GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32fc_index_max_16u.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32fc_index_max_16u
25  *
26  * \b Overview
27  *
28  * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
29  * maximum magnitude for complex points in the given vector.
30  *
31  * <b>Dispatcher Prototype</b>
32  * \code
33  * void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_points)
34  * \endcode
35  *
36  * \b Inputs
37  * \li src0: The complex input vector.
38  * \li num_points: The number of samples.
39  *
40  * \b Outputs
41  * \li target: The index of the point with maximum magnitude.
42  *
43  * \b Example
44  * Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
45  * the unit circle.
46  * \code
47  * int N = 10;
48  * unsigned int alignment = volk_get_alignment();
49  * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
50  * uint32_t* max = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
51  *
52  * for(unsigned int ii = 0; ii < N/2; ++ii){
53  * float real = 2.f * ((float)ii / (float)N) - 1.f;
54  * float imag = std::sqrt(1.f - real * real);
55  * in[ii] = lv_cmake(real, imag);
56  * in[ii] = in[ii] * in[ii] + in[ii];
57  * in[N-ii] = lv_cmake(real, imag);
58  * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
59  * }
60  *
61  * volk_32fc_index_max_16u(max, in, N);
62  *
63  * printf("index of max value = %u\n", *max);
64  *
65  * volk_free(in);
66  * volk_free(max);
67  * \endcode
68  */
69 
70 #ifndef INCLUDED_volk_32fc_index_max_16u_a_H
71 #define INCLUDED_volk_32fc_index_max_16u_a_H
72 
73 #include <volk/volk_common.h>
74 #include<inttypes.h>
75 #include<stdio.h>
76 #include<volk/volk_complex.h>
77 
78 #ifdef LV_HAVE_SSE3
79 #include<xmmintrin.h>
80 #include<pmmintrin.h>
81 
82 static inline void
83 volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0,
84  unsigned int num_points)
85 {
86  const unsigned int num_bytes = num_points*8;
87 
88  union bit128 holderf;
89  union bit128 holderi;
90  float sq_dist = 0.0;
91 
92  union bit128 xmm5, xmm4;
93  __m128 xmm1, xmm2, xmm3;
94  __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
95 
96  xmm5.int_vec = xmmfive = _mm_setzero_si128();
97  xmm4.int_vec = xmmfour = _mm_setzero_si128();
98  holderf.int_vec = holder0 = _mm_setzero_si128();
99  holderi.int_vec = holder1 = _mm_setzero_si128();
100 
101  int bound = num_bytes >> 5;
102  int leftovers0 = (num_bytes >> 4) & 1;
103  int leftovers1 = (num_bytes >> 3) & 1;
104  int i = 0;
105 
106  xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
107  xmm9 = xmm8 = _mm_setzero_si128();
108  xmm10 = _mm_set_epi32(4, 4, 4, 4);
109  xmm3 = _mm_setzero_ps();
110 
111  //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
112 
113  for(; i < bound; ++i) {
114  xmm1 = _mm_load_ps((float*)src0);
115  xmm2 = _mm_load_ps((float*)&src0[2]);
116 
117  src0 += 4;
118 
119  xmm1 = _mm_mul_ps(xmm1, xmm1);
120  xmm2 = _mm_mul_ps(xmm2, xmm2);
121 
122  xmm1 = _mm_hadd_ps(xmm1, xmm2);
123 
124  xmm3 = _mm_max_ps(xmm1, xmm3);
125 
126  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
127  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
128 
129  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
130  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
131 
132  xmm9 = _mm_add_epi32(xmm11, xmm12);
133 
134  xmm8 = _mm_add_epi32(xmm8, xmm10);
135 
136  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
137  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
138  }
139 
140 
141  for(i = 0; i < leftovers0; ++i) {
142  xmm2 = _mm_load_ps((float*)src0);
143 
144  xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
145  xmm8 = bit128_p(&xmm1)->int_vec;
146 
147  xmm2 = _mm_mul_ps(xmm2, xmm2);
148 
149  src0 += 2;
150 
151  xmm1 = _mm_hadd_ps(xmm2, xmm2);
152 
153  xmm3 = _mm_max_ps(xmm1, xmm3);
154 
155  xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
156 
157  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
158  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
159 
160  xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
161  xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
162 
163  xmm9 = _mm_add_epi32(xmm11, xmm12);
164 
165  xmm8 = _mm_add_epi32(xmm8, xmm10);
166  //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
167  }
168 
169  for(i = 0; i < leftovers1; ++i) {
170  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
171 
172  sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
173 
174  xmm2 = _mm_load1_ps(&sq_dist);
175 
176  xmm1 = xmm3;
177 
178  xmm3 = _mm_max_ss(xmm3, xmm2);
179 
180  xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
181  xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
182 
183  xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
184 
185  xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
186  xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
187 
188  xmm9 = _mm_add_epi32(xmm11, xmm12);
189  }
190 
191  //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
192  //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
193 
194  _mm_store_ps((float*)&(holderf.f), xmm3);
195  _mm_store_si128(&(holderi.int_vec), xmm9);
196 
197  target[0] = holderi.i[0];
198  sq_dist = holderf.f[0];
199  target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
200  sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
201  target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
202  sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
203  target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
204  sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
205 
206  /*
207  float placeholder = 0.0;
208  uint32_t temp0, temp1;
209  unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
210  unsigned int l0 = g0 ^ 1;
211 
212  unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
213  unsigned int l1 = g1 ^ 1;
214 
215  temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
216  temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
217  sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
218  placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
219 
220  g0 = (sq_dist > placeholder);
221  l0 = g0 ^ 1;
222  target[0] = g0 * temp0 + l0 * temp1;
223  */
224 }
225 
226 #endif /*LV_HAVE_SSE3*/
227 
228 #ifdef LV_HAVE_GENERIC
229 static inline void
230  volk_32fc_index_max_16u_generic(unsigned int* target, lv_32fc_t* src0,
231  unsigned int num_points)
232 {
233  const unsigned int num_bytes = num_points*8;
234 
235  float sq_dist = 0.0;
236  float max = 0.0;
237  unsigned int index = 0;
238 
239  unsigned int i = 0;
240 
241  for(; i < num_bytes >> 3; ++i) {
242  sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
243 
244  index = sq_dist > max ? i : index;
245  max = sq_dist > max ? sq_dist : max;
246  }
247  target[0] = index;
248 }
249 
250 #endif /*LV_HAVE_GENERIC*/
251 
252 
253 #endif /*INCLUDED_volk_32fc_index_max_16u_a_H*/
#define bit128_p(x)
Definition: volk_common.h:94
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
Definition: volk_common.h:78
#define lv_cimag(x)
Definition: volk_complex.h:78
uint32_t i[4]
Definition: volk_common.h:80