GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32fc_x2_multiply_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32fc_x2_multiply_32fc
25  *
26  * \b Overview
27  *
28  * Multiplies two complex vectors and returns the complex result.
29  *
30  * <b>Dispatcher Prototype</b>
31  * \code
32  * void volk_32fc_x2_multiply_32fc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
33  * \endcode
34  *
35  * \b Inputs
36  * \li aVector: The first input vector of complex floats.
37  * \li bVector: The second input vector of complex floats.
38  * \li num_points: The number of data points.
39  *
40  * \b Outputs
41  * \li outputVector: The output vector complex floats.
42  *
43  * \b Example
44  * Mix two signals at f=0.3 and 0.1.
45  * \code
46  * int N = 10;
47  * unsigned int alignment = volk_get_alignment();
48  * lv_32fc_t* sig_1 = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
49  * lv_32fc_t* sig_2 = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
50  * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
51  *
52  * for(unsigned int ii = 0; ii < N; ++ii){
53  * // Generate two tones
54  * float real_1 = std::cos(0.3f * (float)ii);
55  * float imag_1 = std::sin(0.3f * (float)ii);
56  * sig_1[ii] = lv_cmake(real_1, imag_1);
57  * float real_2 = std::cos(0.1f * (float)ii);
58  * float imag_2 = std::sin(0.1f * (float)ii);
59  * sig_2[ii] = lv_cmake(real_2, imag_2);
60  * }
61  *
62  * volk_32fc_x2_multiply_32fc(out, sig_1, sig_2, N);
63  * *
64  * volk_free(sig_1);
65  * volk_free(sig_2);
66  * volk_free(out);
67  * \endcode
68  */
69 
70 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
71 #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
72 
73 #include <inttypes.h>
74 #include <stdio.h>
75 #include <volk/volk_complex.h>
76 #include <float.h>
77 
78 #ifdef LV_HAVE_AVX
79 #include <immintrin.h>
80 
81 static inline void
82 volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
83  const lv_32fc_t* bVector, unsigned int num_points)
84 {
85  unsigned int number = 0;
86  const unsigned int quarterPoints = num_points / 4;
87 
88  __m256 x, y, yl, yh, z, tmp1, tmp2;
89  lv_32fc_t* c = cVector;
90  const lv_32fc_t* a = aVector;
91  const lv_32fc_t* b = bVector;
92 
93  for(;number < quarterPoints; number++){
94 
95  x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
96  y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
97 
98  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
99  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
100 
101  tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
102 
103  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
104 
105  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
106 
107  z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
108 
109  _mm256_storeu_ps((float*)c,z); // Store the results back into the C container
110 
111  a += 4;
112  b += 4;
113  c += 4;
114  }
115 
116  number = quarterPoints * 4;
117 
118  for(; number < num_points; number++) {
119  *c++ = (*a++) * (*b++);
120  }
121 }
122 #endif /* LV_HAVE_AVX */
123 
124 
125 #ifdef LV_HAVE_SSE3
126 #include <pmmintrin.h>
127 
128 static inline void
129 volk_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
130  const lv_32fc_t* bVector, unsigned int num_points)
131 {
132  unsigned int number = 0;
133  const unsigned int halfPoints = num_points / 2;
134 
135  __m128 x, y, yl, yh, z, tmp1, tmp2;
136  lv_32fc_t* c = cVector;
137  const lv_32fc_t* a = aVector;
138  const lv_32fc_t* b = bVector;
139 
140  for(;number < halfPoints; number++){
141 
142  x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
143  y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
144 
145  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
146  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
147 
148  tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
149 
150  x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
151 
152  tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
153 
154  z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
155 
156  _mm_storeu_ps((float*)c,z); // Store the results back into the C container
157 
158  a += 2;
159  b += 2;
160  c += 2;
161  }
162 
163  if((num_points % 2) != 0) {
164  *c = (*a) * (*b);
165  }
166 }
167 #endif /* LV_HAVE_SSE */
168 
169 
170 #ifdef LV_HAVE_GENERIC
171 
172 static inline void
173 volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
174  const lv_32fc_t* bVector, unsigned int num_points)
175 {
176  lv_32fc_t* cPtr = cVector;
177  const lv_32fc_t* aPtr = aVector;
178  const lv_32fc_t* bPtr= bVector;
179  unsigned int number = 0;
180 
181  for(number = 0; number < num_points; number++){
182  *cPtr++ = (*aPtr++) * (*bPtr++);
183  }
184 }
185 #endif /* LV_HAVE_GENERIC */
186 
187 
188 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
189 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
190 #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
191 
192 #include <inttypes.h>
193 #include <stdio.h>
194 #include <volk/volk_complex.h>
195 #include <float.h>
196 
197 #ifdef LV_HAVE_AVX
198 #include <immintrin.h>
199 
200 static inline void
201 volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* aVector,
202  const lv_32fc_t* bVector, unsigned int num_points)
203 {
204  unsigned int number = 0;
205  const unsigned int quarterPoints = num_points / 4;
206 
207  __m256 x, y, yl, yh, z, tmp1, tmp2;
208  lv_32fc_t* c = cVector;
209  const lv_32fc_t* a = aVector;
210  const lv_32fc_t* b = bVector;
211 
212  for(;number < quarterPoints; number++){
213 
214  x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
215  y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
216 
217  yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
218  yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
219 
220  tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
221 
222  x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
223 
224  tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
225 
226  z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
227 
228  _mm256_store_ps((float*)c,z); // Store the results back into the C container
229 
230  a += 4;
231  b += 4;
232  c += 4;
233  }
234 
235  number = quarterPoints * 4;
236 
237  for(; number < num_points; number++) {
238  *c++ = (*a++) * (*b++);
239  }
240 }
241 #endif /* LV_HAVE_AVX */
242 
243 #ifdef LV_HAVE_SSE3
244 #include <pmmintrin.h>
245 
246 static inline void
247 volk_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector,
248  const lv_32fc_t* bVector, unsigned int num_points)
249 {
250  unsigned int number = 0;
251  const unsigned int halfPoints = num_points / 2;
252 
253  __m128 x, y, yl, yh, z, tmp1, tmp2;
254  lv_32fc_t* c = cVector;
255  const lv_32fc_t* a = aVector;
256  const lv_32fc_t* b = bVector;
257  for(;number < halfPoints; number++){
258 
259  x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
260  y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
261 
262  yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
263  yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
264 
265  tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
266 
267  x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
268 
269  tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
270 
271  z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
272 
273  _mm_store_ps((float*)c,z); // Store the results back into the C container
274 
275  a += 2;
276  b += 2;
277  c += 2;
278  }
279 
280  if((num_points % 2) != 0) {
281  *c = (*a) * (*b);
282  }
283 }
284 #endif /* LV_HAVE_SSE */
285 
286 
287 #ifdef LV_HAVE_GENERIC
288 
289 static inline void
290 volk_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
291  const lv_32fc_t* bVector, unsigned int num_points)
292 {
293  lv_32fc_t* cPtr = cVector;
294  const lv_32fc_t* aPtr = aVector;
295  const lv_32fc_t* bPtr= bVector;
296  unsigned int number = 0;
297 
298  for(number = 0; number < num_points; number++){
299  *cPtr++ = (*aPtr++) * (*bPtr++);
300  }
301 }
302 #endif /* LV_HAVE_GENERIC */
303 
304 
305 #ifdef LV_HAVE_NEON
306 #include <arm_neon.h>
307 
308 static inline void
309 volk_32fc_x2_multiply_32fc_neon(lv_32fc_t* cVector, const lv_32fc_t* aVector,
310  const lv_32fc_t* bVector, unsigned int num_points)
311 {
312  lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
313  lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
314  unsigned int quarter_points = num_points / 4;
315  float32x4x2_t a_val, b_val, c_val;
316  float32x4x2_t tmp_real, tmp_imag;
317  unsigned int number = 0;
318 
319  for(number = 0; number < quarter_points; ++number) {
320  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
321  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
322  __builtin_prefetch(a_ptr+4);
323  __builtin_prefetch(b_ptr+4);
324 
325  // multiply the real*real and imag*imag to get real result
326  // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
327  tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
328  // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
329  tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
330 
331  // Multiply cross terms to get the imaginary result
332  // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
333  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
334  // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
335  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
336 
337  // store the results
338  c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
339  c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
340  vst2q_f32((float*)cVector, c_val);
341 
342  a_ptr += 4;
343  b_ptr += 4;
344  cVector += 4;
345  }
346 
347  for(number = quarter_points*4; number < num_points; number++){
348  *cVector++ = (*a_ptr++) * (*b_ptr++);
349  }
350 }
351 #endif /* LV_HAVE_NEON */
352 
353 
354 #ifdef LV_HAVE_NEON
355 
356 static inline void
357 volk_32fc_x2_multiply_32fc_neon_opttests(lv_32fc_t* cVector, const lv_32fc_t* aVector,
358  const lv_32fc_t* bVector, unsigned int num_points)
359 {
360  lv_32fc_t *a_ptr = (lv_32fc_t*) aVector;
361  lv_32fc_t *b_ptr = (lv_32fc_t*) bVector;
362  unsigned int quarter_points = num_points / 4;
363  float32x4x2_t a_val, b_val;
364  float32x4x2_t tmp_imag;
365  unsigned int number = 0;
366 
367  for(number = 0; number < quarter_points; ++number) {
368  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
369  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
370  __builtin_prefetch(a_ptr+4);
371  __builtin_prefetch(b_ptr+4);
372 
373  // do the first multiply
374  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
375  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
376 
377  // use multiply accumulate/subtract to get result
378  tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
379  tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
380 
381  // store
382  vst2q_f32((float*)cVector, tmp_imag);
383  // increment pointers
384  a_ptr += 4;
385  b_ptr += 4;
386  cVector += 4;
387  }
388 
389  for(number = quarter_points*4; number < num_points; number++){
390  *cVector++ = (*a_ptr++) * (*b_ptr++);
391  }
392 }
393 #endif /* LV_HAVE_NEON */
394 
395 
396 #ifdef LV_HAVE_NEON
397 
398 extern void
399 volk_32fc_x2_multiply_32fc_neonasm(lv_32fc_t* cVector, const lv_32fc_t* aVector,
400  const lv_32fc_t* bVector, unsigned int num_points);
401 #endif /* LV_HAVE_NEON */
402 
403 
404 #ifdef LV_HAVE_ORC
405 
406 extern void
407 volk_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector,
408  const lv_32fc_t* bVector, unsigned int num_points);
409 
410 static inline void
411 volk_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector,
412  const lv_32fc_t* bVector, unsigned int num_points)
413 {
414  volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
415 }
416 
417 #endif /* LV_HAVE_ORC */
418 
419 #endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
float complex lv_32fc_t
Definition: volk_complex.h:56