63 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
64 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_a_H
69 #ifdef LV_HAVE_GENERIC
71 static inline void volk_32fc_32f_dot_prod_32fc_generic(
lv_32fc_t* result,
const lv_32fc_t* input,
const float *
taps,
unsigned int num_points) {
74 float *realpt = &res[0], *imagpt = &res[1];
75 const float* aPtr = (
float*)input;
76 const float* bPtr=
taps;
77 unsigned int number = 0;
82 for(number = 0; number < num_points; number++){
83 *realpt += ((*aPtr++) * (*bPtr));
84 *imagpt += ((*aPtr++) * (*bPtr++));
95 #include <immintrin.h>
97 static inline void volk_32fc_32f_dot_prod_32fc_a_avx(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
99 unsigned int number = 0;
100 const unsigned int sixteenthPoints = num_points / 16;
103 float *realpt = &res[0], *imagpt = &res[1];
104 const float* aPtr = (
float*)input;
105 const float* bPtr =
taps;
107 __m256 a0Val, a1Val, a2Val, a3Val;
108 __m256 b0Val, b1Val, b2Val, b3Val;
109 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
110 __m256 c0Val, c1Val, c2Val, c3Val;
112 __m256 dotProdVal0 = _mm256_setzero_ps();
113 __m256 dotProdVal1 = _mm256_setzero_ps();
114 __m256 dotProdVal2 = _mm256_setzero_ps();
115 __m256 dotProdVal3 = _mm256_setzero_ps();
117 for(;number < sixteenthPoints; number++){
119 a0Val = _mm256_load_ps(aPtr);
120 a1Val = _mm256_load_ps(aPtr+8);
121 a2Val = _mm256_load_ps(aPtr+16);
122 a3Val = _mm256_load_ps(aPtr+24);
124 x0Val = _mm256_load_ps(bPtr);
125 x1Val = _mm256_load_ps(bPtr+8);
126 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
127 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
128 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
129 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
132 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
133 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
134 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
135 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
137 c0Val = _mm256_mul_ps(a0Val, b0Val);
138 c1Val = _mm256_mul_ps(a1Val, b1Val);
139 c2Val = _mm256_mul_ps(a2Val, b2Val);
140 c3Val = _mm256_mul_ps(a3Val, b3Val);
142 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
143 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
144 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
145 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
151 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
152 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
153 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
157 _mm256_store_ps(dotProductVector,dotProdVal0);
159 *realpt = dotProductVector[0];
160 *imagpt = dotProductVector[1];
161 *realpt += dotProductVector[2];
162 *imagpt += dotProductVector[3];
163 *realpt += dotProductVector[4];
164 *imagpt += dotProductVector[5];
165 *realpt += dotProductVector[6];
166 *imagpt += dotProductVector[7];
168 number = sixteenthPoints*16;
169 for(;number < num_points; number++){
170 *realpt += ((*aPtr++) * (*bPtr));
171 *imagpt += ((*aPtr++) * (*bPtr++));
185 static inline void volk_32fc_32f_dot_prod_32fc_a_sse(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
187 unsigned int number = 0;
188 const unsigned int sixteenthPoints = num_points / 8;
191 float *realpt = &res[0], *imagpt = &res[1];
192 const float* aPtr = (
float*)input;
193 const float* bPtr =
taps;
195 __m128 a0Val, a1Val, a2Val, a3Val;
196 __m128 b0Val, b1Val, b2Val, b3Val;
197 __m128 x0Val, x1Val, x2Val, x3Val;
198 __m128 c0Val, c1Val, c2Val, c3Val;
200 __m128 dotProdVal0 = _mm_setzero_ps();
201 __m128 dotProdVal1 = _mm_setzero_ps();
202 __m128 dotProdVal2 = _mm_setzero_ps();
203 __m128 dotProdVal3 = _mm_setzero_ps();
205 for(;number < sixteenthPoints; number++){
207 a0Val = _mm_load_ps(aPtr);
208 a1Val = _mm_load_ps(aPtr+4);
209 a2Val = _mm_load_ps(aPtr+8);
210 a3Val = _mm_load_ps(aPtr+12);
212 x0Val = _mm_load_ps(bPtr);
213 x1Val = _mm_load_ps(bPtr);
214 x2Val = _mm_load_ps(bPtr+4);
215 x3Val = _mm_load_ps(bPtr+4);
216 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
217 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
218 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
219 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
221 c0Val = _mm_mul_ps(a0Val, b0Val);
222 c1Val = _mm_mul_ps(a1Val, b1Val);
223 c2Val = _mm_mul_ps(a2Val, b2Val);
224 c3Val = _mm_mul_ps(a3Val, b3Val);
226 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
227 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
228 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
229 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
235 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
236 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
237 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
241 _mm_store_ps(dotProductVector,dotProdVal0);
243 *realpt = dotProductVector[0];
244 *imagpt = dotProductVector[1];
245 *realpt += dotProductVector[2];
246 *imagpt += dotProductVector[3];
248 number = sixteenthPoints*8;
249 for(;number < num_points; number++){
250 *realpt += ((*aPtr++) * (*bPtr));
251 *imagpt += ((*aPtr++) * (*bPtr++));
263 #include <immintrin.h>
265 static inline void volk_32fc_32f_dot_prod_32fc_u_avx(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
267 unsigned int number = 0;
268 const unsigned int sixteenthPoints = num_points / 16;
271 float *realpt = &res[0], *imagpt = &res[1];
272 const float* aPtr = (
float*)input;
273 const float* bPtr =
taps;
275 __m256 a0Val, a1Val, a2Val, a3Val;
276 __m256 b0Val, b1Val, b2Val, b3Val;
277 __m256 x0Val, x1Val, x0loVal, x0hiVal, x1loVal, x1hiVal;
278 __m256 c0Val, c1Val, c2Val, c3Val;
280 __m256 dotProdVal0 = _mm256_setzero_ps();
281 __m256 dotProdVal1 = _mm256_setzero_ps();
282 __m256 dotProdVal2 = _mm256_setzero_ps();
283 __m256 dotProdVal3 = _mm256_setzero_ps();
285 for(;number < sixteenthPoints; number++){
287 a0Val = _mm256_loadu_ps(aPtr);
288 a1Val = _mm256_loadu_ps(aPtr+8);
289 a2Val = _mm256_loadu_ps(aPtr+16);
290 a3Val = _mm256_loadu_ps(aPtr+24);
292 x0Val = _mm256_loadu_ps(bPtr);
293 x1Val = _mm256_loadu_ps(bPtr+8);
294 x0loVal = _mm256_unpacklo_ps(x0Val, x0Val);
295 x0hiVal = _mm256_unpackhi_ps(x0Val, x0Val);
296 x1loVal = _mm256_unpacklo_ps(x1Val, x1Val);
297 x1hiVal = _mm256_unpackhi_ps(x1Val, x1Val);
300 b0Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x20);
301 b1Val = _mm256_permute2f128_ps(x0loVal, x0hiVal, 0x31);
302 b2Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x20);
303 b3Val = _mm256_permute2f128_ps(x1loVal, x1hiVal, 0x31);
305 c0Val = _mm256_mul_ps(a0Val, b0Val);
306 c1Val = _mm256_mul_ps(a1Val, b1Val);
307 c2Val = _mm256_mul_ps(a2Val, b2Val);
308 c3Val = _mm256_mul_ps(a3Val, b3Val);
310 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
311 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
312 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
313 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
319 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
320 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
321 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
325 _mm256_store_ps(dotProductVector,dotProdVal0);
327 *realpt = dotProductVector[0];
328 *imagpt = dotProductVector[1];
329 *realpt += dotProductVector[2];
330 *imagpt += dotProductVector[3];
331 *realpt += dotProductVector[4];
332 *imagpt += dotProductVector[5];
333 *realpt += dotProductVector[6];
334 *imagpt += dotProductVector[7];
336 number = sixteenthPoints*16;
337 for(;number < num_points; number++){
338 *realpt += ((*aPtr++) * (*bPtr));
339 *imagpt += ((*aPtr++) * (*bPtr++));
347 #include <arm_neon.h>
349 static inline void volk_32fc_32f_dot_prod_32fc_neon_unroll (
lv_32fc_t* __restrict result,
const lv_32fc_t* __restrict input,
const float* __restrict taps,
unsigned int num_points) {
352 const unsigned int quarterPoints = num_points / 8;
355 float *realpt = &res[0], *imagpt = &res[1];
356 const float* inputPtr = (
float*)input;
357 const float* tapsPtr =
taps;
358 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
359 float accVector_real[4];
360 float accVector_imag[4];
362 float32x4x2_t inputVector0, inputVector1;
363 float32x4_t tapsVector0, tapsVector1;
364 float32x4_t tmp_real0, tmp_imag0;
365 float32x4_t tmp_real1, tmp_imag1;
366 float32x4_t real_accumulator0, imag_accumulator0;
367 float32x4_t real_accumulator1, imag_accumulator1;
371 real_accumulator0 = vld1q_f32( zero );
372 imag_accumulator0 = vld1q_f32( zero );
373 real_accumulator1 = vld1q_f32( zero );
374 imag_accumulator1 = vld1q_f32( zero );
376 for(number=0 ;number < quarterPoints; number++){
378 tapsVector0 = vld1q_f32(tapsPtr );
379 tapsVector1 = vld1q_f32(tapsPtr+4 );
382 inputVector0 = vld2q_f32(inputPtr );
383 inputVector1 = vld2q_f32(inputPtr+8 );
386 tmp_real0 = vmulq_f32(tapsVector0, inputVector0.val[0]);
387 tmp_imag0 = vmulq_f32(tapsVector0, inputVector0.val[1]);
389 tmp_real1 = vmulq_f32(tapsVector1, inputVector1.val[0]);
390 tmp_imag1 = vmulq_f32(tapsVector1, inputVector1.val[1]);
392 real_accumulator0 = vaddq_f32(real_accumulator0, tmp_real0);
393 imag_accumulator0 = vaddq_f32(imag_accumulator0, tmp_imag0);
395 real_accumulator1 = vaddq_f32(real_accumulator1, tmp_real1);
396 imag_accumulator1 = vaddq_f32(imag_accumulator1, tmp_imag1);
402 real_accumulator0 = vaddq_f32( real_accumulator0, real_accumulator1);
403 imag_accumulator0 = vaddq_f32( imag_accumulator0, imag_accumulator1);
406 vst1q_f32(accVector_real, real_accumulator0);
407 vst1q_f32(accVector_imag, imag_accumulator0);
408 *realpt = accVector_real[0] + accVector_real[1] +
409 accVector_real[2] + accVector_real[3] ;
411 *imagpt = accVector_imag[0] + accVector_imag[1] +
412 accVector_imag[2] + accVector_imag[3] ;
415 for(number=quarterPoints*8; number < num_points; number++){
416 *realpt += ((*inputPtr++) * (*tapsPtr));
417 *imagpt += ((*inputPtr++) * (*tapsPtr++));
426 #include <arm_neon.h>
428 static inline void volk_32fc_32f_dot_prod_32fc_a_neon (
lv_32fc_t* __restrict result,
const lv_32fc_t* __restrict input,
const float* __restrict taps,
unsigned int num_points) {
431 const unsigned int quarterPoints = num_points / 4;
434 float *realpt = &res[0], *imagpt = &res[1];
435 const float* inputPtr = (
float*)input;
436 const float* tapsPtr =
taps;
437 float zero[4] = {0.0f, 0.0f, 0.0f, 0.0f };
438 float accVector_real[4];
439 float accVector_imag[4];
441 float32x4x2_t inputVector;
442 float32x4_t tapsVector;
443 float32x4_t tmp_real, tmp_imag;
444 float32x4_t real_accumulator, imag_accumulator;
449 real_accumulator = vld1q_f32( zero );
450 imag_accumulator = vld1q_f32( zero );
452 for(number=0 ;number < quarterPoints; number++){
455 tapsVector = vld1q_f32(tapsPtr );
458 inputVector = vld2q_f32(inputPtr );
460 tmp_real = vmulq_f32(tapsVector, inputVector.val[0]);
461 tmp_imag = vmulq_f32(tapsVector, inputVector.val[1]);
463 real_accumulator = vaddq_f32(real_accumulator, tmp_real);
464 imag_accumulator = vaddq_f32(imag_accumulator, tmp_imag);
473 vst1q_f32(accVector_real, real_accumulator);
474 vst1q_f32(accVector_imag, imag_accumulator);
475 *realpt = accVector_real[0] + accVector_real[1] +
476 accVector_real[2] + accVector_real[3] ;
478 *imagpt = accVector_imag[0] + accVector_imag[1] +
479 accVector_imag[2] + accVector_imag[3] ;
482 for(number=quarterPoints*4; number < num_points; number++){
483 *realpt += ((*inputPtr++) * (*tapsPtr));
484 *imagpt += ((*inputPtr++) * (*tapsPtr++));
493 extern void volk_32fc_32f_dot_prod_32fc_a_neonasm (
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points);
497 extern void volk_32fc_32f_dot_prod_32fc_a_neonpipeline (
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points);
502 static inline void volk_32fc_32f_dot_prod_32fc_u_sse(
lv_32fc_t* result,
const lv_32fc_t* input,
const float* taps,
unsigned int num_points) {
504 unsigned int number = 0;
505 const unsigned int sixteenthPoints = num_points / 8;
508 float *realpt = &res[0], *imagpt = &res[1];
509 const float* aPtr = (
float*)input;
510 const float* bPtr =
taps;
512 __m128 a0Val, a1Val, a2Val, a3Val;
513 __m128 b0Val, b1Val, b2Val, b3Val;
514 __m128 x0Val, x1Val, x2Val, x3Val;
515 __m128 c0Val, c1Val, c2Val, c3Val;
517 __m128 dotProdVal0 = _mm_setzero_ps();
518 __m128 dotProdVal1 = _mm_setzero_ps();
519 __m128 dotProdVal2 = _mm_setzero_ps();
520 __m128 dotProdVal3 = _mm_setzero_ps();
522 for(;number < sixteenthPoints; number++){
524 a0Val = _mm_loadu_ps(aPtr);
525 a1Val = _mm_loadu_ps(aPtr+4);
526 a2Val = _mm_loadu_ps(aPtr+8);
527 a3Val = _mm_loadu_ps(aPtr+12);
529 x0Val = _mm_loadu_ps(bPtr);
530 x1Val = _mm_loadu_ps(bPtr);
531 x2Val = _mm_loadu_ps(bPtr+4);
532 x3Val = _mm_loadu_ps(bPtr+4);
533 b0Val = _mm_unpacklo_ps(x0Val, x1Val);
534 b1Val = _mm_unpackhi_ps(x0Val, x1Val);
535 b2Val = _mm_unpacklo_ps(x2Val, x3Val);
536 b3Val = _mm_unpackhi_ps(x2Val, x3Val);
538 c0Val = _mm_mul_ps(a0Val, b0Val);
539 c1Val = _mm_mul_ps(a1Val, b1Val);
540 c2Val = _mm_mul_ps(a2Val, b2Val);
541 c3Val = _mm_mul_ps(a3Val, b3Val);
543 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
544 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
545 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
546 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
552 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
553 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
554 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
558 _mm_store_ps(dotProductVector,dotProdVal0);
560 *realpt = dotProductVector[0];
561 *imagpt = dotProductVector[1];
562 *realpt += dotProductVector[2];
563 *imagpt += dotProductVector[3];
565 number = sixteenthPoints*8;
566 for(;number < num_points; number++){
567 *realpt += ((*aPtr++) * (*bPtr));
568 *imagpt += ((*aPtr++) * (*bPtr++));
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56