73 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_u_H
74 #define INCLUDED_volk_32f_x2_dot_prod_32f_u_H
80 #ifdef LV_HAVE_GENERIC
83 static inline void volk_32f_x2_dot_prod_32f_generic(
float * result,
const float * input,
const float *
taps,
unsigned int num_points) {
86 const float* aPtr = input;
87 const float* bPtr=
taps;
88 unsigned int number = 0;
90 for(number = 0; number < num_points; number++){
91 dotProduct += ((*aPtr++) * (*bPtr++));
103 static inline void volk_32f_x2_dot_prod_32f_u_sse(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
105 unsigned int number = 0;
106 const unsigned int sixteenthPoints = num_points / 16;
108 float dotProduct = 0;
109 const float* aPtr = input;
110 const float* bPtr =
taps;
112 __m128 a0Val, a1Val, a2Val, a3Val;
113 __m128 b0Val, b1Val, b2Val, b3Val;
114 __m128 c0Val, c1Val, c2Val, c3Val;
116 __m128 dotProdVal0 = _mm_setzero_ps();
117 __m128 dotProdVal1 = _mm_setzero_ps();
118 __m128 dotProdVal2 = _mm_setzero_ps();
119 __m128 dotProdVal3 = _mm_setzero_ps();
121 for(;number < sixteenthPoints; number++){
123 a0Val = _mm_loadu_ps(aPtr);
124 a1Val = _mm_loadu_ps(aPtr+4);
125 a2Val = _mm_loadu_ps(aPtr+8);
126 a3Val = _mm_loadu_ps(aPtr+12);
127 b0Val = _mm_loadu_ps(bPtr);
128 b1Val = _mm_loadu_ps(bPtr+4);
129 b2Val = _mm_loadu_ps(bPtr+8);
130 b3Val = _mm_loadu_ps(bPtr+12);
132 c0Val = _mm_mul_ps(a0Val, b0Val);
133 c1Val = _mm_mul_ps(a1Val, b1Val);
134 c2Val = _mm_mul_ps(a2Val, b2Val);
135 c3Val = _mm_mul_ps(a3Val, b3Val);
137 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
138 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
139 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
140 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
146 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
147 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
148 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
152 _mm_store_ps(dotProductVector,dotProdVal0);
154 dotProduct = dotProductVector[0];
155 dotProduct += dotProductVector[1];
156 dotProduct += dotProductVector[2];
157 dotProduct += dotProductVector[3];
159 number = sixteenthPoints*16;
160 for(;number < num_points; number++){
161 dotProduct += ((*aPtr++) * (*bPtr++));
164 *result = dotProduct;
172 #include <pmmintrin.h>
174 static inline void volk_32f_x2_dot_prod_32f_u_sse3(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
175 unsigned int number = 0;
176 const unsigned int sixteenthPoints = num_points / 16;
178 float dotProduct = 0;
179 const float* aPtr = input;
180 const float* bPtr =
taps;
182 __m128 a0Val, a1Val, a2Val, a3Val;
183 __m128 b0Val, b1Val, b2Val, b3Val;
184 __m128 c0Val, c1Val, c2Val, c3Val;
186 __m128 dotProdVal0 = _mm_setzero_ps();
187 __m128 dotProdVal1 = _mm_setzero_ps();
188 __m128 dotProdVal2 = _mm_setzero_ps();
189 __m128 dotProdVal3 = _mm_setzero_ps();
191 for(;number < sixteenthPoints; number++){
193 a0Val = _mm_loadu_ps(aPtr);
194 a1Val = _mm_loadu_ps(aPtr+4);
195 a2Val = _mm_loadu_ps(aPtr+8);
196 a3Val = _mm_loadu_ps(aPtr+12);
197 b0Val = _mm_loadu_ps(bPtr);
198 b1Val = _mm_loadu_ps(bPtr+4);
199 b2Val = _mm_loadu_ps(bPtr+8);
200 b3Val = _mm_loadu_ps(bPtr+12);
202 c0Val = _mm_mul_ps(a0Val, b0Val);
203 c1Val = _mm_mul_ps(a1Val, b1Val);
204 c2Val = _mm_mul_ps(a2Val, b2Val);
205 c3Val = _mm_mul_ps(a3Val, b3Val);
207 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
208 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
209 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
210 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
216 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
217 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
218 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
221 _mm_store_ps(dotProductVector,dotProdVal0);
223 dotProduct = dotProductVector[0];
224 dotProduct += dotProductVector[1];
225 dotProduct += dotProductVector[2];
226 dotProduct += dotProductVector[3];
228 number = sixteenthPoints*16;
229 for(;number < num_points; number++){
230 dotProduct += ((*aPtr++) * (*bPtr++));
233 *result = dotProduct;
238 #ifdef LV_HAVE_SSE4_1
240 #include <smmintrin.h>
242 static inline void volk_32f_x2_dot_prod_32f_u_sse4_1(
float * result,
const float * input,
const float* taps,
unsigned int num_points) {
243 unsigned int number = 0;
244 const unsigned int sixteenthPoints = num_points / 16;
246 float dotProduct = 0;
247 const float* aPtr = input;
248 const float* bPtr =
taps;
250 __m128 aVal1, bVal1, cVal1;
251 __m128 aVal2, bVal2, cVal2;
252 __m128 aVal3, bVal3, cVal3;
253 __m128 aVal4, bVal4, cVal4;
255 __m128 dotProdVal = _mm_setzero_ps();
257 for(;number < sixteenthPoints; number++){
259 aVal1 = _mm_loadu_ps(aPtr); aPtr += 4;
260 aVal2 = _mm_loadu_ps(aPtr); aPtr += 4;
261 aVal3 = _mm_loadu_ps(aPtr); aPtr += 4;
262 aVal4 = _mm_loadu_ps(aPtr); aPtr += 4;
264 bVal1 = _mm_loadu_ps(bPtr); bPtr += 4;
265 bVal2 = _mm_loadu_ps(bPtr); bPtr += 4;
266 bVal3 = _mm_loadu_ps(bPtr); bPtr += 4;
267 bVal4 = _mm_loadu_ps(bPtr); bPtr += 4;
269 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
270 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
271 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
272 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
274 cVal1 = _mm_or_ps(cVal1, cVal2);
275 cVal3 = _mm_or_ps(cVal3, cVal4);
276 cVal1 = _mm_or_ps(cVal1, cVal3);
278 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
282 _mm_store_ps(dotProductVector, dotProdVal);
284 dotProduct = dotProductVector[0];
285 dotProduct += dotProductVector[1];
286 dotProduct += dotProductVector[2];
287 dotProduct += dotProductVector[3];
289 number = sixteenthPoints * 16;
290 for(;number < num_points; number++){
291 dotProduct += ((*aPtr++) * (*bPtr++));
294 *result = dotProduct;
301 #include <immintrin.h>
303 static inline void volk_32f_x2_dot_prod_32f_u_avx(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
305 unsigned int number = 0;
306 const unsigned int sixteenthPoints = num_points / 16;
308 float dotProduct = 0;
309 const float* aPtr = input;
310 const float* bPtr =
taps;
316 __m256 dotProdVal0 = _mm256_setzero_ps();
317 __m256 dotProdVal1 = _mm256_setzero_ps();
319 for(;number < sixteenthPoints; number++){
321 a0Val = _mm256_loadu_ps(aPtr);
322 a1Val = _mm256_loadu_ps(aPtr+8);
323 b0Val = _mm256_loadu_ps(bPtr);
324 b1Val = _mm256_loadu_ps(bPtr+8);
326 c0Val = _mm256_mul_ps(a0Val, b0Val);
327 c1Val = _mm256_mul_ps(a1Val, b1Val);
329 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
330 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
336 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
340 _mm256_storeu_ps(dotProductVector,dotProdVal0);
342 dotProduct = dotProductVector[0];
343 dotProduct += dotProductVector[1];
344 dotProduct += dotProductVector[2];
345 dotProduct += dotProductVector[3];
346 dotProduct += dotProductVector[4];
347 dotProduct += dotProductVector[5];
348 dotProduct += dotProductVector[6];
349 dotProduct += dotProductVector[7];
351 number = sixteenthPoints*16;
352 for(;number < num_points; number++){
353 dotProduct += ((*aPtr++) * (*bPtr++));
356 *result = dotProduct;
363 #ifndef INCLUDED_volk_32f_x2_dot_prod_32f_a_H
364 #define INCLUDED_volk_32f_x2_dot_prod_32f_a_H
370 #ifdef LV_HAVE_GENERIC
373 static inline void volk_32f_x2_dot_prod_32f_a_generic(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
375 float dotProduct = 0;
376 const float* aPtr = input;
377 const float* bPtr=
taps;
378 unsigned int number = 0;
380 for(number = 0; number < num_points; number++){
381 dotProduct += ((*aPtr++) * (*bPtr++));
384 *result = dotProduct;
393 static inline void volk_32f_x2_dot_prod_32f_a_sse(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
395 unsigned int number = 0;
396 const unsigned int sixteenthPoints = num_points / 16;
398 float dotProduct = 0;
399 const float* aPtr = input;
400 const float* bPtr =
taps;
402 __m128 a0Val, a1Val, a2Val, a3Val;
403 __m128 b0Val, b1Val, b2Val, b3Val;
404 __m128 c0Val, c1Val, c2Val, c3Val;
406 __m128 dotProdVal0 = _mm_setzero_ps();
407 __m128 dotProdVal1 = _mm_setzero_ps();
408 __m128 dotProdVal2 = _mm_setzero_ps();
409 __m128 dotProdVal3 = _mm_setzero_ps();
411 for(;number < sixteenthPoints; number++){
413 a0Val = _mm_load_ps(aPtr);
414 a1Val = _mm_load_ps(aPtr+4);
415 a2Val = _mm_load_ps(aPtr+8);
416 a3Val = _mm_load_ps(aPtr+12);
417 b0Val = _mm_load_ps(bPtr);
418 b1Val = _mm_load_ps(bPtr+4);
419 b2Val = _mm_load_ps(bPtr+8);
420 b3Val = _mm_load_ps(bPtr+12);
422 c0Val = _mm_mul_ps(a0Val, b0Val);
423 c1Val = _mm_mul_ps(a1Val, b1Val);
424 c2Val = _mm_mul_ps(a2Val, b2Val);
425 c3Val = _mm_mul_ps(a3Val, b3Val);
427 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
428 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
429 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
430 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
436 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
437 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
438 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
442 _mm_store_ps(dotProductVector,dotProdVal0);
444 dotProduct = dotProductVector[0];
445 dotProduct += dotProductVector[1];
446 dotProduct += dotProductVector[2];
447 dotProduct += dotProductVector[3];
449 number = sixteenthPoints*16;
450 for(;number < num_points; number++){
451 dotProduct += ((*aPtr++) * (*bPtr++));
454 *result = dotProduct;
462 #include <pmmintrin.h>
464 static inline void volk_32f_x2_dot_prod_32f_a_sse3(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
465 unsigned int number = 0;
466 const unsigned int sixteenthPoints = num_points / 16;
468 float dotProduct = 0;
469 const float* aPtr = input;
470 const float* bPtr =
taps;
472 __m128 a0Val, a1Val, a2Val, a3Val;
473 __m128 b0Val, b1Val, b2Val, b3Val;
474 __m128 c0Val, c1Val, c2Val, c3Val;
476 __m128 dotProdVal0 = _mm_setzero_ps();
477 __m128 dotProdVal1 = _mm_setzero_ps();
478 __m128 dotProdVal2 = _mm_setzero_ps();
479 __m128 dotProdVal3 = _mm_setzero_ps();
481 for(;number < sixteenthPoints; number++){
483 a0Val = _mm_load_ps(aPtr);
484 a1Val = _mm_load_ps(aPtr+4);
485 a2Val = _mm_load_ps(aPtr+8);
486 a3Val = _mm_load_ps(aPtr+12);
487 b0Val = _mm_load_ps(bPtr);
488 b1Val = _mm_load_ps(bPtr+4);
489 b2Val = _mm_load_ps(bPtr+8);
490 b3Val = _mm_load_ps(bPtr+12);
492 c0Val = _mm_mul_ps(a0Val, b0Val);
493 c1Val = _mm_mul_ps(a1Val, b1Val);
494 c2Val = _mm_mul_ps(a2Val, b2Val);
495 c3Val = _mm_mul_ps(a3Val, b3Val);
497 dotProdVal0 = _mm_add_ps(dotProdVal0, c0Val);
498 dotProdVal1 = _mm_add_ps(dotProdVal1, c1Val);
499 dotProdVal2 = _mm_add_ps(dotProdVal2, c2Val);
500 dotProdVal3 = _mm_add_ps(dotProdVal3, c3Val);
506 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
507 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
508 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
511 _mm_store_ps(dotProductVector,dotProdVal0);
513 dotProduct = dotProductVector[0];
514 dotProduct += dotProductVector[1];
515 dotProduct += dotProductVector[2];
516 dotProduct += dotProductVector[3];
518 number = sixteenthPoints*16;
519 for(;number < num_points; number++){
520 dotProduct += ((*aPtr++) * (*bPtr++));
523 *result = dotProduct;
528 #ifdef LV_HAVE_SSE4_1
530 #include <smmintrin.h>
532 static inline void volk_32f_x2_dot_prod_32f_a_sse4_1(
float * result,
const float * input,
const float* taps,
unsigned int num_points) {
533 unsigned int number = 0;
534 const unsigned int sixteenthPoints = num_points / 16;
536 float dotProduct = 0;
537 const float* aPtr = input;
538 const float* bPtr =
taps;
540 __m128 aVal1, bVal1, cVal1;
541 __m128 aVal2, bVal2, cVal2;
542 __m128 aVal3, bVal3, cVal3;
543 __m128 aVal4, bVal4, cVal4;
545 __m128 dotProdVal = _mm_setzero_ps();
547 for(;number < sixteenthPoints; number++){
549 aVal1 = _mm_load_ps(aPtr); aPtr += 4;
550 aVal2 = _mm_load_ps(aPtr); aPtr += 4;
551 aVal3 = _mm_load_ps(aPtr); aPtr += 4;
552 aVal4 = _mm_load_ps(aPtr); aPtr += 4;
554 bVal1 = _mm_load_ps(bPtr); bPtr += 4;
555 bVal2 = _mm_load_ps(bPtr); bPtr += 4;
556 bVal3 = _mm_load_ps(bPtr); bPtr += 4;
557 bVal4 = _mm_load_ps(bPtr); bPtr += 4;
559 cVal1 = _mm_dp_ps(aVal1, bVal1, 0xF1);
560 cVal2 = _mm_dp_ps(aVal2, bVal2, 0xF2);
561 cVal3 = _mm_dp_ps(aVal3, bVal3, 0xF4);
562 cVal4 = _mm_dp_ps(aVal4, bVal4, 0xF8);
564 cVal1 = _mm_or_ps(cVal1, cVal2);
565 cVal3 = _mm_or_ps(cVal3, cVal4);
566 cVal1 = _mm_or_ps(cVal1, cVal3);
568 dotProdVal = _mm_add_ps(dotProdVal, cVal1);
572 _mm_store_ps(dotProductVector, dotProdVal);
574 dotProduct = dotProductVector[0];
575 dotProduct += dotProductVector[1];
576 dotProduct += dotProductVector[2];
577 dotProduct += dotProductVector[3];
579 number = sixteenthPoints * 16;
580 for(;number < num_points; number++){
581 dotProduct += ((*aPtr++) * (*bPtr++));
584 *result = dotProduct;
591 #include <immintrin.h>
593 static inline void volk_32f_x2_dot_prod_32f_a_avx(
float* result,
const float* input,
const float* taps,
unsigned int num_points) {
595 unsigned int number = 0;
596 const unsigned int sixteenthPoints = num_points / 16;
598 float dotProduct = 0;
599 const float* aPtr = input;
600 const float* bPtr =
taps;
606 __m256 dotProdVal0 = _mm256_setzero_ps();
607 __m256 dotProdVal1 = _mm256_setzero_ps();
609 for(;number < sixteenthPoints; number++){
611 a0Val = _mm256_load_ps(aPtr);
612 a1Val = _mm256_load_ps(aPtr+8);
613 b0Val = _mm256_load_ps(bPtr);
614 b1Val = _mm256_load_ps(bPtr+8);
616 c0Val = _mm256_mul_ps(a0Val, b0Val);
617 c1Val = _mm256_mul_ps(a1Val, b1Val);
619 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
620 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
626 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
630 _mm256_store_ps(dotProductVector,dotProdVal0);
632 dotProduct = dotProductVector[0];
633 dotProduct += dotProductVector[1];
634 dotProduct += dotProductVector[2];
635 dotProduct += dotProductVector[3];
636 dotProduct += dotProductVector[4];
637 dotProduct += dotProductVector[5];
638 dotProduct += dotProductVector[6];
639 dotProduct += dotProductVector[7];
641 number = sixteenthPoints*16;
642 for(;number < num_points; number++){
643 dotProduct += ((*aPtr++) * (*bPtr++));
646 *result = dotProduct;
653 #include <arm_neon.h>
655 static inline void volk_32f_x2_dot_prod_32f_neonopts(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
657 unsigned int quarter_points = num_points / 16;
658 float dotProduct = 0;
659 const float* aPtr = input;
660 const float* bPtr=
taps;
661 unsigned int number = 0;
663 float32x4x4_t a_val, b_val, accumulator0;
664 accumulator0.val[0] = vdupq_n_f32(0);
665 accumulator0.val[1] = vdupq_n_f32(0);
666 accumulator0.val[2] = vdupq_n_f32(0);
667 accumulator0.val[3] = vdupq_n_f32(0);
670 for( number = 0; number < quarter_points; ++number) {
671 a_val = vld4q_f32(aPtr);
672 b_val = vld4q_f32(bPtr);
673 accumulator0.val[0] = vmlaq_f32(accumulator0.val[0], a_val.val[0], b_val.val[0]);
674 accumulator0.val[1] = vmlaq_f32(accumulator0.val[1], a_val.val[1], b_val.val[1]);
675 accumulator0.val[2] = vmlaq_f32(accumulator0.val[2], a_val.val[2], b_val.val[2]);
676 accumulator0.val[3] = vmlaq_f32(accumulator0.val[3], a_val.val[3], b_val.val[3]);
680 accumulator0.val[0] = vaddq_f32(accumulator0.val[0], accumulator0.val[1]);
681 accumulator0.val[2] = vaddq_f32(accumulator0.val[2], accumulator0.val[3]);
682 accumulator0.val[0] = vaddq_f32(accumulator0.val[2], accumulator0.val[0]);
684 vst1q_f32(accumulator, accumulator0.val[0]);
685 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
687 for(number = quarter_points*16; number < num_points; number++){
688 dotProduct += ((*aPtr++) * (*bPtr++));
691 *result = dotProduct;
700 static inline void volk_32f_x2_dot_prod_32f_neon(
float * result,
const float * input,
const float * taps,
unsigned int num_points) {
702 unsigned int quarter_points = num_points / 8;
703 float dotProduct = 0;
704 const float* aPtr = input;
705 const float* bPtr=
taps;
706 unsigned int number = 0;
708 float32x4x2_t a_val, b_val, accumulator_val;
709 accumulator_val.val[0] = vdupq_n_f32(0);
710 accumulator_val.val[1] = vdupq_n_f32(0);
712 for( number = 0; number < quarter_points; ++number) {
713 a_val = vld2q_f32(aPtr);
714 b_val = vld2q_f32(bPtr);
715 accumulator_val.val[0] = vmlaq_f32(accumulator_val.val[0], a_val.val[0], b_val.val[0]);
716 accumulator_val.val[1] = vmlaq_f32(accumulator_val.val[1], a_val.val[1], b_val.val[1]);
720 accumulator_val.val[0] = vaddq_f32(accumulator_val.val[0], accumulator_val.val[1]);
722 vst1q_f32(accumulator, accumulator_val.val[0]);
723 dotProduct = accumulator[0] + accumulator[1] + accumulator[2] + accumulator[3];
725 for(number = quarter_points*8; number < num_points; number++){
726 dotProduct += ((*aPtr++) * (*bPtr++));
729 *result = dotProduct;
735 extern void volk_32f_x2_dot_prod_32f_neonasm(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
739 extern void volk_32f_x2_dot_prod_32f_neonasm_opts(
float* cVector,
const float* aVector,
const float* bVector,
unsigned int num_points);
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9