58 #ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
59 #define INCLUDED_volk_32f_x2_dot_prod_16i_H
65 #ifdef LV_HAVE_GENERIC
68 static inline void volk_32f_x2_dot_prod_16i_generic(
int16_t* result,
const float* input,
const float*
taps,
unsigned int num_points) {
71 const float* aPtr = input;
72 const float* bPtr=
taps;
73 unsigned int number = 0;
75 for(number = 0; number < num_points; number++){
76 dotProduct += ((*aPtr++) * (*bPtr++));
87 static inline void volk_32f_x2_dot_prod_16i_a_sse(
int16_t* result,
const float* input,
const float* taps,
unsigned int num_points) {
89 unsigned int number = 0;
90 const unsigned int sixteenthPoints = num_points / 16;
93 const float* aPtr = input;
94 const float* bPtr =
taps;
96 __m128 a0Val, a1Val, a2Val, a3Val;
97 __m128 b0Val, b1Val, b2Val, b3Val;
98 __m128 c0Val, c1Val, c2Val, c3Val;
100 __m128 dotProdVal0 = _mm_setzero_ps();
101 __m128 dotProdVal1 = _mm_setzero_ps();
102 __m128 dotProdVal2 = _mm_setzero_ps();
103 __m128 dotProdVal3 = _mm_setzero_ps();
105 for(;number < sixteenthPoints; number++){
107 a0Val = _mm_load_ps(aPtr);
108 a1Val = _mm_load_ps(aPtr+4);
109 a2Val = _mm_load_ps(aPtr+8);
110 a3Val = _mm_load_ps(aPtr+12);
111 b0Val = _mm_load_ps(bPtr);
112 b1Val = _mm_load_ps(bPtr+4);
113 b2Val = _mm_load_ps(bPtr+8);
114 b3Val = _mm_load_ps(bPtr+12);
116 c0Val = _mm_mul_ps(a0Val, b0Val);
117 c1Val = _mm_mul_ps(a1Val, b1Val);
118 c2Val = _mm_mul_ps(a2Val, b2Val);
119 c3Val = _mm_mul_ps(a3Val, b3Val);
121 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
122 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
123 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
124 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
130 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
131 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
132 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
136 _mm_store_ps(dotProductVector,dotProdVal0);
138 dotProduct = dotProductVector[0];
139 dotProduct += dotProductVector[1];
140 dotProduct += dotProductVector[2];
141 dotProduct += dotProductVector[3];
143 number = sixteenthPoints*16;
144 for(;number < num_points; number++){
145 dotProduct += ((*aPtr++) * (*bPtr++));
148 *result = (short)dotProduct;
156 static inline void volk_32f_x2_dot_prod_16i_u_sse(
int16_t* result,
const float* input,
const float* taps,
unsigned int num_points) {
158 unsigned int number = 0;
159 const unsigned int sixteenthPoints = num_points / 16;
161 float dotProduct = 0;
162 const float* aPtr = input;
163 const float* bPtr =
taps;
165 __m128 a0Val, a1Val, a2Val, a3Val;
166 __m128 b0Val, b1Val, b2Val, b3Val;
167 __m128 c0Val, c1Val, c2Val, c3Val;
169 __m128 dotProdVal0 = _mm_setzero_ps();
170 __m128 dotProdVal1 = _mm_setzero_ps();
171 __m128 dotProdVal2 = _mm_setzero_ps();
172 __m128 dotProdVal3 = _mm_setzero_ps();
174 for(;number < sixteenthPoints; number++){
176 a0Val = _mm_loadu_ps(aPtr);
177 a1Val = _mm_loadu_ps(aPtr+4);
178 a2Val = _mm_loadu_ps(aPtr+8);
179 a3Val = _mm_loadu_ps(aPtr+12);
180 b0Val = _mm_loadu_ps(bPtr);
181 b1Val = _mm_loadu_ps(bPtr+4);
182 b2Val = _mm_loadu_ps(bPtr+8);
183 b3Val = _mm_loadu_ps(bPtr+12);
185 c0Val = _mm_mul_ps(a0Val, b0Val);
186 c1Val = _mm_mul_ps(a1Val, b1Val);
187 c2Val = _mm_mul_ps(a2Val, b2Val);
188 c3Val = _mm_mul_ps(a3Val, b3Val);
190 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
191 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
192 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
193 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
199 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
200 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
201 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
205 _mm_store_ps(dotProductVector,dotProdVal0);
207 dotProduct = dotProductVector[0];
208 dotProduct += dotProductVector[1];
209 dotProduct += dotProductVector[2];
210 dotProduct += dotProductVector[3];
212 number = sixteenthPoints*16;
213 for(;number < num_points; number++){
214 dotProduct += ((*aPtr++) * (*bPtr++));
217 *result = (short)dotProduct;
signed short int16_t
Definition: stdint.h:76
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9