58 #ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
59 #define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
65 #ifdef LV_HAVE_GENERIC
67 static inline void volk_16i_32fc_dot_prod_32fc_generic(
lv_32fc_t* result,
const short* input,
const lv_32fc_t *
taps,
unsigned int num_points) {
69 static const int N_UNROLL = 4;
77 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
79 for(i = 0; i < n; i += N_UNROLL) {
80 acc0 += taps[i + 0] * (float)input[i + 0];
81 acc1 += taps[i + 1] * (float)input[i + 1];
82 acc2 += taps[i + 2] * (float)input[i + 2];
83 acc3 += taps[i + 3] * (float)input[i + 3];
86 for(; i < num_points; i++) {
87 acc0 += taps[i] * (float)input[i];
90 *result = acc0 + acc1 + acc2 + acc3;
97 static inline void volk_16i_32fc_dot_prod_32fc_neon(
lv_32fc_t* result,
const short* input,
const lv_32fc_t * taps,
unsigned int num_points) {
100 unsigned quarter_points = num_points / 4;
102 short* inputPtr = (
short*) input;
105 float32x4x2_t tapsVal, accumulator_val;
108 float32x4_t input_float, prod_re, prod_im;
110 accumulator_val.val[0] = vdupq_n_f32(0.0);
111 accumulator_val.val[1] = vdupq_n_f32(0.0);
113 for(ii = 0; ii < quarter_points; ++ii) {
114 tapsVal = vld2q_f32((
float*)tapsPtr);
115 input16 = vld1_s16(inputPtr);
117 input32 = vmovl_s16(input16);
119 input_float = vcvtq_f32_s32(input32);
121 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
122 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
124 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
125 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
130 vst2q_f32((
float*)accumulator_vec, accumulator_val);
131 accumulator_vec[0] += accumulator_vec[1];
132 accumulator_vec[2] += accumulator_vec[3];
133 accumulator_vec[0] += accumulator_vec[2];
135 for(ii = quarter_points * 4; ii < num_points; ++ii) {
136 accumulator_vec[0] += *(tapsPtr++) * (
float)(*(inputPtr++));
139 *result = accumulator_vec[0];
144 #if LV_HAVE_SSE && LV_HAVE_MMX
146 static inline void volk_16i_32fc_dot_prod_32fc_u_sse(
lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points) {
148 unsigned int number = 0;
149 const unsigned int sixteenthPoints = num_points / 8;
152 float *realpt = &res[0], *imagpt = &res[1];
153 const short* aPtr = input;
154 const float* bPtr = (
float*)taps;
157 __m128 f0, f1, f2, f3;
158 __m128 a0Val, a1Val, a2Val, a3Val;
159 __m128 b0Val, b1Val, b2Val, b3Val;
160 __m128 c0Val, c1Val, c2Val, c3Val;
162 __m128 dotProdVal0 = _mm_setzero_ps();
163 __m128 dotProdVal1 = _mm_setzero_ps();
164 __m128 dotProdVal2 = _mm_setzero_ps();
165 __m128 dotProdVal3 = _mm_setzero_ps();
167 for(;number < sixteenthPoints; number++){
169 m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
170 m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
171 f0 = _mm_cvtpi16_ps(m0);
172 f1 = _mm_cvtpi16_ps(m0);
173 f2 = _mm_cvtpi16_ps(m1);
174 f3 = _mm_cvtpi16_ps(m1);
176 a0Val = _mm_unpacklo_ps(f0, f1);
177 a1Val = _mm_unpackhi_ps(f0, f1);
178 a2Val = _mm_unpacklo_ps(f2, f3);
179 a3Val = _mm_unpackhi_ps(f2, f3);
181 b0Val = _mm_loadu_ps(bPtr);
182 b1Val = _mm_loadu_ps(bPtr+4);
183 b2Val = _mm_loadu_ps(bPtr+8);
184 b3Val = _mm_loadu_ps(bPtr+12);
186 c0Val = _mm_mul_ps(a0Val, b0Val);
187 c1Val = _mm_mul_ps(a1Val, b1Val);
188 c2Val = _mm_mul_ps(a2Val, b2Val);
189 c3Val = _mm_mul_ps(a3Val, b3Val);
191 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
192 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
193 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
194 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
200 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
201 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
202 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
206 _mm_store_ps(dotProductVector,dotProdVal0);
208 *realpt = dotProductVector[0];
209 *imagpt = dotProductVector[1];
210 *realpt += dotProductVector[2];
211 *imagpt += dotProductVector[3];
213 number = sixteenthPoints*8;
214 for(;number < num_points; number++){
215 *realpt += ((*aPtr) * (*bPtr++));
216 *imagpt += ((*aPtr++) * (*bPtr++));
227 #if LV_HAVE_SSE && LV_HAVE_MMX
230 static inline void volk_16i_32fc_dot_prod_32fc_a_sse(
lv_32fc_t* result,
const short* input,
const lv_32fc_t* taps,
unsigned int num_points) {
232 unsigned int number = 0;
233 const unsigned int sixteenthPoints = num_points / 8;
236 float *realpt = &res[0], *imagpt = &res[1];
237 const short* aPtr = input;
238 const float* bPtr = (
float*)taps;
241 __m128 f0, f1, f2, f3;
242 __m128 a0Val, a1Val, a2Val, a3Val;
243 __m128 b0Val, b1Val, b2Val, b3Val;
244 __m128 c0Val, c1Val, c2Val, c3Val;
246 __m128 dotProdVal0 = _mm_setzero_ps();
247 __m128 dotProdVal1 = _mm_setzero_ps();
248 __m128 dotProdVal2 = _mm_setzero_ps();
249 __m128 dotProdVal3 = _mm_setzero_ps();
251 for(;number < sixteenthPoints; number++){
253 m0 = _mm_set_pi16(*(aPtr+3), *(aPtr+2), *(aPtr+1), *(aPtr+0));
254 m1 = _mm_set_pi16(*(aPtr+7), *(aPtr+6), *(aPtr+5), *(aPtr+4));
255 f0 = _mm_cvtpi16_ps(m0);
256 f1 = _mm_cvtpi16_ps(m0);
257 f2 = _mm_cvtpi16_ps(m1);
258 f3 = _mm_cvtpi16_ps(m1);
260 a0Val = _mm_unpacklo_ps(f0, f1);
261 a1Val = _mm_unpackhi_ps(f0, f1);
262 a2Val = _mm_unpacklo_ps(f2, f3);
263 a3Val = _mm_unpackhi_ps(f2, f3);
265 b0Val = _mm_load_ps(bPtr);
266 b1Val = _mm_load_ps(bPtr+4);
267 b2Val = _mm_load_ps(bPtr+8);
268 b3Val = _mm_load_ps(bPtr+12);
270 c0Val = _mm_mul_ps(a0Val, b0Val);
271 c1Val = _mm_mul_ps(a1Val, b1Val);
272 c2Val = _mm_mul_ps(a2Val, b2Val);
273 c3Val = _mm_mul_ps(a3Val, b3Val);
275 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
276 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
277 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
278 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
284 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
285 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
286 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
290 _mm_store_ps(dotProductVector,dotProdVal0);
292 *realpt = dotProductVector[0];
293 *imagpt = dotProductVector[1];
294 *realpt += dotProductVector[2];
295 *imagpt += dotProductVector[3];
297 number = sixteenthPoints*8;
298 for(;number < num_points; number++){
299 *realpt += ((*aPtr) * (*bPtr++));
300 *imagpt += ((*aPtr++) * (*bPtr++));
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56