70 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_u_H
71 #define INCLUDED_volk_32fc_x2_multiply_32fc_u_H
79 #include <immintrin.h>
83 const lv_32fc_t* bVector,
unsigned int num_points)
85 unsigned int number = 0;
86 const unsigned int quarterPoints = num_points / 4;
88 __m256 x, y, yl, yh, z, tmp1, tmp2;
93 for(;number < quarterPoints; number++){
95 x = _mm256_loadu_ps((
float*)a);
96 y = _mm256_loadu_ps((
float*)b);
98 yl = _mm256_moveldup_ps(y);
99 yh = _mm256_movehdup_ps(y);
101 tmp1 = _mm256_mul_ps(x,yl);
103 x = _mm256_shuffle_ps(x,x,0xB1);
105 tmp2 = _mm256_mul_ps(x,yh);
107 z = _mm256_addsub_ps(tmp1,tmp2);
109 _mm256_storeu_ps((
float*)c,z);
116 number = quarterPoints * 4;
118 for(; number < num_points; number++) {
119 *c++ = (*a++) * (*b++);
126 #include <pmmintrin.h>
130 const lv_32fc_t* bVector,
unsigned int num_points)
132 unsigned int number = 0;
133 const unsigned int halfPoints = num_points / 2;
135 __m128 x, y, yl, yh, z, tmp1, tmp2;
140 for(;number < halfPoints; number++){
142 x = _mm_loadu_ps((
float*)a);
143 y = _mm_loadu_ps((
float*)b);
145 yl = _mm_moveldup_ps(y);
146 yh = _mm_movehdup_ps(y);
148 tmp1 = _mm_mul_ps(x,yl);
150 x = _mm_shuffle_ps(x,x,0xB1);
152 tmp2 = _mm_mul_ps(x,yh);
154 z = _mm_addsub_ps(tmp1,tmp2);
156 _mm_storeu_ps((
float*)c,z);
163 if((num_points % 2) != 0) {
170 #ifdef LV_HAVE_GENERIC
174 const lv_32fc_t* bVector,
unsigned int num_points)
179 unsigned int number = 0;
181 for(number = 0; number < num_points; number++){
182 *cPtr++ = (*aPtr++) * (*bPtr++);
189 #ifndef INCLUDED_volk_32fc_x2_multiply_32fc_a_H
190 #define INCLUDED_volk_32fc_x2_multiply_32fc_a_H
198 #include <immintrin.h>
202 const lv_32fc_t* bVector,
unsigned int num_points)
204 unsigned int number = 0;
205 const unsigned int quarterPoints = num_points / 4;
207 __m256 x, y, yl, yh, z, tmp1, tmp2;
212 for(;number < quarterPoints; number++){
214 x = _mm256_load_ps((
float*)a);
215 y = _mm256_load_ps((
float*)b);
217 yl = _mm256_moveldup_ps(y);
218 yh = _mm256_movehdup_ps(y);
220 tmp1 = _mm256_mul_ps(x,yl);
222 x = _mm256_shuffle_ps(x,x,0xB1);
224 tmp2 = _mm256_mul_ps(x,yh);
226 z = _mm256_addsub_ps(tmp1,tmp2);
228 _mm256_store_ps((
float*)c,z);
235 number = quarterPoints * 4;
237 for(; number < num_points; number++) {
238 *c++ = (*a++) * (*b++);
244 #include <pmmintrin.h>
248 const lv_32fc_t* bVector,
unsigned int num_points)
250 unsigned int number = 0;
251 const unsigned int halfPoints = num_points / 2;
253 __m128 x, y, yl, yh, z, tmp1, tmp2;
257 for(;number < halfPoints; number++){
259 x = _mm_load_ps((
float*)a);
260 y = _mm_load_ps((
float*)b);
262 yl = _mm_moveldup_ps(y);
263 yh = _mm_movehdup_ps(y);
265 tmp1 = _mm_mul_ps(x,yl);
267 x = _mm_shuffle_ps(x,x,0xB1);
269 tmp2 = _mm_mul_ps(x,yh);
271 z = _mm_addsub_ps(tmp1,tmp2);
273 _mm_store_ps((
float*)c,z);
280 if((num_points % 2) != 0) {
287 #ifdef LV_HAVE_GENERIC
291 const lv_32fc_t* bVector,
unsigned int num_points)
296 unsigned int number = 0;
298 for(number = 0; number < num_points; number++){
299 *cPtr++ = (*aPtr++) * (*bPtr++);
306 #include <arm_neon.h>
310 const lv_32fc_t* bVector,
unsigned int num_points)
314 unsigned int quarter_points = num_points / 4;
315 float32x4x2_t a_val, b_val, c_val;
316 float32x4x2_t tmp_real, tmp_imag;
317 unsigned int number = 0;
319 for(number = 0; number < quarter_points; ++number) {
320 a_val = vld2q_f32((
float*)a_ptr);
321 b_val = vld2q_f32((
float*)b_ptr);
322 __builtin_prefetch(a_ptr+4);
323 __builtin_prefetch(b_ptr+4);
327 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
329 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
333 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
335 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
338 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
339 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
340 vst2q_f32((
float*)cVector, c_val);
347 for(number = quarter_points*4; number < num_points; number++){
348 *cVector++ = (*a_ptr++) * (*b_ptr++);
357 volk_32fc_x2_multiply_32fc_neon_opttests(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
358 const lv_32fc_t* bVector,
unsigned int num_points)
362 unsigned int quarter_points = num_points / 4;
363 float32x4x2_t a_val, b_val;
364 float32x4x2_t tmp_imag;
365 unsigned int number = 0;
367 for(number = 0; number < quarter_points; ++number) {
368 a_val = vld2q_f32((
float*)a_ptr);
369 b_val = vld2q_f32((
float*)b_ptr);
370 __builtin_prefetch(a_ptr+4);
371 __builtin_prefetch(b_ptr+4);
374 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
375 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
378 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
379 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
382 vst2q_f32((
float*)cVector, tmp_imag);
389 for(number = quarter_points*4; number < num_points; number++){
390 *cVector++ = (*a_ptr++) * (*b_ptr++);
400 const lv_32fc_t* bVector,
unsigned int num_points);
408 const lv_32fc_t* bVector,
unsigned int num_points);
412 const lv_32fc_t* bVector,
unsigned int num_points)
414 volk_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
float complex lv_32fc_t
Definition: volk_complex.h:56