71 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
72 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_u_H
80 #include <immintrin.h>
83 volk_32fc_x2_multiply_conjugate_32fc_u_avx(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
84 const lv_32fc_t* bVector,
unsigned int num_points)
86 unsigned int number = 0;
87 const unsigned int quarterPoints = num_points / 4;
89 __m256 x, y, yl, yh, z, tmp1, tmp2;
94 __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
96 for(;number < quarterPoints; number++){
98 x = _mm256_loadu_ps((
float*)a);
99 y = _mm256_loadu_ps((
float*)b);
101 y = _mm256_xor_ps(y, conjugator);
103 yl = _mm256_moveldup_ps(y);
104 yh = _mm256_movehdup_ps(y);
106 tmp1 = _mm256_mul_ps(x,yl);
108 x = _mm256_shuffle_ps(x,x,0xB1);
110 tmp2 = _mm256_mul_ps(x,yh);
112 z = _mm256_addsub_ps(tmp1,tmp2);
114 _mm256_storeu_ps((
float*)c,z);
121 number = quarterPoints * 4;
123 for(; number < num_points; number++) {
131 #include <pmmintrin.h>
134 volk_32fc_x2_multiply_conjugate_32fc_u_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
135 const lv_32fc_t* bVector,
unsigned int num_points)
137 unsigned int number = 0;
138 const unsigned int halfPoints = num_points / 2;
140 __m128 x, y, yl, yh, z, tmp1, tmp2;
145 __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
147 for(;number < halfPoints; number++){
149 x = _mm_loadu_ps((
float*)a);
150 y = _mm_loadu_ps((
float*)b);
152 y = _mm_xor_ps(y, conjugator);
154 yl = _mm_moveldup_ps(y);
155 yh = _mm_movehdup_ps(y);
157 tmp1 = _mm_mul_ps(x,yl);
159 x = _mm_shuffle_ps(x,x,0xB1);
161 tmp2 = _mm_mul_ps(x,yh);
163 z = _mm_addsub_ps(tmp1,tmp2);
165 _mm_storeu_ps((
float*)c,z);
172 if((num_points % 2) != 0) {
179 #ifdef LV_HAVE_GENERIC
182 volk_32fc_x2_multiply_conjugate_32fc_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
183 const lv_32fc_t* bVector,
unsigned int num_points)
188 unsigned int number = 0;
190 for(number = 0; number < num_points; number++){
191 *cPtr++ = (*aPtr++) *
lv_conj(*bPtr++);
199 #ifndef INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
200 #define INCLUDED_volk_32fc_x2_multiply_conjugate_32fc_a_H
208 #include <immintrin.h>
211 volk_32fc_x2_multiply_conjugate_32fc_a_avx(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
212 const lv_32fc_t* bVector,
unsigned int num_points)
214 unsigned int number = 0;
215 const unsigned int quarterPoints = num_points / 4;
217 __m256 x, y, yl, yh, z, tmp1, tmp2;
222 __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
224 for(;number < quarterPoints; number++){
226 x = _mm256_load_ps((
float*)a);
227 y = _mm256_load_ps((
float*)b);
229 y = _mm256_xor_ps(y, conjugator);
231 yl = _mm256_moveldup_ps(y);
232 yh = _mm256_movehdup_ps(y);
234 tmp1 = _mm256_mul_ps(x,yl);
236 x = _mm256_shuffle_ps(x,x,0xB1);
238 tmp2 = _mm256_mul_ps(x,yh);
240 z = _mm256_addsub_ps(tmp1,tmp2);
242 _mm256_store_ps((
float*)c,z);
249 number = quarterPoints * 4;
251 for(; number < num_points; number++) {
259 #include <pmmintrin.h>
262 volk_32fc_x2_multiply_conjugate_32fc_a_sse3(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
263 const lv_32fc_t* bVector,
unsigned int num_points)
265 unsigned int number = 0;
266 const unsigned int halfPoints = num_points / 2;
268 __m128 x, y, yl, yh, z, tmp1, tmp2;
273 __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
275 for(;number < halfPoints; number++){
277 x = _mm_load_ps((
float*)a);
278 y = _mm_load_ps((
float*)b);
280 y = _mm_xor_ps(y, conjugator);
282 yl = _mm_moveldup_ps(y);
283 yh = _mm_movehdup_ps(y);
285 tmp1 = _mm_mul_ps(x,yl);
287 x = _mm_shuffle_ps(x,x,0xB1);
289 tmp2 = _mm_mul_ps(x,yh);
291 z = _mm_addsub_ps(tmp1,tmp2);
293 _mm_store_ps((
float*)c,z);
300 if((num_points % 2) != 0) {
308 #include <arm_neon.h>
311 volk_32fc_x2_multiply_conjugate_32fc_neon(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
312 const lv_32fc_t* bVector,
unsigned int num_points)
316 unsigned int quarter_points = num_points / 4;
317 float32x4x2_t a_val, b_val, c_val;
318 float32x4x2_t tmp_real, tmp_imag;
319 unsigned int number = 0;
321 for(number = 0; number < quarter_points; ++number) {
322 a_val = vld2q_f32((
float*)a_ptr);
323 b_val = vld2q_f32((
float*)b_ptr);
324 b_val.val[1] = vnegq_f32(b_val.val[1]);
325 __builtin_prefetch(a_ptr+4);
326 __builtin_prefetch(b_ptr+4);
330 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
332 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
336 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
338 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
341 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
342 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
343 vst2q_f32((
float*)cVector, c_val);
350 for(number = quarter_points*4; number < num_points; number++){
351 *cVector++ = (*a_ptr++) * conj(*b_ptr++);
357 #ifdef LV_HAVE_GENERIC
360 volk_32fc_x2_multiply_conjugate_32fc_a_generic(
lv_32fc_t* cVector,
const lv_32fc_t* aVector,
361 const lv_32fc_t* bVector,
unsigned int num_points)
366 unsigned int number = 0;
368 for(number = 0; number < num_points; number++){
369 *cPtr++ = (*aPtr++) *
lv_conj(*bPtr++);
#define lv_conj(x)
Definition: volk_complex.h:80
float complex lv_32fc_t
Definition: volk_complex.h:56