GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32fc_x2_conjugate_dot_prod_32fc.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2012, 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32fc_x2_conjugate_dot_prod_32fc
25  *
26  * \b Overview
27  *
28  * This block computes the conjugate dot product (or inner product)
29  * between two vectors, the \p input and \p taps vectors. Given a set
30  * of \p num_points taps, the result is the sum of products between
31  * the input vector and the conjugate of the taps. The result is a
32  * single value stored in the \p result address and is returned as a
33  * complex float.
34  *
35  * <b>Dispatcher Prototype</b>
36  * \code
37  * void volk_32fc_x2_conjugate_dot_prod_32fc(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points)
38  * \endcode
39  *
40  * \b Inputs
41  * \li input: vector of complex floats.
42  * \li taps: complex float taps.
43  * \li num_points: number of samples in both \p input and \p taps.
44  *
45  * \b Outputs
46  * \li result: pointer to a complex float value to hold the dot product result.
47  *
48  * \b Example
49  * \code
50  * int N = 10000;
51  *
52  * <FIXME>
53  *
54  * volk_32fc_x2_conjugate_dot_prod_32fc();
55  *
56  * \endcode
57  */
58 
59 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
60 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
61 
62 
63 #include<volk/volk_complex.h>
64 
65 
66 #ifdef LV_HAVE_GENERIC
67 
68 
69 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
70 
71  const unsigned int num_bytes = num_points*8;
72 
73  float * res = (float*) result;
74  float * in = (float*) input;
75  float * tp = (float*) taps;
76  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
77  unsigned int isodd = (num_bytes >> 3) &1;
78 
79 
80 
81  float sum0[2] = {0,0};
82  float sum1[2] = {0,0};
83  unsigned int i = 0;
84 
85 
86  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
87 
88  sum0[0] += in[0] * tp[0] + in[1] * tp[1];
89  sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
90  sum1[0] += in[2] * tp[2] + in[3] * tp[3];
91  sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
92 
93 
94  in += 4;
95  tp += 4;
96 
97  }
98 
99 
100  res[0] = sum0[0] + sum1[0];
101  res[1] = sum0[1] + sum1[1];
102 
103 
104 
105  for(i = 0; i < isodd; ++i) {
106 
107 
108  *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
109 
110  }
111  /*
112  for(i = 0; i < num_bytes >> 3; ++i) {
113  *result += input[i] * conjf(taps[i]);
114  }
115  */
116 }
117 
118 #endif /*LV_HAVE_GENERIC*/
119 
120 #ifdef LV_HAVE_SSE3
121 
122 #include <xmmintrin.h>
123 #include <pmmintrin.h>
124 #include <mmintrin.h>
125 
126 
127 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
128 
129  unsigned int num_bytes = num_points*8;
130 
131  // Variable never used?
132  //__VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
133 
134  union HalfMask {
135  uint32_t intRep[4];
136  __m128 vec;
137  } halfMask;
138 
139  union NegMask {
140  int intRep[4];
141  __m128 vec;
142  } negMask;
143 
144  unsigned int offset = 0;
145  float Rsum=0, Isum=0;
146  float Im,Re;
147 
148  __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
149  __m128 zv = {0,0,0,0};
150 
151  halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
152  halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
153 
154  negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
155  negMask.intRep[1] = negMask.intRep[3] = 0;
156 
157  // main loop
158  while(num_bytes >= 4*sizeof(float)){
159 
160  in1 = _mm_loadu_ps( (float*) (input+offset) );
161  in2 = _mm_loadu_ps( (float*) (taps+offset) );
162  Rv = _mm_mul_ps(in1, in2);
163  fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
164  Iv = _mm_mul_ps(in1, fehg);
165  Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
166  Ivm = _mm_xor_ps( negMask.vec, Iv );
167  Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
168  _mm_store_ss( &Im, Is );
169  _mm_store_ss( &Re, Rs );
170  num_bytes -= 4*sizeof(float);
171  offset += 2;
172  Rsum += Re;
173  Isum += Im;
174  }
175 
176  // handle the last complex case ...
177  if(num_bytes > 0){
178 
179  if(num_bytes != 4){
180  // bad things are happening
181  }
182 
183  in1 = _mm_loadu_ps( (float*) (input+offset) );
184  in2 = _mm_loadu_ps( (float*) (taps+offset) );
185  Rv = _mm_and_ps(_mm_mul_ps(in1, in2), halfMask.vec);
186  fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
187  Iv = _mm_and_ps(_mm_mul_ps(in1, fehg), halfMask.vec);
188  Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
189  Ivm = _mm_xor_ps( negMask.vec, Iv );
190  Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
191  _mm_store_ss( &Im, Is );
192  _mm_store_ss( &Re, Rs );
193  Rsum += Re;
194  Isum += Im;
195  }
196 
197  result[0] = lv_cmake(Rsum,Isum);
198  return;
199 }
200 
201 #endif /*LV_HAVE_SSE3*/
202 
203 #ifdef LV_HAVE_NEON
204 #include <arm_neon.h>
205 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_neon(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
206 
207  unsigned int quarter_points = num_points / 4;
208  unsigned int number;
209 
210  lv_32fc_t* a_ptr = (lv_32fc_t*) taps;
211  lv_32fc_t* b_ptr = (lv_32fc_t*) input;
212  // for 2-lane vectors, 1st lane holds the real part,
213  // 2nd lane holds the imaginary part
214  float32x4x2_t a_val, b_val, accumulator;
215  float32x4x2_t tmp_imag;
216  accumulator.val[0] = vdupq_n_f32(0);
217  accumulator.val[1] = vdupq_n_f32(0);
218 
219  for(number = 0; number < quarter_points; ++number) {
220  a_val = vld2q_f32((float*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
221  b_val = vld2q_f32((float*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
222  __builtin_prefetch(a_ptr+8);
223  __builtin_prefetch(b_ptr+8);
224 
225  // do the first multiply
226  tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
227  tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
228 
229  // use multiply accumulate/subtract to get result
230  tmp_imag.val[1] = vmlsq_f32(tmp_imag.val[1], a_val.val[0], b_val.val[1]);
231  tmp_imag.val[0] = vmlaq_f32(tmp_imag.val[0], a_val.val[1], b_val.val[1]);
232 
233  accumulator.val[0] = vaddq_f32(accumulator.val[0], tmp_imag.val[0]);
234  accumulator.val[1] = vaddq_f32(accumulator.val[1], tmp_imag.val[1]);
235 
236  // increment pointers
237  a_ptr += 4;
238  b_ptr += 4;
239  }
240  lv_32fc_t accum_result[4];
241  vst2q_f32((float*)accum_result, accumulator);
242  *result = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
243 
244  // tail case
245  for(number = quarter_points*4; number < num_points; ++number) {
246  *result += (*a_ptr++) * lv_conj(*b_ptr++);
247  }
248  *result = lv_conj(*result);
249 
250 }
251 #endif /*LV_HAVE_NEON*/
252 
253 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
254 
255 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
256 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H
257 
258 #include <volk/volk_common.h>
259 #include<volk/volk_complex.h>
260 #include<stdio.h>
261 
262 
263 #ifdef LV_HAVE_GENERIC
264 
265 
266 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
267 
268  const unsigned int num_bytes = num_points*8;
269 
270  float * res = (float*) result;
271  float * in = (float*) input;
272  float * tp = (float*) taps;
273  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
274  unsigned int isodd = (num_bytes >> 3) &1;
275 
276 
277 
278  float sum0[2] = {0,0};
279  float sum1[2] = {0,0};
280  unsigned int i = 0;
281 
282 
283  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
284 
285 
286  sum0[0] += in[0] * tp[0] + in[1] * tp[1];
287  sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
288  sum1[0] += in[2] * tp[2] + in[3] * tp[3];
289  sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
290 
291 
292  in += 4;
293  tp += 4;
294 
295  }
296 
297 
298  res[0] = sum0[0] + sum1[0];
299  res[1] = sum0[1] + sum1[1];
300 
301 
302 
303  for(i = 0; i < isodd; ++i) {
304 
305 
306  *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
307 
308  }
309  /*
310  for(i = 0; i < num_bytes >> 3; ++i) {
311  *result += input[i] * conjf(taps[i]);
312  }
313  */
314 }
315 
316 #endif /*LV_HAVE_GENERIC*/
317 
318 
319 #if LV_HAVE_SSE && LV_HAVE_64
320 
321 
322 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
323 
324  const unsigned int num_bytes = num_points*8;
325 
326  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
327 
328 
329 
330 
331  asm volatile
332  (
333  "# ccomplex_conjugate_dotprod_generic (float* result, const float *input,\n\t"
334  "# const float *taps, unsigned num_bytes)\n\t"
335  "# float sum0 = 0;\n\t"
336  "# float sum1 = 0;\n\t"
337  "# float sum2 = 0;\n\t"
338  "# float sum3 = 0;\n\t"
339  "# do {\n\t"
340  "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
341  "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
342  "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
343  "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
344  "# input += 4;\n\t"
345  "# taps += 4; \n\t"
346  "# } while (--n_2_ccomplex_blocks != 0);\n\t"
347  "# result[0] = sum0 + sum2;\n\t"
348  "# result[1] = sum1 + sum3;\n\t"
349  "# TODO: prefetch and better scheduling\n\t"
350  " xor %%r9, %%r9\n\t"
351  " xor %%r10, %%r10\n\t"
352  " movq %[conjugator], %%r9\n\t"
353  " movq %%rcx, %%rax\n\t"
354  " movaps 0(%%r9), %%xmm8\n\t"
355  " movq %%rcx, %%r8\n\t"
356  " movq %[rsi], %%r9\n\t"
357  " movq %[rdx], %%r10\n\t"
358  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
359  " movaps 0(%%r9), %%xmm0\n\t"
360  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
361  " movups 0(%%r10), %%xmm2\n\t"
362  " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
363  " shr $4, %%r8\n\t"
364  " xorps %%xmm8, %%xmm2\n\t"
365  " jmp .%=L1_test\n\t"
366  " # 4 taps / loop\n\t"
367  " # something like ?? cycles / loop\n\t"
368  ".%=Loop1: \n\t"
369  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
370  "# movaps (%%r9), %%xmmA\n\t"
371  "# movaps (%%r10), %%xmmB\n\t"
372  "# movaps %%xmmA, %%xmmZ\n\t"
373  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
374  "# mulps %%xmmB, %%xmmA\n\t"
375  "# mulps %%xmmZ, %%xmmB\n\t"
376  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
377  "# xorps %%xmmPN, %%xmmA\n\t"
378  "# movaps %%xmmA, %%xmmZ\n\t"
379  "# unpcklps %%xmmB, %%xmmA\n\t"
380  "# unpckhps %%xmmB, %%xmmZ\n\t"
381  "# movaps %%xmmZ, %%xmmY\n\t"
382  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
383  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
384  "# addps %%xmmZ, %%xmmA\n\t"
385  "# addps %%xmmA, %%xmmC\n\t"
386  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
387  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
388  " movaps 16(%%r9), %%xmm1\n\t"
389  " movaps %%xmm0, %%xmm4\n\t"
390  " mulps %%xmm2, %%xmm0\n\t"
391  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
392  " movaps 16(%%r10), %%xmm3\n\t"
393  " movaps %%xmm1, %%xmm5\n\t"
394  " xorps %%xmm8, %%xmm3\n\t"
395  " addps %%xmm0, %%xmm6\n\t"
396  " mulps %%xmm3, %%xmm1\n\t"
397  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
398  " addps %%xmm1, %%xmm6\n\t"
399  " mulps %%xmm4, %%xmm2\n\t"
400  " movaps 32(%%r9), %%xmm0\n\t"
401  " addps %%xmm2, %%xmm7\n\t"
402  " mulps %%xmm5, %%xmm3\n\t"
403  " add $32, %%r9\n\t"
404  " movaps 32(%%r10), %%xmm2\n\t"
405  " addps %%xmm3, %%xmm7\n\t"
406  " add $32, %%r10\n\t"
407  " xorps %%xmm8, %%xmm2\n\t"
408  ".%=L1_test:\n\t"
409  " dec %%rax\n\t"
410  " jge .%=Loop1\n\t"
411  " # We've handled the bulk of multiplies up to here.\n\t"
412  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
413  " # If so, we've got 2 more taps to do.\n\t"
414  " and $1, %%r8\n\t"
415  " je .%=Leven\n\t"
416  " # The count was odd, do 2 more taps.\n\t"
417  " # Note that we've already got mm0/mm2 preloaded\n\t"
418  " # from the main loop.\n\t"
419  " movaps %%xmm0, %%xmm4\n\t"
420  " mulps %%xmm2, %%xmm0\n\t"
421  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
422  " addps %%xmm0, %%xmm6\n\t"
423  " mulps %%xmm4, %%xmm2\n\t"
424  " addps %%xmm2, %%xmm7\n\t"
425  ".%=Leven:\n\t"
426  " # neg inversor\n\t"
427  " xorps %%xmm1, %%xmm1\n\t"
428  " mov $0x80000000, %%r9\n\t"
429  " movd %%r9, %%xmm1\n\t"
430  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
431  " # pfpnacc\n\t"
432  " xorps %%xmm1, %%xmm6\n\t"
433  " movaps %%xmm6, %%xmm2\n\t"
434  " unpcklps %%xmm7, %%xmm6\n\t"
435  " unpckhps %%xmm7, %%xmm2\n\t"
436  " movaps %%xmm2, %%xmm3\n\t"
437  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
438  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
439  " addps %%xmm2, %%xmm6\n\t"
440  " # xmm6 = r1 i2 r3 i4\n\t"
441  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
442  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
443  " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
444  :
445  :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result), [conjugator] "r" (conjugator)
446  :"rax", "r8", "r9", "r10"
447  );
448 
449 
450  int getem = num_bytes % 16;
451 
452 
453  for(; getem > 0; getem -= 8) {
454 
455 
456  *result += (input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]));
457 
458  }
459 
460  return;
461 }
462 #endif
463 
464 #if LV_HAVE_SSE && LV_HAVE_32
465 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
466 
467  const unsigned int num_bytes = num_points*8;
468 
469  __VOLK_ATTR_ALIGNED(16) static const uint32_t conjugator[4]= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
470 
471  int bound = num_bytes >> 4;
472  int leftovers = num_bytes % 16;
473 
474 
475  asm volatile
476  (
477  " #pushl %%ebp\n\t"
478  " #movl %%esp, %%ebp\n\t"
479  " #movl 12(%%ebp), %%eax # input\n\t"
480  " #movl 16(%%ebp), %%edx # taps\n\t"
481  " #movl 20(%%ebp), %%ecx # n_bytes\n\t"
482  " movaps 0(%[conjugator]), %%xmm1\n\t"
483  " xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
484  " movaps 0(%[eax]), %%xmm0\n\t"
485  " xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
486  " movaps 0(%[edx]), %%xmm2\n\t"
487  " movl %[ecx], (%[out])\n\t"
488  " shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"
489 
490  " xorps %%xmm1, %%xmm2\n\t"
491  " jmp .%=L1_test\n\t"
492  " # 4 taps / loop\n\t"
493  " # something like ?? cycles / loop\n\t"
494  ".%=Loop1: \n\t"
495  "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
496  "# movaps (%[eax]), %%xmmA\n\t"
497  "# movaps (%[edx]), %%xmmB\n\t"
498  "# movaps %%xmmA, %%xmmZ\n\t"
499  "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
500  "# mulps %%xmmB, %%xmmA\n\t"
501  "# mulps %%xmmZ, %%xmmB\n\t"
502  "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
503  "# xorps %%xmmPN, %%xmmA\n\t"
504  "# movaps %%xmmA, %%xmmZ\n\t"
505  "# unpcklps %%xmmB, %%xmmA\n\t"
506  "# unpckhps %%xmmB, %%xmmZ\n\t"
507  "# movaps %%xmmZ, %%xmmY\n\t"
508  "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
509  "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
510  "# addps %%xmmZ, %%xmmA\n\t"
511  "# addps %%xmmA, %%xmmC\n\t"
512  "# A=xmm0, B=xmm2, Z=xmm4\n\t"
513  "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
514  " movaps 16(%[edx]), %%xmm3\n\t"
515  " movaps %%xmm0, %%xmm4\n\t"
516  " xorps %%xmm1, %%xmm3\n\t"
517  " mulps %%xmm2, %%xmm0\n\t"
518  " movaps 16(%[eax]), %%xmm1\n\t"
519  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
520  " movaps %%xmm1, %%xmm5\n\t"
521  " addps %%xmm0, %%xmm6\n\t"
522  " mulps %%xmm3, %%xmm1\n\t"
523  " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
524  " addps %%xmm1, %%xmm6\n\t"
525  " movaps 0(%[conjugator]), %%xmm1\n\t"
526  " mulps %%xmm4, %%xmm2\n\t"
527  " movaps 32(%[eax]), %%xmm0\n\t"
528  " addps %%xmm2, %%xmm7\n\t"
529  " mulps %%xmm5, %%xmm3\n\t"
530  " addl $32, %[eax]\n\t"
531  " movaps 32(%[edx]), %%xmm2\n\t"
532  " addps %%xmm3, %%xmm7\n\t"
533  " xorps %%xmm1, %%xmm2\n\t"
534  " addl $32, %[edx]\n\t"
535  ".%=L1_test:\n\t"
536  " decl %[ecx]\n\t"
537  " jge .%=Loop1\n\t"
538  " # We've handled the bulk of multiplies up to here.\n\t"
539  " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
540  " # If so, we've got 2 more taps to do.\n\t"
541  " movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
542  " shrl $4, %[ecx]\n\t"
543  " andl $1, %[ecx]\n\t"
544  " je .%=Leven\n\t"
545  " # The count was odd, do 2 more taps.\n\t"
546  " # Note that we've already got mm0/mm2 preloaded\n\t"
547  " # from the main loop.\n\t"
548  " movaps %%xmm0, %%xmm4\n\t"
549  " mulps %%xmm2, %%xmm0\n\t"
550  " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
551  " addps %%xmm0, %%xmm6\n\t"
552  " mulps %%xmm4, %%xmm2\n\t"
553  " addps %%xmm2, %%xmm7\n\t"
554  ".%=Leven:\n\t"
555  " # neg inversor\n\t"
556  " #movl 8(%%ebp), %[eax] \n\t"
557  " xorps %%xmm1, %%xmm1\n\t"
558  " movl $0x80000000, (%[out])\n\t"
559  " movss (%[out]), %%xmm1\n\t"
560  " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
561  " # pfpnacc\n\t"
562  " xorps %%xmm1, %%xmm6\n\t"
563  " movaps %%xmm6, %%xmm2\n\t"
564  " unpcklps %%xmm7, %%xmm6\n\t"
565  " unpckhps %%xmm7, %%xmm2\n\t"
566  " movaps %%xmm2, %%xmm3\n\t"
567  " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
568  " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
569  " addps %%xmm2, %%xmm6\n\t"
570  " # xmm6 = r1 i2 r3 i4\n\t"
571  " #movl 8(%%ebp), %[eax] # @result\n\t"
572  " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
573  " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
574  " movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) to memory\n\t"
575  " #popl %%ebp\n\t"
576  :
577  : [eax] "r" (input), [edx] "r" (taps), [ecx] "r" (num_bytes), [out] "r" (result), [conjugator] "r" (conjugator)
578  );
579 
580 
581 
582 
583  printf("%d, %d\n", leftovers, bound);
584 
585  for(; leftovers > 0; leftovers -= 8) {
586 
587 
588  *result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
589 
590  }
591 
592  return;
593 
594 
595 
596 
597 
598 
599 }
600 
601 #endif /*LV_HAVE_SSE*/
602 
603 
604 
605 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/
#define lv_conj(x)
Definition: volk_complex.h:80
unsigned int uint32_t
Definition: stdint.h:80
#define lv_cmake(r, i)
Definition: volk_complex.h:59
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:27
static const float taps[NSTEPS+1][NTAPS]
Definition: interpolator_taps.h:9
float complex lv_32fc_t
Definition: volk_complex.h:56
uint32_t i[4]
Definition: volk_common.h:80