79 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
80 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
92 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
93 float scalar,
unsigned int num_points)
95 const unsigned int num_bytes = num_points*8;
97 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
100 memset(&diff, 0x0, 2*
sizeof(
float));
103 int bound = num_bytes >> 5;
104 int leftovers0 = (num_bytes >> 4) & 1;
105 int leftovers1 = (num_bytes >> 3) & 1;
108 xmm1 = _mm_setzero_ps();
109 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
110 xmm2 = _mm_load_ps((
float*)&points[0]);
111 xmm8 = _mm_load1_ps(&scalar);
112 xmm1 = _mm_movelh_ps(xmm1, xmm1);
113 xmm3 = _mm_load_ps((
float*)&points[2]);
115 for(; i < bound - 1; ++i) {
116 xmm4 = _mm_sub_ps(xmm1, xmm2);
117 xmm5 = _mm_sub_ps(xmm1, xmm3);
119 xmm6 = _mm_mul_ps(xmm4, xmm4);
120 xmm7 = _mm_mul_ps(xmm5, xmm5);
122 xmm2 = _mm_load_ps((
float*)&points[0]);
124 xmm4 = _mm_hadd_ps(xmm6, xmm7);
126 xmm3 = _mm_load_ps((
float*)&points[2]);
128 xmm4 = _mm_mul_ps(xmm4, xmm8);
130 _mm_store_ps(target, xmm4);
135 xmm4 = _mm_sub_ps(xmm1, xmm2);
136 xmm5 = _mm_sub_ps(xmm1, xmm3);
139 xmm6 = _mm_mul_ps(xmm4, xmm4);
140 xmm7 = _mm_mul_ps(xmm5, xmm5);
142 xmm4 = _mm_hadd_ps(xmm6, xmm7);
144 xmm4 = _mm_mul_ps(xmm4, xmm8);
146 _mm_store_ps(target, xmm4);
150 for(i = 0; i < leftovers0; ++i) {
151 xmm2 = _mm_load_ps((
float*)&points[0]);
153 xmm4 = _mm_sub_ps(xmm1, xmm2);
157 xmm6 = _mm_mul_ps(xmm4, xmm4);
159 xmm4 = _mm_hadd_ps(xmm6, xmm6);
161 xmm4 = _mm_mul_ps(xmm4, xmm8);
163 _mm_storeh_pi((__m64*)target, xmm4);
168 for(i = 0; i < leftovers1; ++i) {
170 diff = src0[0] - points[0];
181 #ifdef LV_HAVE_GENERIC
183 volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(
float* target,
lv_32fc_t* src0,
lv_32fc_t* points,
184 float scalar,
unsigned int num_points)
186 const unsigned int num_bytes = num_points*8;
192 for(; i < num_bytes >> 3; ++i) {
193 diff = src0[0] - points[i];
float complex lv_32fc_t
Definition: volk_complex.h:56
#define lv_creal(x)
Definition: volk_complex.h:76
#define lv_cimag(x)
Definition: volk_complex.h:78