71 #ifndef INCLUDED_volk_32f_x2_multiply_32f_u_H
72 #define INCLUDED_volk_32f_x2_multiply_32f_u_H
78 #include <xmmintrin.h>
81 volk_32f_x2_multiply_32f_u_sse(
float* cVector,
const float* aVector,
82 const float* bVector,
unsigned int num_points)
84 unsigned int number = 0;
85 const unsigned int quarterPoints = num_points / 4;
87 float* cPtr = cVector;
88 const float* aPtr = aVector;
89 const float* bPtr= bVector;
91 __m128 aVal, bVal, cVal;
92 for(;number < quarterPoints; number++){
94 aVal = _mm_loadu_ps(aPtr);
95 bVal = _mm_loadu_ps(bPtr);
97 cVal = _mm_mul_ps(aVal, bVal);
99 _mm_storeu_ps(cPtr,cVal);
106 number = quarterPoints * 4;
107 for(;number < num_points; number++){
108 *cPtr++ = (*aPtr++) * (*bPtr++);
115 #include <immintrin.h>
118 volk_32f_x2_multiply_32f_u_avx(
float* cVector,
const float* aVector,
119 const float* bVector,
unsigned int num_points)
121 unsigned int number = 0;
122 const unsigned int eighthPoints = num_points / 8;
124 float* cPtr = cVector;
125 const float* aPtr = aVector;
126 const float* bPtr= bVector;
128 __m256 aVal, bVal, cVal;
129 for(;number < eighthPoints; number++){
131 aVal = _mm256_loadu_ps(aPtr);
132 bVal = _mm256_loadu_ps(bPtr);
134 cVal = _mm256_mul_ps(aVal, bVal);
136 _mm256_storeu_ps(cPtr,cVal);
143 number = eighthPoints * 8;
144 for(;number < num_points; number++){
145 *cPtr++ = (*aPtr++) * (*bPtr++);
151 #ifdef LV_HAVE_GENERIC
154 volk_32f_x2_multiply_32f_generic(
float* cVector,
const float* aVector,
155 const float* bVector,
unsigned int num_points)
157 float* cPtr = cVector;
158 const float* aPtr = aVector;
159 const float* bPtr= bVector;
160 unsigned int number = 0;
162 for(number = 0; number < num_points; number++){
163 *cPtr++ = (*aPtr++) * (*bPtr++);
172 #ifndef INCLUDED_volk_32f_x2_multiply_32f_a_H
173 #define INCLUDED_volk_32f_x2_multiply_32f_a_H
179 #include <xmmintrin.h>
182 volk_32f_x2_multiply_32f_a_sse(
float* cVector,
const float* aVector,
183 const float* bVector,
unsigned int num_points)
185 unsigned int number = 0;
186 const unsigned int quarterPoints = num_points / 4;
188 float* cPtr = cVector;
189 const float* aPtr = aVector;
190 const float* bPtr= bVector;
192 __m128 aVal, bVal, cVal;
193 for(;number < quarterPoints; number++){
195 aVal = _mm_load_ps(aPtr);
196 bVal = _mm_load_ps(bPtr);
198 cVal = _mm_mul_ps(aVal, bVal);
200 _mm_store_ps(cPtr,cVal);
207 number = quarterPoints * 4;
208 for(;number < num_points; number++){
209 *cPtr++ = (*aPtr++) * (*bPtr++);
216 #include <immintrin.h>
219 volk_32f_x2_multiply_32f_a_avx(
float* cVector,
const float* aVector,
220 const float* bVector,
unsigned int num_points)
222 unsigned int number = 0;
223 const unsigned int eighthPoints = num_points / 8;
225 float* cPtr = cVector;
226 const float* aPtr = aVector;
227 const float* bPtr= bVector;
229 __m256 aVal, bVal, cVal;
230 for(;number < eighthPoints; number++){
232 aVal = _mm256_load_ps(aPtr);
233 bVal = _mm256_load_ps(bPtr);
235 cVal = _mm256_mul_ps(aVal, bVal);
237 _mm256_store_ps(cPtr,cVal);
244 number = eighthPoints * 8;
245 for(;number < num_points; number++){
246 *cPtr++ = (*aPtr++) * (*bPtr++);
253 #include <arm_neon.h>
256 volk_32f_x2_multiply_32f_neon(
float* cVector,
const float* aVector,
257 const float* bVector,
unsigned int num_points)
259 const unsigned int quarter_points = num_points / 4;
261 float32x4_t avec, bvec, cvec;
262 for(number=0; number < quarter_points; ++number) {
263 avec = vld1q_f32(aVector);
264 bvec = vld1q_f32(bVector);
265 cvec = vmulq_f32(avec, bvec);
266 vst1q_f32(cVector, cvec);
271 for(number=quarter_points*4; number < num_points; ++number) {
272 *cVector++ = *aVector++ * *bVector++;
278 #ifdef LV_HAVE_GENERIC
281 volk_32f_x2_multiply_32f_a_generic(
float* cVector,
const float* aVector,
282 const float* bVector,
unsigned int num_points)
284 float* cPtr = cVector;
285 const float* aPtr = aVector;
286 const float* bPtr= bVector;
287 unsigned int number = 0;
289 for(number = 0; number < num_points; number++){
290 *cPtr++ = (*aPtr++) * (*bPtr++);
298 volk_32f_x2_multiply_32f_a_orc_impl(
float* cVector,
const float* aVector,
299 const float* bVector,
unsigned int num_points);
302 volk_32f_x2_multiply_32f_u_orc(
float* cVector,
const float* aVector,
303 const float* bVector,
unsigned int num_points)
305 volk_32f_x2_multiply_32f_a_orc_impl(cVector, aVector, bVector, num_points);