89 #ifndef INCLUDED_volk_32f_log2_32f_a_H
90 #define INCLUDED_volk_32f_log2_32f_a_H
97 #define POLY0(x, c0) _mm_set1_ps(c0)
98 #define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
99 #define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
100 #define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
101 #define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
102 #define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
104 #define LOG_POLY_DEGREE 6
106 #ifdef LV_HAVE_GENERIC
109 volk_32f_log2_32f_generic(
float* bVector,
const float* aVector,
unsigned int num_points)
111 float* bPtr = bVector;
112 const float* aPtr = aVector;
113 unsigned int number = 0;
115 for(number = 0; number < num_points; number++) {
116 *bPtr++ = log2(*aPtr++);
123 #ifdef LV_HAVE_SSE4_1
124 #include <smmintrin.h>
127 volk_32f_log2_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
129 float* bPtr = bVector;
130 const float* aPtr = aVector;
132 unsigned int number = 0;
133 const unsigned int quarterPoints = num_points / 4;
135 __m128 aVal, bVal, mantissa, frac, leadingOne;
138 for(;number < quarterPoints; number++){
140 aVal = _mm_load_ps(aPtr);
141 bias = _mm_set1_epi32(127);
142 leadingOne = _mm_set1_ps(1.0f);
143 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
144 bVal = _mm_cvtepi32_ps(exp);
147 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
149 #if LOG_POLY_DEGREE == 6
150 mantissa =
POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
151 #elif LOG_POLY_DEGREE == 5
152 mantissa =
POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
153 #elif LOG_POLY_DEGREE == 4
154 mantissa =
POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
155 #elif LOG_POLY_DEGREE == 3
156 mantissa =
POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
161 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
162 _mm_store_ps(bPtr, bVal);
168 number = quarterPoints * 4;
169 for(;number < num_points; number++){
170 *bPtr++ = log2(*aPtr++);
178 #include <arm_neon.h>
181 #define VLOG2Q_NEON_PREAMBLE() \
182 int32x4_t one = vdupq_n_s32(0x000800000); \
184 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
185 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
186 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
187 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
188 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
189 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
190 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
191 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
192 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
193 int32x4_t exp_bias = vdupq_n_s32(127);
196 #define VLOG2Q_NEON_F32(log2_approx, aval) \
197 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
198 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
199 exponent_i = vshrq_n_s32(exponent_i, 23); \
204 significand_i = vorrq_s32(one, significand_i); \
205 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
207 exponent_i = vsubq_s32(exponent_i, exp_bias); \
208 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
212 log2_approx = vaddq_f32(exponent_f, p0); \
213 float32x4_t tmp1 = vmulq_f32(significand_f, p1); \
214 log2_approx = vaddq_f32(log2_approx, tmp1); \
215 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); \
216 tmp1 = vmulq_f32(sig_2, p2); \
217 log2_approx = vaddq_f32(log2_approx, tmp1); \
219 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); \
220 tmp1 = vmulq_f32(sig_3, p3); \
221 log2_approx = vaddq_f32(log2_approx, tmp1); \
222 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); \
223 tmp1 = vmulq_f32(sig_4, p4); \
224 log2_approx = vaddq_f32(log2_approx, tmp1); \
225 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); \
226 tmp1 = vmulq_f32(sig_5, p5); \
227 log2_approx = vaddq_f32(log2_approx, tmp1); \
228 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); \
229 tmp1 = vmulq_f32(sig_6, p6); \
230 log2_approx = vaddq_f32(log2_approx, tmp1);
233 volk_32f_log2_32f_neon(
float* bVector,
const float* aVector,
unsigned int num_points)
235 float* bPtr = bVector;
236 const float* aPtr = aVector;
238 const unsigned int quarterPoints = num_points / 4;
241 float32x4_t log2_approx;
243 VLOG2Q_NEON_PREAMBLE()
252 for(number = 0; number < quarterPoints; ++number){
254 aval = vld1q_s32((
int*)aPtr);
256 VLOG2Q_NEON_F32(log2_approx, aval)
258 vst1q_f32(bPtr, log2_approx);
264 for(number = quarterPoints * 4; number < num_points; number++){
265 *bPtr++ = log2(*aPtr++);
274 #ifndef INCLUDED_volk_32f_log2_32f_u_H
275 #define INCLUDED_volk_32f_log2_32f_u_H
278 #ifdef LV_HAVE_GENERIC
281 volk_32f_log2_32f_u_generic(
float* bVector,
const float* aVector,
unsigned int num_points)
283 float* bPtr = bVector;
284 const float* aPtr = aVector;
285 unsigned int number = 0;
287 for(number = 0; number < num_points; number++){
288 *bPtr++ = log2(*aPtr++);
295 #ifdef LV_HAVE_SSE4_1
296 #include <smmintrin.h>
299 volk_32f_log2_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
301 float* bPtr = bVector;
302 const float* aPtr = aVector;
304 unsigned int number = 0;
305 const unsigned int quarterPoints = num_points / 4;
307 __m128 aVal, bVal, mantissa, frac, leadingOne;
310 for(;number < quarterPoints; number++){
312 aVal = _mm_loadu_ps(aPtr);
313 bias = _mm_set1_epi32(127);
314 leadingOne = _mm_set1_ps(1.0f);
315 exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23), bias);
316 bVal = _mm_cvtepi32_ps(exp);
319 frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
321 #if LOG_POLY_DEGREE == 6
322 mantissa =
POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
323 #elif LOG_POLY_DEGREE == 5
324 mantissa =
POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
325 #elif LOG_POLY_DEGREE == 4
326 mantissa =
POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
327 #elif LOG_POLY_DEGREE == 3
328 mantissa =
POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
333 bVal = _mm_add_ps(bVal, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
334 _mm_storeu_ps(bPtr, bVal);
340 number = quarterPoints * 4;
341 for(;number < num_points; number++){
342 *bPtr++ = log2(*aPtr++);
#define POLY3(x, c0, c1, c2, c3)
Definition: volk_32f_log2_32f.h:100
#define POLY4(x, c0, c1, c2, c3, c4)
Definition: volk_32f_log2_32f.h:101
#define POLY2(x, c0, c1, c2)
Definition: volk_32f_log2_32f.h:99
#define POLY5(x, c0, c1, c2, c3, c4, c5)
Definition: volk_32f_log2_32f.h:102