77 #ifndef INCLUDED_volk_32f_atan_32f_a_H
78 #define INCLUDED_volk_32f_atan_32f_a_H
81 #include <smmintrin.h>
84 volk_32f_atan_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
89 unsigned int number = 0;
90 unsigned int quarterPoints = num_points / 4;
93 __m128 aVal, pio2, x, y, z, arctangent;
94 __m128 fzeroes, fones, ftwos, ffours, condition;
96 pio2 = _mm_set1_ps(3.14159265358979323846/2);
97 fzeroes = _mm_setzero_ps();
98 fones = _mm_set1_ps(1.0);
99 ftwos = _mm_set1_ps(2.0);
100 ffours = _mm_set1_ps(4.0);
102 for(;number < quarterPoints; number++){
103 aVal = _mm_load_ps(aPtr);
105 condition = _mm_cmplt_ps(z, fzeroes);
106 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
108 condition = _mm_cmplt_ps(z, fones);
109 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
111 for(i = 0; i < 2; i++){
112 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
114 x = _mm_div_ps(fones, x);
116 for(j =
TERMS - 1; j >=0 ; j--){
117 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
120 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
121 condition = _mm_cmpgt_ps(z, fones);
123 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
125 condition = _mm_cmplt_ps(aVal, fzeroes);
126 arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
128 _mm_store_ps(bPtr, arctangent);
133 number = quarterPoints * 4;
134 for(;number < num_points; number++){
135 *bPtr++ = atan(*aPtr++);
143 #ifndef INCLUDED_volk_32f_atan_32f_u_H
144 #define INCLUDED_volk_32f_atan_32f_u_H
146 #ifdef LV_HAVE_SSE4_1
147 #include <smmintrin.h>
150 volk_32f_atan_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
152 float* bPtr = bVector;
153 const float* aPtr = aVector;
155 unsigned int number = 0;
156 unsigned int quarterPoints = num_points / 4;
159 __m128 aVal, pio2, x, y, z, arctangent;
160 __m128 fzeroes, fones, ftwos, ffours, condition;
162 pio2 = _mm_set1_ps(3.14159265358979323846/2);
163 fzeroes = _mm_setzero_ps();
164 fones = _mm_set1_ps(1.0);
165 ftwos = _mm_set1_ps(2.0);
166 ffours = _mm_set1_ps(4.0);
168 for(;number < quarterPoints; number++){
169 aVal = _mm_loadu_ps(aPtr);
171 condition = _mm_cmplt_ps(z, fzeroes);
172 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
174 condition = _mm_cmplt_ps(z, fones);
175 x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
177 for(i = 0; i < 2; i++)
178 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
179 x = _mm_div_ps(fones, x);
181 for(j =
TERMS - 1; j >= 0; j--)
182 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
184 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
185 condition = _mm_cmpgt_ps(z, fones);
187 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
189 condition = _mm_cmplt_ps(aVal, fzeroes);
190 arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
192 _mm_storeu_ps(bPtr, arctangent);
197 number = quarterPoints * 4;
198 for(;number < num_points; number++){
199 *bPtr++ = atan(*aPtr++);
205 #ifdef LV_HAVE_GENERIC
208 volk_32f_atan_32f_generic(
float* bVector,
const float* aVector,
unsigned int num_points)
210 float* bPtr = bVector;
211 const float* aPtr = aVector;
212 unsigned int number = 0;
214 for(number = 0; number < num_points; number++){
215 *bPtr++ = atan(*aPtr++);
#define TERMS
Definition: volk_32f_atan_32f.h:75