GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32f_atan_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_atan_32f
25  *
26  * \b Overview
27  *
28  * Computes arcsine of input vector and stores results in output vector.
29  *
30  * <b>Dispatcher Prototype</b>
31  * \code
32  * void volk_32f_atan_32f(float* bVector, const float* aVector, unsigned int num_points)
33  * \endcode
34  *
35  * \b Inputs
36  * \li aVector: The input vector of floats.
37  * \li num_points: The number of data points.
38  *
39  * \b Outputs
40  * \li bVector: The vector where results will be stored.
41  *
42  * \b Example
43  * Calculate common angles around the top half of the unit circle.
44  * \code
45  * int N = 10;
46  * unsigned int alignment = volk_get_alignment();
47  * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
48  * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
49  *
50  * in[0] = 0.f;
51  * in[1] = 1.f/std::sqrt(3.f);
52  * in[2] = 1.f;
53  * in[3] = std::sqrt(3.f);
54  * in[4] = in[5] = 1e99;
55  * for(unsigned int ii = 6; ii < N; ++ii){
56  * in[ii] = - in[N-ii-1];
57  * }
58  *
59  * volk_32f_atan_32f(out, in, N);
60  *
61  * for(unsigned int ii = 0; ii < N; ++ii){
62  * printf("atan(%1.3f) = %1.3f\n", in[ii], out[ii]);
63  * }
64  *
65  * volk_free(in);
66  * volk_free(out);
67  * \endcode
68  */
69 
70 #include <stdio.h>
71 #include <math.h>
72 #include <inttypes.h>
73 
74 /* This is the number of terms of Taylor series to evaluate, increase this for more accuracy*/
75 #define TERMS 2
76 
77 #ifndef INCLUDED_volk_32f_atan_32f_a_H
78 #define INCLUDED_volk_32f_atan_32f_a_H
79 
80 #ifdef LV_HAVE_SSE4_1
81 #include <smmintrin.h>
82 
83 static inline void
84 volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
85 {
86  float* bPtr = bVector;
87  const float* aPtr = aVector;
88 
89  unsigned int number = 0;
90  unsigned int quarterPoints = num_points / 4;
91  int i, j;
92 
93  __m128 aVal, pio2, x, y, z, arctangent;
94  __m128 fzeroes, fones, ftwos, ffours, condition;
95 
96  pio2 = _mm_set1_ps(3.14159265358979323846/2);
97  fzeroes = _mm_setzero_ps();
98  fones = _mm_set1_ps(1.0);
99  ftwos = _mm_set1_ps(2.0);
100  ffours = _mm_set1_ps(4.0);
101 
102  for(;number < quarterPoints; number++){
103  aVal = _mm_load_ps(aPtr);
104  z = aVal;
105  condition = _mm_cmplt_ps(z, fzeroes);
106  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
107  x = z;
108  condition = _mm_cmplt_ps(z, fones);
109  x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
110 
111  for(i = 0; i < 2; i++){
112  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
113  }
114  x = _mm_div_ps(fones, x);
115  y = fzeroes;
116  for(j = TERMS - 1; j >=0 ; j--){
117  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
118  }
119 
120  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
121  condition = _mm_cmpgt_ps(z, fones);
122 
123  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
124  arctangent = y;
125  condition = _mm_cmplt_ps(aVal, fzeroes);
126  arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
127 
128  _mm_store_ps(bPtr, arctangent);
129  aPtr += 4;
130  bPtr += 4;
131  }
132 
133  number = quarterPoints * 4;
134  for(;number < num_points; number++){
135  *bPtr++ = atan(*aPtr++);
136  }
137 }
138 
139 #endif /* LV_HAVE_SSE4_1 for aligned */
140 
141 #endif /* INCLUDED_volk_32f_atan_32f_a_H */
142 
143 #ifndef INCLUDED_volk_32f_atan_32f_u_H
144 #define INCLUDED_volk_32f_atan_32f_u_H
145 
146 #ifdef LV_HAVE_SSE4_1
147 #include <smmintrin.h>
148 
149 static inline void
150 volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
151 {
152  float* bPtr = bVector;
153  const float* aPtr = aVector;
154 
155  unsigned int number = 0;
156  unsigned int quarterPoints = num_points / 4;
157  int i, j;
158 
159  __m128 aVal, pio2, x, y, z, arctangent;
160  __m128 fzeroes, fones, ftwos, ffours, condition;
161 
162  pio2 = _mm_set1_ps(3.14159265358979323846/2);
163  fzeroes = _mm_setzero_ps();
164  fones = _mm_set1_ps(1.0);
165  ftwos = _mm_set1_ps(2.0);
166  ffours = _mm_set1_ps(4.0);
167 
168  for(;number < quarterPoints; number++){
169  aVal = _mm_loadu_ps(aPtr);
170  z = aVal;
171  condition = _mm_cmplt_ps(z, fzeroes);
172  z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
173  x = z;
174  condition = _mm_cmplt_ps(z, fones);
175  x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
176 
177  for(i = 0; i < 2; i++)
178  x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
179  x = _mm_div_ps(fones, x);
180  y = fzeroes;
181  for(j = TERMS - 1; j >= 0; j--)
182  y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
183 
184  y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
185  condition = _mm_cmpgt_ps(z, fones);
186 
187  y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
188  arctangent = y;
189  condition = _mm_cmplt_ps(aVal, fzeroes);
190  arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
191 
192  _mm_storeu_ps(bPtr, arctangent);
193  aPtr += 4;
194  bPtr += 4;
195  }
196 
197  number = quarterPoints * 4;
198  for(;number < num_points; number++){
199  *bPtr++ = atan(*aPtr++);
200  }
201 }
202 
203 #endif /* LV_HAVE_SSE4_1 for unaligned */
204 
205 #ifdef LV_HAVE_GENERIC
206 
207 static inline void
208 volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
209 {
210  float* bPtr = bVector;
211  const float* aPtr = aVector;
212  unsigned int number = 0;
213 
214  for(number = 0; number < num_points; number++){
215  *bPtr++ = atan(*aPtr++);
216  }
217 }
218 #endif /* LV_HAVE_GENERIC */
219 
220 #endif /* INCLUDED_volk_32f_atan_32f_u_H */
#define TERMS
Definition: volk_32f_atan_32f.h:75