GNU Radio Manual and C++ API Reference  3.7.7
The Free & Open Software Radio Ecosystem
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
volk_32f_sin_32f.h
Go to the documentation of this file.
1 /* -*- c++ -*- */
2 /*
3  * Copyright 2014 Free Software Foundation, Inc.
4  *
5  * This file is part of GNU Radio
6  *
7  * GNU Radio is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3, or (at your option)
10  * any later version.
11  *
12  * GNU Radio is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with GNU Radio; see the file COPYING. If not, write to
19  * the Free Software Foundation, Inc., 51 Franklin Street,
20  * Boston, MA 02110-1301, USA.
21  */
22 
23 /*!
24  * \page volk_32f_sin_32f
25  *
26  * \b Overview
27  *
28  * Computes the sine of the input vector and stores the results in the output vector.
29  *
30  * <b>Dispatcher Prototype</b>
31  * \code
32  * void volk_32f_sin_32f(float* bVector, const float* aVector, unsigned int num_points)
33  * \endcode
34  *
35  * \b Inputs
36  * \li aVector: The input vector of floats.
37  * \li num_points: The number of data points.
38  *
39  * \b Outputs
40  * \li bVector: The output vector.
41  *
42  * \b Example
43  * Calculate sin(theta) for several common angles.
44  * \code
45  * int N = 10;
46  * unsigned int alignment = volk_get_alignment();
47  * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
48  * float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
49  *
50  * in[0] = 0.000;
51  * in[1] = 0.524;
52  * in[2] = 0.786;
53  * in[3] = 1.047;
54  * in[4] = 1.571;
55  * in[5] = 1.571;
56  * in[6] = 2.094;
57  * in[7] = 2.356;
58  * in[8] = 2.618;
59  * in[9] = 3.142;
60  *
61  * volk_32f_sin_32f(out, in, N);
62  *
63  * for(unsigned int ii = 0; ii < N; ++ii){
64  * printf("sin(%1.3f) = %1.3f\n", in[ii], out[ii]);
65  * }
66  *
67  * volk_free(in);
68  * volk_free(out);
69  * \endcode
70  */
71 
72 #include <stdio.h>
73 #include <math.h>
74 #include <inttypes.h>
75 
76 #ifndef INCLUDED_volk_32f_sin_32f_a_H
77 #define INCLUDED_volk_32f_sin_32f_a_H
78 
79 #ifdef LV_HAVE_SSE4_1
80 #include <smmintrin.h>
81 
82 static inline void
83 volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
84 {
85  float* bPtr = bVector;
86  const float* aPtr = aVector;
87 
88  unsigned int number = 0;
89  unsigned int quarterPoints = num_points / 4;
90  unsigned int i = 0;
91 
92  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
93  __m128 sine, cosine, condition1, condition2;
94  __m128i q, r, ones, twos, fours;
95 
96  m4pi = _mm_set1_ps(1.273239545);
97  pio4A = _mm_set1_ps(0.78515625);
98  pio4B = _mm_set1_ps(0.241876e-3);
99  ffours = _mm_set1_ps(4.0);
100  ftwos = _mm_set1_ps(2.0);
101  fones = _mm_set1_ps(1.0);
102  fzeroes = _mm_setzero_ps();
103  ones = _mm_set1_epi32(1);
104  twos = _mm_set1_epi32(2);
105  fours = _mm_set1_epi32(4);
106 
107  cp1 = _mm_set1_ps(1.0);
108  cp2 = _mm_set1_ps(0.83333333e-1);
109  cp3 = _mm_set1_ps(0.2777778e-2);
110  cp4 = _mm_set1_ps(0.49603e-4);
111  cp5 = _mm_set1_ps(0.551e-6);
112 
113  for(;number < quarterPoints; number++) {
114  aVal = _mm_load_ps(aPtr);
115  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
116  q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
117  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
118 
119  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
120  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
121 
122  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
123  s = _mm_mul_ps(s, s);
124  // Evaluate Taylor series
125  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
126 
127  for(i = 0; i < 3; i++) {
128  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
129  }
130  s = _mm_div_ps(s, ftwos);
131 
132  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
133  cosine = _mm_sub_ps(fones, s);
134 
135  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
136  condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
137  // Need this condition only for cos
138  //condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
139 
140  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
141  sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
142  _mm_store_ps(bPtr, sine);
143  aPtr += 4;
144  bPtr += 4;
145  }
146 
147  number = quarterPoints * 4;
148  for(;number < num_points; number++) {
149  *bPtr++ = sin(*aPtr++);
150  }
151 }
152 
153 #endif /* LV_HAVE_SSE4_1 for aligned */
154 
155 
156 #endif /* INCLUDED_volk_32f_sin_32f_a_H */
157 
158 #ifndef INCLUDED_volk_32f_sin_32f_u_H
159 #define INCLUDED_volk_32f_sin_32f_u_H
160 
161 #ifdef LV_HAVE_SSE4_1
162 #include <smmintrin.h>
163 
164 static inline void
165 volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
166 {
167  float* bPtr = bVector;
168  const float* aPtr = aVector;
169 
170  unsigned int number = 0;
171  unsigned int quarterPoints = num_points / 4;
172  unsigned int i = 0;
173 
174  __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones, fzeroes;
175  __m128 sine, cosine, condition1, condition2;
176  __m128i q, r, ones, twos, fours;
177 
178  m4pi = _mm_set1_ps(1.273239545);
179  pio4A = _mm_set1_ps(0.78515625);
180  pio4B = _mm_set1_ps(0.241876e-3);
181  ffours = _mm_set1_ps(4.0);
182  ftwos = _mm_set1_ps(2.0);
183  fones = _mm_set1_ps(1.0);
184  fzeroes = _mm_setzero_ps();
185  ones = _mm_set1_epi32(1);
186  twos = _mm_set1_epi32(2);
187  fours = _mm_set1_epi32(4);
188 
189  cp1 = _mm_set1_ps(1.0);
190  cp2 = _mm_set1_ps(0.83333333e-1);
191  cp3 = _mm_set1_ps(0.2777778e-2);
192  cp4 = _mm_set1_ps(0.49603e-4);
193  cp5 = _mm_set1_ps(0.551e-6);
194 
195  for(;number < quarterPoints; number++) {
196  aVal = _mm_loadu_ps(aPtr);
197  s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
198  q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
199  r = _mm_add_epi32(q, _mm_and_si128(q, ones));
200 
201  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
202  s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
203 
204  s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
205  s = _mm_mul_ps(s, s);
206  // Evaluate Taylor series
207  s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
208 
209  for(i = 0; i < 3; i++) {
210  s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
211  }
212  s = _mm_div_ps(s, ftwos);
213 
214  sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
215  cosine = _mm_sub_ps(fones, s);
216 
217  condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
218  condition2 = _mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes), _mm_cmplt_ps(aVal, fzeroes));
219 
220  sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
221  sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
222  _mm_storeu_ps(bPtr, sine);
223  aPtr += 4;
224  bPtr += 4;
225  }
226 
227  number = quarterPoints * 4;
228  for(;number < num_points; number++){
229  *bPtr++ = sin(*aPtr++);
230  }
231 }
232 
233 #endif /* LV_HAVE_SSE4_1 for unaligned */
234 
235 
236 #ifdef LV_HAVE_GENERIC
237 
238 static inline void
239 volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
240 {
241  float* bPtr = bVector;
242  const float* aPtr = aVector;
243  unsigned int number = 0;
244 
245  for(number = 0; number < num_points; number++) {
246  *bPtr++ = sin(*aPtr++);
247  }
248 
249 }
250 
251 #endif /* LV_HAVE_GENERIC */
252 
253 #endif /* INCLUDED_volk_32f_sin_32f_u_H */