66 #ifndef INCLUDED_volk_64u_byteswap_u_H
67 #define INCLUDED_volk_64u_byteswap_u_H
73 #include <emmintrin.h>
75 static inline void volk_64u_byteswap_u_sse2(
uint64_t* intsToSwap,
unsigned int num_points){
77 __m128i input, byte1, byte2, byte3, byte4, output;
78 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
79 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
81 const unsigned int halfPoints = num_points / 2;
82 for(;number < halfPoints; number++){
84 input = _mm_loadu_si128((__m128i*)inputPtr);
87 byte1 = _mm_slli_epi32(input, 24);
88 byte2 = _mm_slli_epi32(input, 8);
89 byte3 = _mm_srli_epi32(input, 8);
90 byte4 = _mm_srli_epi32(input, 24);
92 output = _mm_or_si128(byte1, byte4);
93 byte2 = _mm_and_si128(byte2, byte2mask);
94 output = _mm_or_si128(output, byte2);
95 byte3 = _mm_and_si128(byte3, byte3mask);
96 output = _mm_or_si128(output, byte3);
99 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
102 _mm_storeu_si128((__m128i*)inputPtr, output);
107 number = halfPoints*2;
108 for(; number < num_points; number++){
112 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
114 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
116 *inputPtr++ = output2;
117 *inputPtr++ = output1;
124 #ifdef LV_HAVE_GENERIC
126 static inline void volk_64u_byteswap_generic(
uint64_t* intsToSwap,
unsigned int num_points){
129 for(point = 0; point < num_points; point++){
133 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
135 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
137 *inputPtr++ = output2;
138 *inputPtr++ = output1;
146 #include <arm_neon.h>
148 static inline void volk_64u_byteswap_neon(
uint64_t* intsToSwap,
unsigned int num_points){
150 unsigned int number = 0;
151 unsigned int n8points = num_points / 4;
153 uint8x8x4_t input_table;
154 uint8x8_t int_lookup01, int_lookup23, int_lookup45, int_lookup67;
155 uint8x8_t swapped_int01, swapped_int23, swapped_int45, swapped_int67;
165 int_lookup01 = vcreate_u8(2269495096316185);
166 int_lookup23 = vcreate_u8(146949840772469531);
167 int_lookup45 = vcreate_u8(291630186448622877);
168 int_lookup67 = vcreate_u8(436310532124776223);
170 for(number = 0; number < n8points; ++number){
171 input_table = vld4_u8((
uint8_t*) inputPtr);
172 swapped_int01 = vtbl4_u8(input_table, int_lookup01);
173 swapped_int23 = vtbl4_u8(input_table, int_lookup23);
174 swapped_int45 = vtbl4_u8(input_table, int_lookup45);
175 swapped_int67 = vtbl4_u8(input_table, int_lookup67);
176 vst1_u8((
uint8_t*) inputPtr, swapped_int01);
177 vst1_u8((
uint8_t*) (inputPtr+2), swapped_int23);
178 vst1_u8((
uint8_t*) (inputPtr+4), swapped_int45);
179 vst1_u8((
uint8_t*) (inputPtr+6), swapped_int67);
184 for(number = n8points * 4; number < num_points; ++number){
188 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
189 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
191 *inputPtr++ = output2;
192 *inputPtr++ = output1;
200 #ifndef INCLUDED_volk_64u_byteswap_a_H
201 #define INCLUDED_volk_64u_byteswap_a_H
208 #include <emmintrin.h>
210 static inline void volk_64u_byteswap_a_sse2(
uint64_t* intsToSwap,
unsigned int num_points){
212 __m128i input, byte1, byte2, byte3, byte4, output;
213 __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
214 __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
216 const unsigned int halfPoints = num_points / 2;
217 for(;number < halfPoints; number++){
219 input = _mm_load_si128((__m128i*)inputPtr);
222 byte1 = _mm_slli_epi32(input, 24);
223 byte2 = _mm_slli_epi32(input, 8);
224 byte3 = _mm_srli_epi32(input, 8);
225 byte4 = _mm_srli_epi32(input, 24);
227 output = _mm_or_si128(byte1, byte4);
228 byte2 = _mm_and_si128(byte2, byte2mask);
229 output = _mm_or_si128(output, byte2);
230 byte3 = _mm_and_si128(byte3, byte3mask);
231 output = _mm_or_si128(output, byte3);
234 output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
237 _mm_store_si128((__m128i*)inputPtr, output);
242 number = halfPoints*2;
243 for(; number < num_points; number++){
247 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
249 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
251 *inputPtr++ = output2;
252 *inputPtr++ = output1;
258 #ifdef LV_HAVE_GENERIC
260 static inline void volk_64u_byteswap_a_generic(
uint64_t* intsToSwap,
unsigned int num_points){
263 for(point = 0; point < num_points; point++){
267 output1 = (((output1 >> 24) & 0xff) | ((output1 >> 8) & 0x0000ff00) | ((output1 << 8) & 0x00ff0000) | ((output1 << 24) & 0xff000000));
269 output2 = (((output2 >> 24) & 0xff) | ((output2 >> 8) & 0x0000ff00) | ((output2 << 8) & 0x00ff0000) | ((output2 << 24) & 0xff000000));
271 *inputPtr++ = output2;
272 *inputPtr++ = output1;
unsigned char uint8_t
Definition: stdint.h:78
unsigned int uint32_t
Definition: stdint.h:80
unsigned __int64 uint64_t
Definition: stdint.h:90