1 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H
2 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H
15 static inline void volk_16i_max_star_horizontal_16i_a_ssse3(
int16_t* target,
int16_t* src0,
unsigned int num_bytes) {
17 const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
18 const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d};
19 const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
20 const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02};
24 volatile __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
25 __m128i xmm5, xmm6, xmm7, xmm8;
27 xmm4 = _mm_load_si128((__m128i*)shufmask0);
28 xmm5 = _mm_load_si128((__m128i*)shufmask1);
29 xmm6 = _mm_load_si128((__m128i*)andmask0);
30 xmm7 = _mm_load_si128((__m128i*)andmask1);
32 __m128i *p_target, *p_src0;
34 p_target = (__m128i*)target;
35 p_src0 = (__m128i*)src0;
37 int bound = num_bytes >> 5;
38 int intermediate = (num_bytes >> 4) & 1;
39 int leftovers = (num_bytes >> 1) & 7;
44 for(i = 0; i < bound; ++i) {
46 xmm0 = _mm_load_si128(p_src0);
47 xmm1 = _mm_load_si128(&p_src0[1]);
51 xmm2 = _mm_xor_si128(xmm2, xmm2);
54 xmm3 = _mm_hsub_epi16(xmm0, xmm1);
56 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
58 xmm8 = _mm_and_si128(xmm2, xmm6);
59 xmm3 = _mm_and_si128(xmm2, xmm7);
62 xmm8 = _mm_add_epi8(xmm8, xmm4);
63 xmm3 = _mm_add_epi8(xmm3, xmm5);
65 xmm0 = _mm_shuffle_epi8(xmm0, xmm8);
66 xmm1 = _mm_shuffle_epi8(xmm1, xmm3);
69 xmm3 = _mm_add_epi16(xmm0, xmm1);
72 _mm_store_si128(p_target, xmm3);
78 for(i = 0; i < intermediate; ++i) {
80 xmm0 = _mm_load_si128(p_src0);
83 xmm2 = _mm_xor_si128(xmm2, xmm2);
86 xmm3 = _mm_hsub_epi16(xmm0, xmm1);
87 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3);
89 xmm8 = _mm_and_si128(xmm2, xmm6);
91 xmm3 = _mm_add_epi8(xmm8, xmm4);
93 xmm0 = _mm_shuffle_epi8(xmm0, xmm3);
96 _mm_storel_pd((
double*)p_target, (__m128d)xmm0);
98 p_target = (__m128i*)((
int8_t*)p_target + 8);
102 for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) {
103 target[i>>1] = ((
int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1];
112 #ifdef LV_HAVE_GENERIC
113 static inline void volk_16i_max_star_horizontal_16i_a_generic(
int16_t* target,
int16_t* src0,
unsigned int num_bytes) {
117 int bound = num_bytes >> 1;
120 for(i = 0; i < bound; i += 2) {
121 target[i >> 1] = ((
int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1];