28 #error "SSE2 instruction set not enabled"
34 typedef long long __m128i
__attribute__((__vector_size__(16)));
38 typedef long long __v2di
__attribute__ ((__vector_size__ (16)));
43 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
46 _mm_add_sd(__m128d __a, __m128d
__b)
53 _mm_add_pd(__m128d __a, __m128d
__b)
59 _mm_sub_sd(__m128d __a, __m128d
__b)
66 _mm_sub_pd(__m128d __a, __m128d
__b)
72 _mm_mul_sd(__m128d __a, __m128d
__b)
79 _mm_mul_pd(__m128d __a, __m128d
__b)
85 _mm_div_sd(__m128d __a, __m128d
__b)
92 _mm_div_pd(__m128d __a, __m128d
__b)
98 _mm_sqrt_sd(__m128d __a, __m128d
__b)
100 __m128d
__c = __builtin_ia32_sqrtsd(__b);
101 return (__m128d) { __c[0], __a[1] };
105 _mm_sqrt_pd(__m128d __a)
107 return __builtin_ia32_sqrtpd(__a);
111 _mm_min_sd(__m128d __a, __m128d __b)
113 return __builtin_ia32_minsd(__a, __b);
117 _mm_min_pd(__m128d __a, __m128d __b)
119 return __builtin_ia32_minpd(__a, __b);
123 _mm_max_sd(__m128d __a, __m128d __b)
125 return __builtin_ia32_maxsd(__a, __b);
129 _mm_max_pd(__m128d __a, __m128d __b)
131 return __builtin_ia32_maxpd(__a, __b);
135 _mm_and_pd(__m128d __a, __m128d __b)
137 return (__m128d)((__v4si)__a & (__v4si)
__b);
141 _mm_andnot_pd(__m128d __a, __m128d __b)
143 return (__m128d)(~(__v4si)__a & (__v4si)
__b);
147 _mm_or_pd(__m128d __a, __m128d __b)
149 return (__m128d)((__v4si)__a | (__v4si)
__b);
153 _mm_xor_pd(__m128d __a, __m128d __b)
155 return (__m128d)((__v4si)__a ^ (__v4si)
__b);
159 _mm_cmpeq_pd(__m128d __a, __m128d __b)
161 return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
165 _mm_cmplt_pd(__m128d __a, __m128d __b)
167 return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
171 _mm_cmple_pd(__m128d __a, __m128d __b)
173 return (__m128d)__builtin_ia32_cmplepd(__a, __b);
177 _mm_cmpgt_pd(__m128d __a, __m128d __b)
179 return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
183 _mm_cmpge_pd(__m128d __a, __m128d __b)
185 return (__m128d)__builtin_ia32_cmplepd(__b, __a);
189 _mm_cmpord_pd(__m128d __a, __m128d __b)
191 return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
195 _mm_cmpunord_pd(__m128d __a, __m128d __b)
197 return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
201 _mm_cmpneq_pd(__m128d __a, __m128d __b)
203 return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
207 _mm_cmpnlt_pd(__m128d __a, __m128d __b)
209 return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
213 _mm_cmpnle_pd(__m128d __a, __m128d __b)
215 return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
219 _mm_cmpngt_pd(__m128d __a, __m128d __b)
221 return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
225 _mm_cmpnge_pd(__m128d __a, __m128d __b)
227 return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
231 _mm_cmpeq_sd(__m128d __a, __m128d __b)
233 return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
237 _mm_cmplt_sd(__m128d __a, __m128d __b)
239 return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
243 _mm_cmple_sd(__m128d __a, __m128d __b)
245 return (__m128d)__builtin_ia32_cmplesd(__a, __b);
249 _mm_cmpgt_sd(__m128d __a, __m128d __b)
251 __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
252 return (__m128d) { __c[0], __a[1] };
256 _mm_cmpge_sd(__m128d __a, __m128d __b)
258 __m128d __c = __builtin_ia32_cmplesd(__b, __a);
259 return (__m128d) { __c[0], __a[1] };
263 _mm_cmpord_sd(__m128d __a, __m128d __b)
265 return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
269 _mm_cmpunord_sd(__m128d __a, __m128d __b)
271 return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
275 _mm_cmpneq_sd(__m128d __a, __m128d __b)
277 return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
281 _mm_cmpnlt_sd(__m128d __a, __m128d __b)
283 return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
287 _mm_cmpnle_sd(__m128d __a, __m128d __b)
289 return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
293 _mm_cmpngt_sd(__m128d __a, __m128d __b)
295 __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
296 return (__m128d) { __c[0], __a[1] };
300 _mm_cmpnge_sd(__m128d __a, __m128d __b)
302 __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
303 return (__m128d) { __c[0], __a[1] };
307 _mm_comieq_sd(__m128d __a, __m128d __b)
309 return __builtin_ia32_comisdeq(__a, __b);
313 _mm_comilt_sd(__m128d __a, __m128d __b)
315 return __builtin_ia32_comisdlt(__a, __b);
319 _mm_comile_sd(__m128d __a, __m128d __b)
321 return __builtin_ia32_comisdle(__a, __b);
325 _mm_comigt_sd(__m128d __a, __m128d __b)
327 return __builtin_ia32_comisdgt(__a, __b);
331 _mm_comige_sd(__m128d __a, __m128d __b)
333 return __builtin_ia32_comisdge(__a, __b);
337 _mm_comineq_sd(__m128d __a, __m128d __b)
339 return __builtin_ia32_comisdneq(__a, __b);
343 _mm_ucomieq_sd(__m128d __a, __m128d __b)
345 return __builtin_ia32_ucomisdeq(__a, __b);
349 _mm_ucomilt_sd(__m128d __a, __m128d __b)
351 return __builtin_ia32_ucomisdlt(__a, __b);
355 _mm_ucomile_sd(__m128d __a, __m128d __b)
357 return __builtin_ia32_ucomisdle(__a, __b);
361 _mm_ucomigt_sd(__m128d __a, __m128d __b)
363 return __builtin_ia32_ucomisdgt(__a, __b);
367 _mm_ucomige_sd(__m128d __a, __m128d __b)
369 return __builtin_ia32_ucomisdge(__a, __b);
373 _mm_ucomineq_sd(__m128d __a, __m128d __b)
375 return __builtin_ia32_ucomisdneq(__a, __b);
379 _mm_cvtpd_ps(__m128d __a)
381 return __builtin_ia32_cvtpd2ps(__a);
385 _mm_cvtps_pd(__m128 __a)
387 return __builtin_ia32_cvtps2pd(__a);
391 _mm_cvtepi32_pd(__m128i __a)
393 return __builtin_ia32_cvtdq2pd((__v4si)__a);
397 _mm_cvtpd_epi32(__m128d __a)
399 return __builtin_ia32_cvtpd2dq(__a);
403 _mm_cvtsd_si32(__m128d __a)
405 return __builtin_ia32_cvtsd2si(__a);
409 _mm_cvtsd_ss(__m128 __a, __m128d __b)
416 _mm_cvtsi32_sd(__m128d __a,
int __b)
423 _mm_cvtss_sd(__m128d __a, __m128 __b)
430 _mm_cvttpd_epi32(__m128d __a)
432 return (__m128i)__builtin_ia32_cvttpd2dq(__a);
436 _mm_cvttsd_si32(__m128d __a)
442 _mm_cvtpd_pi32(__m128d __a)
444 return (__m64)__builtin_ia32_cvtpd2pi(__a);
448 _mm_cvttpd_pi32(__m128d __a)
450 return (__m64)__builtin_ia32_cvttpd2pi(__a);
454 _mm_cvtpi32_pd(__m64 __a)
456 return __builtin_ia32_cvtpi2pd((__v2si)__a);
460 _mm_cvtsd_f64(__m128d __a)
466 _mm_load_pd(
double const *__dp)
468 return *(__m128d*)__dp;
472 _mm_load1_pd(
double const *__dp)
474 struct __mm_load1_pd_struct {
477 double __u = ((
struct __mm_load1_pd_struct*)__dp)->__u;
478 return (__m128d){ __u, __u };
481 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
484 _mm_loadr_pd(
double const *__dp)
486 __m128d __u = *(__m128d*)__dp;
487 return __builtin_shufflevector(__u, __u, 1, 0);
491 _mm_loadu_pd(
double const *__dp)
496 return ((
struct __loadu_pd*)__dp)->__v;
500 _mm_load_sd(
double const *__dp)
502 struct __mm_load_sd_struct {
505 double __u = ((
struct __mm_load_sd_struct*)__dp)->__u;
506 return (__m128d){ __u, 0 };
510 _mm_loadh_pd(__m128d __a,
double const *__dp)
512 struct __mm_loadh_pd_struct {
515 double __u = ((
struct __mm_loadh_pd_struct*)__dp)->__u;
516 return (__m128d){ __a[0], __u };
520 _mm_loadl_pd(__m128d __a,
double const *__dp)
522 struct __mm_loadl_pd_struct {
525 double __u = ((
struct __mm_loadl_pd_struct*)__dp)->__u;
526 return (__m128d){ __u, __a[1] };
530 _mm_set_sd(
double __w)
532 return (__m128d){ __w, 0 };
536 _mm_set1_pd(
double __w)
538 return (__m128d){ __w, __w };
542 _mm_set_pd(
double __w,
double __x)
544 return (__m128d){ __x, __w };
548 _mm_setr_pd(
double __w,
double __x)
550 return (__m128d){ __w, __x };
556 return (__m128d){ 0, 0 };
560 _mm_move_sd(__m128d __a, __m128d __b)
562 return (__m128d){ __b[0], __a[1] };
566 _mm_store_sd(
double *__dp, __m128d __a)
568 struct __mm_store_sd_struct {
571 ((
struct __mm_store_sd_struct*)__dp)->__u = __a[0];
575 _mm_store1_pd(
double *__dp, __m128d __a)
577 struct __mm_store1_pd_struct {
580 ((
struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
581 ((
struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
585 _mm_store_pd(
double *__dp, __m128d __a)
587 *(__m128d *)__dp = __a;
591 _mm_storeu_pd(
double *__dp, __m128d __a)
593 __builtin_ia32_storeupd(__dp, __a);
597 _mm_storer_pd(
double *__dp, __m128d __a)
599 __a = __builtin_shufflevector(__a, __a, 1, 0);
600 *(__m128d *)__dp = __a;
604 _mm_storeh_pd(
double *__dp, __m128d __a)
606 struct __mm_storeh_pd_struct {
609 ((
struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
613 _mm_storel_pd(
double *__dp, __m128d __a)
615 struct __mm_storeh_pd_struct {
618 ((
struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
622 _mm_add_epi8(__m128i __a, __m128i __b)
624 return (__m128i)((__v16qi)__a + (__v16qi)
__b);
628 _mm_add_epi16(__m128i __a, __m128i __b)
630 return (__m128i)((__v8hi)__a + (__v8hi)
__b);
634 _mm_add_epi32(__m128i __a, __m128i __b)
636 return (__m128i)((__v4si)__a + (__v4si)
__b);
640 _mm_add_si64(__m64 __a, __m64 __b)
646 _mm_add_epi64(__m128i __a, __m128i __b)
652 _mm_adds_epi8(__m128i __a, __m128i __b)
654 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
658 _mm_adds_epi16(__m128i __a, __m128i __b)
660 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
664 _mm_adds_epu8(__m128i __a, __m128i __b)
666 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
670 _mm_adds_epu16(__m128i __a, __m128i __b)
672 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
676 _mm_avg_epu8(__m128i __a, __m128i __b)
678 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
682 _mm_avg_epu16(__m128i __a, __m128i __b)
684 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
688 _mm_madd_epi16(__m128i __a, __m128i __b)
690 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
694 _mm_max_epi16(__m128i __a, __m128i __b)
696 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
700 _mm_max_epu8(__m128i __a, __m128i __b)
702 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
706 _mm_min_epi16(__m128i __a, __m128i __b)
708 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
712 _mm_min_epu8(__m128i __a, __m128i __b)
714 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
718 _mm_mulhi_epi16(__m128i __a, __m128i __b)
720 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
724 _mm_mulhi_epu16(__m128i __a, __m128i __b)
726 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
730 _mm_mullo_epi16(__m128i __a, __m128i __b)
732 return (__m128i)((__v8hi)__a * (__v8hi)
__b);
736 _mm_mul_su32(__m64 __a, __m64 __b)
738 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
742 _mm_mul_epu32(__m128i __a, __m128i __b)
744 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
748 _mm_sad_epu8(__m128i __a, __m128i __b)
750 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
754 _mm_sub_epi8(__m128i __a, __m128i __b)
756 return (__m128i)((__v16qi)__a - (__v16qi)
__b);
760 _mm_sub_epi16(__m128i __a, __m128i __b)
762 return (__m128i)((__v8hi)__a - (__v8hi)
__b);
766 _mm_sub_epi32(__m128i __a, __m128i __b)
768 return (__m128i)((__v4si)__a - (__v4si)
__b);
772 _mm_sub_si64(__m64 __a, __m64 __b)
778 _mm_sub_epi64(__m128i __a, __m128i __b)
784 _mm_subs_epi8(__m128i __a, __m128i __b)
786 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
790 _mm_subs_epi16(__m128i __a, __m128i __b)
792 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
796 _mm_subs_epu8(__m128i __a, __m128i __b)
798 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
802 _mm_subs_epu16(__m128i __a, __m128i __b)
804 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
808 _mm_and_si128(__m128i __a, __m128i __b)
814 _mm_andnot_si128(__m128i __a, __m128i __b)
820 _mm_or_si128(__m128i __a, __m128i __b)
826 _mm_xor_si128(__m128i __a, __m128i __b)
831 #define _mm_slli_si128(a, imm) __extension__ ({ \
832 (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(), \
833 (__v16qi)(__m128i)(a), \
834 ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \
835 ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \
836 ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \
837 ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \
838 ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \
839 ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \
840 ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \
841 ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \
842 ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \
843 ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \
844 ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \
845 ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \
846 ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \
847 ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \
848 ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \
849 ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); })
851 #define _mm_bslli_si128(a, imm) \
852 _mm_slli_si128((a), (imm))
855 _mm_slli_epi16(__m128i __a,
int __count)
857 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
861 _mm_sll_epi16(__m128i __a, __m128i __count)
863 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
867 _mm_slli_epi32(__m128i __a,
int __count)
869 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
873 _mm_sll_epi32(__m128i __a, __m128i __count)
875 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
879 _mm_slli_epi64(__m128i __a,
int __count)
881 return __builtin_ia32_psllqi128(__a, __count);
885 _mm_sll_epi64(__m128i __a, __m128i __count)
887 return __builtin_ia32_psllq128(__a, __count);
891 _mm_srai_epi16(__m128i __a,
int __count)
893 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
897 _mm_sra_epi16(__m128i __a, __m128i __count)
899 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
903 _mm_srai_epi32(__m128i __a,
int __count)
905 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
909 _mm_sra_epi32(__m128i __a, __m128i __count)
911 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
914 #define _mm_srli_si128(a, imm) __extension__ ({ \
915 (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a), \
916 (__v16qi)_mm_setzero_si128(), \
917 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0, \
918 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1, \
919 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2, \
920 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3, \
921 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4, \
922 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5, \
923 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6, \
924 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7, \
925 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8, \
926 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9, \
927 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \
928 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \
929 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \
930 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \
931 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \
932 ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); })
934 #define _mm_bsrli_si128(a, imm) \
935 _mm_srli_si128((a), (imm))
938 _mm_srli_epi16(__m128i __a,
int __count)
940 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
944 _mm_srl_epi16(__m128i __a, __m128i __count)
946 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
950 _mm_srli_epi32(__m128i __a,
int __count)
952 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
956 _mm_srl_epi32(__m128i __a, __m128i __count)
958 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
962 _mm_srli_epi64(__m128i __a,
int __count)
964 return __builtin_ia32_psrlqi128(__a, __count);
968 _mm_srl_epi64(__m128i __a, __m128i __count)
970 return __builtin_ia32_psrlq128(__a, __count);
974 _mm_cmpeq_epi8(__m128i __a, __m128i __b)
976 return (__m128i)((__v16qi)__a == (__v16qi)
__b);
980 _mm_cmpeq_epi16(__m128i __a, __m128i __b)
982 return (__m128i)((__v8hi)__a == (__v8hi)
__b);
986 _mm_cmpeq_epi32(__m128i __a, __m128i __b)
988 return (__m128i)((__v4si)__a == (__v4si)
__b);
992 _mm_cmpgt_epi8(__m128i __a, __m128i __b)
996 typedef signed char __v16qs
__attribute__((__vector_size__(16)));
997 return (__m128i)((__v16qs)__a > (__v16qs)
__b);
1001 _mm_cmpgt_epi16(__m128i __a, __m128i __b)
1003 return (__m128i)((__v8hi)__a > (__v8hi)
__b);
1007 _mm_cmpgt_epi32(__m128i __a, __m128i __b)
1009 return (__m128i)((__v4si)__a > (__v4si)
__b);
1013 _mm_cmplt_epi8(__m128i __a, __m128i __b)
1015 return _mm_cmpgt_epi8(__b, __a);
1019 _mm_cmplt_epi16(__m128i __a, __m128i __b)
1021 return _mm_cmpgt_epi16(__b, __a);
1025 _mm_cmplt_epi32(__m128i __a, __m128i __b)
1027 return _mm_cmpgt_epi32(__b, __a);
1032 _mm_cvtsi64_sd(__m128d __a,
long long __b)
1039 _mm_cvtsd_si64(__m128d __a)
1041 return __builtin_ia32_cvtsd2si64(__a);
1045 _mm_cvttsd_si64(__m128d __a)
1052 _mm_cvtepi32_ps(__m128i __a)
1054 return __builtin_ia32_cvtdq2ps((__v4si)__a);
1058 _mm_cvtps_epi32(__m128 __a)
1060 return (__m128i)__builtin_ia32_cvtps2dq(__a);
1064 _mm_cvttps_epi32(__m128 __a)
1066 return (__m128i)__builtin_ia32_cvttps2dq(__a);
1070 _mm_cvtsi32_si128(
int __a)
1072 return (__m128i)(__v4si){ __a, 0, 0, 0 };
1077 _mm_cvtsi64_si128(
long long __a)
1079 return (__m128i){ __a, 0 };
1084 _mm_cvtsi128_si32(__m128i __a)
1086 __v4si __b = (__v4si)__a;
1092 _mm_cvtsi128_si64(__m128i __a)
1099 _mm_load_si128(__m128i
const *__p)
1105 _mm_loadu_si128(__m128i
const *__p)
1107 struct __loadu_si128 {
1110 return ((
struct __loadu_si128*)__p)->__v;
1114 _mm_loadl_epi64(__m128i
const *__p)
1116 struct __mm_loadl_epi64_struct {
1119 return (__m128i) { ((
struct __mm_loadl_epi64_struct*)__p)->__u, 0};
1123 _mm_set_epi64x(
long long q1,
long long q0)
1125 return (__m128i){ q0, q1 };
1129 _mm_set_epi64(__m64 q1, __m64 q0)
1131 return (__m128i){ (
long long)q0, (
long long)q1 };
1135 _mm_set_epi32(
int i3,
int i2,
int i1,
int i0)
1137 return (__m128i)(__v4si){ i0, i1, i2, i3};
1141 _mm_set_epi16(
short w7,
short w6,
short w5,
short w4,
short w3,
short w2,
short w1,
short w0)
1143 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1147 _mm_set_epi8(
char b15,
char b14,
char b13,
char b12,
char b11,
char b10,
char b9,
char b8,
char b7,
char b6,
char b5,
char b4,
char b3,
char b2,
char b1,
char b0)
1149 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1153 _mm_set1_epi64x(
long long __q)
1155 return (__m128i){ __q, __q };
1159 _mm_set1_epi64(__m64 __q)
1161 return (__m128i){ (
long long)__q, (
long long)__q };
1165 _mm_set1_epi32(
int __i)
1167 return (__m128i)(__v4si){ __i, __i, __i, __i };
1171 _mm_set1_epi16(
short __w)
1173 return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
1177 _mm_set1_epi8(
char __b)
1179 return (__m128i)(__v16qi){
__b,
__b,
__b,
__b,
__b,
__b,
__b,
__b,
__b,
__b,
__b,
__b,
__b,
__b,
__b, __b };
1183 _mm_setr_epi64(__m64 q0, __m64 q1)
1185 return (__m128i){ (
long long)q0, (
long long)q1 };
1189 _mm_setr_epi32(
int i0,
int i1,
int i2,
int i3)
1191 return (__m128i)(__v4si){ i0, i1, i2, i3};
1195 _mm_setr_epi16(
short w0,
short w1,
short w2,
short w3,
short w4,
short w5,
short w6,
short w7)
1197 return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
1201 _mm_setr_epi8(
char b0,
char b1,
char b2,
char b3,
char b4,
char b5,
char b6,
char b7,
char b8,
char b9,
char b10,
char b11,
char b12,
char b13,
char b14,
char b15)
1203 return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
1207 _mm_setzero_si128(
void)
1209 return (__m128i){ 0LL, 0LL };
1213 _mm_store_si128(__m128i *__p, __m128i __b)
1219 _mm_storeu_si128(__m128i *__p, __m128i __b)
1221 __builtin_ia32_storedqu((
char *)__p, (__v16qi)__b);
1225 _mm_maskmoveu_si128(__m128i __d, __m128i __n,
char *__p)
1227 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
1231 _mm_storel_epi64(__m128i *__p, __m128i __a)
1233 struct __mm_storel_epi64_struct {
1236 ((
struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
1240 _mm_stream_pd(
double *__p, __m128d __a)
1242 __builtin_ia32_movntpd(__p, __a);
1246 _mm_stream_si128(__m128i *__p, __m128i __a)
1248 __builtin_ia32_movntdq(__p, __a);
1252 _mm_stream_si32(
int *__p,
int __a)
1254 __builtin_ia32_movnti(__p, __a);
1259 _mm_stream_si64(
long long *__p,
long long __a)
1261 __builtin_ia32_movnti64(__p, __a);
1266 _mm_clflush(
void const *__p)
1268 __builtin_ia32_clflush(__p);
1274 __builtin_ia32_lfence();
1280 __builtin_ia32_mfence();
1284 _mm_packs_epi16(__m128i __a, __m128i __b)
1286 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
1290 _mm_packs_epi32(__m128i __a, __m128i __b)
1292 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
1296 _mm_packus_epi16(__m128i __a, __m128i __b)
1298 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
1302 _mm_extract_epi16(__m128i __a,
int __imm)
1304 __v8hi __b = (__v8hi)__a;
1305 return (
unsigned short)__b[__imm & 7];
1309 _mm_insert_epi16(__m128i __a,
int __b,
int __imm)
1311 __v8hi __c = (__v8hi)__a;
1312 __c[__imm & 7] =
__b;
1313 return (__m128i)
__c;
1317 _mm_movemask_epi8(__m128i __a)
1319 return __builtin_ia32_pmovmskb128((__v16qi)__a);
1322 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
1323 (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
1324 (__v4si)_mm_set1_epi32(0), \
1325 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1326 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
1328 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
1329 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
1330 (__v8hi)_mm_set1_epi16(0), \
1331 (imm) & 0x3, ((imm) & 0xc) >> 2, \
1332 ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
1335 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
1336 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
1337 (__v8hi)_mm_set1_epi16(0), \
1339 4 + (((imm) & 0x03) >> 0), \
1340 4 + (((imm) & 0x0c) >> 2), \
1341 4 + (((imm) & 0x30) >> 4), \
1342 4 + (((imm) & 0xc0) >> 6)); })
1345 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
1347 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
1351 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
1353 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
1357 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
1359 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
1363 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
1365 return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
1369 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
1371 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
1375 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
1377 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
1381 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
1383 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
1387 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
1389 return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
1393 _mm_movepi64_pi64(__m128i __a)
1395 return (__m64)__a[0];
1399 _mm_movpi64_epi64(__m64 __a)
1401 return (__m128i){ (
long long)__a, 0 };
1405 _mm_move_epi64(__m128i __a)
1407 return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
1411 _mm_unpackhi_pd(__m128d __a, __m128d __b)
1413 return __builtin_shufflevector(__a, __b, 1, 2+1);
1417 _mm_unpacklo_pd(__m128d __a, __m128d __b)
1419 return __builtin_shufflevector(__a, __b, 0, 2+0);
1423 _mm_movemask_pd(__m128d __a)
1425 return __builtin_ia32_movmskpd(__a);
1428 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
1429 __builtin_shufflevector((__m128d)(a), (__m128d)(b), \
1430 (i) & 1, (((i) & 2) >> 1) + 2); })
1433 _mm_castpd_ps(__m128d __a)
1439 _mm_castpd_si128(__m128d __a)
1441 return (__m128i)__a;
1445 _mm_castps_pd(__m128 __a)
1447 return (__m128d)__a;
1451 _mm_castps_si128(__m128 __a)
1453 return (__m128i)__a;
1457 _mm_castsi128_ps(__m128i __a)
1463 _mm_castsi128_pd(__m128i __a)
1465 return (__m128d)__a;
1471 __asm__
volatile (
"pause");
1474 #undef __DEFAULT_FN_ATTRS
1476 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
char __v64qi __attribute__((__vector_size__(64)))
static vector float vector float __b
#define __DEFAULT_FN_ATTRS