27#if defined(_MM2_FUNCTIONALITY) && !defined(_MM_FUNCTIONALITY)
28#define _MM_FUNCTIONALITY
31#if !defined _VCRT_BUILD && !defined _INC_MALLOC
39#if defined(_MSC_VER) && !defined(__clang__)
49 unsigned __int8 m128_u8[16];
54#define __ATTRIBUTE_SSE__
59 typedef signed int __v4si
__attribute__((__vector_size__(16)));
60 typedef unsigned int __v4su
__attribute__((__vector_size__(16)));
61 typedef float __m128_u
__attribute__((__vector_size__(16), __aligned__(1)));
63 typedef float __m128
__attribute__((__vector_size__(16), __aligned__(16)));
66#define __ATTRIBUTE_SSE__ __attribute__((__target__("sse"),__min_vector_width__(128)))
68#define __ATTRIBUTE_SSE__ __attribute__((__target__("sse")))
70#define __INTRIN_INLINE_SSE __INTRIN_INLINE __ATTRIBUTE_SSE__
74#define _MM_ALIGN16 _VCRT_ALIGN(16)
81#define _MM_HINT_ENTA 4
89#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
90 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
93#define _MM_EXCEPT_MASK 0x003f
94#define _MM_EXCEPT_INVALID 0x0001
95#define _MM_EXCEPT_DENORM 0x0002
96#define _MM_EXCEPT_DIV_ZERO 0x0004
97#define _MM_EXCEPT_OVERFLOW 0x0008
98#define _MM_EXCEPT_UNDERFLOW 0x0010
99#define _MM_EXCEPT_INEXACT 0x0020
101#define _MM_MASK_MASK 0x1f80
102#define _MM_MASK_INVALID 0x0080
103#define _MM_MASK_DENORM 0x0100
104#define _MM_MASK_DIV_ZERO 0x0200
105#define _MM_MASK_OVERFLOW 0x0400
106#define _MM_MASK_UNDERFLOW 0x0800
107#define _MM_MASK_INEXACT 0x1000
109#define _MM_ROUND_MASK 0x6000
110#define _MM_ROUND_NEAREST 0x0000
111#define _MM_ROUND_DOWN 0x2000
112#define _MM_ROUND_UP 0x4000
113#define _MM_ROUND_TOWARD_ZERO 0x6000
115#define _MM_FLUSH_ZERO_MASK 0x8000
116#define _MM_FLUSH_ZERO_ON 0x8000
117#define _MM_FLUSH_ZERO_OFF 0x0000
188__m64 _mm_cvt_ps2pi(__m128
a);
189__m64 _mm_cvtt_ps2pi(__m128
a);
190__m128 _mm_cvt_pi2ps(__m128
a, __m64
b);
211__m128
_mm_set_ps(
float e3,
float e2,
float e1,
float e0);
212__m128
_mm_setr_ps(
float e3,
float e2,
float e1,
float e0);
221int _m_pextrw(__m64
a,
int imm8);
222__m64 _m_pinsrw(__m64
a,
int i,
int imm8);
223__m64 _m_pmaxsw(__m64
a, __m64
b);
224__m64 _m_pmaxub(__m64
a, __m64
b);
225__m64 _m_pminsw(__m64
a, __m64
b);
226__m64 _m_pminub(__m64
a, __m64
b);
227int _m_pmovmskb(__m64
a);
228__m64 _m_pmulhuw(__m64
a, __m64
b);
229__m64 _m_pshufw(__m64
a,
int imm8);
230void _m_maskmovq(__m64
a, __m64
b,
char*);
231__m64 _m_pavgb(__m64
a, __m64
b);
232__m64 _m_pavgw(__m64
a, __m64
b);
233__m64 _m_psadbw(__m64
a, __m64
b);
241__m128 _mm_cvtsi64_ss(__m128
a,
__int64 b);
245#define _mm_cvtss_si32 _mm_cvt_ss2si
246#define _mm_cvttss_si32 _mm_cvtt_ss2si
247#define _mm_cvtsi32_ss _mm_cvt_si2ss
248#define _mm_set1_ps _mm_set_ps1
249#define _mm_load1_ps _mm_load_ps1f
250#define _mm_store1_ps _mm_store_ps1
251#define _mm_cvtps_pi32 _mm_cvt_ps2pi
252#define _mm_cvttps_pi32 _mm_cvtt_ps2pi
253#define _mm_cvtpi32_ps _mm_cvt_pi2ps
254#define _mm_extract_pi16 _m_pextrw
255#define _mm_insert_pi16 _m_pinsrw
256#define _mm_max_pi16 _m_pmaxsw
257#define _mm_max_pu8 _m_pmaxub
258#define _mm_min_pi16 _m_pminsw
259#define _mm_min_pu8 _m_pminub
260#define _mm_movemask_pi8 _m_pmovmskb
261#define _mm_mulhi_pu16 _m_pmulhuw
262#define _mm_shuffle_pi16 _m_pshufw
263#define _mm_maskmove_si64 _m_maskmovq
264#define _mm_avg_pu8 _m_pavgb
265#define _mm_avg_pu16 _m_pavgw
266#define _mm_sad_pu8 _m_psadbw
272static __inline __m128 _mm_cvtpi16_ps(__m64 __a)
277 __b = _mm_setzero_si64();
278 __b = _mm_cmpgt_pi16(__b, __a);
279 __c = _mm_unpackhi_pi16(__a, __b);
283 __c = _mm_unpacklo_pi16(__a, __b);
290static __inline __m128 _mm_cvtpu16_ps(__m64 __a)
295 __b = _mm_setzero_si64();
296 __c = _mm_unpackhi_pi16(__a, __b);
300 __c = _mm_unpacklo_pi16(__a, __b);
307static __inline __m128 _mm_cvtpi8_ps(__m64 __a)
311 __b = _mm_setzero_si64();
312 __b = _mm_cmpgt_pi8(__b, __a);
313 __b = _mm_unpacklo_pi8(__a, __b);
315 return _mm_cvtpi16_ps(__b);
319static __inline __m128 _mm_cvtpu8_ps(__m64 __a)
323 __b = _mm_setzero_si64();
324 __b = _mm_unpacklo_pi8(__a, __b);
326 return _mm_cvtpi16_ps(__b);
330static __inline __m128 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
342static __inline __m64 _mm_cvtps_pi16(__m128 __a)
350 return _mm_packs_pi32(__b,
__c);
354static __inline __m64 _mm_cvtps_pi8(__m128 __a)
358 __b = _mm_cvtps_pi16(__a);
359 __c = _mm_setzero_si64();
361 return _mm_packs_pi16(__b,
__c);
367#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
369 __m128 t0 = _mm_unpacklo_ps(row0, row1); \
370 __m128 t1 = _mm_unpacklo_ps(row2, row3); \
371 __m128 t2 = _mm_unpackhi_ps(row0, row1); \
372 __m128 t3 = _mm_unpackhi_ps(row2, row3); \
373 (row0) = _mm_movelh_ps(t0, t1); \
374 (row1) = _mm_movehl_ps(t1, t0); \
375 (row2) = _mm_movelh_ps(t2, t3); \
376 (row3) = _mm_movehl_ps(t3, t2); \
379#define _MM_GET_EXCEPTION_STATE() \
380 (_mm_getcsr() & _MM_EXCEPT_MASK)
382#define _MM_GET_EXCEPTION_MASK() \
383 (_mm_getcsr() & _MM_MASK_MASK)
385#define _MM_GET_ROUNDING_MODE() \
386 (_mm_getcsr() & _MM_ROUND_MASK)
388#define _MM_GET_FLUSH_ZERO_MODE() \
389 (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
391#define _MM_SET_EXCEPTION_STATE(__mask) \
392 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (__mask))
394#define _MM_SET_EXCEPTION_MASK(__mask) \
395 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (__mask))
397#define _MM_SET_ROUNDING_MODE(__mode) \
398 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (__mode))
400#define _MM_SET_FLUSH_ZERO_MODE(__mode) \
401 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (__mode))
404#if defined(_MSC_VER) && !defined(__clang__)
405#pragma intrinsic(_mm_prefetch)
406#pragma intrinsic(_mm_setzero_ps)
407#pragma intrinsic(_mm_add_ss)
408#pragma intrinsic(_mm_sub_ss)
409#pragma intrinsic(_mm_mul_ss)
410#pragma intrinsic(_mm_div_ss)
411#pragma intrinsic(_mm_sqrt_ss)
412#pragma intrinsic(_mm_rcp_ss)
413#pragma intrinsic(_mm_rsqrt_ss)
414#pragma intrinsic(_mm_min_ss)
415#pragma intrinsic(_mm_max_ss)
416#pragma intrinsic(_mm_add_ps)
417#pragma intrinsic(_mm_sub_ps)
418#pragma intrinsic(_mm_mul_ps)
419#pragma intrinsic(_mm_div_ps)
420#pragma intrinsic(_mm_sqrt_ps)
421#pragma intrinsic(_mm_rcp_ps)
422#pragma intrinsic(_mm_rsqrt_ps)
423#pragma intrinsic(_mm_min_ps)
424#pragma intrinsic(_mm_max_ps)
425#pragma intrinsic(_mm_and_ps)
426#pragma intrinsic(_mm_andnot_ps)
427#pragma intrinsic(_mm_or_ps)
428#pragma intrinsic(_mm_xor_ps)
429#pragma intrinsic(_mm_cmpeq_ss)
430#pragma intrinsic(_mm_cmplt_ss)
431#pragma intrinsic(_mm_cmple_ss)
432#pragma intrinsic(_mm_cmpgt_ss)
433#pragma intrinsic(_mm_cmpge_ss)
434#pragma intrinsic(_mm_cmpneq_ss)
435#pragma intrinsic(_mm_cmpnlt_ss)
436#pragma intrinsic(_mm_cmpnle_ss)
437#pragma intrinsic(_mm_cmpngt_ss)
438#pragma intrinsic(_mm_cmpnge_ss)
439#pragma intrinsic(_mm_cmpord_ss)
440#pragma intrinsic(_mm_cmpunord_ss)
441#pragma intrinsic(_mm_cmpeq_ps)
442#pragma intrinsic(_mm_cmplt_ps)
443#pragma intrinsic(_mm_cmple_ps)
444#pragma intrinsic(_mm_cmpgt_ps)
445#pragma intrinsic(_mm_cmpge_ps)
446#pragma intrinsic(_mm_cmpneq_ps)
447#pragma intrinsic(_mm_cmpnlt_ps)
448#pragma intrinsic(_mm_cmpnle_ps)
449#pragma intrinsic(_mm_cmpngt_ps)
450#pragma intrinsic(_mm_cmpnge_ps)
451#pragma intrinsic(_mm_cmpord_ps)
452#pragma intrinsic(_mm_cmpunord_ps)
453#pragma intrinsic(_mm_comieq_ss)
454#pragma intrinsic(_mm_comilt_ss)
455#pragma intrinsic(_mm_comile_ss)
456#pragma intrinsic(_mm_comigt_ss)
457#pragma intrinsic(_mm_comige_ss)
458#pragma intrinsic(_mm_comineq_ss)
459#pragma intrinsic(_mm_ucomieq_ss)
460#pragma intrinsic(_mm_ucomilt_ss)
461#pragma intrinsic(_mm_ucomile_ss)
462#pragma intrinsic(_mm_ucomigt_ss)
463#pragma intrinsic(_mm_ucomige_ss)
464#pragma intrinsic(_mm_ucomineq_ss)
465#pragma intrinsic(_mm_cvt_ss2si)
466#pragma intrinsic(_mm_cvtt_ss2si)
467#pragma intrinsic(_mm_cvt_si2ss)
469#pragma intrinsic(_mm_cvt_ps2pi)
470#pragma intrinsic(_mm_cvtt_ps2pi)
471#pragma intrinsic(_mm_cvt_pi2ps)
473#pragma intrinsic(_mm_shuffle_ps)
474#pragma intrinsic(_mm_unpackhi_ps)
475#pragma intrinsic(_mm_unpacklo_ps)
476#pragma intrinsic(_mm_loadh_pi)
477#pragma intrinsic(_mm_storeh_pi)
478#pragma intrinsic(_mm_movehl_ps)
479#pragma intrinsic(_mm_movelh_ps)
480#pragma intrinsic(_mm_loadl_pi)
481#pragma intrinsic(_mm_storel_pi)
482#pragma intrinsic(_mm_movemask_ps)
483#pragma intrinsic(_mm_getcsr)
484#pragma intrinsic(_mm_setcsr)
485#pragma intrinsic(_mm_set_ss)
486#pragma intrinsic(_mm_set_ps1)
487#pragma intrinsic(_mm_load_ss)
488#pragma intrinsic(_mm_load_ps1)
489#pragma intrinsic(_mm_load_ps)
490#pragma intrinsic(_mm_loadu_ps)
491#pragma intrinsic(_mm_loadr_ps)
492#pragma intrinsic(_mm_set_ps)
493#pragma intrinsic(_mm_setr_ps)
494#pragma intrinsic(_mm_store_ss)
495#pragma intrinsic(_mm_cvtss_f32)
496#pragma intrinsic(_mm_store_ps)
497#pragma intrinsic(_mm_storeu_ps)
498#pragma intrinsic(_mm_store_ps1)
499#pragma intrinsic(_mm_storer_ps)
500#pragma intrinsic(_mm_move_ss)
502#pragma intrinsic(_m_pextrw)
503#pragma intrinsic(_m_pinsrw)
504#pragma intrinsic(_m_pmaxsw)
505#pragma intrinsic(_m_pmaxub)
506#pragma intrinsic(_m_pminsw)
507#pragma intrinsic(_m_pminub)
508#pragma intrinsic(_m_pmovmskb)
509#pragma intrinsic(_m_pmulhuw)
510#pragma intrinsic(_m_pshufw)
511#pragma intrinsic(_m_maskmovq)
512#pragma intrinsic(_m_pavgb)
513#pragma intrinsic(_m_pavgw)
514#pragma intrinsic(_m_psadbw)
515#pragma intrinsic(_mm_stream_pi)
517#pragma intrinsic(_mm_stream_ps)
518#pragma intrinsic(_mm_sfence)
520#pragma intrinsic(_mm_cvtss_si64)
521#pragma intrinsic(_mm_cvttss_si64)
522#pragma intrinsic(_mm_cvtsi64_ss)
534#if !HAS_BUILTIN(_mm_getcsr) && !defined(_MSC_VER)
537 return __builtin_ia32_stmxcsr();
541#if !HAS_BUILTIN(_mm_setcsr) && !defined(_MSC_VER)
544 __builtin_ia32_ldmxcsr(
a);
556 return (__m128)((__v4sf)__a + (__v4sf)__b);
567 return (__m128)((__v4sf)__a - (__v4sf)__b);
578 return (__m128)((__v4sf)__a * (__v4sf)__b);
589 return (__m128)((__v4sf)__a / (__v4sf)__b);
594 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
599 return __builtin_ia32_sqrtps((__v4sf)__a);
604 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
609 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
614 return __builtin_ia32_rsqrtss((__v4sf)__a);
619 return __builtin_ia32_rsqrtps((__v4sf)__a);
624 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
629 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
634 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
639 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
644 return (__m128)((__v4su)__a & (__v4su)__b);
649 return (__m128)(~(__v4su)__a & (__v4su)__b);
654 return (__m128)((__v4su)__a | (__v4su)__b);
659 return (__m128)((__v4su)__a ^ (__v4su)__b);
664 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
669 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
674 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
679 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
684 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
689 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
694 __v4sf
temp = __builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a);
696 return (__m128)__builtin_shufflevector((__v4sf)__a,
temp, 4, 1, 2, 3);
698 return (__m128)__builtin_ia32_movss((__v4sf)__a,
temp);
704 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
709 __v4sf
temp = __builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a);
711 return (__m128)__builtin_shufflevector((__v4sf)__a,
temp, 4, 1, 2, 3);
713 return (__m128)__builtin_ia32_movss((__v4sf)__a,
temp);
719 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
724 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
729 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
734 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
739 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
744 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
749 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
754 __v4sf
temp = __builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a);
756 return (__m128)__builtin_shufflevector((__v4sf)__a,
temp, 4, 1, 2, 3);
758 return (__m128)__builtin_ia32_movss((__v4sf)__a,
temp);
764 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
769 __v4sf
temp = (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a);
771 return (__m128)__builtin_shufflevector((__v4sf)__a,
temp, 4, 1, 2, 3);
773 return (__m128)__builtin_ia32_movss((__v4sf)__a,
temp);
779 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
784 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
789 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
794 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
799 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
804 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
809 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
814 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
819 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
824 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
829 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
834 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
839 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
844 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
849 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
854 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
859 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
865 return __builtin_ia32_cvtss2si((__v4sf)__a);
871 return __builtin_ia32_cvtss2si64((__v4sf)__a);
878 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
884 return __builtin_ia32_cvttss2si((__v4sf)__a);
890 return __builtin_ia32_cvttss2si64((__v4sf)__a);
897 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
918 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
929 typedef float __mm_loadh_pi_v2f32
__attribute__((__vector_size__(8)));
930 struct __mm_loadh_pi_struct {
931 __mm_loadh_pi_v2f32 __u;
933 __mm_loadh_pi_v2f32 __b = ((
const struct __mm_loadh_pi_struct*)__p)->__u;
934 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
935 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
937 return (__m128)__builtin_ia32_loadhps(__a, __p);
944 typedef float __mm_loadl_pi_v2f32
__attribute__((__vector_size__(8)));
945 struct __mm_loadl_pi_struct {
946 __mm_loadl_pi_v2f32 __u;
948 __mm_loadl_pi_v2f32 __b = ((
const struct __mm_loadl_pi_struct*)__p)->__u;
949 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
950 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
952 return (__m128)__builtin_ia32_loadlps(__a, __p);
969 return *(
const __m128*)__p;
977 return ((
const struct __loadu_ps*)__p)->__v;
984 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
986 return (__m128)__builtin_ia32_shufps(__a, __a,
_MM_SHUFFLE(0,1,2,3));
993 return (__m128)__builtin_ia32_undef128();
995 __m128 undef = undef;
1002 return __extension__ (__m128){ __w, 0, 0, 0 };
1008 return __extension__ (__m128){ __w, __w, __w, __w };
1013 return __extension__ (__m128){ __w, __x, __y, __z };
1018 return __extension__ (__m128){ __z, __y, __x, __w };
1023 return __extension__ (__m128){ 0, 0, 0, 0 };
1029 typedef float __mm_storeh_pi_v2f32
__attribute__((__vector_size__(8)));
1030 struct __mm_storeh_pi_struct {
1031 __mm_storeh_pi_v2f32 __u;
1033 ((
struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1035 __builtin_ia32_storehps(__p, __a);
1042 typedef float __mm_storeh_pi_v2f32
__attribute__((__vector_size__(8)));
1043 struct __mm_storeh_pi_struct {
1044 __mm_storeh_pi_v2f32 __u;
1046 ((
struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1048 __builtin_ia32_storelps(__p, __a);
1054 *__p = ((__v4sf)__a)[0];
1059 *(__m128_u *)__p = __a;
1064 *(__m128*)__p = __a;
1072 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
1074 __a = __builtin_ia32_shufps(__a, __a,
_MM_SHUFFLE(0,0,0,0));
1082 __m128 __tmp = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1084 __m128 __tmp = __builtin_ia32_shufps(__a, __a,
_MM_SHUFFLE(0,1,2,3));
1090#define _MM_HINT_NTA_ALT 0
1091#define _MM_HINT_T0_ALT 3
1092#define _MM_HINT_T1_ALT 2
1093#define _MM_HINT_T2_ALT 1
1094#define _MM_HINT_ENTA_ALT 4
1101#define _MM_HINT_MS_TO_ALT(sel) \
1102 (((sel) == _MM_HINT_NTA) ? _MM_HINT_NTA_ALT : \
1103 ((sel) == _MM_HINT_T0) ? _MM_HINT_T0_ALT : \
1104 ((sel) == _MM_HINT_T1) ? _MM_HINT_T1_ALT : \
1105 ((sel) == _MM_HINT_T2) ? _MM_HINT_T2_ALT : \
1106 ((sel) == _MM_HINT_ENTA) ? _MM_HINT_ENTA_ALT : 0)
1111#pragma intrinsic(_mm_prefetch)
1112#define _mm_prefetch(p, sel) _mm_prefetch(p, _MM_HINT_MS_TO_ALT(sel))
1116#define _mm_prefetch(p, sel) \
1117 __builtin_prefetch((const void *)(p), (_MM_HINT_MS_TO_ALT(sel) >> 2) & 1, _MM_HINT_MS_TO_ALT(sel) & 0x3)
1124 __builtin_ia32_movntq((__v1di*)__p, __a);
1126 __builtin_ia32_movntq((
long long unsigned int *)__p, (
long long unsigned int)__a);
1133 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
1135 __builtin_ia32_movntps(__p, (__v4sf)__a);
1139#if !HAS_BUILTIN(_mm_sfence) && !defined(_MSC_VER)
1142 __builtin_ia32_sfence();
1147#define _m_pextrw(a, n) \
1148 ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
1150#define _m_pinsrw(a, d, n) \
1151 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
1156 return (
unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)__a,
__n);
1162 return (__m64)__builtin_ia32_vec_set_v4hi ((__v4hi)__a, __d,
__n);
1170 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
1176 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
1182 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
1188 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
1194 return __builtin_ia32_pmovmskb((__v8qi)__a);
1200 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
1204#define _m_pshufw(a, n) \
1205 ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
1210 return (__m64) __builtin_ia32_pshufw ((__v4hi)__a,
__n);
1217 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)
__n, __p);
1223 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
1229 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
1235 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
#define _DECLSPEC_INTRIN_TYPE
GLboolean GLboolean GLboolean b
GLboolean GLboolean GLboolean GLboolean a
GLsizei GLenum const GLvoid GLsizei GLenum GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLint GLint GLint GLshort GLshort GLshort GLubyte GLubyte GLubyte GLuint GLuint GLuint GLushort GLushort GLushort GLbyte GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLfloat GLint GLint GLint GLint GLshort GLshort GLshort GLshort GLubyte GLubyte GLubyte GLubyte GLuint GLuint GLuint GLuint GLushort GLushort GLushort GLushort GLboolean const GLdouble const GLfloat const GLint const GLshort const GLbyte const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLdouble const GLfloat const GLfloat const GLint const GLint const GLshort const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort GLenum GLenum GLenum GLfloat GLenum GLint GLenum GLenum GLenum GLfloat GLenum GLenum GLint GLenum GLfloat GLenum GLint GLint GLushort GLenum GLenum GLfloat GLenum GLenum GLint GLfloat const GLubyte GLenum GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLint GLint GLsizei GLsizei GLint GLenum GLenum const GLvoid GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLenum const GLdouble GLenum GLenum const GLfloat GLenum GLenum const GLint GLsizei GLuint GLfloat GLuint GLbitfield GLfloat GLint GLuint GLboolean GLenum GLfloat GLenum GLbitfield GLenum GLfloat GLfloat GLint GLint const GLfloat GLenum GLfloat GLfloat GLint GLint GLfloat GLfloat GLint GLint const GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat const GLdouble const GLfloat const GLdouble const GLfloat GLint i
#define __INTRIN_INLINE_MMX
_Must_inspect_result_ _In_ WDFDEVICE _In_ PWDF_DEVICE_PROPERTY_DATA _In_ DEVPROPTYPE _In_ ULONG Size
int _mm_comineq_ss(__m128 a, __m128 b)
__m128 _mm_cmplt_ss(__m128 a, __m128 b)
__INTRIN_INLINE_SSE void _mm_stream_pi(__m64 *__p, __m64 __a)
__m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
__m128 _mm_add_ss(__m128 a, __m128 b)
__m128 _mm_rsqrt_ps(__m128 a)
void _mm_storeu_ps(float *p, __m128 a)
__m128 _mm_move_ss(__m128 a, __m128 b)
int _mm_comige_ss(__m128 a, __m128 b)
__m128 _mm_loadu_ps(float const *p)
__m128 _mm_loadl_pi(__m128 a, __m64 const *p)
void _mm_stream_ps(float *p, __m128 a)
__m128 _mm_div_ss(__m128 a, __m128 b)
__m128 _mm_max_ss(__m128 a, __m128 b)
int _mm_ucomilt_ss(__m128 a, __m128 b)
void _mm_store_ss(float *p, __m128 a)
__m128 _mm_set_ps(float e3, float e2, float e1, float e0)
__m128 _mm_cmpunord_ss(__m128 a, __m128 b)
#define __INTRIN_INLINE_SSE
float _mm_cvtss_f32(__m128 a)
__m128 _mm_rcp_ps(__m128 a)
__m128 _mm_setzero_ps(void)
void _mm_setcsr(unsigned int a)
__m128 _mm_mul_ss(__m128 a, __m128 b)
__m128 _mm_load_ps(float const *p)
__m128 _mm_cmpnle_ps(__m128 a, __m128 b)
__m128 _mm_cmpeq_ps(__m128 a, __m128 b)
int _mm_ucomige_ss(__m128 a, __m128 b)
__m128 _mm_rsqrt_ss(__m128 a)
__m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8)
__m128 _mm_cmpngt_ps(__m128 a, __m128 b)
__m128 _mm_cmpord_ss(__m128 a, __m128 b)
void _mm_storeh_pi(__m64 *p, __m128 a)
__m128 _mm_set_ss(float a)
__m128 _mm_cvt_si2ss(__m128 a, int b)
void _mm_store_ps1(float *p, __m128 a)
__m128 _mm_and_ps(__m128 a, __m128 b)
int _mm_comieq_ss(__m128 a, __m128 b)
__m128 _mm_andnot_ps(__m128 a, __m128 b)
__INTRIN_INLINE_SSE __m128 _mm_undefined_ps(void)
__m128 _mm_xor_ps(__m128 a, __m128 b)
__m128 _mm_cmpgt_ps(__m128 a, __m128 b)
int _mm_comigt_ss(__m128 a, __m128 b)
__m128 _mm_set_ps1(float a)
__m128 _mm_cmpge_ss(__m128 a, __m128 b)
int _mm_cvt_ss2si(__m128 a)
__m128 _mm_cmpeq_ss(__m128 a, __m128 b)
__m128 _mm_div_ps(__m128 a, __m128 b)
__m128 _mm_max_ps(__m128 a, __m128 b)
__m128 _mm_cmpngt_ss(__m128 a, __m128 b)
__m128 _mm_sub_ps(__m128 a, __m128 b)
__m128 _mm_unpackhi_ps(__m128 a, __m128 b)
__m128 _mm_cmpneq_ss(__m128 a, __m128 b)
int _mm_comilt_ss(__m128 a, __m128 b)
__m128 _mm_min_ss(__m128 a, __m128 b)
__m128 _mm_cmpnle_ss(__m128 a, __m128 b)
int _mm_ucomile_ss(__m128 a, __m128 b)
__m128 _mm_add_ps(__m128 a, __m128 b)
__m128 _mm_load_ss(float const *p)
__m128 _mm_movehl_ps(__m128 a, __m128 b)
__m128 _mm_cmpge_ps(__m128 a, __m128 b)
void _mm_store_ps(float *p, __m128 a)
__m128 _mm_cmpnge_ps(__m128 a, __m128 b)
__m128 _mm_loadh_pi(__m128 a, __m64 const *p)
#define _mm_maskmove_si64
__m128 _mm_min_ps(__m128 a, __m128 b)
int _mm_ucomigt_ss(__m128 a, __m128 b)
int _mm_cvtt_ss2si(__m128 a)
__m128 _mm_cmpunord_ps(__m128 a, __m128 b)
__m128 _mm_sqrt_ss(__m128 a)
#define __ATTRIBUTE_SSE__
__m128 _mm_cmple_ps(__m128 a, __m128 b)
int _mm_ucomieq_ss(__m128 a, __m128 b)
#define _mm_prefetch(p, sel)
__m128 _mm_or_ps(__m128 a, __m128 b)
__m128 _mm_movelh_ps(__m128 a, __m128 b)
void _mm_storer_ps(float *p, __m128 a)
unsigned int _mm_getcsr(void)
__m128 _mm_sqrt_ps(__m128 a)
__m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
int _mm_ucomineq_ss(__m128 a, __m128 b)
__m128 _mm_cmpnge_ss(__m128 a, __m128 b)
__m128 _mm_loadr_ps(float const *p)
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
__m128 _mm_cmpneq_ps(__m128 a, __m128 b)
__m128 _mm_cmpord_ps(__m128 a, __m128 b)
__m128 _mm_load_ps1(float const *p)
__m128 _mm_sub_ss(__m128 a, __m128 b)
__m128 _mm_rcp_ss(__m128 a)
__m128 _mm_cmpgt_ss(__m128 a, __m128 b)
__m128 _mm_cmplt_ps(__m128 a, __m128 b)
__m128 _mm_mul_ps(__m128 a, __m128 b)
__m128 _mm_unpacklo_ps(__m128 a, __m128 b)
int _mm_movemask_ps(__m128 a)
__m128 _mm_cmple_ss(__m128 a, __m128 b)
int _mm_comile_ss(__m128 a, __m128 b)
float __v4sf __attribute__((__vector_size__(16)))
void _mm_storel_pi(__m64 *p, __m128 a)
__m128 _mm_setr_ps(float e3, float e2, float e1, float e0)