ReactOS 0.4.15-dev-7788-g1ad9096
xmmintrin.h
Go to the documentation of this file.
1/*
2 * xmmintrin.h
3 *
4 * This file is part of the ReactOS CRT package.
5 *
6 * Contributors:
7 * Timo Kreuzer (timo.kreuzer@reactos.org)
8 *
9 * THIS SOFTWARE IS NOT COPYRIGHTED
10 *
11 * This source code is offered for use in the public domain. You may
12 * use, modify or distribute it freely.
13 *
14 * This code is distributed in the hope that it will be useful but
15 * WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY
16 * DISCLAIMED. This includes but is not limited to warranties of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18 *
19 */
20
21#pragma once
22#ifndef _INCLUDED_MM2
23#define _INCLUDED_MM2
24
25#include <mmintrin.h>
26
27#if defined(_MM2_FUNCTIONALITY) && !defined(_MM_FUNCTIONALITY)
28#define _MM_FUNCTIONALITY
29#endif
30
31#if !defined _VCRT_BUILD && !defined _INC_MALLOC
32//#include <malloc.h> // FIXME: This breaks build
33#endif
34
35#ifdef __cplusplus
36extern "C" {
37#endif
38
39#if defined(_MSC_VER) && !defined(__clang__)
40
41typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128
42{
43 float m128_f32[4];
44 unsigned __int64 m128_u64[2];
45 __int8 m128_i8[16];
46 __int16 m128_i16[8];
47 __int32 m128_i32[4];
48 __int64 m128_i64[2];
49 unsigned __int8 m128_u8[16];
50 unsigned __int16 m128_u16[8];
51 unsigned __int32 m128_u32[4];
52} __m128;
53
54#define __ATTRIBUTE_SSE__
55
56#else /* _MSC_VER */
57
58 typedef float __v4sf __attribute__((__vector_size__(16)));
59 typedef signed int __v4si __attribute__((__vector_size__(16)));
60 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
61 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
62
63 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
64
65#ifdef __clang__
66#define __ATTRIBUTE_SSE__ __attribute__((__target__("sse"),__min_vector_width__(128)))
67#else
68#define __ATTRIBUTE_SSE__ __attribute__((__target__("sse")))
69#endif
70#define __INTRIN_INLINE_SSE __INTRIN_INLINE __ATTRIBUTE_SSE__
71
72#endif /* _MSC_VER */
73
74#define _MM_ALIGN16 _VCRT_ALIGN(16)
75
76/* Constants for use with _mm_prefetch. */
77#define _MM_HINT_NTA 0
78#define _MM_HINT_T0 1
79#define _MM_HINT_T1 2
80#define _MM_HINT_T2 3
81#define _MM_HINT_ENTA 4
82#if 0 // Not supported yet
83#define _MM_HINT_ET0 5
84#define _MM_HINT_ET1 6
85#define _MM_HINT_ET2 7
86#endif
87
88/* Create a selector for use with the SHUFPS instruction. */
89#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
90 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
91
92/* Bits in the MXCSR. */
93#define _MM_EXCEPT_MASK 0x003f
94#define _MM_EXCEPT_INVALID 0x0001
95#define _MM_EXCEPT_DENORM 0x0002
96#define _MM_EXCEPT_DIV_ZERO 0x0004
97#define _MM_EXCEPT_OVERFLOW 0x0008
98#define _MM_EXCEPT_UNDERFLOW 0x0010
99#define _MM_EXCEPT_INEXACT 0x0020
100
101#define _MM_MASK_MASK 0x1f80
102#define _MM_MASK_INVALID 0x0080
103#define _MM_MASK_DENORM 0x0100
104#define _MM_MASK_DIV_ZERO 0x0200
105#define _MM_MASK_OVERFLOW 0x0400
106#define _MM_MASK_UNDERFLOW 0x0800
107#define _MM_MASK_INEXACT 0x1000
108
109#define _MM_ROUND_MASK 0x6000
110#define _MM_ROUND_NEAREST 0x0000
111#define _MM_ROUND_DOWN 0x2000
112#define _MM_ROUND_UP 0x4000
113#define _MM_ROUND_TOWARD_ZERO 0x6000
114
115#define _MM_FLUSH_ZERO_MASK 0x8000
116#define _MM_FLUSH_ZERO_ON 0x8000
117#define _MM_FLUSH_ZERO_OFF 0x0000
118
119#ifdef __ICL
120void* __cdecl _mm_malloc(size_t Size, size_t Al);
121void __cdecl _mm_free(void* P);
122#endif
123
124void _mm_prefetch(_In_ char const* p, _In_ int i);
125__m128 _mm_setzero_ps(void);
126__m128 _mm_add_ss(__m128 a, __m128 b);
127__m128 _mm_sub_ss(__m128 a, __m128 b);
128__m128 _mm_mul_ss(__m128 a, __m128 b);
129__m128 _mm_div_ss(__m128 a, __m128 b);
130__m128 _mm_sqrt_ss(__m128 a);
131__m128 _mm_rcp_ss(__m128 a);
132__m128 _mm_rsqrt_ss(__m128 a);
133__m128 _mm_min_ss(__m128 a, __m128 b);
134__m128 _mm_max_ss(__m128 a, __m128 b);
135__m128 _mm_add_ps(__m128 a, __m128 b);
136__m128 _mm_sub_ps(__m128 a, __m128 b);
137__m128 _mm_mul_ps(__m128 a, __m128 b);
138__m128 _mm_div_ps(__m128 a, __m128 b);
139__m128 _mm_sqrt_ps(__m128 a);
140__m128 _mm_rcp_ps(__m128 a);
141__m128 _mm_rsqrt_ps(__m128 a);
142__m128 _mm_min_ps(__m128 a, __m128 b);
143__m128 _mm_max_ps(__m128 a, __m128 b);
144__m128 _mm_and_ps(__m128 a, __m128 b);
145__m128 _mm_andnot_ps(__m128 a, __m128 b);
146__m128 _mm_or_ps(__m128 a, __m128 b);
147__m128 _mm_xor_ps(__m128 a, __m128 b);
148__m128 _mm_cmpeq_ss(__m128 a, __m128 b);
149__m128 _mm_cmplt_ss(__m128 a, __m128 b);
150__m128 _mm_cmple_ss(__m128 a, __m128 b);
151__m128 _mm_cmpgt_ss(__m128 a, __m128 b);
152__m128 _mm_cmpge_ss(__m128 a, __m128 b);
153__m128 _mm_cmpneq_ss(__m128 a, __m128 b);
154__m128 _mm_cmpnlt_ss(__m128 a, __m128 b);
155__m128 _mm_cmpnle_ss(__m128 a, __m128 b);
156__m128 _mm_cmpngt_ss(__m128 a, __m128 b);
157__m128 _mm_cmpnge_ss(__m128 a, __m128 b);
158__m128 _mm_cmpord_ss(__m128 a, __m128 b);
159__m128 _mm_cmpunord_ss(__m128 a, __m128 b);
160__m128 _mm_cmpeq_ps(__m128 a, __m128 b);
161__m128 _mm_cmplt_ps(__m128 a, __m128 b);
162__m128 _mm_cmple_ps(__m128 a, __m128 b);
163__m128 _mm_cmpgt_ps(__m128 a, __m128 b);
164__m128 _mm_cmpge_ps(__m128 a, __m128 b);
165__m128 _mm_cmpneq_ps(__m128 a, __m128 b);
166__m128 _mm_cmpnlt_ps(__m128 a, __m128 b);
167__m128 _mm_cmpnle_ps(__m128 a, __m128 b);
168__m128 _mm_cmpngt_ps(__m128 a, __m128 b);
169__m128 _mm_cmpnge_ps(__m128 a, __m128 b);
170__m128 _mm_cmpord_ps(__m128 a, __m128 b);
171__m128 _mm_cmpunord_ps(__m128 a, __m128 b);
172int _mm_comieq_ss(__m128 a, __m128 b);
173int _mm_comilt_ss(__m128 a, __m128 b);
174int _mm_comile_ss(__m128 a, __m128 b);
175int _mm_comigt_ss(__m128 a, __m128 b);
176int _mm_comige_ss(__m128 a, __m128 b);
177int _mm_comineq_ss(__m128 a, __m128 b);
178int _mm_ucomieq_ss(__m128 a, __m128 b);
179int _mm_ucomilt_ss(__m128 a, __m128 b);
180int _mm_ucomile_ss(__m128 a, __m128 b);
181int _mm_ucomigt_ss(__m128 a, __m128 b);
182int _mm_ucomige_ss(__m128 a, __m128 b);
183int _mm_ucomineq_ss(__m128 a, __m128 b);
184int _mm_cvt_ss2si(__m128 a);
185int _mm_cvtt_ss2si(__m128 a);
186__m128 _mm_cvt_si2ss(__m128 a, int b);
187#ifdef _M_IX86
188__m64 _mm_cvt_ps2pi(__m128 a);
189__m64 _mm_cvtt_ps2pi(__m128 a);
190__m128 _mm_cvt_pi2ps(__m128 a, __m64 b);
191#endif
192__m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8);
193__m128 _mm_unpackhi_ps(__m128 a, __m128 b);
194__m128 _mm_unpacklo_ps(__m128 a, __m128 b);
195__m128 _mm_loadh_pi(__m128 a, __m64 const* p);
196void _mm_storeh_pi(__m64* p, __m128 a);
197__m128 _mm_movehl_ps(__m128 a, __m128 b);
198__m128 _mm_movelh_ps(__m128 a, __m128 b);
199__m128 _mm_loadl_pi(__m128 a, __m64 const* p);
200void _mm_storel_pi(__m64* p, __m128 a);
201int _mm_movemask_ps(__m128 a);
202unsigned int _mm_getcsr(void);
203void _mm_setcsr(unsigned int a);
204__m128 _mm_set_ss(float a);
205__m128 _mm_set_ps1(float a);
206__m128 _mm_load_ss(float const* p);
207__m128 _mm_load_ps1(float const* p);
208__m128 _mm_load_ps(float const* p);
209__m128 _mm_loadu_ps(float const* p);
210__m128 _mm_loadr_ps(float const* p);
211__m128 _mm_set_ps(float e3, float e2, float e1, float e0);
212__m128 _mm_setr_ps(float e3, float e2, float e1, float e0);
213void _mm_store_ss(float* p, __m128 a);
214float _mm_cvtss_f32(__m128 a);
215void _mm_store_ps(float* p, __m128 a);
216void _mm_storeu_ps(float* p, __m128 a);
217void _mm_store_ps1(float* p, __m128 a);
218void _mm_storer_ps(float* p, __m128 a);
219__m128 _mm_move_ss(__m128 a, __m128 b);
220#ifdef _M_IX86
221int _m_pextrw(__m64 a, int imm8);
222__m64 _m_pinsrw(__m64 a, int i, int imm8);
223__m64 _m_pmaxsw(__m64 a, __m64 b);
224__m64 _m_pmaxub(__m64 a, __m64 b);
225__m64 _m_pminsw(__m64 a, __m64 b);
226__m64 _m_pminub(__m64 a, __m64 b);
227int _m_pmovmskb(__m64 a);
228__m64 _m_pmulhuw(__m64 a, __m64 b);
229__m64 _m_pshufw(__m64 a, int imm8);
230void _m_maskmovq(__m64 a, __m64 b, char*);
231__m64 _m_pavgb(__m64 a, __m64 b);
232__m64 _m_pavgw(__m64 a, __m64 b);
233__m64 _m_psadbw(__m64 a, __m64 b);
234void _mm_stream_pi(__m64* p, __m64 a);
235#endif
236void _mm_stream_ps(float* p, __m128 a);
237void _mm_sfence(void);
238#ifdef _M_AMD64
239__int64 _mm_cvtss_si64(__m128 a);
240__int64 _mm_cvttss_si64(__m128 a);
241__m128 _mm_cvtsi64_ss(__m128 a, __int64 b);
242#endif
243
244/* Alternate names */
245#define _mm_cvtss_si32 _mm_cvt_ss2si
246#define _mm_cvttss_si32 _mm_cvtt_ss2si
247#define _mm_cvtsi32_ss _mm_cvt_si2ss
248#define _mm_set1_ps _mm_set_ps1
249#define _mm_load1_ps _mm_load_ps1f
250#define _mm_store1_ps _mm_store_ps1
251#define _mm_cvtps_pi32 _mm_cvt_ps2pi
252#define _mm_cvttps_pi32 _mm_cvtt_ps2pi
253#define _mm_cvtpi32_ps _mm_cvt_pi2ps
254#define _mm_extract_pi16 _m_pextrw
255#define _mm_insert_pi16 _m_pinsrw
256#define _mm_max_pi16 _m_pmaxsw
257#define _mm_max_pu8 _m_pmaxub
258#define _mm_min_pi16 _m_pminsw
259#define _mm_min_pu8 _m_pminub
260#define _mm_movemask_pi8 _m_pmovmskb
261#define _mm_mulhi_pu16 _m_pmulhuw
262#define _mm_shuffle_pi16 _m_pshufw
263#define _mm_maskmove_si64 _m_maskmovq
264#define _mm_avg_pu8 _m_pavgb
265#define _mm_avg_pu16 _m_pavgw
266#define _mm_sad_pu8 _m_psadbw
267
268#ifdef _M_IX86
269/* Inline functions from Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h */
270
272static __inline __m128 _mm_cvtpi16_ps(__m64 __a)
273{
274 __m64 __b, __c;
275 __m128 __r;
276
277 __b = _mm_setzero_si64();
278 __b = _mm_cmpgt_pi16(__b, __a);
279 __c = _mm_unpackhi_pi16(__a, __b);
280 __r = _mm_setzero_ps();
281 __r = _mm_cvtpi32_ps(__r, __c);
282 __r = _mm_movelh_ps(__r, __r);
283 __c = _mm_unpacklo_pi16(__a, __b);
284 __r = _mm_cvtpi32_ps(__r, __c);
285
286 return __r;
287}
288
290static __inline __m128 _mm_cvtpu16_ps(__m64 __a)
291{
292 __m64 __b, __c;
293 __m128 __r;
294
295 __b = _mm_setzero_si64();
296 __c = _mm_unpackhi_pi16(__a, __b);
297 __r = _mm_setzero_ps();
298 __r = _mm_cvtpi32_ps(__r, __c);
299 __r = _mm_movelh_ps(__r, __r);
300 __c = _mm_unpacklo_pi16(__a, __b);
301 __r = _mm_cvtpi32_ps(__r, __c);
302
303 return __r;
304}
305
307static __inline __m128 _mm_cvtpi8_ps(__m64 __a)
308{
309 __m64 __b;
310
311 __b = _mm_setzero_si64();
312 __b = _mm_cmpgt_pi8(__b, __a);
313 __b = _mm_unpacklo_pi8(__a, __b);
314
315 return _mm_cvtpi16_ps(__b);
316}
317
319static __inline __m128 _mm_cvtpu8_ps(__m64 __a)
320{
321 __m64 __b;
322
323 __b = _mm_setzero_si64();
324 __b = _mm_unpacklo_pi8(__a, __b);
325
326 return _mm_cvtpi16_ps(__b);
327}
328
330static __inline __m128 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
331{
332 __m128 __c;
333
335 __c = _mm_cvtpi32_ps(__c, __b);
337
338 return _mm_cvtpi32_ps(__c, __a);
339}
340
342static __inline __m64 _mm_cvtps_pi16(__m128 __a)
343{
344 __m64 __b, __c;
345
346 __b = _mm_cvtps_pi32(__a);
347 __a = _mm_movehl_ps(__a, __a);
348 __c = _mm_cvtps_pi32(__a);
349
350 return _mm_packs_pi32(__b, __c);
351}
352
354static __inline __m64 _mm_cvtps_pi8(__m128 __a)
355{
356 __m64 __b, __c;
357
358 __b = _mm_cvtps_pi16(__a);
359 __c = _mm_setzero_si64();
360
361 return _mm_packs_pi16(__b, __c);
362}
363
364#endif /* _M_IX86 */
365
366/* Transpose the 4x4 matrix composed of row[0-3]. */
367#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
368do { \
369 __m128 t0 = _mm_unpacklo_ps(row0, row1); \
370 __m128 t1 = _mm_unpacklo_ps(row2, row3); \
371 __m128 t2 = _mm_unpackhi_ps(row0, row1); \
372 __m128 t3 = _mm_unpackhi_ps(row2, row3); \
373 (row0) = _mm_movelh_ps(t0, t1); \
374 (row1) = _mm_movehl_ps(t1, t0); \
375 (row2) = _mm_movelh_ps(t2, t3); \
376 (row3) = _mm_movehl_ps(t3, t2); \
377} while (0)
378
379#define _MM_GET_EXCEPTION_STATE() \
380 (_mm_getcsr() & _MM_EXCEPT_MASK)
381
382#define _MM_GET_EXCEPTION_MASK() \
383 (_mm_getcsr() & _MM_MASK_MASK)
384
385#define _MM_GET_ROUNDING_MODE() \
386 (_mm_getcsr() & _MM_ROUND_MASK)
387
388#define _MM_GET_FLUSH_ZERO_MODE() \
389 (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
390
391#define _MM_SET_EXCEPTION_STATE(__mask) \
392 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (__mask))
393
394#define _MM_SET_EXCEPTION_MASK(__mask) \
395 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (__mask))
396
397#define _MM_SET_ROUNDING_MODE(__mode) \
398 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (__mode))
399
400#define _MM_SET_FLUSH_ZERO_MODE(__mode) \
401 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (__mode))
402
403/* Use intrinsics on MSVC */
404#if defined(_MSC_VER) && !defined(__clang__)
405#pragma intrinsic(_mm_prefetch)
406#pragma intrinsic(_mm_setzero_ps)
407#pragma intrinsic(_mm_add_ss)
408#pragma intrinsic(_mm_sub_ss)
409#pragma intrinsic(_mm_mul_ss)
410#pragma intrinsic(_mm_div_ss)
411#pragma intrinsic(_mm_sqrt_ss)
412#pragma intrinsic(_mm_rcp_ss)
413#pragma intrinsic(_mm_rsqrt_ss)
414#pragma intrinsic(_mm_min_ss)
415#pragma intrinsic(_mm_max_ss)
416#pragma intrinsic(_mm_add_ps)
417#pragma intrinsic(_mm_sub_ps)
418#pragma intrinsic(_mm_mul_ps)
419#pragma intrinsic(_mm_div_ps)
420#pragma intrinsic(_mm_sqrt_ps)
421#pragma intrinsic(_mm_rcp_ps)
422#pragma intrinsic(_mm_rsqrt_ps)
423#pragma intrinsic(_mm_min_ps)
424#pragma intrinsic(_mm_max_ps)
425#pragma intrinsic(_mm_and_ps)
426#pragma intrinsic(_mm_andnot_ps)
427#pragma intrinsic(_mm_or_ps)
428#pragma intrinsic(_mm_xor_ps)
429#pragma intrinsic(_mm_cmpeq_ss)
430#pragma intrinsic(_mm_cmplt_ss)
431#pragma intrinsic(_mm_cmple_ss)
432#pragma intrinsic(_mm_cmpgt_ss)
433#pragma intrinsic(_mm_cmpge_ss)
434#pragma intrinsic(_mm_cmpneq_ss)
435#pragma intrinsic(_mm_cmpnlt_ss)
436#pragma intrinsic(_mm_cmpnle_ss)
437#pragma intrinsic(_mm_cmpngt_ss)
438#pragma intrinsic(_mm_cmpnge_ss)
439#pragma intrinsic(_mm_cmpord_ss)
440#pragma intrinsic(_mm_cmpunord_ss)
441#pragma intrinsic(_mm_cmpeq_ps)
442#pragma intrinsic(_mm_cmplt_ps)
443#pragma intrinsic(_mm_cmple_ps)
444#pragma intrinsic(_mm_cmpgt_ps)
445#pragma intrinsic(_mm_cmpge_ps)
446#pragma intrinsic(_mm_cmpneq_ps)
447#pragma intrinsic(_mm_cmpnlt_ps)
448#pragma intrinsic(_mm_cmpnle_ps)
449#pragma intrinsic(_mm_cmpngt_ps)
450#pragma intrinsic(_mm_cmpnge_ps)
451#pragma intrinsic(_mm_cmpord_ps)
452#pragma intrinsic(_mm_cmpunord_ps)
453#pragma intrinsic(_mm_comieq_ss)
454#pragma intrinsic(_mm_comilt_ss)
455#pragma intrinsic(_mm_comile_ss)
456#pragma intrinsic(_mm_comigt_ss)
457#pragma intrinsic(_mm_comige_ss)
458#pragma intrinsic(_mm_comineq_ss)
459#pragma intrinsic(_mm_ucomieq_ss)
460#pragma intrinsic(_mm_ucomilt_ss)
461#pragma intrinsic(_mm_ucomile_ss)
462#pragma intrinsic(_mm_ucomigt_ss)
463#pragma intrinsic(_mm_ucomige_ss)
464#pragma intrinsic(_mm_ucomineq_ss)
465#pragma intrinsic(_mm_cvt_ss2si)
466#pragma intrinsic(_mm_cvtt_ss2si)
467#pragma intrinsic(_mm_cvt_si2ss)
468#ifdef _M_IX86
469#pragma intrinsic(_mm_cvt_ps2pi)
470#pragma intrinsic(_mm_cvtt_ps2pi)
471#pragma intrinsic(_mm_cvt_pi2ps)
472#endif // _M_IX86
473#pragma intrinsic(_mm_shuffle_ps)
474#pragma intrinsic(_mm_unpackhi_ps)
475#pragma intrinsic(_mm_unpacklo_ps)
476#pragma intrinsic(_mm_loadh_pi)
477#pragma intrinsic(_mm_storeh_pi)
478#pragma intrinsic(_mm_movehl_ps)
479#pragma intrinsic(_mm_movelh_ps)
480#pragma intrinsic(_mm_loadl_pi)
481#pragma intrinsic(_mm_storel_pi)
482#pragma intrinsic(_mm_movemask_ps)
483#pragma intrinsic(_mm_getcsr)
484#pragma intrinsic(_mm_setcsr)
485#pragma intrinsic(_mm_set_ss)
486#pragma intrinsic(_mm_set_ps1)
487#pragma intrinsic(_mm_load_ss)
488#pragma intrinsic(_mm_load_ps1)
489#pragma intrinsic(_mm_load_ps)
490#pragma intrinsic(_mm_loadu_ps)
491#pragma intrinsic(_mm_loadr_ps)
492#pragma intrinsic(_mm_set_ps)
493#pragma intrinsic(_mm_setr_ps)
494#pragma intrinsic(_mm_store_ss)
495#pragma intrinsic(_mm_cvtss_f32)
496#pragma intrinsic(_mm_store_ps)
497#pragma intrinsic(_mm_storeu_ps)
498#pragma intrinsic(_mm_store_ps1)
499#pragma intrinsic(_mm_storer_ps)
500#pragma intrinsic(_mm_move_ss)
501#ifdef _M_IX86
502#pragma intrinsic(_m_pextrw)
503#pragma intrinsic(_m_pinsrw)
504#pragma intrinsic(_m_pmaxsw)
505#pragma intrinsic(_m_pmaxub)
506#pragma intrinsic(_m_pminsw)
507#pragma intrinsic(_m_pminub)
508#pragma intrinsic(_m_pmovmskb)
509#pragma intrinsic(_m_pmulhuw)
510#pragma intrinsic(_m_pshufw)
511#pragma intrinsic(_m_maskmovq)
512#pragma intrinsic(_m_pavgb)
513#pragma intrinsic(_m_pavgw)
514#pragma intrinsic(_m_psadbw)
515#pragma intrinsic(_mm_stream_pi)
516#endif // _M_IX86
517#pragma intrinsic(_mm_stream_ps)
518#pragma intrinsic(_mm_sfence)
519#ifdef _M_AMD64
520#pragma intrinsic(_mm_cvtss_si64)
521#pragma intrinsic(_mm_cvttss_si64)
522#pragma intrinsic(_mm_cvtsi64_ss)
523#endif // _M_AMD64
524
525#else /* _MSC_VER */
526
527/*
528 GCC: https://github.com/gcc-mirror/gcc/blob/master/gcc/config/i386/xmmintrin.h
529 Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h
530*/
531
532/* Use inline functions on GCC/Clang */
533
534#if !HAS_BUILTIN(_mm_getcsr)
536{
537 return __builtin_ia32_stmxcsr();
538}
539#endif
540
541#if !HAS_BUILTIN(_mm_setcsr)
543{
544 __builtin_ia32_ldmxcsr(a);
545}
546#endif
547
548__INTRIN_INLINE_SSE __m128 _mm_add_ss(__m128 __a, __m128 __b)
549{
550 __a[0] += __b[0];
551 return __a;
552}
553
554__INTRIN_INLINE_SSE __m128 _mm_add_ps(__m128 __a, __m128 __b)
555{
556 return (__m128)((__v4sf)__a + (__v4sf)__b);
557}
558
559__INTRIN_INLINE_SSE __m128 _mm_sub_ss(__m128 __a, __m128 __b)
560{
561 __a[0] -= __b[0];
562 return __a;
563}
564
565__INTRIN_INLINE_SSE __m128 _mm_sub_ps(__m128 __a, __m128 __b)
566{
567 return (__m128)((__v4sf)__a - (__v4sf)__b);
568}
569
570__INTRIN_INLINE_SSE __m128 _mm_mul_ss(__m128 __a, __m128 __b)
571{
572 __a[0] *= __b[0];
573 return __a;
574}
575
576__INTRIN_INLINE_SSE __m128 _mm_mul_ps(__m128 __a, __m128 __b)
577{
578 return (__m128)((__v4sf)__a * (__v4sf)__b);
579}
580
581__INTRIN_INLINE_SSE __m128 _mm_div_ss(__m128 __a, __m128 __b)
582{
583 __a[0] /= __b[0];
584 return __a;
585}
586
587__INTRIN_INLINE_SSE __m128 _mm_div_ps(__m128 __a, __m128 __b)
588{
589 return (__m128)((__v4sf)__a / (__v4sf)__b);
590}
591
593{
594 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
595}
596
598{
599 return __builtin_ia32_sqrtps((__v4sf)__a);
600}
601
603{
604 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
605}
606
608{
609 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
610}
611
613{
614 return __builtin_ia32_rsqrtss((__v4sf)__a);
615}
616
618{
619 return __builtin_ia32_rsqrtps((__v4sf)__a);
620}
621
622__INTRIN_INLINE_SSE __m128 _mm_min_ss(__m128 __a, __m128 __b)
623{
624 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
625}
626
627__INTRIN_INLINE_SSE __m128 _mm_min_ps(__m128 __a, __m128 __b)
628{
629 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
630}
631
632__INTRIN_INLINE_SSE __m128 _mm_max_ss(__m128 __a, __m128 __b)
633{
634 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
635}
636
637__INTRIN_INLINE_SSE __m128 _mm_max_ps(__m128 __a, __m128 __b)
638{
639 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
640}
641
642__INTRIN_INLINE_SSE __m128 _mm_and_ps(__m128 __a, __m128 __b)
643{
644 return (__m128)((__v4su)__a & (__v4su)__b);
645}
646
647__INTRIN_INLINE_SSE __m128 _mm_andnot_ps(__m128 __a, __m128 __b)
648{
649 return (__m128)(~(__v4su)__a & (__v4su)__b);
650}
651
652__INTRIN_INLINE_SSE __m128 _mm_or_ps(__m128 __a, __m128 __b)
653{
654 return (__m128)((__v4su)__a | (__v4su)__b);
655}
656
657__INTRIN_INLINE_SSE __m128 _mm_xor_ps(__m128 __a, __m128 __b)
658{
659 return (__m128)((__v4su)__a ^ (__v4su)__b);
660}
661
662__INTRIN_INLINE_SSE __m128 _mm_cmpeq_ss(__m128 __a, __m128 __b)
663{
664 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
665}
666
667__INTRIN_INLINE_SSE __m128 _mm_cmpeq_ps(__m128 __a, __m128 __b)
668{
669 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
670}
671
672__INTRIN_INLINE_SSE __m128 _mm_cmplt_ss(__m128 __a, __m128 __b)
673{
674 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
675}
676
677__INTRIN_INLINE_SSE __m128 _mm_cmplt_ps(__m128 __a, __m128 __b)
678{
679 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
680}
681
682__INTRIN_INLINE_SSE __m128 _mm_cmple_ss(__m128 __a, __m128 __b)
683{
684 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
685}
686
687__INTRIN_INLINE_SSE __m128 _mm_cmple_ps(__m128 __a, __m128 __b)
688{
689 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
690}
691
692__INTRIN_INLINE_SSE __m128 _mm_cmpgt_ss(__m128 __a, __m128 __b)
693{
694 __v4sf temp = __builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a);
695#ifdef __clang__
696 return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
697#else
698 return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
699#endif
700}
701
702__INTRIN_INLINE_SSE __m128 _mm_cmpgt_ps(__m128 __a, __m128 __b)
703{
704 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
705}
706
707__INTRIN_INLINE_SSE __m128 _mm_cmpge_ss(__m128 __a, __m128 __b)
708{
709 __v4sf temp = __builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a);
710#ifdef __clang__
711 return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
712#else
713 return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
714#endif
715}
716
717__INTRIN_INLINE_SSE __m128 _mm_cmpge_ps(__m128 __a, __m128 __b)
718{
719 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
720}
721
722__INTRIN_INLINE_SSE __m128 _mm_cmpneq_ss(__m128 __a, __m128 __b)
723{
724 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
725}
726
727__INTRIN_INLINE_SSE __m128 _mm_cmpneq_ps(__m128 __a, __m128 __b)
728{
729 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
730}
731
732__INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
733{
734 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
735}
736
737__INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
738{
739 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
740}
741
742__INTRIN_INLINE_SSE __m128 _mm_cmpnle_ss(__m128 __a, __m128 __b)
743{
744 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
745}
746
747__INTRIN_INLINE_SSE __m128 _mm_cmpnle_ps(__m128 __a, __m128 __b)
748{
749 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
750}
751
752__INTRIN_INLINE_SSE __m128 _mm_cmpngt_ss(__m128 __a, __m128 __b)
753{
754 __v4sf temp = __builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a);
755#ifdef __clang__
756 return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
757#else
758 return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
759#endif
760}
761
762__INTRIN_INLINE_SSE __m128 _mm_cmpngt_ps(__m128 __a, __m128 __b)
763{
764 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
765}
766
767__INTRIN_INLINE_SSE __m128 _mm_cmpnge_ss(__m128 __a, __m128 __b)
768{
769 __v4sf temp = (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a);
770#ifdef __clang__
771 return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
772#else
773 return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
774#endif
775}
776
777__INTRIN_INLINE_SSE __m128 _mm_cmpnge_ps(__m128 __a, __m128 __b)
778{
779 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
780}
781
782__INTRIN_INLINE_SSE __m128 _mm_cmpord_ss(__m128 __a, __m128 __b)
783{
784 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
785}
786
787__INTRIN_INLINE_SSE __m128 _mm_cmpord_ps(__m128 __a, __m128 __b)
788{
789 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
790}
791
792__INTRIN_INLINE_SSE __m128 _mm_cmpunord_ss(__m128 __a, __m128 __b)
793{
794 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
795}
796
797__INTRIN_INLINE_SSE __m128 _mm_cmpunord_ps(__m128 __a, __m128 __b)
798{
799 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
800}
801
802__INTRIN_INLINE_SSE int _mm_comieq_ss(__m128 __a, __m128 __b)
803{
804 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
805}
806
807__INTRIN_INLINE_SSE int _mm_comilt_ss(__m128 __a, __m128 __b)
808{
809 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
810}
811
812__INTRIN_INLINE_SSE int _mm_comile_ss(__m128 __a, __m128 __b)
813{
814 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
815}
816
817__INTRIN_INLINE_SSE int _mm_comigt_ss(__m128 __a, __m128 __b)
818{
819 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
820}
821
822__INTRIN_INLINE_SSE int _mm_comige_ss(__m128 __a, __m128 __b)
823{
824 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
825}
826
827__INTRIN_INLINE_SSE int _mm_comineq_ss(__m128 __a, __m128 __b)
828{
829 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
830}
831
832__INTRIN_INLINE_SSE int _mm_ucomieq_ss(__m128 __a, __m128 __b)
833{
834 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
835}
836
837__INTRIN_INLINE_SSE int _mm_ucomilt_ss(__m128 __a, __m128 __b)
838{
839 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
840}
841
842__INTRIN_INLINE_SSE int _mm_ucomile_ss(__m128 __a, __m128 __b)
843{
844 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
845}
846
847__INTRIN_INLINE_SSE int _mm_ucomigt_ss(__m128 __a, __m128 __b)
848{
849 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
850}
851
852__INTRIN_INLINE_SSE int _mm_ucomige_ss(__m128 __a, __m128 __b)
853{
854 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
855}
856
857__INTRIN_INLINE_SSE int _mm_ucomineq_ss(__m128 __a, __m128 __b)
858{
859 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
860}
861
862// _mm_cvt_ss2si
864{
865 return __builtin_ia32_cvtss2si((__v4sf)__a);
866}
867
868#ifdef _M_AMD64
869__INTRIN_INLINE_SSE long long _mm_cvtss_si64(__m128 __a)
870{
871 return __builtin_ia32_cvtss2si64((__v4sf)__a);
872}
873#endif
874
875// _mm_cvt_ps2pi
877{
878 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
879}
880
881// _mm_cvtt_ss2si
883{
884 return __builtin_ia32_cvttss2si((__v4sf)__a);
885}
886
887#ifdef _M_AMD64
888__INTRIN_INLINE_SSE long long _mm_cvttss_si64(__m128 __a)
889{
890 return __builtin_ia32_cvttss2si64((__v4sf)__a);
891}
892#endif
893
894// _mm_cvtt_ps2pi
896{
897 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
898}
899
900// _mm_cvt_si2ss
901__INTRIN_INLINE_SSE __m128 _mm_cvtsi32_ss(__m128 __a, int __b)
902{
903 __a[0] = __b;
904 return __a;
905}
906
907#ifdef _M_AMD64
908__INTRIN_INLINE_SSE __m128 _mm_cvtsi64_ss(__m128 __a, long long __b)
909{
910 __a[0] = __b;
911 return __a;
912}
913#endif
914
915// _mm_cvt_pi2ps
916__INTRIN_INLINE_SSE __m128 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
917{
918 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
919}
920
922{
923 return __a[0];
924}
925
926__INTRIN_INLINE_SSE __m128 _mm_loadh_pi(__m128 __a, const __m64 *__p)
927{
928#ifdef __clang__
929 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
930 struct __mm_loadh_pi_struct {
931 __mm_loadh_pi_v2f32 __u;
932 } __attribute__((__packed__, __may_alias__));
933 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
934 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
935 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
936#else
937 return (__m128)__builtin_ia32_loadhps(__a, __p);
938#endif
939}
940
941__INTRIN_INLINE_SSE __m128 _mm_loadl_pi(__m128 __a, const __m64 *__p)
942{
943#ifdef __clang__
944 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
945 struct __mm_loadl_pi_struct {
946 __mm_loadl_pi_v2f32 __u;
947 } __attribute__((__packed__, __may_alias__));
948 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
949 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
950 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
951#else
952 return (__m128)__builtin_ia32_loadlps(__a, __p);
953#endif
954}
955
956__INTRIN_INLINE_SSE __m128 _mm_load_ss(const float *__p)
957{
958 return _mm_set_ss(*__p);
959}
960
961// _mm_load_ps1
962__INTRIN_INLINE_SSE __m128 _mm_load1_ps(const float *__p)
963{
964 return _mm_set1_ps(*__p);
965}
966
967__INTRIN_INLINE_SSE __m128 _mm_load_ps(const float *__p)
968{
969 return *(const __m128*)__p;
970}
971
972__INTRIN_INLINE_SSE __m128 _mm_loadu_ps(const float *__p)
973{
974 struct __loadu_ps {
975 __m128_u __v;
976 } __attribute__((__packed__, __may_alias__));
977 return ((const struct __loadu_ps*)__p)->__v;
978}
979
980__INTRIN_INLINE_SSE __m128 _mm_loadr_ps(const float *__p)
981{
982 __m128 __a = _mm_load_ps(__p);
983#ifdef __clang__
984 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
985#else
986 return (__m128)__builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3));
987#endif
988}
989
991{
992#ifdef __clang__
993 return (__m128)__builtin_ia32_undef128();
994#else
995 __m128 undef = undef;
996 return undef;
997#endif
998}
999
1001{
1002 return __extension__ (__m128){ __w, 0, 0, 0 };
1003}
1004
1005// _mm_set_ps1
1007{
1008 return __extension__ (__m128){ __w, __w, __w, __w };
1009}
1010
1011__INTRIN_INLINE_SSE __m128 _mm_set_ps(float __z, float __y, float __x, float __w)
1012{
1013 return __extension__ (__m128){ __w, __x, __y, __z };
1014}
1015
1016__INTRIN_INLINE_SSE __m128 _mm_setr_ps(float __z, float __y, float __x, float __w)
1017{
1018 return __extension__ (__m128){ __z, __y, __x, __w };
1019}
1020
1022{
1023 return __extension__ (__m128){ 0, 0, 0, 0 };
1024}
1025
1026__INTRIN_INLINE_SSE void _mm_storeh_pi(__m64 *__p, __m128 __a)
1027{
1028#ifdef __clang__
1029 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1030 struct __mm_storeh_pi_struct {
1031 __mm_storeh_pi_v2f32 __u;
1032 } __attribute__((__packed__, __may_alias__));
1033 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1034#else
1035 __builtin_ia32_storehps(__p, __a);
1036#endif
1037}
1038
1039__INTRIN_INLINE_SSE void _mm_storel_pi(__m64 *__p, __m128 __a)
1040{
1041#ifdef __clang__
1042 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1043 struct __mm_storeh_pi_struct {
1044 __mm_storeh_pi_v2f32 __u;
1045 } __attribute__((__packed__, __may_alias__));
1046 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1047#else
1048 __builtin_ia32_storelps(__p, __a);
1049#endif
1050}
1051
1052__INTRIN_INLINE_SSE void _mm_store_ss(float *__p, __m128 __a)
1053{
1054 *__p = ((__v4sf)__a)[0];
1055}
1056
1057__INTRIN_INLINE_SSE void _mm_storeu_ps(float *__p, __m128 __a)
1058{
1059 *(__m128_u *)__p = __a;
1060}
1061
1062__INTRIN_INLINE_SSE void _mm_store_ps(float *__p, __m128 __a)
1063{
1064 *(__m128*)__p = __a;
1065}
1066
1067// _mm_store_ps1
1068__INTRIN_INLINE_SSE void _mm_store1_ps(float *__p, __m128 __a)
1069{
1070 // FIXME: Should we use a temp instead?
1071#ifdef __clang__
1072 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
1073#else
1074 __a = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,0,0,0));
1075#endif
1076 _mm_store_ps(__p, __a);
1077}
1078
1079__INTRIN_INLINE_SSE void _mm_storer_ps(float *__p, __m128 __a)
1080{
1081#ifdef __clang__
1082 __m128 __tmp = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1083#else
1084 __m128 __tmp = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3));
1085#endif
1086 _mm_store_ps(__p, __tmp);
1087}
1088
1089/* GCC / Clang specific consants */
1090#define _MM_HINT_NTA_ALT 0
1091#define _MM_HINT_T0_ALT 3
1092#define _MM_HINT_T1_ALT 2
1093#define _MM_HINT_T2_ALT 1
1094#define _MM_HINT_ENTA_ALT 4
1095
1096// These are not supported yet
1097//#define _MM_HINT_ET0_ALT 7
1098//#define _MM_HINT_ET1_ALT 6
1099//#define _MM_HINT_ET2_ALT 5
1100
1101#define _MM_HINT_MS_TO_ALT(sel) \
1102 (((sel) == _MM_HINT_NTA) ? _MM_HINT_NTA_ALT : \
1103 ((sel) == _MM_HINT_T0) ? _MM_HINT_T0_ALT : \
1104 ((sel) == _MM_HINT_T1) ? _MM_HINT_T1_ALT : \
1105 ((sel) == _MM_HINT_T2) ? _MM_HINT_T2_ALT : \
1106 ((sel) == _MM_HINT_ENTA) ? _MM_HINT_ENTA_ALT : 0)
1107
1108#ifdef _MSC_VER1
1109
1110/* On clang-cl we have an intrinsic, but the constants are different */
1111#pragma intrinsic(_mm_prefetch)
1112#define _mm_prefetch(p, sel) _mm_prefetch(p, _MM_HINT_MS_TO_ALT(sel))
1113
1114#else /* _MSC_VER */
1115
1116#define _mm_prefetch(p, sel) \
1117 __builtin_prefetch((const void *)(p), (_MM_HINT_MS_TO_ALT(sel) >> 2) & 1, _MM_HINT_MS_TO_ALT(sel) & 0x3)
1118
1119#endif /* _MSC_VER */
1120
1121__INTRIN_INLINE_SSE void _mm_stream_pi(__m64 *__p, __m64 __a)
1122{
1123#ifdef __clang__
1124 __builtin_ia32_movntq((__v1di*)__p, __a);
1125#else
1126 __builtin_ia32_movntq((long long unsigned int *)__p, (long long unsigned int)__a);
1127#endif
1128}
1129
1130__INTRIN_INLINE_SSE void _mm_stream_ps(float *__p, __m128 __a)
1131{
1132#ifdef __clang__
1133 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
1134#else
1135 __builtin_ia32_movntps(__p, (__v4sf)__a);
1136#endif
1137}
1138
1139#if !HAS_BUILTIN(_mm_sfence)
1141{
1142 __builtin_ia32_sfence();
1143}
1144#endif
1145
1146#ifdef __clang__
1147#define _m_pextrw(a, n) \
1148 ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
1149
1150#define _m_pinsrw(a, d, n) \
1151 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
1152#else
1153// _m_pextrw
1154__INTRIN_INLINE_SSE int _mm_extract_pi16(__m64 const __a, int const __n)
1155{
1156 return (unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)__a, __n);
1157}
1158
1159// _m_pinsrw
1160__INTRIN_INLINE_SSE __m64 _mm_insert_pi16 (__m64 const __a, int const __d, int const __n)
1161{
1162 return (__m64)__builtin_ia32_vec_set_v4hi ((__v4hi)__a, __d, __n);
1163}
1164
1165#endif
1166
1167// _m_pmaxsw
1168__INTRIN_INLINE_SSE __m64 _mm_max_pi16(__m64 __a, __m64 __b)
1169{
1170 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
1171}
1172
1173// _m_pmaxub
1174__INTRIN_INLINE_SSE __m64 _mm_max_pu8(__m64 __a, __m64 __b)
1175{
1176 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
1177}
1178
1179// _m_pminsw
1180__INTRIN_INLINE_SSE __m64 _mm_min_pi16(__m64 __a, __m64 __b)
1181{
1182 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
1183}
1184
1185// _m_pminub
1186__INTRIN_INLINE_SSE __m64 _mm_min_pu8(__m64 __a, __m64 __b)
1187{
1188 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
1189}
1190
1191// _m_pmovmskb
1193{
1194 return __builtin_ia32_pmovmskb((__v8qi)__a);
1195}
1196
1197// _m_pmulhuw
1198__INTRIN_INLINE_SSE __m64 _mm_mulhi_pu16(__m64 __a, __m64 __b)
1199{
1200 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
1201}
1202
1203#ifdef __clang__
1204#define _m_pshufw(a, n) \
1205 ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
1206#else
1207// _m_pshufw
1208__INTRIN_INLINE_MMX __m64 _mm_shuffle_pi16 (__m64 __a, int const __n)
1209{
1210 return (__m64) __builtin_ia32_pshufw ((__v4hi)__a, __n);
1211}
1212#endif
1213
1214// _m_maskmovq
1215__INTRIN_INLINE_SSE void _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
1216{
1217 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
1218}
1219
1220// _m_pavgb
1221__INTRIN_INLINE_SSE __m64 _mm_avg_pu8(__m64 __a, __m64 __b)
1222{
1223 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
1224}
1225
1226// _m_pavgw
1227__INTRIN_INLINE_SSE __m64 _mm_avg_pu16(__m64 __a, __m64 __b)
1228{
1229 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
1230}
1231
1232// _m_psadbw
1233__INTRIN_INLINE_SSE __m64 _mm_sad_pu8(__m64 __a, __m64 __b)
1234{
1235 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
1236}
1237
1238#endif // __GNUC__
1239
1240#ifdef __cplusplus
1241}
1242#endif // __cplusplus
1243
1244#endif /* _INCLUDED_MM2 */
return __n
Definition: _algo.h:75
#define _DECLSPEC_INTRIN_TYPE
Definition: _mingw.h:234
#define __cdecl
Definition: accygwin.h:79
#define __int8
Definition: basetyps.h:25
#define __int16
Definition: basetyps.h:22
#define __int64
Definition: basetyps.h:16
#define __int32
Definition: basetyps.h:19
#define _CRT_ALIGN(x)
Definition: crtdefs.h:154
#define P(row, col)
GLboolean GLboolean GLboolean b
Definition: glext.h:6204
GLfloat GLfloat p
Definition: glext.h:8902
GLboolean GLboolean GLboolean GLboolean a
Definition: glext.h:6204
GLsizei GLenum const GLvoid GLsizei GLenum GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLint GLint GLint GLshort GLshort GLshort GLubyte GLubyte GLubyte GLuint GLuint GLuint GLushort GLushort GLushort GLbyte GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLfloat GLint GLint GLint GLint GLshort GLshort GLshort GLshort GLubyte GLubyte GLubyte GLubyte GLuint GLuint GLuint GLuint GLushort GLushort GLushort GLushort GLboolean const GLdouble const GLfloat const GLint const GLshort const GLbyte const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLdouble const GLfloat const GLfloat const GLint const GLint const GLshort const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort GLenum GLenum GLenum GLfloat GLenum GLint GLenum GLenum GLenum GLfloat GLenum GLenum GLint GLenum GLfloat GLenum GLint GLint GLushort GLenum GLenum GLfloat GLenum GLenum GLint GLfloat const GLubyte GLenum GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLint GLint GLsizei GLsizei GLint GLenum GLenum const GLvoid GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLenum const GLdouble GLenum GLenum const GLfloat GLenum GLenum const GLint GLsizei GLuint GLfloat GLuint GLbitfield GLfloat GLint GLuint GLboolean GLenum GLfloat GLenum GLbitfield GLenum GLfloat GLfloat GLint GLint const GLfloat GLenum GLfloat GLfloat GLint GLint GLfloat GLfloat GLint GLint const GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat const GLdouble const GLfloat const GLdouble const GLfloat GLint i
Definition: glfuncs.h:248
#define _mm_malloc(a, b)
Definition: malloc.h:64
#define _mm_free(a)
Definition: malloc.h:63
#define __INTRIN_INLINE_MMX
Definition: mmintrin.h:64
#define _In_
Definition: ms_sal.h:308
static calc_node_t temp
Definition: rpn_ieee.c:38
#define __c
Definition: schilyio.h:209
_Must_inspect_result_ _In_ WDFDEVICE _In_ PWDF_DEVICE_PROPERTY_DATA _In_ DEVPROPTYPE _In_ ULONG Size
Definition: wdfdevice.h:4533
int _mm_comineq_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:827
#define _mm_cvtps_pi32
Definition: xmmintrin.h:251
void _mm_sfence(void)
Definition: xmmintrin.h:1140
__m128 _mm_cmplt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:672
__INTRIN_INLINE_SSE void _mm_stream_pi(__m64 *__p, __m64 __a)
Definition: xmmintrin.h:1121
__m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:732
__m128 _mm_add_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:548
__m128 _mm_rsqrt_ps(__m128 a)
Definition: xmmintrin.h:617
void _mm_storeu_ps(float *p, __m128 a)
Definition: xmmintrin.h:1057
#define _mm_movemask_pi8
Definition: xmmintrin.h:260
__m128 _mm_move_ss(__m128 a, __m128 b)
#define _mm_avg_pu16
Definition: xmmintrin.h:265
int _mm_comige_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:822
__m128 _mm_loadu_ps(float const *p)
Definition: xmmintrin.h:972
__m128 _mm_loadl_pi(__m128 a, __m64 const *p)
Definition: xmmintrin.h:941
void _mm_stream_ps(float *p, __m128 a)
Definition: xmmintrin.h:1130
__m128 _mm_div_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:581
#define _mm_cvttps_pi32
Definition: xmmintrin.h:252
__m128 _mm_max_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:632
int _mm_ucomilt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:837
void _mm_store_ss(float *p, __m128 a)
Definition: xmmintrin.h:1052
#define _mm_extract_pi16
Definition: xmmintrin.h:254
__m128 _mm_set_ps(float e3, float e2, float e1, float e0)
Definition: xmmintrin.h:1011
__m128 _mm_cmpunord_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:792
#define __INTRIN_INLINE_SSE
Definition: xmmintrin.h:70
float _mm_cvtss_f32(__m128 a)
Definition: xmmintrin.h:921
__m128 _mm_rcp_ps(__m128 a)
Definition: xmmintrin.h:607
#define _mm_min_pu8
Definition: xmmintrin.h:259
__m128 _mm_setzero_ps(void)
Definition: xmmintrin.h:1021
void _mm_setcsr(unsigned int a)
Definition: xmmintrin.h:542
__m128 _mm_mul_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:570
__m128 _mm_load_ps(float const *p)
Definition: xmmintrin.h:967
__m128 _mm_cmpnle_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:747
__m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:667
#define _mm_max_pu8
Definition: xmmintrin.h:257
int _mm_ucomige_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:852
__m128 _mm_rsqrt_ss(__m128 a)
Definition: xmmintrin.h:612
__m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8)
__m128 _mm_cmpngt_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:762
__m128 _mm_cmpord_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:782
void _mm_storeh_pi(__m64 *p, __m128 a)
Definition: xmmintrin.h:1026
#define _mm_min_pi16
Definition: xmmintrin.h:258
__m128 _mm_set_ss(float a)
Definition: xmmintrin.h:1000
__m128 _mm_cvt_si2ss(__m128 a, int b)
void _mm_store_ps1(float *p, __m128 a)
#define _mm_cvtss_si32
Definition: xmmintrin.h:245
#define _mm_cvtpi32_ps
Definition: xmmintrin.h:253
#define _mm_max_pi16
Definition: xmmintrin.h:256
__m128 _mm_and_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:642
int _mm_comieq_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:802
__m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:647
__INTRIN_INLINE_SSE __m128 _mm_undefined_ps(void)
Definition: xmmintrin.h:990
__m128 _mm_xor_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:657
__m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:702
int _mm_comigt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:817
__m128 _mm_set_ps1(float a)
#define _mm_load1_ps
Definition: xmmintrin.h:249
__m128 _mm_cmpge_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:707
int _mm_cvt_ss2si(__m128 a)
#define _mm_sad_pu8
Definition: xmmintrin.h:266
__m128 _mm_cmpeq_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:662
__m128 _mm_div_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:587
__m128 _mm_max_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:637
__m128 _mm_cmpngt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:752
__m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:565
__m128 _mm_unpackhi_ps(__m128 a, __m128 b)
__m128 _mm_cmpneq_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:722
int _mm_comilt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:807
__m128 _mm_min_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:622
__m128 _mm_cmpnle_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:742
#define _mm_avg_pu8
Definition: xmmintrin.h:264
int _mm_ucomile_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:842
__m128 _mm_add_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:554
__m128 _mm_load_ss(float const *p)
Definition: xmmintrin.h:956
__m128 _mm_movehl_ps(__m128 a, __m128 b)
__m128 _mm_cmpge_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:717
void _mm_store_ps(float *p, __m128 a)
Definition: xmmintrin.h:1062
__m128 _mm_cmpnge_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:777
__m128 _mm_loadh_pi(__m128 a, __m64 const *p)
Definition: xmmintrin.h:926
#define _mm_set1_ps
Definition: xmmintrin.h:248
#define _mm_cvttss_si32
Definition: xmmintrin.h:246
#define _mm_maskmove_si64
Definition: xmmintrin.h:263
__m128 _mm_min_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:627
int _mm_ucomigt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:847
int _mm_cvtt_ss2si(__m128 a)
__m128 _mm_cmpunord_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:797
__m128 _mm_sqrt_ss(__m128 a)
Definition: xmmintrin.h:592
#define _mm_cvtsi32_ss
Definition: xmmintrin.h:247
#define __ATTRIBUTE_SSE__
Definition: xmmintrin.h:68
__m128 _mm_cmple_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:687
int _mm_ucomieq_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:832
#define _mm_prefetch(p, sel)
Definition: xmmintrin.h:1116
__m128 _mm_or_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:652
__m128 _mm_movelh_ps(__m128 a, __m128 b)
void _mm_storer_ps(float *p, __m128 a)
Definition: xmmintrin.h:1079
unsigned int _mm_getcsr(void)
Definition: xmmintrin.h:535
__m128 _mm_sqrt_ps(__m128 a)
Definition: xmmintrin.h:597
#define _mm_shuffle_pi16
Definition: xmmintrin.h:262
__m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:737
int _mm_ucomineq_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:857
__m128 _mm_cmpnge_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:767
__m128 _mm_loadr_ps(float const *p)
Definition: xmmintrin.h:980
#define _mm_mulhi_pu16
Definition: xmmintrin.h:261
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: xmmintrin.h:89
__m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:727
__m128 _mm_cmpord_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:787
__m128 _mm_load_ps1(float const *p)
__m128 _mm_sub_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:559
__m128 _mm_rcp_ss(__m128 a)
Definition: xmmintrin.h:602
__m128 _mm_cmpgt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:692
__m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:677
__m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:576
__m128 _mm_unpacklo_ps(__m128 a, __m128 b)
int _mm_movemask_ps(__m128 a)
__m128 _mm_cmple_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:682
#define _mm_store1_ps
Definition: xmmintrin.h:250
int _mm_comile_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:812
float __v4sf __attribute__((__vector_size__(16)))
Definition: xmmintrin.h:58
void _mm_storel_pi(__m64 *p, __m128 a)
Definition: xmmintrin.h:1039
__m128 _mm_setr_ps(float e3, float e2, float e1, float e0)
Definition: xmmintrin.h:1016
#define _mm_insert_pi16
Definition: xmmintrin.h:255