ReactOS  0.4.15-dev-5122-g72bdbdd
xmmintrin.h
Go to the documentation of this file.
1 /*
2  * xmmintrin.h
3  *
4  * This file is part of the ReactOS CRT package.
5  *
6  * Contributors:
7  * Timo Kreuzer (timo.kreuzer@reactos.org)
8  *
9  * THIS SOFTWARE IS NOT COPYRIGHTED
10  *
11  * This source code is offered for use in the public domain. You may
12  * use, modify or distribute it freely.
13  *
14  * This code is distributed in the hope that it will be useful but
15  * WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY
16  * DISCLAIMED. This includes but is not limited to warranties of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18  *
19  */
20 
21 #pragma once
22 #ifndef _INCLUDED_MM2
23 #define _INCLUDED_MM2
24 
25 #include <mmintrin.h>
26 
27 #if defined(_MM2_FUNCTIONALITY) && !defined(_MM_FUNCTIONALITY)
28 #define _MM_FUNCTIONALITY
29 #endif
30 
31 #if !defined _VCRT_BUILD && !defined _INC_MALLOC
32 //#include <malloc.h> // FIXME: This breaks build
33 #endif
34 
35 #ifdef __cplusplus
36 extern "C" {
37 #endif
38 
39 #if defined(_MSC_VER) && !defined(__clang__)
40 
41 typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128
42 {
43  float m128_f32[4];
44  unsigned __int64 m128_u64[2];
45  __int8 m128_i8[16];
46  __int16 m128_i16[8];
47  __int32 m128_i32[4];
48  __int64 m128_i64[2];
49  unsigned __int8 m128_u8[16];
50  unsigned __int16 m128_u16[8];
51  unsigned __int32 m128_u32[4];
52 } __m128;
53 
54 #define __ATTRIBUTE_SSE__
55 
56 #else /* _MSC_VER */
57 
58  typedef float __v4sf __attribute__((__vector_size__(16)));
59  typedef signed int __v4si __attribute__((__vector_size__(16)));
60  typedef unsigned int __v4su __attribute__((__vector_size__(16)));
61  typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
62 
63  typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
64 
65 #ifdef __clang__
66 #define __ATTRIBUTE_SSE__ __attribute__((__target__("sse"),__min_vector_width__(128)))
67 #else
68 #define __ATTRIBUTE_SSE__ __attribute__((__target__("sse")))
69 #endif
70 #define __INTRIN_INLINE_SSE __INTRIN_INLINE __ATTRIBUTE_SSE__
71 
72 #endif /* _MSC_VER */
73 
74 #define _MM_ALIGN16 _VCRT_ALIGN(16)
75 
76 /* Constants for use with _mm_prefetch. */
77 #define _MM_HINT_NTA 0
78 #define _MM_HINT_T0 1
79 #define _MM_HINT_T1 2
80 #define _MM_HINT_T2 3
81 #define _MM_HINT_ENTA 4
82 #if 0 // Not supported yet
83 #define _MM_HINT_ET0 5
84 #define _MM_HINT_ET1 6
85 #define _MM_HINT_ET2 7
86 #endif
87 
88 /* Create a selector for use with the SHUFPS instruction. */
89 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
90  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
91 
92 /* Bits in the MXCSR. */
93 #define _MM_EXCEPT_MASK 0x003f
94 #define _MM_EXCEPT_INVALID 0x0001
95 #define _MM_EXCEPT_DENORM 0x0002
96 #define _MM_EXCEPT_DIV_ZERO 0x0004
97 #define _MM_EXCEPT_OVERFLOW 0x0008
98 #define _MM_EXCEPT_UNDERFLOW 0x0010
99 #define _MM_EXCEPT_INEXACT 0x0020
100 
101 #define _MM_MASK_MASK 0x1f80
102 #define _MM_MASK_INVALID 0x0080
103 #define _MM_MASK_DENORM 0x0100
104 #define _MM_MASK_DIV_ZERO 0x0200
105 #define _MM_MASK_OVERFLOW 0x0400
106 #define _MM_MASK_UNDERFLOW 0x0800
107 #define _MM_MASK_INEXACT 0x1000
108 
109 #define _MM_ROUND_MASK 0x6000
110 #define _MM_ROUND_NEAREST 0x0000
111 #define _MM_ROUND_DOWN 0x2000
112 #define _MM_ROUND_UP 0x4000
113 #define _MM_ROUND_TOWARD_ZERO 0x6000
114 
115 #define _MM_FLUSH_ZERO_MASK 0x8000
116 #define _MM_FLUSH_ZERO_ON 0x8000
117 #define _MM_FLUSH_ZERO_OFF 0x0000
118 
119 #ifdef __ICL
120 void* __cdecl _mm_malloc(size_t Size, size_t Al);
121 void __cdecl _mm_free(void* P);
122 #endif
123 
124 void _mm_prefetch(_In_ char const* p, _In_ int i);
125 __m128 _mm_setzero_ps(void);
126 __m128 _mm_add_ss(__m128 a, __m128 b);
127 __m128 _mm_sub_ss(__m128 a, __m128 b);
128 __m128 _mm_mul_ss(__m128 a, __m128 b);
129 __m128 _mm_div_ss(__m128 a, __m128 b);
130 __m128 _mm_sqrt_ss(__m128 a);
131 __m128 _mm_rcp_ss(__m128 a);
132 __m128 _mm_rsqrt_ss(__m128 a);
133 __m128 _mm_min_ss(__m128 a, __m128 b);
134 __m128 _mm_max_ss(__m128 a, __m128 b);
135 __m128 _mm_add_ps(__m128 a, __m128 b);
136 __m128 _mm_sub_ps(__m128 a, __m128 b);
137 __m128 _mm_mul_ps(__m128 a, __m128 b);
138 __m128 _mm_div_ps(__m128 a, __m128 b);
139 __m128 _mm_sqrt_ps(__m128 a);
140 __m128 _mm_rcp_ps(__m128 a);
141 __m128 _mm_rsqrt_ps(__m128 a);
142 __m128 _mm_min_ps(__m128 a, __m128 b);
143 __m128 _mm_max_ps(__m128 a, __m128 b);
144 __m128 _mm_and_ps(__m128 a, __m128 b);
145 __m128 _mm_andnot_ps(__m128 a, __m128 b);
146 __m128 _mm_or_ps(__m128 a, __m128 b);
147 __m128 _mm_xor_ps(__m128 a, __m128 b);
148 __m128 _mm_cmpeq_ss(__m128 a, __m128 b);
149 __m128 _mm_cmplt_ss(__m128 a, __m128 b);
150 __m128 _mm_cmple_ss(__m128 a, __m128 b);
151 __m128 _mm_cmpgt_ss(__m128 a, __m128 b);
152 __m128 _mm_cmpge_ss(__m128 a, __m128 b);
153 __m128 _mm_cmpneq_ss(__m128 a, __m128 b);
154 __m128 _mm_cmpnlt_ss(__m128 a, __m128 b);
155 __m128 _mm_cmpnle_ss(__m128 a, __m128 b);
156 __m128 _mm_cmpngt_ss(__m128 a, __m128 b);
157 __m128 _mm_cmpnge_ss(__m128 a, __m128 b);
158 __m128 _mm_cmpord_ss(__m128 a, __m128 b);
159 __m128 _mm_cmpunord_ss(__m128 a, __m128 b);
160 __m128 _mm_cmpeq_ps(__m128 a, __m128 b);
161 __m128 _mm_cmplt_ps(__m128 a, __m128 b);
162 __m128 _mm_cmple_ps(__m128 a, __m128 b);
163 __m128 _mm_cmpgt_ps(__m128 a, __m128 b);
164 __m128 _mm_cmpge_ps(__m128 a, __m128 b);
165 __m128 _mm_cmpneq_ps(__m128 a, __m128 b);
166 __m128 _mm_cmpnlt_ps(__m128 a, __m128 b);
167 __m128 _mm_cmpnle_ps(__m128 a, __m128 b);
168 __m128 _mm_cmpngt_ps(__m128 a, __m128 b);
169 __m128 _mm_cmpnge_ps(__m128 a, __m128 b);
170 __m128 _mm_cmpord_ps(__m128 a, __m128 b);
171 __m128 _mm_cmpunord_ps(__m128 a, __m128 b);
172 int _mm_comieq_ss(__m128 a, __m128 b);
173 int _mm_comilt_ss(__m128 a, __m128 b);
174 int _mm_comile_ss(__m128 a, __m128 b);
175 int _mm_comigt_ss(__m128 a, __m128 b);
176 int _mm_comige_ss(__m128 a, __m128 b);
177 int _mm_comineq_ss(__m128 a, __m128 b);
178 int _mm_ucomieq_ss(__m128 a, __m128 b);
179 int _mm_ucomilt_ss(__m128 a, __m128 b);
180 int _mm_ucomile_ss(__m128 a, __m128 b);
181 int _mm_ucomigt_ss(__m128 a, __m128 b);
182 int _mm_ucomige_ss(__m128 a, __m128 b);
183 int _mm_ucomineq_ss(__m128 a, __m128 b);
184 int _mm_cvt_ss2si(__m128 a);
185 int _mm_cvtt_ss2si(__m128 a);
186 __m128 _mm_cvt_si2ss(__m128 a, int b);
187 #ifdef _M_IX86
188 __m64 _mm_cvt_ps2pi(__m128 a);
189 __m64 _mm_cvtt_ps2pi(__m128 a);
190 __m128 _mm_cvt_pi2ps(__m128 a, __m64 b);
191 #endif
192 __m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8);
193 __m128 _mm_unpackhi_ps(__m128 a, __m128 b);
194 __m128 _mm_unpacklo_ps(__m128 a, __m128 b);
195 __m128 _mm_loadh_pi(__m128 a, __m64 const* p);
196 void _mm_storeh_pi(__m64* p, __m128 a);
197 __m128 _mm_movehl_ps(__m128 a, __m128 b);
198 __m128 _mm_movelh_ps(__m128 a, __m128 b);
199 __m128 _mm_loadl_pi(__m128 a, __m64 const* p);
200 void _mm_storel_pi(__m64* p, __m128 a);
201 int _mm_movemask_ps(__m128 a);
202 unsigned int _mm_getcsr(void);
203 void _mm_setcsr(unsigned int a);
204 __m128 _mm_set_ss(float a);
205 __m128 _mm_set_ps1(float a);
206 __m128 _mm_load_ss(float const* p);
207 __m128 _mm_load_ps1(float const* p);
208 __m128 _mm_load_ps(float const* p);
209 __m128 _mm_loadu_ps(float const* p);
210 __m128 _mm_loadr_ps(float const* p);
211 __m128 _mm_set_ps(float e3, float e2, float e1, float e0);
212 __m128 _mm_setr_ps(float e3, float e2, float e1, float e0);
213 void _mm_store_ss(float* p, __m128 a);
214 float _mm_cvtss_f32(__m128 a);
215 void _mm_store_ps(float* p, __m128 a);
216 void _mm_storeu_ps(float* p, __m128 a);
217 void _mm_store_ps1(float* p, __m128 a);
218 void _mm_storer_ps(float* p, __m128 a);
219 __m128 _mm_move_ss(__m128 a, __m128 b);
220 #ifdef _M_IX86
221 int _m_pextrw(__m64 a, int imm8);
222 __m64 _m_pinsrw(__m64 a, int i, int imm8);
223 __m64 _m_pmaxsw(__m64 a, __m64 b);
224 __m64 _m_pmaxub(__m64 a, __m64 b);
225 __m64 _m_pminsw(__m64 a, __m64 b);
226 __m64 _m_pminub(__m64 a, __m64 b);
227 int _m_pmovmskb(__m64 a);
228 __m64 _m_pmulhuw(__m64 a, __m64 b);
229 __m64 _m_pshufw(__m64 a, int imm8);
230 void _m_maskmovq(__m64 a, __m64 b, char*);
231 __m64 _m_pavgb(__m64 a, __m64 b);
232 __m64 _m_pavgw(__m64 a, __m64 b);
233 __m64 _m_psadbw(__m64 a, __m64 b);
234 void _mm_stream_pi(__m64* p, __m64 a);
235 #endif
236 void _mm_stream_ps(float* p, __m128 a);
237 void _mm_sfence(void);
238 #ifdef _M_AMD64
239 __int64 _mm_cvtss_si64(__m128 a);
240 __int64 _mm_cvttss_si64(__m128 a);
241 __m128 _mm_cvtsi64_ss(__m128 a, __int64 b);
242 #endif
243 
244 /* Alternate names */
245 #define _mm_cvtss_si32 _mm_cvt_ss2si
246 #define _mm_cvttss_si32 _mm_cvtt_ss2si
247 #define _mm_cvtsi32_ss _mm_cvt_si2ss
248 #define _mm_set1_ps _mm_set_ps1
249 #define _mm_load1_ps _mm_load_ps1f
250 #define _mm_store1_ps _mm_store_ps1
251 #define _mm_cvtps_pi32 _mm_cvt_ps2pi
252 #define _mm_cvttps_pi32 _mm_cvtt_ps2pi
253 #define _mm_cvtpi32_ps _mm_cvt_pi2ps
254 #define _mm_extract_pi16 _m_pextrw
255 #define _mm_insert_pi16 _m_pinsrw
256 #define _mm_max_pi16 _m_pmaxsw
257 #define _mm_max_pu8 _m_pmaxub
258 #define _mm_min_pi16 _m_pminsw
259 #define _mm_min_pu8 _m_pminub
260 #define _mm_movemask_pi8 _m_pmovmskb
261 #define _mm_mulhi_pu16 _m_pmulhuw
262 #define _mm_shuffle_pi16 _m_pshufw
263 #define _mm_maskmove_si64 _m_maskmovq
264 #define _mm_avg_pu8 _m_pavgb
265 #define _mm_avg_pu16 _m_pavgw
266 #define _mm_sad_pu8 _m_psadbw
267 
268 #ifdef _M_IX86
269 /* Inline functions from Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h */
270 
272 static __inline __m128 _mm_cvtpi16_ps(__m64 __a)
273 {
274  __m64 __b, __c;
275  __m128 __r;
276 
277  __b = _mm_setzero_si64();
278  __b = _mm_cmpgt_pi16(__b, __a);
279  __c = _mm_unpackhi_pi16(__a, __b);
280  __r = _mm_setzero_ps();
281  __r = _mm_cvtpi32_ps(__r, __c);
282  __r = _mm_movelh_ps(__r, __r);
283  __c = _mm_unpacklo_pi16(__a, __b);
284  __r = _mm_cvtpi32_ps(__r, __c);
285 
286  return __r;
287 }
288 
290 static __inline __m128 _mm_cvtpu16_ps(__m64 __a)
291 {
292  __m64 __b, __c;
293  __m128 __r;
294 
295  __b = _mm_setzero_si64();
296  __c = _mm_unpackhi_pi16(__a, __b);
297  __r = _mm_setzero_ps();
298  __r = _mm_cvtpi32_ps(__r, __c);
299  __r = _mm_movelh_ps(__r, __r);
300  __c = _mm_unpacklo_pi16(__a, __b);
301  __r = _mm_cvtpi32_ps(__r, __c);
302 
303  return __r;
304 }
305 
307 static __inline __m128 _mm_cvtpi8_ps(__m64 __a)
308 {
309  __m64 __b;
310 
311  __b = _mm_setzero_si64();
312  __b = _mm_cmpgt_pi8(__b, __a);
313  __b = _mm_unpacklo_pi8(__a, __b);
314 
315  return _mm_cvtpi16_ps(__b);
316 }
317 
319 static __inline __m128 _mm_cvtpu8_ps(__m64 __a)
320 {
321  __m64 __b;
322 
323  __b = _mm_setzero_si64();
324  __b = _mm_unpacklo_pi8(__a, __b);
325 
326  return _mm_cvtpi16_ps(__b);
327 }
328 
330 static __inline __m128 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
331 {
332  __m128 __c;
333 
334  __c = _mm_setzero_ps();
335  __c = _mm_cvtpi32_ps(__c, __b);
336  __c = _mm_movelh_ps(__c, __c);
337 
338  return _mm_cvtpi32_ps(__c, __a);
339 }
340 
342 static __inline __m64 _mm_cvtps_pi16(__m128 __a)
343 {
344  __m64 __b, __c;
345 
346  __b = _mm_cvtps_pi32(__a);
347  __a = _mm_movehl_ps(__a, __a);
348  __c = _mm_cvtps_pi32(__a);
349 
350  return _mm_packs_pi32(__b, __c);
351 }
352 
354 static __inline __m64 _mm_cvtps_pi8(__m128 __a)
355 {
356  __m64 __b, __c;
357 
358  __b = _mm_cvtps_pi16(__a);
359  __c = _mm_setzero_si64();
360 
361  return _mm_packs_pi16(__b, __c);
362 }
363 
364 #endif /* _M_IX86 */
365 
366 /* Transpose the 4x4 matrix composed of row[0-3]. */
367 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
368 do { \
369  __m128 t0 = _mm_unpacklo_ps(row0, row1); \
370  __m128 t1 = _mm_unpacklo_ps(row2, row3); \
371  __m128 t2 = _mm_unpackhi_ps(row0, row1); \
372  __m128 t3 = _mm_unpackhi_ps(row2, row3); \
373  (row0) = _mm_movelh_ps(t0, t1); \
374  (row1) = _mm_movehl_ps(t1, t0); \
375  (row2) = _mm_movelh_ps(t2, t3); \
376  (row3) = _mm_movehl_ps(t3, t2); \
377 } while (0)
378 
379 #define _MM_GET_EXCEPTION_STATE() \
380  (_mm_getcsr() & _MM_EXCEPT_MASK)
381 
382 #define _MM_GET_EXCEPTION_MASK() \
383  (_mm_getcsr() & _MM_MASK_MASK)
384 
385 #define _MM_GET_ROUNDING_MODE() \
386  (_mm_getcsr() & _MM_ROUND_MASK)
387 
388 #define _MM_GET_FLUSH_ZERO_MODE() \
389  (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
390 
391 #define _MM_SET_EXCEPTION_STATE(__mask) \
392  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (__mask))
393 
394 #define _MM_SET_EXCEPTION_MASK(__mask) \
395  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (__mask))
396 
397 #define _MM_SET_ROUNDING_MODE(__mode) \
398  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (__mode))
399 
400 #define _MM_SET_FLUSH_ZERO_MODE(__mode) \
401  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (__mode))
402 
403 /* Use intrinsics on MSVC */
404 #if defined(_MSC_VER) && !defined(__clang__)
405 #pragma intrinsic(_mm_prefetch)
406 #pragma intrinsic(_mm_setzero_ps)
407 #pragma intrinsic(_mm_add_ss)
408 #pragma intrinsic(_mm_sub_ss)
409 #pragma intrinsic(_mm_mul_ss)
410 #pragma intrinsic(_mm_div_ss)
411 #pragma intrinsic(_mm_sqrt_ss)
412 #pragma intrinsic(_mm_rcp_ss)
413 #pragma intrinsic(_mm_rsqrt_ss)
414 #pragma intrinsic(_mm_min_ss)
415 #pragma intrinsic(_mm_max_ss)
416 #pragma intrinsic(_mm_add_ps)
417 #pragma intrinsic(_mm_sub_ps)
418 #pragma intrinsic(_mm_mul_ps)
419 #pragma intrinsic(_mm_div_ps)
420 #pragma intrinsic(_mm_sqrt_ps)
421 #pragma intrinsic(_mm_rcp_ps)
422 #pragma intrinsic(_mm_rsqrt_ps)
423 #pragma intrinsic(_mm_min_ps)
424 #pragma intrinsic(_mm_max_ps)
425 #pragma intrinsic(_mm_and_ps)
426 #pragma intrinsic(_mm_andnot_ps)
427 #pragma intrinsic(_mm_or_ps)
428 #pragma intrinsic(_mm_xor_ps)
429 #pragma intrinsic(_mm_cmpeq_ss)
430 #pragma intrinsic(_mm_cmplt_ss)
431 #pragma intrinsic(_mm_cmple_ss)
432 #pragma intrinsic(_mm_cmpgt_ss)
433 #pragma intrinsic(_mm_cmpge_ss)
434 #pragma intrinsic(_mm_cmpneq_ss)
435 #pragma intrinsic(_mm_cmpnlt_ss)
436 #pragma intrinsic(_mm_cmpnle_ss)
437 #pragma intrinsic(_mm_cmpngt_ss)
438 #pragma intrinsic(_mm_cmpnge_ss)
439 #pragma intrinsic(_mm_cmpord_ss)
440 #pragma intrinsic(_mm_cmpunord_ss)
441 #pragma intrinsic(_mm_cmpeq_ps)
442 #pragma intrinsic(_mm_cmplt_ps)
443 #pragma intrinsic(_mm_cmple_ps)
444 #pragma intrinsic(_mm_cmpgt_ps)
445 #pragma intrinsic(_mm_cmpge_ps)
446 #pragma intrinsic(_mm_cmpneq_ps)
447 #pragma intrinsic(_mm_cmpnlt_ps)
448 #pragma intrinsic(_mm_cmpnle_ps)
449 #pragma intrinsic(_mm_cmpngt_ps)
450 #pragma intrinsic(_mm_cmpnge_ps)
451 #pragma intrinsic(_mm_cmpord_ps)
452 #pragma intrinsic(_mm_cmpunord_ps)
453 #pragma intrinsic(_mm_comieq_ss)
454 #pragma intrinsic(_mm_comilt_ss)
455 #pragma intrinsic(_mm_comile_ss)
456 #pragma intrinsic(_mm_comigt_ss)
457 #pragma intrinsic(_mm_comige_ss)
458 #pragma intrinsic(_mm_comineq_ss)
459 #pragma intrinsic(_mm_ucomieq_ss)
460 #pragma intrinsic(_mm_ucomilt_ss)
461 #pragma intrinsic(_mm_ucomile_ss)
462 #pragma intrinsic(_mm_ucomigt_ss)
463 #pragma intrinsic(_mm_ucomige_ss)
464 #pragma intrinsic(_mm_ucomineq_ss)
465 #pragma intrinsic(_mm_cvt_ss2si)
466 #pragma intrinsic(_mm_cvtt_ss2si)
467 #pragma intrinsic(_mm_cvt_si2ss)
468 #ifdef _M_IX86
469 #pragma intrinsic(_mm_cvt_ps2pi)
470 #pragma intrinsic(_mm_cvtt_ps2pi)
471 #pragma intrinsic(_mm_cvt_pi2ps)
472 #endif // _M_IX86
473 #pragma intrinsic(_mm_shuffle_ps)
474 #pragma intrinsic(_mm_unpackhi_ps)
475 #pragma intrinsic(_mm_unpacklo_ps)
476 #pragma intrinsic(_mm_loadh_pi)
477 #pragma intrinsic(_mm_storeh_pi)
478 #pragma intrinsic(_mm_movehl_ps)
479 #pragma intrinsic(_mm_movelh_ps)
480 #pragma intrinsic(_mm_loadl_pi)
481 #pragma intrinsic(_mm_storel_pi)
482 #pragma intrinsic(_mm_movemask_ps)
483 #pragma intrinsic(_mm_getcsr)
484 #pragma intrinsic(_mm_setcsr)
485 #pragma intrinsic(_mm_set_ss)
486 #pragma intrinsic(_mm_set_ps1)
487 #pragma intrinsic(_mm_load_ss)
488 #pragma intrinsic(_mm_load_ps1)
489 #pragma intrinsic(_mm_load_ps)
490 #pragma intrinsic(_mm_loadu_ps)
491 #pragma intrinsic(_mm_loadr_ps)
492 #pragma intrinsic(_mm_set_ps)
493 #pragma intrinsic(_mm_setr_ps)
494 #pragma intrinsic(_mm_store_ss)
495 #pragma intrinsic(_mm_cvtss_f32)
496 #pragma intrinsic(_mm_store_ps)
497 #pragma intrinsic(_mm_storeu_ps)
498 #pragma intrinsic(_mm_store_ps1)
499 #pragma intrinsic(_mm_storer_ps)
500 #pragma intrinsic(_mm_move_ss)
501 #ifdef _M_IX86
502 #pragma intrinsic(_m_pextrw)
503 #pragma intrinsic(_m_pinsrw)
504 #pragma intrinsic(_m_pmaxsw)
505 #pragma intrinsic(_m_pmaxub)
506 #pragma intrinsic(_m_pminsw)
507 #pragma intrinsic(_m_pminub)
508 #pragma intrinsic(_m_pmovmskb)
509 #pragma intrinsic(_m_pmulhuw)
510 #pragma intrinsic(_m_pshufw)
511 #pragma intrinsic(_m_maskmovq)
512 #pragma intrinsic(_m_pavgb)
513 #pragma intrinsic(_m_pavgw)
514 #pragma intrinsic(_m_psadbw)
515 #pragma intrinsic(_mm_stream_pi)
516 #endif // _M_IX86
517 #pragma intrinsic(_mm_stream_ps)
518 #pragma intrinsic(_mm_sfence)
519 #ifdef _M_AMD64
520 #pragma intrinsic(_mm_cvtss_si64)
521 #pragma intrinsic(_mm_cvttss_si64)
522 #pragma intrinsic(_mm_cvtsi64_ss)
523 #endif // _M_AMD64
524 
525 #else /* _MSC_VER */
526 
527 /*
528  GCC: https://github.com/gcc-mirror/gcc/blob/master/gcc/config/i386/xmmintrin.h
529  Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h
530 */
531 
532 /* Use inline functions on GCC/Clang */
533 
534 #if !HAS_BUILTIN(_mm_getcsr)
535 __INTRIN_INLINE_SSE unsigned int _mm_getcsr(void)
536 {
537  return __builtin_ia32_stmxcsr();
538 }
539 #endif
540 
541 #if !HAS_BUILTIN(_mm_setcsr)
542 __INTRIN_INLINE_SSE void _mm_setcsr(unsigned int a)
543 {
544  __builtin_ia32_ldmxcsr(a);
545 }
546 #endif
547 
548 __INTRIN_INLINE_SSE __m128 _mm_add_ss(__m128 __a, __m128 __b)
549 {
550  __a[0] += __b[0];
551  return __a;
552 }
553 
554 __INTRIN_INLINE_SSE __m128 _mm_add_ps(__m128 __a, __m128 __b)
555 {
556  return (__m128)((__v4sf)__a + (__v4sf)__b);
557 }
558 
559 __INTRIN_INLINE_SSE __m128 _mm_sub_ss(__m128 __a, __m128 __b)
560 {
561  __a[0] -= __b[0];
562  return __a;
563 }
564 
565 __INTRIN_INLINE_SSE __m128 _mm_sub_ps(__m128 __a, __m128 __b)
566 {
567  return (__m128)((__v4sf)__a - (__v4sf)__b);
568 }
569 
570 __INTRIN_INLINE_SSE __m128 _mm_mul_ss(__m128 __a, __m128 __b)
571 {
572  __a[0] *= __b[0];
573  return __a;
574 }
575 
576 __INTRIN_INLINE_SSE __m128 _mm_mul_ps(__m128 __a, __m128 __b)
577 {
578  return (__m128)((__v4sf)__a * (__v4sf)__b);
579 }
580 
581 __INTRIN_INLINE_SSE __m128 _mm_div_ss(__m128 __a, __m128 __b)
582 {
583  __a[0] /= __b[0];
584  return __a;
585 }
586 
587 __INTRIN_INLINE_SSE __m128 _mm_div_ps(__m128 __a, __m128 __b)
588 {
589  return (__m128)((__v4sf)__a / (__v4sf)__b);
590 }
591 
593 {
594  return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
595 }
596 
598 {
599  return __builtin_ia32_sqrtps((__v4sf)__a);
600 }
601 
602 __INTRIN_INLINE_SSE __m128 _mm_rcp_ss(__m128 __a)
603 {
604  return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
605 }
606 
607 __INTRIN_INLINE_SSE __m128 _mm_rcp_ps(__m128 __a)
608 {
609  return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
610 }
611 
613 {
614  return __builtin_ia32_rsqrtss((__v4sf)__a);
615 }
616 
618 {
619  return __builtin_ia32_rsqrtps((__v4sf)__a);
620 }
621 
622 __INTRIN_INLINE_SSE __m128 _mm_min_ss(__m128 __a, __m128 __b)
623 {
624  return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
625 }
626 
627 __INTRIN_INLINE_SSE __m128 _mm_min_ps(__m128 __a, __m128 __b)
628 {
629  return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
630 }
631 
632 __INTRIN_INLINE_SSE __m128 _mm_max_ss(__m128 __a, __m128 __b)
633 {
634  return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
635 }
636 
637 __INTRIN_INLINE_SSE __m128 _mm_max_ps(__m128 __a, __m128 __b)
638 {
639  return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
640 }
641 
642 __INTRIN_INLINE_SSE __m128 _mm_and_ps(__m128 __a, __m128 __b)
643 {
644  return (__m128)((__v4su)__a & (__v4su)__b);
645 }
646 
647 __INTRIN_INLINE_SSE __m128 _mm_andnot_ps(__m128 __a, __m128 __b)
648 {
649  return (__m128)(~(__v4su)__a & (__v4su)__b);
650 }
651 
652 __INTRIN_INLINE_SSE __m128 _mm_or_ps(__m128 __a, __m128 __b)
653 {
654  return (__m128)((__v4su)__a | (__v4su)__b);
655 }
656 
657 __INTRIN_INLINE_SSE __m128 _mm_xor_ps(__m128 __a, __m128 __b)
658 {
659  return (__m128)((__v4su)__a ^ (__v4su)__b);
660 }
661 
662 __INTRIN_INLINE_SSE __m128 _mm_cmpeq_ss(__m128 __a, __m128 __b)
663 {
664  return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
665 }
666 
667 __INTRIN_INLINE_SSE __m128 _mm_cmpeq_ps(__m128 __a, __m128 __b)
668 {
669  return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
670 }
671 
672 __INTRIN_INLINE_SSE __m128 _mm_cmplt_ss(__m128 __a, __m128 __b)
673 {
674  return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
675 }
676 
677 __INTRIN_INLINE_SSE __m128 _mm_cmplt_ps(__m128 __a, __m128 __b)
678 {
679  return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
680 }
681 
682 __INTRIN_INLINE_SSE __m128 _mm_cmple_ss(__m128 __a, __m128 __b)
683 {
684  return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
685 }
686 
687 __INTRIN_INLINE_SSE __m128 _mm_cmple_ps(__m128 __a, __m128 __b)
688 {
689  return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
690 }
691 
692 __INTRIN_INLINE_SSE __m128 _mm_cmpgt_ss(__m128 __a, __m128 __b)
693 {
694  __v4sf temp = __builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a);
695 #ifdef __clang__
696  return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
697 #else
698  return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
699 #endif
700 }
701 
702 __INTRIN_INLINE_SSE __m128 _mm_cmpgt_ps(__m128 __a, __m128 __b)
703 {
704  return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
705 }
706 
707 __INTRIN_INLINE_SSE __m128 _mm_cmpge_ss(__m128 __a, __m128 __b)
708 {
709  __v4sf temp = __builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a);
710 #ifdef __clang__
711  return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
712 #else
713  return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
714 #endif
715 }
716 
717 __INTRIN_INLINE_SSE __m128 _mm_cmpge_ps(__m128 __a, __m128 __b)
718 {
719  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
720 }
721 
722 __INTRIN_INLINE_SSE __m128 _mm_cmpneq_ss(__m128 __a, __m128 __b)
723 {
724  return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
725 }
726 
727 __INTRIN_INLINE_SSE __m128 _mm_cmpneq_ps(__m128 __a, __m128 __b)
728 {
729  return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
730 }
731 
732 __INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
733 {
734  return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
735 }
736 
737 __INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
738 {
739  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
740 }
741 
742 __INTRIN_INLINE_SSE __m128 _mm_cmpnle_ss(__m128 __a, __m128 __b)
743 {
744  return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
745 }
746 
747 __INTRIN_INLINE_SSE __m128 _mm_cmpnle_ps(__m128 __a, __m128 __b)
748 {
749  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
750 }
751 
752 __INTRIN_INLINE_SSE __m128 _mm_cmpngt_ss(__m128 __a, __m128 __b)
753 {
754  __v4sf temp = __builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a);
755 #ifdef __clang__
756  return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
757 #else
758  return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
759 #endif
760 }
761 
762 __INTRIN_INLINE_SSE __m128 _mm_cmpngt_ps(__m128 __a, __m128 __b)
763 {
764  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
765 }
766 
767 __INTRIN_INLINE_SSE __m128 _mm_cmpnge_ss(__m128 __a, __m128 __b)
768 {
769  __v4sf temp = (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a);
770 #ifdef __clang__
771  return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3);
772 #else
773  return (__m128)__builtin_ia32_movss((__v4sf)__a, temp);
774 #endif
775 }
776 
777 __INTRIN_INLINE_SSE __m128 _mm_cmpnge_ps(__m128 __a, __m128 __b)
778 {
779  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
780 }
781 
782 __INTRIN_INLINE_SSE __m128 _mm_cmpord_ss(__m128 __a, __m128 __b)
783 {
784  return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
785 }
786 
787 __INTRIN_INLINE_SSE __m128 _mm_cmpord_ps(__m128 __a, __m128 __b)
788 {
789  return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
790 }
791 
792 __INTRIN_INLINE_SSE __m128 _mm_cmpunord_ss(__m128 __a, __m128 __b)
793 {
794  return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
795 }
796 
797 __INTRIN_INLINE_SSE __m128 _mm_cmpunord_ps(__m128 __a, __m128 __b)
798 {
799  return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
800 }
801 
802 __INTRIN_INLINE_SSE int _mm_comieq_ss(__m128 __a, __m128 __b)
803 {
804  return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
805 }
806 
807 __INTRIN_INLINE_SSE int _mm_comilt_ss(__m128 __a, __m128 __b)
808 {
809  return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
810 }
811 
812 __INTRIN_INLINE_SSE int _mm_comile_ss(__m128 __a, __m128 __b)
813 {
814  return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
815 }
816 
817 __INTRIN_INLINE_SSE int _mm_comigt_ss(__m128 __a, __m128 __b)
818 {
819  return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
820 }
821 
822 __INTRIN_INLINE_SSE int _mm_comige_ss(__m128 __a, __m128 __b)
823 {
824  return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
825 }
826 
827 __INTRIN_INLINE_SSE int _mm_comineq_ss(__m128 __a, __m128 __b)
828 {
829  return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
830 }
831 
832 __INTRIN_INLINE_SSE int _mm_ucomieq_ss(__m128 __a, __m128 __b)
833 {
834  return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
835 }
836 
837 __INTRIN_INLINE_SSE int _mm_ucomilt_ss(__m128 __a, __m128 __b)
838 {
839  return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
840 }
841 
842 __INTRIN_INLINE_SSE int _mm_ucomile_ss(__m128 __a, __m128 __b)
843 {
844  return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
845 }
846 
847 __INTRIN_INLINE_SSE int _mm_ucomigt_ss(__m128 __a, __m128 __b)
848 {
849  return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
850 }
851 
852 __INTRIN_INLINE_SSE int _mm_ucomige_ss(__m128 __a, __m128 __b)
853 {
854  return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
855 }
856 
857 __INTRIN_INLINE_SSE int _mm_ucomineq_ss(__m128 __a, __m128 __b)
858 {
859  return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
860 }
861 
862 // _mm_cvt_ss2si
864 {
865  return __builtin_ia32_cvtss2si((__v4sf)__a);
866 }
867 
868 #ifdef _M_AMD64
869 __INTRIN_INLINE_SSE long long _mm_cvtss_si64(__m128 __a)
870 {
871  return __builtin_ia32_cvtss2si64((__v4sf)__a);
872 }
873 #endif
874 
875 // _mm_cvt_ps2pi
877 {
878  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
879 }
880 
881 // _mm_cvtt_ss2si
883 {
884  return __builtin_ia32_cvttss2si((__v4sf)__a);
885 }
886 
887 #ifdef _M_AMD64
888 __INTRIN_INLINE_SSE long long _mm_cvttss_si64(__m128 __a)
889 {
890  return __builtin_ia32_cvttss2si64((__v4sf)__a);
891 }
892 #endif
893 
894 // _mm_cvtt_ps2pi
896 {
897  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
898 }
899 
900 // _mm_cvt_si2ss
901 __INTRIN_INLINE_SSE __m128 _mm_cvtsi32_ss(__m128 __a, int __b)
902 {
903  __a[0] = __b;
904  return __a;
905 }
906 
907 #ifdef _M_AMD64
908 __INTRIN_INLINE_SSE __m128 _mm_cvtsi64_ss(__m128 __a, long long __b)
909 {
910  __a[0] = __b;
911  return __a;
912 }
913 #endif
914 
915 // _mm_cvt_pi2ps
916 __INTRIN_INLINE_SSE __m128 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
917 {
918  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
919 }
920 
922 {
923  return __a[0];
924 }
925 
926 __INTRIN_INLINE_SSE __m128 _mm_loadh_pi(__m128 __a, const __m64 *__p)
927 {
928 #ifdef __clang__
929  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
930  struct __mm_loadh_pi_struct {
931  __mm_loadh_pi_v2f32 __u;
932  } __attribute__((__packed__, __may_alias__));
933  __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
934  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
935  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
936 #else
937  return (__m128)__builtin_ia32_loadhps(__a, __p);
938 #endif
939 }
940 
941 __INTRIN_INLINE_SSE __m128 _mm_loadl_pi(__m128 __a, const __m64 *__p)
942 {
943 #ifdef __clang__
944  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
945  struct __mm_loadl_pi_struct {
946  __mm_loadl_pi_v2f32 __u;
947  } __attribute__((__packed__, __may_alias__));
948  __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
949  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
950  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
951 #else
952  return (__m128)__builtin_ia32_loadlps(__a, __p);
953 #endif
954 }
955 
956 __INTRIN_INLINE_SSE __m128 _mm_load_ss(const float *__p)
957 {
958  return _mm_set_ss(*__p);
959 }
960 
961 // _mm_load_ps1
962 __INTRIN_INLINE_SSE __m128 _mm_load1_ps(const float *__p)
963 {
964  return _mm_set1_ps(*__p);
965 }
966 
967 __INTRIN_INLINE_SSE __m128 _mm_load_ps(const float *__p)
968 {
969  return *(const __m128*)__p;
970 }
971 
972 __INTRIN_INLINE_SSE __m128 _mm_loadu_ps(const float *__p)
973 {
974  struct __loadu_ps {
975  __m128_u __v;
976  } __attribute__((__packed__, __may_alias__));
977  return ((const struct __loadu_ps*)__p)->__v;
978 }
979 
980 __INTRIN_INLINE_SSE __m128 _mm_loadr_ps(const float *__p)
981 {
982  __m128 __a = _mm_load_ps(__p);
983 #ifdef __clang__
984  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
985 #else
986  return (__m128)__builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3));
987 #endif
988 }
989 
991 {
992 #ifdef __clang__
993  return (__m128)__builtin_ia32_undef128();
994 #else
995  __m128 undef = undef;
996  return undef;
997 #endif
998 }
999 
1001 {
1002  return __extension__ (__m128){ __w, 0, 0, 0 };
1003 }
1004 
1005 // _mm_set_ps1
1007 {
1008  return __extension__ (__m128){ __w, __w, __w, __w };
1009 }
1010 
1011 __INTRIN_INLINE_SSE __m128 _mm_set_ps(float __z, float __y, float __x, float __w)
1012 {
1013  return __extension__ (__m128){ __w, __x, __y, __z };
1014 }
1015 
1016 __INTRIN_INLINE_SSE __m128 _mm_setr_ps(float __z, float __y, float __x, float __w)
1017 {
1018  return __extension__ (__m128){ __z, __y, __x, __w };
1019 }
1020 
1022 {
1023  return __extension__ (__m128){ 0, 0, 0, 0 };
1024 }
1025 
1026 __INTRIN_INLINE_SSE void _mm_storeh_pi(__m64 *__p, __m128 __a)
1027 {
1028 #ifdef __clang__
1029  typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1030  struct __mm_storeh_pi_struct {
1031  __mm_storeh_pi_v2f32 __u;
1032  } __attribute__((__packed__, __may_alias__));
1033  ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1034 #else
1035  __builtin_ia32_storehps(__p, __a);
1036 #endif
1037 }
1038 
1039 __INTRIN_INLINE_SSE void _mm_storel_pi(__m64 *__p, __m128 __a)
1040 {
1041 #ifdef __clang__
1042  typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1043  struct __mm_storeh_pi_struct {
1044  __mm_storeh_pi_v2f32 __u;
1045  } __attribute__((__packed__, __may_alias__));
1046  ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1047 #else
1048  __builtin_ia32_storelps(__p, __a);
1049 #endif
1050 }
1051 
1052 __INTRIN_INLINE_SSE void _mm_store_ss(float *__p, __m128 __a)
1053 {
1054  *__p = ((__v4sf)__a)[0];
1055 }
1056 
1057 __INTRIN_INLINE_SSE void _mm_storeu_ps(float *__p, __m128 __a)
1058 {
1059  *(__m128_u *)__p = __a;
1060 }
1061 
1062 __INTRIN_INLINE_SSE void _mm_store_ps(float *__p, __m128 __a)
1063 {
1064  *(__m128*)__p = __a;
1065 }
1066 
1067 // _mm_store_ps1
1068 __INTRIN_INLINE_SSE void _mm_store1_ps(float *__p, __m128 __a)
1069 {
1070  // FIXME: Should we use a temp instead?
1071 #ifdef __clang__
1072  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
1073 #else
1074  __a = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,0,0,0));
1075 #endif
1076  _mm_store_ps(__p, __a);
1077 }
1078 
1079 __INTRIN_INLINE_SSE void _mm_storer_ps(float *__p, __m128 __a)
1080 {
1081 #ifdef __clang__
1082  __m128 __tmp = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1083 #else
1084  __m128 __tmp = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3));
1085 #endif
1086  _mm_store_ps(__p, __tmp);
1087 }
1088 
1089 /* GCC / Clang specific consants */
1090 #define _MM_HINT_NTA_ALT 0
1091 #define _MM_HINT_T0_ALT 3
1092 #define _MM_HINT_T1_ALT 2
1093 #define _MM_HINT_T2_ALT 1
1094 #define _MM_HINT_ENTA_ALT 4
1095 
1096 // These are not supported yet
1097 //#define _MM_HINT_ET0_ALT 7
1098 //#define _MM_HINT_ET1_ALT 6
1099 //#define _MM_HINT_ET2_ALT 5
1100 
1101 #define _MM_HINT_MS_TO_ALT(sel) \
1102  (((sel) == _MM_HINT_NTA) ? _MM_HINT_NTA_ALT : \
1103  ((sel) == _MM_HINT_T0) ? _MM_HINT_T0_ALT : \
1104  ((sel) == _MM_HINT_T1) ? _MM_HINT_T1_ALT : \
1105  ((sel) == _MM_HINT_T2) ? _MM_HINT_T2_ALT : \
1106  ((sel) == _MM_HINT_ENTA) ? _MM_HINT_ENTA_ALT : 0)
1107 
1108 #ifdef _MSC_VER1
1109 
1110 /* On clang-cl we have an intrinsic, but the constants are different */
1111 #pragma intrinsic(_mm_prefetch)
1112 #define _mm_prefetch(p, sel) _mm_prefetch(p, _MM_HINT_MS_TO_ALT(sel))
1113 
1114 #else /* _MSC_VER */
1115 
1116 #define _mm_prefetch(p, sel) \
1117  __builtin_prefetch((const void *)(p), (_MM_HINT_MS_TO_ALT(sel) >> 2) & 1, _MM_HINT_MS_TO_ALT(sel) & 0x3)
1118 
1119 #endif /* _MSC_VER */
1120 
1121 __INTRIN_INLINE_SSE void _mm_stream_pi(__m64 *__p, __m64 __a)
1122 {
1123 #ifdef __clang__
1124  __builtin_ia32_movntq((__v1di*)__p, __a);
1125 #else
1126  __builtin_ia32_movntq((long long unsigned int *)__p, (long long unsigned int)__a);
1127 #endif
1128 }
1129 
1130 __INTRIN_INLINE_SSE void _mm_stream_ps(float *__p, __m128 __a)
1131 {
1132 #ifdef __clang__
1133  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
1134 #else
1135  __builtin_ia32_movntps(__p, (__v4sf)__a);
1136 #endif
1137 }
1138 
1139 #if !HAS_BUILTIN(_mm_sfence)
1141 {
1142  __builtin_ia32_sfence();
1143 }
1144 #endif
1145 
1146 #ifdef __clang__
1147 #define _m_pextrw(a, n) \
1148  ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
1149 
1150 #define _m_pinsrw(a, d, n) \
1151  ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
1152 #else
1153 // _m_pextrw
1154 __INTRIN_INLINE_SSE int _mm_extract_pi16(__m64 const __a, int const __n)
1155 {
1156  return (unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)__a, __n);
1157 }
1158 
1159 // _m_pinsrw
1160 __INTRIN_INLINE_SSE __m64 _mm_insert_pi16 (__m64 const __a, int const __d, int const __n)
1161 {
1162  return (__m64)__builtin_ia32_vec_set_v4hi ((__v4hi)__a, __d, __n);
1163 }
1164 
1165 #endif
1166 
1167 // _m_pmaxsw
1168 __INTRIN_INLINE_SSE __m64 _mm_max_pi16(__m64 __a, __m64 __b)
1169 {
1170  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
1171 }
1172 
1173 // _m_pmaxub
1174 __INTRIN_INLINE_SSE __m64 _mm_max_pu8(__m64 __a, __m64 __b)
1175 {
1176  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
1177 }
1178 
1179 // _m_pminsw
1180 __INTRIN_INLINE_SSE __m64 _mm_min_pi16(__m64 __a, __m64 __b)
1181 {
1182  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
1183 }
1184 
1185 // _m_pminub
1186 __INTRIN_INLINE_SSE __m64 _mm_min_pu8(__m64 __a, __m64 __b)
1187 {
1188  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
1189 }
1190 
1191 // _m_pmovmskb
1193 {
1194  return __builtin_ia32_pmovmskb((__v8qi)__a);
1195 }
1196 
1197 // _m_pmulhuw
1198 __INTRIN_INLINE_SSE __m64 _mm_mulhi_pu16(__m64 __a, __m64 __b)
1199 {
1200  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
1201 }
1202 
1203 #ifdef __clang__
1204 #define _m_pshufw(a, n) \
1205  ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
1206 #else
1207 // _m_pshufw
1208 __INTRIN_INLINE_MMX __m64 _mm_shuffle_pi16 (__m64 __a, int const __n)
1209 {
1210  return (__m64) __builtin_ia32_pshufw ((__v4hi)__a, __n);
1211 }
1212 #endif
1213 
1214 // _m_maskmovq
1215 __INTRIN_INLINE_SSE void _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
1216 {
1217  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
1218 }
1219 
1220 // _m_pavgb
1221 __INTRIN_INLINE_SSE __m64 _mm_avg_pu8(__m64 __a, __m64 __b)
1222 {
1223  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
1224 }
1225 
1226 // _m_pavgw
1227 __INTRIN_INLINE_SSE __m64 _mm_avg_pu16(__m64 __a, __m64 __b)
1228 {
1229  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
1230 }
1231 
1232 // _m_psadbw
1233 __INTRIN_INLINE_SSE __m64 _mm_sad_pu8(__m64 __a, __m64 __b)
1234 {
1235  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
1236 }
1237 
1238 #endif // __GNUC__
1239 
1240 #ifdef __cplusplus
1241 }
1242 #endif // __cplusplus
1243 
1244 #endif /* _INCLUDED_MM2 */
__m128 _mm_unpackhi_ps(__m128 a, __m128 b)
__m128 _mm_cmple_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:687
#define _mm_avg_pu16
Definition: xmmintrin.h:265
__m128 _mm_rcp_ps(__m128 a)
Definition: xmmintrin.h:607
__m128 _mm_set_ps1(float a)
__m128 _mm_cmpgt_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:702
__m128 _mm_min_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:622
#define _mm_movemask_pi8
Definition: xmmintrin.h:260
#define __ATTRIBUTE_SSE__
Definition: xmmintrin.h:68
void _mm_stream_ps(float *p, __m128 a)
Definition: xmmintrin.h:1130
__m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:737
int _mm_comineq_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:827
__INTRIN_INLINE_SSE void _mm_stream_pi(__m64 *__p, __m64 __a)
Definition: xmmintrin.h:1121
return __n
Definition: _algo.h:75
unsigned int _mm_getcsr(void)
Definition: xmmintrin.h:535
#define __cdecl
Definition: accygwin.h:79
void _mm_storer_ps(float *p, __m128 a)
Definition: xmmintrin.h:1079
__m128 _mm_cmpnle_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:742
void _mm_setcsr(unsigned int a)
Definition: xmmintrin.h:542
__m128 _mm_mul_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:570
#define _DECLSPEC_INTRIN_TYPE
Definition: _mingw.h:234
__m128 _mm_loadu_ps(float const *p)
Definition: xmmintrin.h:972
__m128 _mm_mul_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:576
__m128 _mm_load_ps1(float const *p)
#define _mm_load1_ps
Definition: xmmintrin.h:249
__m128 _mm_cmpunord_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:797
void _mm_storel_pi(__m64 *p, __m128 a)
Definition: xmmintrin.h:1039
__m128 _mm_cmpeq_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:667
IN PVOID IN PVOID IN USHORT IN USHORT Size
Definition: pci.h:361
#define _mm_insert_pi16
Definition: xmmintrin.h:255
__m128 _mm_setr_ps(float e3, float e2, float e1, float e0)
Definition: xmmintrin.h:1016
__m128 _mm_loadh_pi(__m128 a, __m64 const *p)
Definition: xmmintrin.h:926
#define _mm_free(a)
Definition: malloc.h:63
__m128 _mm_cmpnle_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:747
#define _mm_cvtss_si32
Definition: xmmintrin.h:245
__m128 _mm_load_ss(float const *p)
Definition: xmmintrin.h:956
#define _mm_cvtpi32_ps
Definition: xmmintrin.h:253
__m128 _mm_cmpngt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:752
#define _mm_shuffle_pi16
Definition: xmmintrin.h:262
__m128 _mm_cmpge_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:717
#define __INTRIN_INLINE_SSE
Definition: xmmintrin.h:70
#define _mm_malloc(a, b)
Definition: malloc.h:64
void _mm_store_ps(float *p, __m128 a)
Definition: xmmintrin.h:1062
__m128 _mm_load_ps(float const *p)
Definition: xmmintrin.h:967
__m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8)
#define _mm_prefetch(p, sel)
Definition: xmmintrin.h:1116
#define __c
Definition: schilyio.h:209
void _mm_sfence(void)
Definition: xmmintrin.h:1140
void _mm_storeh_pi(__m64 *p, __m128 a)
Definition: xmmintrin.h:1026
__m128 _mm_loadr_ps(float const *p)
Definition: xmmintrin.h:980
__m128 _mm_cmpneq_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:722
__m128 _mm_rsqrt_ss(__m128 a)
Definition: xmmintrin.h:612
#define _mm_cvtsi32_ss
Definition: xmmintrin.h:247
void _mm_store_ss(float *p, __m128 a)
Definition: xmmintrin.h:1052
#define _mm_sad_pu8
Definition: xmmintrin.h:266
__m128 _mm_min_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:627
__m128 _mm_cmpnge_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:777
__m128 _mm_movehl_ps(__m128 a, __m128 b)
int _mm_ucomieq_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:832
__INTRIN_INLINE_SSE __m128 _mm_undefined_ps(void)
Definition: xmmintrin.h:990
__m128 _mm_xor_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:657
__m128 _mm_sub_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:565
#define _In_
Definition: ms_sal.h:308
__m128 _mm_or_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:652
#define _mm_cvttss_si32
Definition: xmmintrin.h:246
__m128 _mm_cmpord_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:782
__m128 _mm_set_ps(float e3, float e2, float e1, float e0)
Definition: xmmintrin.h:1011
int _mm_comile_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:812
#define _mm_maskmove_si64
Definition: xmmintrin.h:263
__m128 _mm_cvt_si2ss(__m128 a, int b)
__m128 _mm_cmplt_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:677
#define _mm_min_pu8
Definition: xmmintrin.h:259
__m128 _mm_sub_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:559
__m128 _mm_cmpunord_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:792
__m128 _mm_cmplt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:672
GLboolean GLboolean GLboolean b
Definition: glext.h:6204
__m128 _mm_add_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:548
int _mm_movemask_ps(__m128 a)
__m128 _mm_div_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:581
#define _mm_min_pi16
Definition: xmmintrin.h:258
#define _mm_max_pu8
Definition: xmmintrin.h:257
#define _mm_extract_pi16
Definition: xmmintrin.h:254
#define _mm_avg_pu8
Definition: xmmintrin.h:264
__m128 _mm_rsqrt_ps(__m128 a)
Definition: xmmintrin.h:617
#define _mm_mulhi_pu16
Definition: xmmintrin.h:261
__m128 _mm_set_ss(float a)
Definition: xmmintrin.h:1000
#define _CRT_ALIGN(x)
Definition: crtdefs.h:154
__m128 _mm_cmpord_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:787
#define _mm_cvttps_pi32
Definition: xmmintrin.h:252
__m128 _mm_add_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:554
int _mm_ucomineq_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:857
#define _mm_set1_ps
Definition: xmmintrin.h:248
__m128 _mm_max_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:632
#define _mm_cvtps_pi32
Definition: xmmintrin.h:251
int _mm_comigt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:817
int _mm_cvtt_ss2si(__m128 a)
#define P(row, col)
__m128 _mm_div_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:587
int _mm_cvt_ss2si(__m128 a)
#define _mm_max_pi16
Definition: xmmintrin.h:256
GLsizei GLenum const GLvoid GLsizei GLenum GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLint GLint GLint GLshort GLshort GLshort GLubyte GLubyte GLubyte GLuint GLuint GLuint GLushort GLushort GLushort GLbyte GLbyte GLbyte GLbyte GLdouble GLdouble GLdouble GLdouble GLfloat GLfloat GLfloat GLfloat GLint GLint GLint GLint GLshort GLshort GLshort GLshort GLubyte GLubyte GLubyte GLubyte GLuint GLuint GLuint GLuint GLushort GLushort GLushort GLushort GLboolean const GLdouble const GLfloat const GLint const GLshort const GLbyte const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLdouble const GLfloat const GLfloat const GLint const GLint const GLshort const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort const GLdouble const GLfloat const GLint const GLshort GLenum GLenum GLenum GLfloat GLenum GLint GLenum GLenum GLenum GLfloat GLenum GLenum GLint GLenum GLfloat GLenum GLint GLint GLushort GLenum GLenum GLfloat GLenum GLenum GLint GLfloat const GLubyte GLenum GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLint GLint GLsizei GLsizei GLint GLenum GLenum const GLvoid GLenum GLenum const GLfloat GLenum GLenum const GLint GLenum GLenum const GLdouble GLenum GLenum const GLfloat GLenum GLenum const GLint GLsizei GLuint GLfloat GLuint GLbitfield GLfloat GLint GLuint GLboolean GLenum GLfloat GLenum GLbitfield GLenum GLfloat GLfloat GLint GLint const GLfloat GLenum GLfloat GLfloat GLint GLint GLfloat GLfloat GLint GLint const GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat GLint GLfloat GLfloat const GLdouble const GLfloat const GLdouble const GLfloat GLint i
Definition: glfuncs.h:248
#define __int32
Definition: basetyps.h:19
__m128 _mm_max_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:637
static calc_node_t temp
Definition: rpn_ieee.c:38
#define __INTRIN_INLINE_MMX
Definition: mmintrin.h:64
#define _mm_store1_ps
Definition: xmmintrin.h:250
__m128 _mm_rcp_ss(__m128 a)
Definition: xmmintrin.h:602
int _mm_comieq_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:802
__m128 _mm_cmpgt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:692
__m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:732
__m128 _mm_setzero_ps(void)
Definition: xmmintrin.h:1021
__m128 _mm_unpacklo_ps(__m128 a, __m128 b)
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: xmmintrin.h:89
float __v4sf __attribute__((__vector_size__(16)))
Definition: xmmintrin.h:58
int _mm_ucomile_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:842
__m128 _mm_andnot_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:647
__m128 _mm_cmple_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:682
__m128 _mm_cmpge_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:707
int _mm_comilt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:807
__m128 _mm_cmpeq_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:662
int _mm_comige_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:822
void _mm_store_ps1(float *p, __m128 a)
int _mm_ucomige_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:852
float _mm_cvtss_f32(__m128 a)
Definition: xmmintrin.h:921
__m128 _mm_cmpngt_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:762
#define __int8
Definition: basetyps.h:25
int _mm_ucomilt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:837
__m128 _mm_and_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:642
GLboolean GLboolean GLboolean GLboolean a
Definition: glext.h:6204
__m128 _mm_cmpneq_ps(__m128 a, __m128 b)
Definition: xmmintrin.h:727
GLfloat GLfloat p
Definition: glext.h:8902
__m128 _mm_cmpnge_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:767
__m128 _mm_loadl_pi(__m128 a, __m64 const *p)
Definition: xmmintrin.h:941
__m128 _mm_sqrt_ps(__m128 a)
Definition: xmmintrin.h:597
void _mm_storeu_ps(float *p, __m128 a)
Definition: xmmintrin.h:1057
#define __int64
Definition: basetyps.h:16
__m128 _mm_sqrt_ss(__m128 a)
Definition: xmmintrin.h:592
int _mm_ucomigt_ss(__m128 a, __m128 b)
Definition: xmmintrin.h:847
#define __int16
Definition: basetyps.h:22
__m128 _mm_movelh_ps(__m128 a, __m128 b)
__m128 _mm_move_ss(__m128 a, __m128 b)