11 #ifndef __VM_SSEFunc__
12 #define __VM_SSEFunc__
21 #pragma warning(disable:4799)
24 #define CPU_HAS_SIMD_INSTR 1
25 #define VM_SSE_STYLE 1
28 #include <emmintrin.h>
37 #if defined(__SSE4_1__) || defined(_MSC_VER)
38 #define VM_SSE41_STYLE 1
40 #include <smmintrin.h>
49 #define V4SF(A) _mm_castsi128_ps(A)
50 #define V4SI(A) _mm_castps_si128(A)
52 #define VM_SHUFFLE_MASK(a0,a1, b0,b1) ((b1)<<6|(b0)<<4 | (a1)<<2|(a0))
58 return _mm_shuffle_ps(a, b,
mask);
68 template <
int A,
int B,
int C,
int D,
typename T>
70 vm_shuffle(
const T &a,
const T &b)
72 return vm_shuffle<VM_SHUFFLE_MASK(A,B,C,D)>(
a,
b);
75 template <
int mask,
typename T>
77 vm_shuffle(
const T &a)
79 return vm_shuffle<mask>(
a,
a);
82 template <
int A,
int B,
int C,
int D,
typename T>
84 vm_shuffle(
const T &a)
86 return vm_shuffle<A,B,C,D>(
a,
a);
89 #if defined(VM_SSE41_STYLE)
96 case 0:
return _mm_insert_epi32(v, a, 0);
97 case 1:
return _mm_insert_epi32(v, a, 1);
98 case 2:
return _mm_insert_epi32(v, a, 2);
99 case 3:
return _mm_insert_epi32(v, a, 3);
105 vm_insert(
const v4sf v,
float a,
int n)
109 case 0:
return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,0,0));
110 case 1:
return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,1,0));
111 case 2:
return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,2,0));
112 case 3:
return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,3,0));
118 vm_extract(
const v4si v,
int n)
122 case 0:
return _mm_extract_epi32(v, 0);
123 case 1:
return _mm_extract_epi32(v, 1);
124 case 2:
return _mm_extract_epi32(v, 2);
125 case 3:
return _mm_extract_epi32(v, 3);
131 vm_extract(
const v4sf v,
int n)
136 case 0: tmp.
ival = _mm_extract_ps(v, 0);
break;
137 case 1: tmp.
ival = _mm_extract_ps(v, 1);
break;
138 case 2: tmp.
ival = _mm_extract_ps(v, 2);
break;
139 case 3: tmp.
ival = _mm_extract_ps(v, 3);
break;
156 vm_insert(
const v4sf v,
float a,
int n)
158 union {
v4sf vector;
float comp[4]; };
165 vm_extract(
const v4si v,
int n)
173 vm_extract(
const v4sf v,
int n)
175 union {
v4sf vector;
float comp[4]; };
185 return _mm_set1_ps(a);
205 vm_splats(
float a,
float b,
float c,
float d)
207 return vm_shuffle<0,2,0,2>(
208 vm_shuffle<0>(_mm_set_ss(a), _mm_set_ss(b)),
209 vm_shuffle<0>(_mm_set_ss(c), _mm_set_ss(d)));
235 vm_load(
const int32 v[4])
237 return V4SI(_mm_loadu_ps((
const float *)v));
241 vm_load(
const float v[4])
243 return _mm_loadu_ps(v);
249 _mm_storeu_si128((__m128i*) dst, value);
252 vm_store(
float dst[4],
v4sf value)
254 _mm_storeu_ps(dst, value);
260 return _mm_sub_ps(_mm_setzero_ps(), a);
266 return _mm_max_ps(a, vm_negate(a));
272 return _mm_mul_ps(a, _mm_rcp_ps(b));
278 return _mm_rcp_ps(_mm_rsqrt_ps(a));
284 return _mm_add_ps(_mm_mul_ps(a, b), c);
287 static const v4si theSSETrue = vm_splats(0xFFFFFFFF);
290 vm_allbits(
const v4si &a)
292 return _mm_movemask_ps(
V4SF(_mm_cmpeq_epi32(a, theSSETrue))) == 0xF;
296 vm_signbits(
const v4si &a)
298 return _mm_movemask_ps(
V4SF(a));
303 vm_signbits(
const v4sf &a)
305 return _mm_movemask_ps(a);
308 #define VM_EXTRACT vm_extract
309 #define VM_INSERT vm_insert
310 #define VM_SPLATS vm_splats
311 #define VM_LOAD vm_load
312 #define VM_STORE vm_store
314 #define VM_CMPLT(A,B) V4SI(_mm_cmplt_ps(A,B))
315 #define VM_CMPLE(A,B) V4SI(_mm_cmple_ps(A,B))
316 #define VM_CMPGT(A,B) V4SI(_mm_cmpgt_ps(A,B))
317 #define VM_CMPGE(A,B) V4SI(_mm_cmpge_ps(A,B))
318 #define VM_CMPEQ(A,B) V4SI(_mm_cmpeq_ps(A,B))
319 #define VM_CMPNE(A,B) V4SI(_mm_cmpneq_ps(A,B))
321 #define VM_ICMPLT _mm_cmplt_epi32
322 #define VM_ICMPGT _mm_cmpgt_epi32
323 #define VM_ICMPEQ _mm_cmpeq_epi32
325 #define VM_IADD _mm_add_epi32
326 #define VM_ISUB _mm_sub_epi32
327 #if defined(VM_SSE41_STYLE)
328 #define VM_IMUL _mm_mullo_epi32
331 #define VM_ADD _mm_add_ps
332 #define VM_SUB _mm_sub_ps
333 #define VM_MUL _mm_mul_ps
334 #define VM_DIV _mm_div_ps
335 #define VM_SQRT _mm_sqrt_ps
336 #define VM_ISQRT _mm_rsqrt_ps
337 #define VM_INVERT _mm_rcp_ps
338 #define VM_ABS vm_abs
340 #define VM_FDIV vm_fdiv
341 #define VM_NEG vm_negate
342 #define VM_FSQRT vm_fsqrt
343 #define VM_MADD vm_madd
345 #define VM_MIN _mm_min_ps
346 #define VM_MAX _mm_max_ps
348 #define VM_AND _mm_and_si128
349 #define VM_ANDNOT _mm_andnot_si128
350 #define VM_OR _mm_or_si128
351 #define VM_XOR _mm_xor_si128
353 #define VM_ALLBITS vm_allbits
354 #define VM_SIGNBITS vm_signbits
356 #define VM_SHUFFLE vm_shuffle
360 #define VM_SSE_ROUND_MASK 0x6000 // 0b110000000000000
361 #define VM_SSE_ROUND_ZERO 0x6000 // 0b110000000000000 (RZ)
362 #define VM_SSE_ROUND_UP 0x4000 // 0b100000000000000 (R+)
363 #define VM_SSE_ROUND_DOWN 0x2000 // 0b010000000000000 (R-)
364 #define VM_SSE_ROUND_NEAR 0x0000 // 0b000000000000000 (RN)
366 #define GETROUND() (_mm_getcsr()&VM_SSE_ROUND_MASK)
367 #define SETROUND(x) (_mm_setcsr(x|(_mm_getcsr()&~VM_SSE_ROUND_MASK)))
369 #define VM_SSE_ROUND_ZERO _MM_ROUND_TOWARDS_ZERO
370 #define VM_SSE_ROUND_UP _MM_ROUND_UP
371 #define VM_SSE_ROUND_DOWN _MM_ROUND_DOWN
372 #define VM_SSE_ROUND_NEAR _MM_ROUND_NEAREST
374 #define GETROUND() _MM_GET_ROUNDING_MODE()
375 #define SETROUND(x) _MM_SET_ROUNDING_MODE(x)
381 #define VM_P_FLOOR() uint rounding = GETROUND(); \
382 SETROUND(VM_SSE_ROUND_DOWN);
383 #define VM_FLOOR _mm_cvtps_epi32
384 #define VM_INT _mm_cvttps_epi32
385 #define VM_E_FLOOR() SETROUND(rounding);
388 #define VM_IFLOAT _mm_cvtepi32_ps
391 #define VM_SHIFTLEFT(A,C) _mm_sll_epi32(A,_mm_setr_epi32(C,0,0,0))
392 #define VM_SHIFTRIGHT(A,C) _mm_srl_epi32(A,_mm_setr_epi32(C,0,0,0))
417 #define _PS_CONST(Name, Val) \
418 static const SYS_ALIGN16 float _ps_##Name[4] = { Val, Val, Val, Val }
419 #define _PI32_CONST(Name, Val) \
420 static const SYS_ALIGN16 int _pi32_##Name[4] = { Val, Val, Val, Val }
421 #define _PS_CONST_TYPE(Name, Type, Val) \
422 static const SYS_ALIGN16 Type _ps_##Name[4] = { Val, Val, Val, Val }
434 _PS_CONST(minus_cephes_DP1, -0.78515625);
435 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
436 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
440 _PS_CONST(coscof_p0, 2.443315711809948E-005);
441 _PS_CONST(coscof_p1, -1.388731625493765E-003);
442 _PS_CONST(coscof_p2, 4.166664568298827E-002);
443 _PS_CONST(cephes_FOPI, 1.27323954473516);
447 #undef _PS_CONST_TYPE
449 v4sf xmm1, xmm2, xmm3, sign_bit_sin,
y;
450 v4si emm0, emm2, emm4;
454 x = _mm_and_ps(x, *(
v4sf*)_ps_inv_sign_mask);
456 sign_bit_sin = _mm_and_ps(sign_bit_sin, *(
v4sf*)_ps_sign_mask);
459 y = _mm_mul_ps(x, *(
v4sf*)_ps_cephes_FOPI);
462 emm2 = _mm_cvttps_epi32(y);
465 emm2 = _mm_add_epi32(emm2, *(
v4si*)_pi32_1);
466 emm2 = _mm_and_si128(emm2, *(
v4si*)_pi32_inv1);
467 y = _mm_cvtepi32_ps(emm2);
472 emm0 = _mm_and_si128(emm2, *(
v4si*)_pi32_4);
473 emm0 = _mm_slli_epi32(emm0, 29);
474 v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
477 emm2 = _mm_and_si128(emm2, *(
v4si*)_pi32_2);
478 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
479 v4sf poly_mask = _mm_castsi128_ps(emm2);
483 xmm1 = *(
v4sf*)_ps_minus_cephes_DP1;
484 xmm2 = *(
v4sf*)_ps_minus_cephes_DP2;
485 xmm3 = *(
v4sf*)_ps_minus_cephes_DP3;
486 xmm1 = _mm_mul_ps(y, xmm1);
487 xmm2 = _mm_mul_ps(y, xmm2);
488 xmm3 = _mm_mul_ps(y, xmm3);
489 x = _mm_add_ps(x, xmm1);
490 x = _mm_add_ps(x, xmm2);
491 x = _mm_add_ps(x, xmm3);
493 emm4 = _mm_sub_epi32(emm4, *(
v4si*)_pi32_2);
494 emm4 = _mm_andnot_si128(emm4, *(
v4si*)_pi32_4);
495 emm4 = _mm_slli_epi32(emm4, 29);
496 v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
498 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
501 v4sf z = _mm_mul_ps(x,x);
502 y = *(
v4sf*)_ps_coscof_p0;
504 y = _mm_mul_ps(y, z);
505 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p1);
506 y = _mm_mul_ps(y, z);
507 y = _mm_add_ps(y, *(
v4sf*)_ps_coscof_p2);
508 y = _mm_mul_ps(y, z);
509 y = _mm_mul_ps(y, z);
510 v4sf tmp = _mm_mul_ps(z, *(
v4sf*)_ps_0p5);
511 y = _mm_sub_ps(y, tmp);
512 y = _mm_add_ps(y, *(
v4sf*)_ps_1);
516 y2 = _mm_mul_ps(y2, z);
517 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p1);
518 y2 = _mm_mul_ps(y2, z);
519 y2 = _mm_add_ps(y2, *(
v4sf*)_ps_sincof_p2);
520 y2 = _mm_mul_ps(y2, z);
521 y2 = _mm_mul_ps(y2, x);
522 y2 = _mm_add_ps(y2, x);
526 v4sf ysin2 = _mm_and_ps(xmm3, y2);
527 v4sf ysin1 = _mm_andnot_ps(xmm3, y);
528 y2 = _mm_sub_ps(y2,ysin2);
529 y = _mm_sub_ps(y, ysin1);
531 xmm1 = _mm_add_ps(ysin1,ysin2);
532 xmm2 = _mm_add_ps(y,y2);
535 *s = _mm_xor_ps(xmm1, sign_bit_sin);
536 *c = _mm_xor_ps(xmm2, sign_bit_cos);
560 return _mm_div_ps(s,c);
563 #define VM_SINCOS vm_sincos
564 #define VM_SIN vm_sin
565 #define VM_COS vm_cos
566 #define VM_TAN vm_tan
#define _PI32_CONST(Name, Val)
GLdouble GLdouble GLdouble z
GLboolean GLboolean GLboolean GLboolean a
GLboolean GLboolean GLboolean b
GLdouble GLdouble GLdouble y2
#define _PS_CONST(Name, Val)
#define _PS_CONST_TYPE(Name, Type, Val)