11 #ifndef __VM_AVXFunc__
12 #define __VM_AVXFunc__
18 #define CPU_HAS_AVX_INSTR 1
19 #define VM_AVX_STYLE 1
21 #include <immintrin.h>
32 vm_v8sf(
const v8si &
a)
43 vm_v8si(
const v8sf &
a)
53 #define V8SF(A) vm_v8sf(A)
54 #define V8SI(A) vm_v8si(A)
58 #define V8SF(A) (v8sf)A
59 #define V8SI(A) (v8si)A
62 #define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
66 #define VM_SHUFFLE_MASK_AVX(a0,a1, b0,b1) ((b1)<<6|(b0)<<4 | (a1)<<2|(a0))
72 return _mm256_shuffle_ps(a, b,
mask);
82 template <
int A,
int B,
int C,
int D,
typename T>
84 vm_shuffle_avx(
const T &a,
const T &b)
86 return vm_shuffle_avx<VM_SHUFFLE_MASK_AVX(A,B,C,D)>(
a,
b);
89 template <
int mask,
typename T>
91 vm_shuffle_avx(
const T &a)
93 return vm_shuffle_avx<mask>(
a,
a);
96 template <
int A,
int B,
int C,
int D,
typename T>
98 vm_shuffle_avx(
const T &a)
100 return vm_shuffle_avx<A,B,C,D>(
a,
a);
104 #if defined(_MSC_VER)
119 case 0 :
return _mm256_insert_epi32(v, a, 0);
120 case 1 :
return _mm256_insert_epi32(v, a, 1);
121 case 2 :
return _mm256_insert_epi32(v, a, 2);
122 case 3 :
return _mm256_insert_epi32(v, a, 3);
123 case 4 :
return _mm256_insert_epi32(v, a, 4);
124 case 5 :
return _mm256_insert_epi32(v, a, 5);
125 case 6 :
return _mm256_insert_epi32(v, a, 6);
126 case 7 :
return _mm256_insert_epi32(v, a, 7);
133 vm_insert_avx(
const v8sf v,
float a,
int n)
135 union {
v8sf vector;
float comp[8]; };
142 #if defined(_MSC_VER)
144 vm_extract_avx(
const v8si v,
int n)
152 vm_extract_avx(
const v8si v,
int n)
156 case 0 :
return _mm256_extract_epi32(v, 0);
157 case 1 :
return _mm256_extract_epi32(v, 1);
158 case 2 :
return _mm256_extract_epi32(v, 2);
159 case 3 :
return _mm256_extract_epi32(v, 3);
160 case 4 :
return _mm256_extract_epi32(v, 4);
161 case 5 :
return _mm256_extract_epi32(v, 5);
162 case 6 :
return _mm256_extract_epi32(v, 6);
163 case 7 :
return _mm256_extract_epi32(v, 7);
170 vm_extract_avx(
const v8sf v,
int n)
172 union {
v8sf vector;
float comp[8]; };
178 vm_splats_avx(
float a)
180 return _mm256_set1_ps(a);
188 return V8SI(vm_splats_avx(tmp.
fval));
192 vm_splats_avx(
int32 a)
194 return _mm256_set1_epi32(a);
198 vm_splats_avx(
float a0,
float a1,
float a2,
float a3,
199 float a4,
float a5,
float a6,
float a7)
201 return _mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0);
216 return _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
220 vm_load_avx(
const int32 v[8])
222 return _mm256_loadu_si256((
v8si *) v);
226 vm_load_avx(
const float v[8])
228 return _mm256_loadu_ps(v);
234 _mm256_storeu_si256((__m256i*) dst, value);
238 vm_store_avx(
float dst[8],
v8sf value)
240 _mm256_storeu_ps(dst, value);
244 vm_negate_avx(
v8sf a)
246 return _mm256_sub_ps(_mm256_setzero_ps(), a);
252 return _mm256_max_ps(a, vm_negate_avx(a));
258 return _mm256_mul_ps(a, _mm256_rcp_ps(b));
264 return _mm256_rcp_ps(_mm256_rsqrt_ps(a));
270 return _mm256_add_ps(_mm256_mul_ps(a, b), c);
274 #define SSE_WRAPPER_I(NAME, OP) \
275 static SYS_FORCE_INLINE v8si \
276 NAME(v8si a, v8si b) \
278 __m128i la = _mm256_extractf128_si256(a, 0); \
279 __m128i ua = _mm256_extractf128_si256(a, 1); \
280 __m128i lb = _mm256_extractf128_si256(b, 0); \
281 __m128i ub = _mm256_extractf128_si256(b, 1); \
282 return _mm256_set_m128i(OP(ua, ub), \
296 static const
v8si theSSETrue_avx= vm_splats_avx(0xFFFFFFFF);
299 vm_allbits_avx(const
v8si &a)
301 return _mm256_movemask_ps(
V8SF(vm_int_cmpeq_avx(a, theSSETrue_avx))) == 0xFF;
305 #define VM_EXTRACT_AVX vm_extract_avx
306 #define VM_INSERT_AVX vm_insert_avx
307 #define VM_SPLATS_AVX vm_splats_avx
308 #define VM_LOAD_AVX vm_load_avx
309 #define VM_STORE_AVX vm_store_avx
311 #define VM_CMPLT_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_LT_OQ))
312 #define VM_CMPLE_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_LE_OQ))
313 #define VM_CMPGT_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_GT_OQ))
314 #define VM_CMPGE_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_GE_OQ))
315 #define VM_CMPEQ_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_EQ_OQ))
316 #define VM_CMPNE_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_NEQ_OQ))
318 #define VM_ICMPLT_AVX vm_int_cmplt_avx
319 #define VM_ICMPGT_AVX vm_int_cmpgt_avx
320 #define VM_ICMPEQ_AVX vm_int_cmpeq_avx
322 #define VM_IADD_AVX vm_int_add_avx
323 #define VM_ISUB_AVX vm_int_sub_avx
324 #define VM_IMUL_AVX vm_int_mul_avx
326 #define VM_ADD_AVX _mm256_add_ps
327 #define VM_SUB_AVX _mm256_sub_ps
328 #define VM_MUL_AVX _mm256_mul_ps
329 #define VM_DIV_AVX _mm256_div_ps
330 #define VM_SQRT_AVX _mm256_sqrt_ps
331 #define VM_ISQRT_AVX _mm256_rsqrt_ps
332 #define VM_INVERT_AVX _mm256_rcp_ps
333 #define VM_ABS_AVX vm_abs_avx
335 #define VM_FDIV_AVX vm_fdiv_avx
336 #define VM_NEG_AVX vm_negate_avx
337 #define VM_FSQRT_AVX vm_fsqrt_avx
338 #define VM_MADD_AVX vm_madd_avx
340 #define VM_MIN_AVX _mm256_min_ps
341 #define VM_MAX_AVX _mm256_max_ps
343 #define VM_AND_AVX vm_int_and_avx
344 #define VM_ANDNOT_AVX vm_int_andnot_avx
345 #define VM_OR_AVX vm_int_or_avx
346 #define VM_XOR_AVX vm_int_xor_avx
348 #define VM_ALLBITS_AVX vm_allbits_avx
350 #define VM_SHUFFLE_AVX vm_shuffle_avx
353 #define VM_SSE_ROUND_MASK_AVX 0x6000
354 #define VM_SSE_ROUND_ZERO_AVX 0x6000
355 #define VM_SSE_ROUND_UP_AVX 0x4000
356 #define VM_SSE_ROUND_DOWN_AVX 0x2000
357 #define VM_SSE_ROUND_NEAR_AVX 0x0000
359 #define GETROUND_AVX() (_mm_getcsr()&VM_SSE_ROUND_MASK_AVX)
360 #define SETROUND_AVX(x) (_mm_setcsr(x|(_mm_getcsr()&~VM_SSE_ROUND_MASK_AVX)))
365 #define VM_P_FLOOR_AVX() uint rounding = GETROUND_AVX(); \
366 SETROUND_AVX(VM_SSE_ROUND_DOWN_AVX);
367 #define VM_FLOOR_AVX _mm256_cvtps_epi32
368 #define VM_INT_AVX _mm256_cvttps_epi32
369 #define VM_E_FLOOR_AVX() SETROUND_AVX(rounding);
372 #define VM_IFLOAT_AVX _mm256_cvtepi32_ps
375 #define VM_SHIFTLEFT_AVX(A,C) _mm256_sll_epi32(A,_mm_setr_epi32(C,0,0,0))
376 #define VM_SHIFTRIGHT_AVX(A,C) _mm256_srl_epi32(A,_mm_setr_epi32(C,0,0,0))
415 #define _PI32AVX_CONST(Name, Val) \
416 static const SYS_ALIGN(32) int _pi32avx_##Name[4] = \
417 { Val, Val, Val, Val }
425 #define _PS256_CONST(Name, Val) \
426 static const SYS_ALIGN(32) float _ps256_##Name[8] = \
427 { Val, Val, Val, Val, Val, Val, Val, Val }
428 #define _PS256_CONST_TYPE(Name, Type, Val) \
429 static const SYS_ALIGN(32) Type _ps256_##Name[8] = \
430 { Val, Val, Val, Val, Val, Val, Val, Val }
439 _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
440 _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
449 #undef _PI32AVX_CONST
451 #undef _PS256_CONST_TYPE
453 typedef union imm_xmm_union {
458 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
459 SYS_ALIGN(32) imm_xmm_union u; \
465 #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \
466 SYS_ALIGN(32) imm_xmm_union u; \
467 u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
470 v8sf xmm1, xmm2, xmm3, sign_bit_sin,
y;
471 v8si imm0, imm2, imm4;
473 __m128i imm0_1, imm0_2;
474 __m128i imm2_1, imm2_2;
475 __m128i imm4_1, imm4_2;
479 x = _mm256_and_ps(x, *(
v8sf*)_ps256_inv_sign_mask);
481 sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(
v8sf*)_ps256_sign_mask);
484 y = _mm256_mul_ps(x, *(
v8sf*)_ps256_cephes_FOPI);
489 imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
490 imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
492 imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
493 imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
496 y = _mm256_cvtepi32_ps(imm2);
501 imm0_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_4);
502 imm0_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_4);
504 imm0_1 = _mm_slli_epi32(imm0_1, 29);
505 imm0_2 = _mm_slli_epi32(imm0_2, 29);
509 imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
510 imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
512 imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
513 imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
517 v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
518 v8sf poly_mask = _mm256_castsi256_ps(imm2);
522 xmm1 = *(
v8sf*)_ps256_minus_cephes_DP1;
523 xmm2 = *(
v8sf*)_ps256_minus_cephes_DP2;
524 xmm3 = *(
v8sf*)_ps256_minus_cephes_DP3;
525 xmm1 = _mm256_mul_ps(y, xmm1);
526 xmm2 = _mm256_mul_ps(y, xmm2);
527 xmm3 = _mm256_mul_ps(y, xmm3);
528 x = _mm256_add_ps(x, xmm1);
529 x = _mm256_add_ps(x, xmm2);
530 x = _mm256_add_ps(x, xmm3);
532 imm4_1 = _mm_sub_epi32(imm4_1, *(__m128i*)_pi32avx_2);
533 imm4_2 = _mm_sub_epi32(imm4_2, *(__m128i*)_pi32avx_2);
535 imm4_1 = _mm_andnot_si128(imm4_1, *(__m128i*)_pi32avx_4);
536 imm4_2 = _mm_andnot_si128(imm4_2, *(__m128i*)_pi32avx_4);
538 imm4_1 = _mm_slli_epi32(imm4_1, 29);
539 imm4_2 = _mm_slli_epi32(imm4_2, 29);
543 v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
545 sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
548 v8sf z = _mm256_mul_ps(x,x);
549 y = *(
v8sf*)_ps256_coscof_p0;
551 y = _mm256_mul_ps(y, z);
552 y = _mm256_add_ps(y, *(
v8sf*)_ps256_coscof_p1);
553 y = _mm256_mul_ps(y, z);
554 y = _mm256_add_ps(y, *(
v8sf*)_ps256_coscof_p2);
555 y = _mm256_mul_ps(y, z);
556 y = _mm256_mul_ps(y, z);
557 v8sf tmp = _mm256_mul_ps(z, *(
v8sf*)_ps256_0p5);
558 y = _mm256_sub_ps(y, tmp);
559 y = _mm256_add_ps(y, *(
v8sf*)_ps256_1);
563 y2 = _mm256_mul_ps(y2, z);
564 y2 = _mm256_add_ps(y2, *(
v8sf*)_ps256_sincof_p1);
565 y2 = _mm256_mul_ps(y2, z);
566 y2 = _mm256_add_ps(y2, *(
v8sf*)_ps256_sincof_p2);
567 y2 = _mm256_mul_ps(y2, z);
568 y2 = _mm256_mul_ps(y2, x);
569 y2 = _mm256_add_ps(y2, x);
573 v8sf ysin2 = _mm256_and_ps(xmm3, y2);
574 v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
575 y2 = _mm256_sub_ps(y2,ysin2);
576 y = _mm256_sub_ps(y, ysin1);
578 xmm1 = _mm256_add_ps(ysin1,ysin2);
579 xmm2 = _mm256_add_ps(y,y2);
582 *s = _mm256_xor_ps(xmm1, sign_bit_sin);
583 *c = _mm256_xor_ps(xmm2, sign_bit_cos);
585 #undef COPY_IMM_TO_XMM
586 #undef COPY_XMM_TO_IMM
593 vm_sincos_avx(x,&s,&c);
601 vm_sincos_avx(x,&s,&c);
609 vm_sincos_avx(x,&s,&c);
610 return _mm256_div_ps(s,c);
613 #define VM_SINCOS_AVX vm_sincos_avx
614 #define VM_SIN_AVX vm_sin_avx
615 #define VM_COS_AVX vm_cos_avx
616 #define VM_TAN_AVX vm_tan_avx
#define _PS256_CONST_TYPE(Name, Type, Val)
#define _PS256_CONST(Name, Val)
GLdouble GLdouble GLdouble z
GLboolean GLboolean GLboolean GLboolean a
#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)
GLboolean GLboolean GLboolean b
#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)
#define _PI32AVX_CONST(Name, Val)
GLdouble GLdouble GLdouble y2
#define SSE_WRAPPER_I(NAME, OP)