HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
VM_AVXFunc.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: VM_AVXFunc.h ( VM Library, C++)
7  *
8  * COMMENTS:
9  */
10 
11 #ifndef __VM_AVXFunc__
12 #define __VM_AVXFunc__
13 
14 #include "VM_API.h"
15 #include <SYS/SYS_Align.h>
16 #include <SYS/SYS_Types.h>
17 
18 #define CPU_HAS_AVX_INSTR 1
19 #define VM_AVX_STYLE 1
20 
21 #include <immintrin.h>
22 typedef __m256 v8sf;
23 typedef __m256i v8si;
24 
25 // Plain casting (no conversion)
26 // MSVC has problems casting between __m128 and __m128i, so we implement a
27 // custom casting routine specifically for windows.
28 
29 #if defined(_MSC_VER)
30 
32 vm_v8sf(const v8si &a)
33 {
34  union {
35  v8si ival;
36  v8sf fval;
37  };
38  ival = a;
39  return fval;
40 }
41 
43 vm_v8si(const v8sf &a)
44 {
45  union {
46  v8si ival;
47  v8sf fval;
48  };
49  fval = a;
50  return ival;
51 }
52 
53 #define V8SF(A) vm_v8sf(A)
54 #define V8SI(A) vm_v8si(A)
55 
56 #else
57 
58 #define V8SF(A) (v8sf)A
59 #define V8SI(A) (v8si)A
60 
61 // Intrinsic missing in gcc/clang
62 #define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
63 
64 #endif
65 
66 #define VM_SHUFFLE_MASK_AVX(a0,a1, b0,b1) ((b1)<<6|(b0)<<4 | (a1)<<2|(a0))
67 
68 template <int mask>
70 vm_shuffle_avx(const v8sf &a, const v8sf &b)
71 {
72  return _mm256_shuffle_ps(a, b, mask);
73 }
74 
75 template <int mask>
77 vm_shuffle_avx(const v8si &a, const v8si &b)
78 {
79  return V8SI(_mm256_shuffle_ps(V8SF(a), V8SF(b), mask));
80 }
81 
82 template <int A, int B, int C, int D, typename T>
83 static SYS_FORCE_INLINE T
84 vm_shuffle_avx(const T &a, const T &b)
85 {
86  return vm_shuffle_avx<VM_SHUFFLE_MASK_AVX(A,B,C,D)>(a, b);
87 }
88 
89 template <int mask, typename T>
90 static SYS_FORCE_INLINE T
91 vm_shuffle_avx(const T &a)
92 {
93  return vm_shuffle_avx<mask>(a, a);
94 }
95 
96 template <int A, int B, int C, int D, typename T>
97 static SYS_FORCE_INLINE T
98 vm_shuffle_avx(const T &a)
99 {
100  return vm_shuffle_avx<A,B,C,D>(a, a);
101 }
102 
103 // The _mm256_insert_epi32 intrinsic is missing in VS2015
104 #if defined(_MSC_VER)
105 static SYS_FORCE_INLINE v8si
106 vm_insert_avx(const v8si v, int32 a, int n)
107 {
108  union { v8si vector; int32 comp[8]; };
109  vector = v;
110  comp[n] = a;
111  return vector;
112 }
113 #else
114 static SYS_FORCE_INLINE v8si
115 vm_insert_avx(const v8si v, int32 a, int n)
116 {
117  switch(n)
118  {
119  case 0 : return _mm256_insert_epi32(v, a, 0);
120  case 1 : return _mm256_insert_epi32(v, a, 1);
121  case 2 : return _mm256_insert_epi32(v, a, 2);
122  case 3 : return _mm256_insert_epi32(v, a, 3);
123  case 4 : return _mm256_insert_epi32(v, a, 4);
124  case 5 : return _mm256_insert_epi32(v, a, 5);
125  case 6 : return _mm256_insert_epi32(v, a, 6);
126  case 7 : return _mm256_insert_epi32(v, a, 7);
127  }
128  return v;
129 }
130 #endif
131 
132 static SYS_FORCE_INLINE v8sf
133 vm_insert_avx(const v8sf v, float a, int n)
134 {
135  union { v8sf vector; float comp[8]; };
136  vector = v;
137  comp[n] = a;
138  return vector;
139 }
140 
141 // The _mm256_extract_epi32 intrinsic is missing in VS2015
142 #if defined(_MSC_VER)
143 static SYS_FORCE_INLINE int
144 vm_extract_avx(const v8si v, int n)
145 {
146  union { v8si vector; int32 comp[8]; };
147  vector = v;
148  return comp[n];
149 }
150 #else
151 static SYS_FORCE_INLINE int
152 vm_extract_avx(const v8si v, int n)
153 {
154  switch(n)
155  {
156  case 0 : return _mm256_extract_epi32(v, 0);
157  case 1 : return _mm256_extract_epi32(v, 1);
158  case 2 : return _mm256_extract_epi32(v, 2);
159  case 3 : return _mm256_extract_epi32(v, 3);
160  case 4 : return _mm256_extract_epi32(v, 4);
161  case 5 : return _mm256_extract_epi32(v, 5);
162  case 6 : return _mm256_extract_epi32(v, 6);
163  case 7 : return _mm256_extract_epi32(v, 7);
164  }
165  return 0;
166 }
167 #endif
168 
169 static SYS_FORCE_INLINE float
170 vm_extract_avx(const v8sf v, int n)
171 {
172  union { v8sf vector; float comp[8]; };
173  vector = v;
174  return comp[n];
175 }
176 
177 static SYS_FORCE_INLINE v8sf
178 vm_splats_avx(float a)
179 {
180  return _mm256_set1_ps(a);
181 }
182 
183 static SYS_FORCE_INLINE v8si
184 vm_splats_avx(uint32 a)
185 {
186  SYS_FPRealUnionF tmp;
187  tmp.uval = a;
188  return V8SI(vm_splats_avx(tmp.fval));
189 }
190 
191 static SYS_FORCE_INLINE v8si
192 vm_splats_avx(int32 a)
193 {
194  return _mm256_set1_epi32(a);
195 }
196 
197 static SYS_FORCE_INLINE v8sf
198 vm_splats_avx(float a0, float a1, float a2, float a3,
199  float a4, float a5, float a6, float a7)
200 {
201  return _mm256_set_ps(a7, a6, a5, a4, a3, a2, a1, a0);
202 }
203 
204 static SYS_FORCE_INLINE v8si
205 vm_splats_avx(uint a0, uint a1, uint a2, uint a3,
206  uint a4, uint a5, uint a6, uint a7)
207 {
208  return _mm256_set_epi32((int32)a7, (int32)a6, (int32)a5, (int32)a4,
209  (int32)a3, (int32)a2, (int32)a1, (int32)a0);
210 }
211 
212 static SYS_FORCE_INLINE v8si
213 vm_splats_avx(int32 a0, int32 a1, int32 a2, int32 a3,
214  int32 a4, int32 a5, int32 a6, int32 a7)
215 {
216  return _mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0);
217 }
218 
219 static SYS_FORCE_INLINE v8si
220 vm_load_avx(const int32 v[8])
221 {
222  return _mm256_loadu_si256((v8si *) v);
223 }
224 
225 static SYS_FORCE_INLINE v8sf
226 vm_load_avx(const float v[8])
227 {
228  return _mm256_loadu_ps(v);
229 }
230 
231 static SYS_FORCE_INLINE void
232 vm_store_avx(int32 dst[8], v8si value)
233 {
234  _mm256_storeu_si256((__m256i*) dst, value);
235 }
236 
237 static SYS_FORCE_INLINE void
238 vm_store_avx(float dst[8], v8sf value)
239 {
240  _mm256_storeu_ps(dst, value);
241 }
242 
243 static SYS_FORCE_INLINE v8sf
244 vm_negate_avx(v8sf a)
245 {
246  return _mm256_sub_ps(_mm256_setzero_ps(), a);
247 }
248 
249 static SYS_FORCE_INLINE v8sf
250 vm_abs_avx(v8sf a)
251 {
252  return _mm256_max_ps(a, vm_negate_avx(a));
253 }
254 
255 static SYS_FORCE_INLINE v8sf
256 vm_fdiv_avx(v8sf a, v8sf b)
257 {
258  return _mm256_mul_ps(a, _mm256_rcp_ps(b));
259 }
260 
261 static SYS_FORCE_INLINE v8sf
262 vm_fsqrt_avx(v8sf a)
263 {
264  return _mm256_rcp_ps(_mm256_rsqrt_ps(a));
265 }
266 
267 static SYS_FORCE_INLINE v8sf
268 vm_madd_avx(v8sf a, v8sf b, v8sf c)
269 {
270  return _mm256_add_ps(_mm256_mul_ps(a, b), c);
271 }
272 
273 // Some integer instructions aren't in AVX so we use SSE
274 #define SSE_WRAPPER_I(NAME, OP) \
275 static SYS_FORCE_INLINE v8si \
276 NAME(v8si a, v8si b) \
277 { \
278  __m128i la = _mm256_extractf128_si256(a, 0); \
279  __m128i ua = _mm256_extractf128_si256(a, 1); \
280  __m128i lb = _mm256_extractf128_si256(b, 0); \
281  __m128i ub = _mm256_extractf128_si256(b, 1); \
282  return _mm256_set_m128i(OP(ua, ub), \
283  OP(la, lb)); \
284 }
285 SSE_WRAPPER_I(vm_int_cmpeq_avx, _mm_cmpeq_epi32)
286 SSE_WRAPPER_I(vm_int_cmplt_avx, _mm_cmplt_epi32)
287 SSE_WRAPPER_I(vm_int_cmpgt_avx, _mm_cmpgt_epi32)
288 SSE_WRAPPER_I(vm_int_add_avx, _mm_add_epi32)
289 SSE_WRAPPER_I(vm_int_sub_avx, _mm_sub_epi32)
290 SSE_WRAPPER_I(vm_int_mul_avx, _mm_mullo_epi32)
291 SSE_WRAPPER_I(vm_int_and_avx, _mm_and_si128)
292 SSE_WRAPPER_I(vm_int_andnot_avx,_mm_andnot_si128)
293 SSE_WRAPPER_I(vm_int_or_avx, _mm_or_si128)
294 SSE_WRAPPER_I(vm_int_xor_avx, _mm_xor_si128)
295 
296 static const v8si theSSETrue_avx= vm_splats_avx(0xFFFFFFFF);
297 
298 static SYS_FORCE_INLINE bool
299 vm_allbits_avx(const v8si &a)
300 {
301  return _mm256_movemask_ps(V8SF(vm_int_cmpeq_avx(a, theSSETrue_avx))) == 0xFF;
302 }
303 
304 
305 #define VM_EXTRACT_AVX vm_extract_avx
306 #define VM_INSERT_AVX vm_insert_avx
307 #define VM_SPLATS_AVX vm_splats_avx
308 #define VM_LOAD_AVX vm_load_avx
309 #define VM_STORE_AVX vm_store_avx
310 
311 #define VM_CMPLT_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_LT_OQ))
312 #define VM_CMPLE_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_LE_OQ))
313 #define VM_CMPGT_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_GT_OQ))
314 #define VM_CMPGE_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_GE_OQ))
315 #define VM_CMPEQ_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_EQ_OQ))
316 #define VM_CMPNE_AVX(A,B) V8SI(_mm256_cmp_ps(A,B,_CMP_NEQ_OQ))
317 
318 #define VM_ICMPLT_AVX vm_int_cmplt_avx
319 #define VM_ICMPGT_AVX vm_int_cmpgt_avx
320 #define VM_ICMPEQ_AVX vm_int_cmpeq_avx
321 
322 #define VM_IADD_AVX vm_int_add_avx
323 #define VM_ISUB_AVX vm_int_sub_avx
324 #define VM_IMUL_AVX vm_int_mul_avx
325 
326 #define VM_ADD_AVX _mm256_add_ps
327 #define VM_SUB_AVX _mm256_sub_ps
328 #define VM_MUL_AVX _mm256_mul_ps
329 #define VM_DIV_AVX _mm256_div_ps
330 #define VM_SQRT_AVX _mm256_sqrt_ps
331 #define VM_ISQRT_AVX _mm256_rsqrt_ps
332 #define VM_INVERT_AVX _mm256_rcp_ps
333 #define VM_ABS_AVX vm_abs_avx
334 
335 #define VM_FDIV_AVX vm_fdiv_avx
336 #define VM_NEG_AVX vm_negate_avx
337 #define VM_FSQRT_AVX vm_fsqrt_avx
338 #define VM_MADD_AVX vm_madd_avx
339 
340 #define VM_MIN_AVX _mm256_min_ps
341 #define VM_MAX_AVX _mm256_max_ps
342 
343 #define VM_AND_AVX vm_int_and_avx
344 #define VM_ANDNOT_AVX vm_int_andnot_avx
345 #define VM_OR_AVX vm_int_or_avx
346 #define VM_XOR_AVX vm_int_xor_avx
347 
348 #define VM_ALLBITS_AVX vm_allbits_avx
349 
350 #define VM_SHUFFLE_AVX vm_shuffle_avx
351 
352 // Integer to float conversions
353 #define VM_SSE_ROUND_MASK_AVX 0x6000
354 #define VM_SSE_ROUND_ZERO_AVX 0x6000
355 #define VM_SSE_ROUND_UP_AVX 0x4000
356 #define VM_SSE_ROUND_DOWN_AVX 0x2000
357 #define VM_SSE_ROUND_NEAR_AVX 0x0000
358 
359 #define GETROUND_AVX() (_mm_getcsr()&VM_SSE_ROUND_MASK_AVX)
360 #define SETROUND_AVX(x) (_mm_setcsr(x|(_mm_getcsr()&~VM_SSE_ROUND_MASK_AVX)))
361 
362 // The P functions must be invoked before FLOOR, the E functions invoked
363 // afterwards to reset the state.
364 
365 #define VM_P_FLOOR_AVX() uint rounding = GETROUND_AVX(); \
366  SETROUND_AVX(VM_SSE_ROUND_DOWN_AVX);
367 #define VM_FLOOR_AVX _mm256_cvtps_epi32
368 #define VM_INT_AVX _mm256_cvttps_epi32
369 #define VM_E_FLOOR_AVX() SETROUND_AVX(rounding);
370 
371 // Float to integer conversion
372 #define VM_IFLOAT_AVX _mm256_cvtepi32_ps
373 
374 // bitshifing A=v8si C=int
375 #define VM_SHIFTLEFT_AVX(A,C) _mm256_sll_epi32(A,_mm_setr_epi32(C,0,0,0))
376 #define VM_SHIFTRIGHT_AVX(A,C) _mm256_srl_epi32(A,_mm_setr_epi32(C,0,0,0))
377 
378 //
379 // SSE Trig sourced from...
380 // http://software-lisc.fbk.eu/avx_mathfun/avx_mathfun.h
381 //
382 static SYS_FORCE_INLINE void
383 vm_sincos_avx(v8sf x, v8sf *s, v8sf *c)
384 {
385 
386 // AVX implementation of sincos
387 //
388 // Based on "sse_mathfun.h", by Julien Pommier
389 // http://gruntthepeon.free.fr/ssemath/
390 //
391 // Copyright (C) 2012 Giovanni Garberoglio
392 // Interdisciplinary Laboratory for Computational Science (LISC)
393 // Fondazione Bruno Kessler and University of Trento
394 // via Sommarive, 18
395 // I-38123 Trento (Italy)
396 //
397 // This software is provided 'as-is', without any express or implied
398 // warranty. In no event will the authors be held liable for any damages
399 // arising from the use of this software.
400 //
401 // Permission is granted to anyone to use this software for any purpose,
402 // including commercial applications, and to alter it and redistribute it
403 // freely, subject to the following restrictions:
404 //
405 // 1. The origin of this software must not be misrepresented; you must not
406 // claim that you wrote the original software. If you use this software
407 // in a product, an acknowledgment in the product documentation would be
408 // appreciated but is not required.
409 // 2. Altered source versions must be plainly marked as such, and must not be
410 // misrepresented as being the original software.
411 // 3. This notice may not be removed or altered from any source distribution.
412 //
413 // (this is the zlib license)
414 
415 #define _PI32AVX_CONST(Name, Val) \
416  static const SYS_ALIGN(32) int _pi32avx_##Name[4] = \
417  { Val, Val, Val, Val }
418 
419  _PI32AVX_CONST(1, 1);
420  _PI32AVX_CONST(inv1, ~1);
421  _PI32AVX_CONST(2, 2);
422  _PI32AVX_CONST(4, 4);
423 
424  //declare some AVX constants -- why can't I figure a better way to do that?
425 #define _PS256_CONST(Name, Val) \
426  static const SYS_ALIGN(32) float _ps256_##Name[8] = \
427  { Val, Val, Val, Val, Val, Val, Val, Val }
428 #define _PS256_CONST_TYPE(Name, Type, Val) \
429  static const SYS_ALIGN(32) Type _ps256_##Name[8] = \
430  { Val, Val, Val, Val, Val, Val, Val, Val }
431 
432  _PS256_CONST(1 , 1.0f);
433  _PS256_CONST(0p5, 0.5f);
434 
435  _PS256_CONST_TYPE(sign_mask, uint32, 0x80000000);
436  _PS256_CONST_TYPE(inv_sign_mask, uint32, ~0x80000000);
437 
438  _PS256_CONST(minus_cephes_DP1, -0.78515625);
439  _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
440  _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
441  _PS256_CONST(sincof_p0, -1.9515295891E-4);
442  _PS256_CONST(sincof_p1, 8.3321608736E-3);
443  _PS256_CONST(sincof_p2, -1.6666654611E-1);
444  _PS256_CONST(coscof_p0, 2.443315711809948E-005);
445  _PS256_CONST(coscof_p1, -1.388731625493765E-003);
446  _PS256_CONST(coscof_p2, 4.166664568298827E-002);
447  _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
448 
449 #undef _PI32AVX_CONST
450 #undef _PS256_CONST
451 #undef _PS256_CONST_TYPE
452 
453  typedef union imm_xmm_union {
454  v8si imm;
455  __m128i xmm[2];
456  } imm_xmm_union;
457 
458 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \
459  SYS_ALIGN(32) imm_xmm_union u; \
460  u.imm = imm_; \
461  xmm0_ = u.xmm[0]; \
462  xmm1_ = u.xmm[1]; \
463  }
464 
465 #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \
466  SYS_ALIGN(32) imm_xmm_union u; \
467  u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \
468  }
469 
470  v8sf xmm1, xmm2, xmm3, sign_bit_sin, y;
471  v8si imm0, imm2, imm4;
472 
473  __m128i imm0_1, imm0_2;
474  __m128i imm2_1, imm2_2;
475  __m128i imm4_1, imm4_2;
476 
477  sign_bit_sin = x;
478  // take the absolute value
479  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
480  // extract the sign bit (upper one)
481  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
482 
483  // scale by 4/Pi
484  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);
485 
486  // we use SSE2 routines to perform the integer ops
487  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
488 
489  imm2_1 = _mm_add_epi32(imm2_1, *(__m128i*)_pi32avx_1);
490  imm2_2 = _mm_add_epi32(imm2_2, *(__m128i*)_pi32avx_1);
491 
492  imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_inv1);
493  imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_inv1);
494 
495  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
496  y = _mm256_cvtepi32_ps(imm2);
497 
498  imm4_1 = imm2_1;
499  imm4_2 = imm2_2;
500 
501  imm0_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_4);
502  imm0_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_4);
503 
504  imm0_1 = _mm_slli_epi32(imm0_1, 29);
505  imm0_2 = _mm_slli_epi32(imm0_2, 29);
506 
507  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
508 
509  imm2_1 = _mm_and_si128(imm2_1, *(__m128i*)_pi32avx_2);
510  imm2_2 = _mm_and_si128(imm2_2, *(__m128i*)_pi32avx_2);
511 
512  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
513  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
514 
515  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
516 
517  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
518  v8sf poly_mask = _mm256_castsi256_ps(imm2);
519 
520  // The magic pass: "Extended precision modular arithmetic"
521  // x = ((x - y * DP1) - y * DP2) - y * DP3;
522  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
523  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
524  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
525  xmm1 = _mm256_mul_ps(y, xmm1);
526  xmm2 = _mm256_mul_ps(y, xmm2);
527  xmm3 = _mm256_mul_ps(y, xmm3);
528  x = _mm256_add_ps(x, xmm1);
529  x = _mm256_add_ps(x, xmm2);
530  x = _mm256_add_ps(x, xmm3);
531 
532  imm4_1 = _mm_sub_epi32(imm4_1, *(__m128i*)_pi32avx_2);
533  imm4_2 = _mm_sub_epi32(imm4_2, *(__m128i*)_pi32avx_2);
534 
535  imm4_1 = _mm_andnot_si128(imm4_1, *(__m128i*)_pi32avx_4);
536  imm4_2 = _mm_andnot_si128(imm4_2, *(__m128i*)_pi32avx_4);
537 
538  imm4_1 = _mm_slli_epi32(imm4_1, 29);
539  imm4_2 = _mm_slli_epi32(imm4_2, 29);
540 
541  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
542 
543  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
544 
545  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
546 
547  // Evaluate the first polynom (0 <= x <= Pi/4)
548  v8sf z = _mm256_mul_ps(x,x);
549  y = *(v8sf*)_ps256_coscof_p0;
550 
551  y = _mm256_mul_ps(y, z);
552  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
553  y = _mm256_mul_ps(y, z);
554  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
555  y = _mm256_mul_ps(y, z);
556  y = _mm256_mul_ps(y, z);
557  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
558  y = _mm256_sub_ps(y, tmp);
559  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
560 
561  // Evaluate the second polynom (Pi/4 <= x <= 0)
562  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
563  y2 = _mm256_mul_ps(y2, z);
564  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
565  y2 = _mm256_mul_ps(y2, z);
566  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
567  y2 = _mm256_mul_ps(y2, z);
568  y2 = _mm256_mul_ps(y2, x);
569  y2 = _mm256_add_ps(y2, x);
570 
571  // select the correct result from the two polynoms
572  xmm3 = poly_mask;
573  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
574  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
575  y2 = _mm256_sub_ps(y2,ysin2);
576  y = _mm256_sub_ps(y, ysin1);
577 
578  xmm1 = _mm256_add_ps(ysin1,ysin2);
579  xmm2 = _mm256_add_ps(y,y2);
580 
581  // update the sign
582  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
583  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
584 
585 #undef COPY_IMM_TO_XMM
586 #undef COPY_XMM_TO_IMM
587 }
588 
589 static SYS_FORCE_INLINE v8sf
590 vm_sin_avx(v8sf x)
591 {
592  v8sf s,c;
593  vm_sincos_avx(x,&s,&c);
594  return s;
595 }
596 
597 static SYS_FORCE_INLINE v8sf
598 vm_cos_avx(v8sf x)
599 {
600  v8sf s,c;
601  vm_sincos_avx(x,&s,&c);
602  return c;
603 }
604 
605 static SYS_FORCE_INLINE v8sf
606 vm_tan_avx(v8sf x)
607 {
608  v8sf s,c;
609  vm_sincos_avx(x,&s,&c);
610  return _mm256_div_ps(s,c);
611 }
612 
613 #define VM_SINCOS_AVX vm_sincos_avx
614 #define VM_SIN_AVX vm_sin_avx
615 #define VM_COS_AVX vm_cos_avx
616 #define VM_TAN_AVX vm_tan_avx
617 
618 #endif
int int32
Definition: SYS_Types.h:39
#define _PS256_CONST_TYPE(Name, Type, Val)
#define _PS256_CONST(Name, Val)
const GLdouble * v
Definition: glcorearb.h:837
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:848
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222
GLdouble s
Definition: glad.h:3009
GLint y
Definition: glcorearb.h:103
__m256 v8sf
Definition: VM_AVXFunc.h:22
__m256i v8si
Definition: VM_AVXFunc.h:23
GLdouble n
Definition: glcorearb.h:2008
GLfloat f
Definition: glcorearb.h:1926
#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45
#define V8SF(A)
Definition: VM_AVXFunc.h:58
GLint GLuint mask
Definition: glcorearb.h:124
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222
#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)
GLint GLenum GLint x
Definition: glcorearb.h:409
GLenum GLenum dst
Definition: glcorearb.h:1793
#define V8SI(A)
Definition: VM_AVXFunc.h:59
#define _PI32AVX_CONST(Name, Val)
unsigned int uint32
Definition: SYS_Types.h:40
Definition: core.h:1131
GLdouble GLdouble GLdouble y2
Definition: glad.h:2349
#define SSE_WRAPPER_I(NAME, OP)
Definition: VM_AVXFunc.h:274
unsigned int uint
Definition: SYS_Types.h:45