docs/hdk/_v_m___s_s_e_func_8h_source.html

 /*

  * PROPRIETARY INFORMATION.  This software is proprietary to

  * Side Effects Software Inc., and is not to be reproduced,

  * transmitted, or disclosed in any way without written permission.

  *

  * NAME:        VM_SSEFunc.h ( VM Library, C++)

  *

  * COMMENTS:

  */


 #ifndef __VM_SSEFunc__

 #define __VM_SSEFunc__


 #include "VM_API.h"

 #include <SYS/SYS_Align.h>

 #include <SYS/SYS_Inline.h>

 #include <SYS/SYS_Types.h>


 #if defined(_MSC_VER)

     #pragma warning(push)

     #pragma warning(disable:4799)

 #endif


 #define CPU_HAS_SIMD_INSTR      1

 #define VM_SSE_STYLE            1


 #if !defined(ARM64)

 #include <emmintrin.h>

 #else

 #include <sse2neon.h>

 #endif


 typedef __m128  v4sf;

 typedef __m128i v4si;


 // NOTE: __SS4_1__ is NOT a predefined macro on Visual C++

 #if defined(__SSE4_1__) || defined(_MSC_VER)

 #define VM_SSE41_STYLE          1

 #if !defined(ARM64)

 #include <smmintrin.h>

 #endif

 #endif


 #if defined(_MSC_VER)

     #pragma warning(pop)

 #endif


 // Plain casting (no conversion)

 #define V4SF(A)         _mm_castsi128_ps(A)

 #define V4SI(A)         _mm_castps_si128(A)


 #define VM_SHUFFLE_MASK(a0,a1, b0,b1)   ((b1)<<6|(b0)<<4 | (a1)<<2|(a0))


 template <int mask>

 static SYS_FORCE_INLINE v4sf

 vm_shuffle(const v4sf &a, const v4sf &b)

 {

     return _mm_shuffle_ps(a, b, mask);

 }


 template <int mask>

 static SYS_FORCE_INLINE v4si

 vm_shuffle(const v4si &a, const v4si &b)

 {

     return V4SI(_mm_shuffle_ps(V4SF(a), V4SF(b), mask));

 }


 template <int A, int B, int C, int D, typename T>

 static SYS_FORCE_INLINE T

 vm_shuffle(const T &a, const T &b)

 {

     return vm_shuffle<VM_SHUFFLE_MASK(A,B,C,D)>(a, b);

 }


 template <int mask, typename T>

 static SYS_FORCE_INLINE T

 vm_shuffle(const T &a)

 {

     return vm_shuffle<mask>(a, a);

 }


 template <int A, int B, int C, int D, typename T>

 static SYS_FORCE_INLINE T

 vm_shuffle(const T &a)

 {

     return vm_shuffle<A,B,C,D>(a, a);

 }


 #if defined(VM_SSE41_STYLE)


 static SYS_FORCE_INLINE v4si

 vm_insert(const v4si v, int32 a, int n)

 {

     switch (n)

     {

     case 0: return _mm_insert_epi32(v, a, 0);

     case 1: return _mm_insert_epi32(v, a, 1);

     case 2: return _mm_insert_epi32(v, a, 2);

     case 3: return _mm_insert_epi32(v, a, 3);

     }

     return v;

 }


 static SYS_FORCE_INLINE v4sf

 vm_insert(const v4sf v, float a, int n)

 {

     switch (n)

     {

     case 0: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,0,0));

     case 1: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,1,0));

     case 2: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,2,0));

     case 3: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,3,0));

     }

     return v;

 }


 static SYS_FORCE_INLINE int

 vm_extract(const v4si v, int n)

 {

     switch (n)

     {

     case 0: return _mm_extract_epi32(v, 0);

     case 1: return _mm_extract_epi32(v, 1);

     case 2: return _mm_extract_epi32(v, 2);

     case 3: return _mm_extract_epi32(v, 3);

     }

     return 0;

 }


 static SYS_FORCE_INLINE float

 vm_extract(const v4sf v, int n)

 {

     SYS_FPRealUnionF    tmp;

     switch (n)

     {

     case 0: tmp.ival = _mm_extract_ps(v, 0); break;

     case 1: tmp.ival = _mm_extract_ps(v, 1); break;

     case 2: tmp.ival = _mm_extract_ps(v, 2); break;

     case 3: tmp.ival = _mm_extract_ps(v, 3); break;

     }

     return tmp.fval;

 }


 #else


 static SYS_FORCE_INLINE v4si

 vm_insert(const v4si v, int32 a, int n)

 {

     union { v4si vector; int32 comp[4]; };

     vector = v;

     comp[n] = a;

     return vector;

 }


 static SYS_FORCE_INLINE v4sf

 vm_insert(const v4sf v, float a, int n)

 {

     union { v4sf vector; float comp[4]; };

     vector = v;

     comp[n] = a;

     return vector;

 }


 static SYS_FORCE_INLINE int

 vm_extract(const v4si v, int n)

 {

     union { v4si vector; int32 comp[4]; };

     vector = v;

     return comp[n];

 }


 static SYS_FORCE_INLINE float

 vm_extract(const v4sf v, int n)

 {

     union { v4sf vector; float comp[4]; };

     vector = v;

     return comp[n];

 }


 #endif


 static SYS_FORCE_INLINE v4sf

 vm_splats(float a)

 {

     return _mm_set1_ps(a);

 }


 static SYS_FORCE_INLINE v4si

 vm_splats(uint32 a)

 {

     SYS_FPRealUnionF    tmp;

     tmp.uval = a;

     return V4SI(vm_splats(tmp.fval));

 }


 static SYS_FORCE_INLINE v4si

 vm_splats(int32 a)

 {

     SYS_FPRealUnionF    tmp;

     tmp.ival = a;

     return V4SI(vm_splats(tmp.fval));

 }


 static SYS_FORCE_INLINE v4sf

 vm_splats(float a, float b, float c, float d)

 {

     return vm_shuffle<0,2,0,2>(

             vm_shuffle<0>(_mm_set_ss(a), _mm_set_ss(b)),

             vm_shuffle<0>(_mm_set_ss(c), _mm_set_ss(d)));

 }


 static SYS_FORCE_INLINE v4si

 vm_splats(uint32 a, uint32 b, uint32 c, uint32 d)

 {

     SYS_FPRealUnionF    af, bf, cf, df;

     af.uval = a;

     bf.uval = b;

     cf.uval = c;

     df.uval = d;

     return V4SI(vm_splats(af.fval, bf.fval, cf.fval, df.fval));

 }


 static SYS_FORCE_INLINE v4si

 vm_splats(int32 a, int32 b, int32 c, int32 d)

 {

     SYS_FPRealUnionF    af, bf, cf, df;

     af.ival = a;

     bf.ival = b;

     cf.ival = c;

     df.ival = d;

     return V4SI(vm_splats(af.fval, bf.fval, cf.fval, df.fval));

 }


 static SYS_FORCE_INLINE v4si

 vm_load(const int32 v[4])

 {

     return V4SI(_mm_loadu_ps((const float *)v));

 }


 static SYS_FORCE_INLINE v4sf

 vm_load(const float v[4])

 {

     return _mm_loadu_ps(v);

 }


 static SYS_FORCE_INLINE void

 vm_store(int32 dst[4], v4si value)

 {

     _mm_storeu_si128((__m128i*) dst, value);

 }

 static SYS_FORCE_INLINE void

 vm_store(float dst[4], v4sf value)

 {

     _mm_storeu_ps(dst, value);

 }


 static SYS_FORCE_INLINE v4sf

 vm_negate(v4sf a)

 {

     return _mm_sub_ps(_mm_setzero_ps(), a);

 }


 static SYS_FORCE_INLINE v4sf

 vm_abs(v4sf a)

 {

     return _mm_max_ps(a, vm_negate(a));

 }


 static SYS_FORCE_INLINE v4sf

 vm_fdiv(v4sf a, v4sf b)

 {

     return _mm_mul_ps(a, _mm_rcp_ps(b));

 }


 static SYS_FORCE_INLINE v4sf

 vm_fsqrt(v4sf a)

 {

     return _mm_rcp_ps(_mm_rsqrt_ps(a));

 }


 static SYS_FORCE_INLINE v4sf

 vm_madd(v4sf a, v4sf b, v4sf c)

 {

     return _mm_add_ps(_mm_mul_ps(a, b), c);

 }


 static const v4si       theSSETrue = vm_splats(0xFFFFFFFF);


 static SYS_FORCE_INLINE bool

 vm_allbits(const v4si &a)

 {

     return _mm_movemask_ps(V4SF(_mm_cmpeq_epi32(a, theSSETrue))) == 0xF;

 }


 static SYS_FORCE_INLINE int

 vm_signbits(const v4si &a)

 {

     return _mm_movemask_ps(V4SF(a));

 }


 static SYS_FORCE_INLINE int

 vm_signbits(const v4sf &a)

 {

     return _mm_movemask_ps(a);

 }


 #define VM_EXTRACT      vm_extract

 #define VM_INSERT       vm_insert

 #define VM_SPLATS       vm_splats

 #define VM_LOAD         vm_load

 #define VM_STORE        vm_store


 #define VM_CMPLT(A,B)   V4SI(_mm_cmplt_ps(A,B))

 #define VM_CMPLE(A,B)   V4SI(_mm_cmple_ps(A,B))

 #define VM_CMPGT(A,B)   V4SI(_mm_cmpgt_ps(A,B))

 #define VM_CMPGE(A,B)   V4SI(_mm_cmpge_ps(A,B))

 #define VM_CMPEQ(A,B)   V4SI(_mm_cmpeq_ps(A,B))

 #define VM_CMPNE(A,B)   V4SI(_mm_cmpneq_ps(A,B))


 #define VM_ICMPLT       _mm_cmplt_epi32

 #define VM_ICMPGT       _mm_cmpgt_epi32

 #define VM_ICMPEQ       _mm_cmpeq_epi32


 #define VM_IADD         _mm_add_epi32

 #define VM_ISUB         _mm_sub_epi32

 #if defined(VM_SSE41_STYLE)

 #define VM_IMUL         _mm_mullo_epi32

 #endif


 #define VM_ADD          _mm_add_ps

 #define VM_SUB          _mm_sub_ps

 #define VM_MUL          _mm_mul_ps

 #define VM_DIV          _mm_div_ps

 #define VM_SQRT         _mm_sqrt_ps

 #define VM_ISQRT        _mm_rsqrt_ps

 #define VM_INVERT       _mm_rcp_ps

 #define VM_ABS          vm_abs


 #define VM_FDIV         vm_fdiv

 #define VM_NEG          vm_negate

 #define VM_FSQRT        vm_fsqrt

 #define VM_MADD         vm_madd


 #define VM_MIN          _mm_min_ps

 #define VM_MAX          _mm_max_ps


 #define VM_AND          _mm_and_si128

 #define VM_ANDNOT       _mm_andnot_si128

 #define VM_OR           _mm_or_si128

 #define VM_XOR          _mm_xor_si128


 #define VM_ALLBITS      vm_allbits

 #define VM_SIGNBITS     vm_signbits


 #define VM_SHUFFLE      vm_shuffle


 // Integer to float conversions

 #if !defined(ARM64)

 #define VM_SSE_ROUND_MASK       0x6000  // 0b110000000000000

 #define VM_SSE_ROUND_ZERO       0x6000  // 0b110000000000000 (RZ)

 #define VM_SSE_ROUND_UP         0x4000  // 0b100000000000000 (R+)

 #define VM_SSE_ROUND_DOWN       0x2000  // 0b010000000000000 (R-)

 #define VM_SSE_ROUND_NEAR       0x0000  // 0b000000000000000 (RN)


 #define GETROUND()      (_mm_getcsr()&VM_SSE_ROUND_MASK)

 #define SETROUND(x)     (_mm_setcsr(x|(_mm_getcsr()&~VM_SSE_ROUND_MASK)))

 #else

 #define VM_SSE_ROUND_ZERO       _MM_ROUND_TOWARDS_ZERO

 #define VM_SSE_ROUND_UP         _MM_ROUND_UP

 #define VM_SSE_ROUND_DOWN       _MM_ROUND_DOWN

 #define VM_SSE_ROUND_NEAR       _MM_ROUND_NEAREST


 #define GETROUND()      _MM_GET_ROUNDING_MODE()

 #define SETROUND(x)     _MM_SET_ROUNDING_MODE(x)

 #endif


 // The P functions must be invoked before FLOOR, the E functions invoked

 // afterwards to reset the state.


 #define VM_P_FLOOR()    uint rounding = GETROUND(); \

                             SETROUND(VM_SSE_ROUND_DOWN);

 #define VM_FLOOR        _mm_cvtps_epi32

 #define VM_INT          _mm_cvttps_epi32

 #define VM_E_FLOOR()    SETROUND(rounding);


 // Float to integer conversion

 #define VM_IFLOAT       _mm_cvtepi32_ps


 // bitshifing  A=v4si C=int

 #define VM_SHIFTLEFT(A,C)   _mm_sll_epi32(A,_mm_setr_epi32(C,0,0,0))

 #define VM_SHIFTRIGHT(A,C)  _mm_srl_epi32(A,_mm_setr_epi32(C,0,0,0))


 //

 //  SSE Trig sourced from...

 //  http://gruntthepeon.free.fr/ssemath/sse_mathfun.h

 //

 static SYS_FORCE_INLINE void

 vm_sincos(v4sf x, v4sf *s, v4sf *c)

 {

 //  Copyright (C) 2007  Julien Pommier

 //  This software is provided 'as-is', without any express or implied

 //  warranty.  In no event will the authors be held liable for any damages

 //  arising from the use of this software.

 //  Permission is granted to anyone to use this software for any purpose,

 //  including commercial applications, and to alter it and redistribute it

 //  freely, subject to the following restrictions:

 //  1. The origin of this software must not be misrepresented; you must not

 //      claim that you wrote the original software. If you use this software

 //      in a product, an acknowledgment in the product documentation would be

 //      appreciated but is not required.

 //  2. Altered source versions must be plainly marked as such, and must not be

 //      misrepresented as being the original software.

 //  3. This notice may not be removed or altered from any source distribution.

 //  (this is the zlib license)


 #define _PS_CONST(Name, Val)            \

     static const SYS_ALIGN16 float _ps_##Name[4] = { Val, Val, Val, Val }

 #define _PI32_CONST(Name, Val)          \

     static const SYS_ALIGN16 int _pi32_##Name[4] = { Val, Val, Val, Val }

 #define _PS_CONST_TYPE(Name, Type, Val) \

     static const SYS_ALIGN16 Type _ps_##Name[4] = { Val, Val, Val, Val }


     _PS_CONST(1  , 1.0f);

     _PS_CONST(0p5, 0.5f);

     _PI32_CONST(1, 1);

     _PI32_CONST(inv1, ~1);

     _PI32_CONST(2, 2);

     _PI32_CONST(4, 4);


     _PS_CONST_TYPE(sign_mask, int, (int)0x80000000);

     _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);


     _PS_CONST(minus_cephes_DP1, -0.78515625);

     _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);

     _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);

     _PS_CONST(sincof_p0, -1.9515295891E-4);

     _PS_CONST(sincof_p1,  8.3321608736E-3);

     _PS_CONST(sincof_p2, -1.6666654611E-1);

     _PS_CONST(coscof_p0,  2.443315711809948E-005);

     _PS_CONST(coscof_p1, -1.388731625493765E-003);

     _PS_CONST(coscof_p2,  4.166664568298827E-002);

     _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI


 #undef _PS_CONST

 #undef _PI32_CONST

 #undef _PS_CONST_TYPE


     v4sf xmm1, xmm2, xmm3, sign_bit_sin, y;

     v4si emm0, emm2, emm4;


     sign_bit_sin = x;

     // take the absolute value

     x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);

     // extract the sign bit (upper one)

     sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);


     // scale by 4/Pi

     y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);


     // store the integer part of y in emm2

     emm2 = _mm_cvttps_epi32(y);


     // j=(j+1) & (~1) (see the cephes sources)

     emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);

     emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);

     y = _mm_cvtepi32_ps(emm2);


     emm4 = emm2;


     // get the swap sign flag for the sine

     emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);

     emm0 = _mm_slli_epi32(emm0, 29);

     v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);


     // get the polynom selection mask for the sine

     emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);

     emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());

     v4sf poly_mask = _mm_castsi128_ps(emm2);


     // The magic pass: "Extended precision modular arithmetic"

     //  x = ((x - y * DP1) - y * DP2) - y * DP3;

     xmm1 = *(v4sf*)_ps_minus_cephes_DP1;

     xmm2 = *(v4sf*)_ps_minus_cephes_DP2;

     xmm3 = *(v4sf*)_ps_minus_cephes_DP3;

     xmm1 = _mm_mul_ps(y, xmm1);

     xmm2 = _mm_mul_ps(y, xmm2);

     xmm3 = _mm_mul_ps(y, xmm3);

     x = _mm_add_ps(x, xmm1);

     x = _mm_add_ps(x, xmm2);

     x = _mm_add_ps(x, xmm3);


     emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);

     emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);

     emm4 = _mm_slli_epi32(emm4, 29);

     v4sf sign_bit_cos = _mm_castsi128_ps(emm4);


     sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);


     // Evaluate the first polynom  (0 <= x <= Pi/4)

     v4sf z = _mm_mul_ps(x,x);

     y = *(v4sf*)_ps_coscof_p0;


     y = _mm_mul_ps(y, z);

     y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);

     y = _mm_mul_ps(y, z);

     y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);

     y = _mm_mul_ps(y, z);

     y = _mm_mul_ps(y, z);

     v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);

     y = _mm_sub_ps(y, tmp);

     y = _mm_add_ps(y, *(v4sf*)_ps_1);


     // Evaluate the second polynom  (Pi/4 <= x <= 0)

     v4sf y2 = *(v4sf*)_ps_sincof_p0;

     y2 = _mm_mul_ps(y2, z);

     y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);

     y2 = _mm_mul_ps(y2, z);

     y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);

     y2 = _mm_mul_ps(y2, z);

     y2 = _mm_mul_ps(y2, x);

     y2 = _mm_add_ps(y2, x);


     // select the correct result from the two polynoms

     xmm3 = poly_mask;

     v4sf ysin2 = _mm_and_ps(xmm3, y2);

     v4sf ysin1 = _mm_andnot_ps(xmm3, y);

     y2 = _mm_sub_ps(y2,ysin2);

     y = _mm_sub_ps(y, ysin1);


     xmm1 = _mm_add_ps(ysin1,ysin2);

     xmm2 = _mm_add_ps(y,y2);


     // update the sign

     *s = _mm_xor_ps(xmm1, sign_bit_sin);

     *c = _mm_xor_ps(xmm2, sign_bit_cos);

 }


 static SYS_FORCE_INLINE v4sf

 vm_sin(v4sf x)

 {

     v4sf s,c;

     vm_sincos(x,&s,&c);

     return s;

 }


 static SYS_FORCE_INLINE v4sf

 vm_cos(v4sf x)

 {

     v4sf s,c;

     vm_sincos(x,&s,&c);

     return c;

 }


 static SYS_FORCE_INLINE v4sf

 vm_tan(v4sf x)

 {

     v4sf s,c;

     vm_sincos(x,&s,&c);

     return _mm_div_ps(s,c);

 }


 #define VM_SINCOS       vm_sincos

 #define VM_SIN          vm_sin

 #define VM_COS          vm_cos

 #define VM_TAN          vm_tan


 #endif

v4si
__m128i v4si
Definition: VM_SSEFunc.h:34

_PI32_CONST
#define _PI32_CONST(Name, Val)

int32
int int32
Definition: SYS_Types.h:39

VM_API.h

v
const GLdouble * v
Definition: glcorearb.h:837

SYS_FPRealUnionT< fpreal32 >::uval
uint_type uval
Definition: SYS_Types.h:385

V4SF
#define V4SF(A)
Definition: VM_SSEFunc.h:49

SYS_FPRealUnionT< fpreal32 >::ival
int_type ival
Definition: SYS_Types.h:384

z
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:848

a
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222

s
GLdouble s
Definition: glad.h:3009

y
GLint y
Definition: glcorearb.h:103

SYS_Inline.h

SYS_Types.h

n
GLdouble n
Definition: glcorearb.h:2008

f
GLfloat f
Definition: glcorearb.h:1926

v4sf
__m128 v4sf
Definition: VM_SSEFunc.h:33

SYS_FORCE_INLINE
#define SYS_FORCE_INLINE
Definition: SYS_Inline.h:45

mask
GLint GLuint mask
Definition: glcorearb.h:124

SYS_Align.h

b
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222

x
GLint GLenum GLint x
Definition: glcorearb.h:409

OBJ_MatchTransform::T

v4sf
Definition: VM_BasicFunc.h:24

dst
GLenum GLenum dst
Definition: glcorearb.h:1793

V4SI
#define V4SI(A)
Definition: VM_SSEFunc.h:50

nanovdb::io::c
c
Definition: IO.h:328

uint32
unsigned int uint32
Definition: SYS_Types.h:40

value
Definition: core.h:1131

SYS_FPRealUnionT< fpreal32 >::fval
fpreal_type fval
Definition: SYS_Types.h:386

SYS_FPRealUnionT< fpreal32 >
Definition: SYS_Types.h:368

y2
GLdouble GLdouble GLdouble y2
Definition: glad.h:2349

_PS_CONST
#define _PS_CONST(Name, Val)

v4si
Definition: VM_BasicFunc.h:40

_PS_CONST_TYPE
#define _PS_CONST_TYPE(Name, Type, Val)