68 #if defined(__CUDA_ARCH__)
72 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) || defined(__e2k__)
73 # include <x86intrin.h>
74 #elif defined(__GNUC__) && defined(__ARM_NEON__)
75 # include <arm_neon.h>
81 #if defined(_WIN32) && defined(__i386__) && !defined(__x86_64__) && !defined(OIIO_NO_SSE)
86 #if defined(__CUDA_ARCH__) && !defined(OIIO_NO_SSE)
90 #if (defined(__SSE2__) || (_MSC_VER >= 1300 && !_M_CEE_PURE)) && !defined(OIIO_NO_SSE)
91 # if (defined(__SSE4_1__) || defined(__SSE4_2__))
92 # define OIIO_SIMD_SSE 4
98 # elif defined(__SSSE3__)
99 # define OIIO_SIMD_SSE 3
108 # define OIIO_SIMD_SSE 2
111 # define OIIO_SIMD_MAX_SIZE_BYTES 16
112 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
113 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
115 # define OIIO_SIMD_SSE 0
118 #if defined(__AVX__) && !defined(OIIO_NO_AVX)
120 # if defined(__AVX2__) && !defined(OIIO_NO_AVX2)
121 # define OIIO_SIMD_AVX 2
123 # define OIIO_SIMD_AVX 1
127 # undef OIIO_SIMD_MAX_SIZE_BYTES
128 # define OIIO_SIMD_MAX_SIZE_BYTES 32
129 # define OIIO_SIMD8_ALIGN OIIO_ALIGN(32)
130 # define OIIO_AVX_ALIGN OIIO_ALIGN(32)
131 # if defined(__AVX512F__)
132 # undef OIIO_SIMD_AVX
133 # define OIIO_SIMD_AVX 512
134 # undef OIIO_SIMD_MAX_SIZE_BYTES
135 # define OIIO_SIMD_MAX_SIZE_BYTES 64
137 # define OIIO_SIMD 16
138 # define OIIO_SIMD16_ALIGN OIIO_ALIGN(64)
139 # define OIIO_AVX512_ALIGN OIIO_ALIGN(64)
140 # define OIIO_AVX512F_ENABLED 1
142 # if defined(__AVX512DQ__)
143 # define OIIO_AVX512DQ_ENABLED 1
145 # define OIIO_AVX512DQ_ENABLED 0
147 # if defined(__AVX512PF__)
148 # define OIIO_AVX512PF_ENABLED 1
150 # define OIIO_AVX512PF_ENABLED 0
152 # if defined(__AVX512ER__)
153 # define OIIO_AVX512ER_ENABLED 1
155 # define OIIO_AVX512ER_ENABLED 0
157 # if defined(__AVX512CD__)
158 # define OIIO_AVX512CD_ENABLED 1
160 # define OIIO_AVX512CD_ENABLED 0
162 # if defined(__AVX512BW__)
163 # define OIIO_AVX512BW_ENABLED 1
165 # define OIIO_AVX512BW_ENABLED 0
167 # if defined(__AVX512VL__)
168 # define OIIO_AVX512VL_ENABLED 1
170 # define OIIO_AVX512VL_ENABLED 0
173 # define OIIO_SIMD_AVX 0
174 # define OIIO_AVX512VL_ENABLED 0
175 # define OIIO_AVX512DQ_ENABLED 0
176 # define OIIO_AVX512PF_ENABLED 0
177 # define OIIO_AVX512ER_ENABLED 0
178 # define OIIO_AVX512CD_ENABLED 0
179 # define OIIO_AVX512BW_ENABLED 0
183 # define OIIO_FMA_ENABLED 1
185 # define OIIO_FMA_ENABLED 0
187 #if defined(__AVX512IFMA__)
188 # define OIIO_AVX512IFMA_ENABLED 1
190 # define OIIO_AVX512IFMA_ENABLED 0
193 #if defined(__F16C__)
194 # define OIIO_F16C_ENABLED 1
196 # define OIIO_F16C_ENABLED 0
201 #if defined(__ARM_NEON__) && !defined(OIIO_NO_NEON)
203 # define OIIO_SIMD_NEON 1
204 # define OIIO_SIMD_MAX_SIZE_BYTES 16
205 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
206 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
208 # define OIIO_SIMD_NEON 0
214 # define OIIO_SIMD4_ALIGN
215 # define OIIO_SIMD_MAX_SIZE_BYTES 16
218 #ifndef OIIO_SIMD8_ALIGN
219 # define OIIO_SIMD8_ALIGN OIIO_SIMD4_ALIGN
221 #ifndef OIIO_SIMD16_ALIGN
222 # define OIIO_SIMD16_ALIGN OIIO_SIMD8_ALIGN
231 #define OIIO_SIMD_HAS_MATRIX4 1
232 #define OIIO_SIMD_HAS_FLOAT8 1
233 #define OIIO_SIMD_HAS_SIMD8 1
234 #define OIIO_SIMD_HAS_SIMD16 1
288 template<>
struct simd_bool_t<4> {
typedef __m128
type; };
292 template<>
struct simd_raw_t<
int,8> {
typedef __m256i
type; };
293 template<>
struct simd_raw_t<
float,8> {
typedef __m256
type; };
294 template<>
struct simd_bool_t<8> {
typedef __m256
type; };
297 #if OIIO_SIMD_AVX >= 512
298 template<>
struct simd_raw_t<
int,16> {
typedef __m512i
type; };
299 template<>
struct simd_raw_t<
float,16> {
typedef __m512
type; };
300 template<>
struct simd_bool_t<16> {
typedef __mmask16
type; };
309 template<>
struct simd_raw_t<
float,4> {
typedef float32x4_t
type; };
310 template<>
struct simd_bool_t<4> {
typedef uint32x4_t
type; };
316 template<
typename T,
int elements>
struct VecType {};
350 template<
typename T>
struct SimdTypeName {
static const char *
name() {
return "unknown"; } };
365 # define OIIO_SIMD_FLOAT4_CONST(name,val) \
366 static const OIIO_SIMD4_ALIGN float name[4] = { (val), (val), (val), (val) }
367 # define OIIO_SIMD_FLOAT4_CONST4(name,v0,v1,v2,v3) \
368 static const OIIO_SIMD4_ALIGN float name[4] = { (v0), (v1), (v2), (v3) }
369 # define OIIO_SIMD_INT4_CONST(name,val) \
370 static const OIIO_SIMD4_ALIGN int name[4] = { (val), (val), (val), (val) }
371 # define OIIO_SIMD_INT4_CONST4(name,v0,v1,v2,v3) \
372 static const OIIO_SIMD4_ALIGN int name[4] = { (v0), (v1), (v2), (v3) }
373 # define OIIO_SIMD_UINT4_CONST(name,val) \
374 static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (val), (val), (val), (val) }
375 # define OIIO_SIMD_UINT4_CONST4(name,v0,v1,v2,v3) \
376 static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (v0), (v1), (v2), (v3) }
378 # define OIIO_SIMD_FLOAT8_CONST(name,val) \
379 static const OIIO_SIMD8_ALIGN float name[8] = { (val), (val), (val), (val), \
380 (val), (val), (val), (val) }
381 # define OIIO_SIMD_FLOAT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
382 static const OIIO_SIMD8_ALIGN float name[8] = { (v0), (v1), (v2), (v3), \
383 (v4), (v5), (v6), (v7) }
384 # define OIIO_SIMD_INT8_CONST(name,val) \
385 static const OIIO_SIMD8_ALIGN int name[8] = { (val), (val), (val), (val), \
386 (val), (val), (val), (val) }
387 # define OIIO_SIMD_INT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
388 static const OIIO_SIMD8_ALIGN int name[8] = { (v0), (v1), (v2), (v3), \
389 (v4), (v5), (v6), (v7) }
390 # define OIIO_SIMD_UINT8_CONST(name,val) \
391 static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (val), (val), (val), (val), \
392 (val), (val), (val), (val) }
393 # define OIIO_SIMD_UINT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
394 static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (v0), (v1), (v2), (v3), \
395 (v4), (v5), (v6), (v7) }
397 # define OIIO_SIMD_VFLOAT16_CONST(name,val) \
398 static const OIIO_SIMD16_ALIGN float name[16] = { \
399 (val), (val), (val), (val), (val), (val), (val), (val), \
400 (val), (val), (val), (val), (val), (val), (val), (val) }
401 # define OIIO_SIMD_VFLOAT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
402 static const OIIO_SIMD16_ALIGN float name[16] = { \
403 (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
404 (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
405 # define OIIO_SIMD_INT16_CONST(name,val) \
406 static const OIIO_SIMD16_ALIGN int name[16] = { \
407 (val), (val), (val), (val), (val), (val), (val), (val), \
408 (val), (val), (val), (val), (val), (val), (val), (val) }
409 # define OIIO_SIMD_INT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
410 static const OIIO_SIMD16_ALIGN int name[16] = { \
411 (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
412 (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
413 # define OIIO_SIMD_UINT16_CONST(name,val) \
414 static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
415 (val), (val), (val), (val), (val), (val), (val), (val), \
416 (val), (val), (val), (val), (val), (val), (val), (val) }
417 # define OIIO_SIMD_UINT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
418 static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
419 (val), (val), (val), (val), (val), (val), (val), (val), \
420 (val), (val), (val), (val), (val), (val), (val), (val) }
427 #define SIMD_DO(x) for (int i = 0; i < elements; ++i) x
428 #define SIMD_CONSTRUCT(x) for (int i = 0; i < elements; ++i) m_val[i] = (x)
429 #define SIMD_CONSTRUCT_PAD(x) for (int i = 0; i < elements; ++i) m_val[i] = (x); \
430 for (int i = elements; i < paddedelements; ++i) m_val[i] = 0
431 #define SIMD_RETURN(T,x) T r; for (int i = 0; i < r.elements; ++i) r[i] = (x); return r
432 #define SIMD_RETURN_REDUCE(T,init,op) T r = init; for (int i = 0; i < v.elements; ++i) op; return r
472 explicit vbool4 (
const bool *
a);
482 load (
bool(a),
bool(b),
bool(c),
bool(d));
531 void load (
bool a,
bool b,
bool c,
bool d);
568 template<
int i0,
int i1,
int i2,
int i3>
586 bool all (
const vbool4&
v);
587 bool any (
const vbool4&
v);
588 bool none (
const vbool4&
v);
591 inline bool all (
bool v) {
return v; }
616 vbool8 (
bool a,
bool b,
bool c,
bool d,
bool e,
bool f,
bool g,
bool h);
622 vbool8 (
int a,
int b,
int c,
int d,
int e,
int f,
int g,
int h);
679 void load (
bool a,
bool b,
bool c,
bool d,
680 bool e,
bool f,
bool g,
bool h);
718 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
736 bool all (
const vbool8&
v);
737 bool any (
const vbool8&
v);
738 bool none (
const vbool8&
v);
766 vbool16 (
bool v0,
bool v1,
bool v2,
bool v3,
bool v4,
bool v5,
bool v6,
bool v7,
767 bool v8,
bool v9,
bool v10,
bool v11,
bool v12,
bool v13,
bool v14,
bool v15);
773 vbool16 (
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
774 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15);
829 void load (
bool v0,
bool v1,
bool v2,
bool v3,
bool v4,
bool v5,
bool v6,
bool v7,
830 bool v8,
bool v9,
bool v10,
bool v11,
bool v12,
bool v13,
bool v14,
bool v15);
881 bool all (
const vbool16&
v);
882 bool any (
const vbool16&
v);
883 bool none (
const vbool16&
v);
919 vint4 (
const int *vals);
922 explicit vint4 (
const unsigned short *vals);
925 explicit vint4 (
const short *vals);
928 explicit vint4 (
const unsigned char *vals);
931 explicit vint4 (
const char *vals);
998 void load (
int a,
int b,
int c,
int d);
1042 template<
int scale=4>
1045 template<
int scale=4>
1047 template<
int scale=4>
1051 template<
int scale=4>
1054 template<
int scale=4>
1056 template<
int scale=4>
1107 vint4
srl (
const vint4&
val,
const unsigned int bits);
1111 template<
int i0,
int i1,
int i2,
int i3>
1131 vint4
blend (
const vint4&
a,
const vint4&
b,
const vbool4&
mask);
1136 vint4
blend0 (
const vint4&
a,
const vbool4&
mask);
1146 vint4
select (
const vbool4&
mask,
const vint4&
a,
const vint4&
b);
1149 vint4
abs (
const vint4&
a);
1150 vint4
min (
const vint4&
a,
const vint4&
b);
1151 vint4
max (
const vint4&
a,
const vint4&
b);
1154 vint4
rotl (
const vint4&
x,
const int s);
1156 vint4
rotl32 (
const vint4&
x,
const unsigned int k);
1159 vint4
andnot (
const vint4&
a,
const vint4&
b);
1166 void transpose (vint4 &
a, vint4 &
b, vint4 &
c, vint4 &d);
1167 void transpose (
const vint4&
a,
const vint4&
b,
const vint4&
c,
const vint4& d,
1168 vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3);
1170 vint4
AxBxCxDx (
const vint4&
a,
const vint4&
b,
const vint4&
c,
const vint4& d);
1173 vint4
safe_mod (
const vint4&
a,
const vint4&
b);
1206 vint8 (
int a,
int b,
int c,
int d,
int e,
int f,
int g,
int h);
1209 vint8 (
const int *vals);
1212 explicit vint8 (
const unsigned short *vals);
1215 explicit vint8 (
const short *vals);
1218 explicit vint8 (
const unsigned char *vals);
1221 explicit vint8 (
const char *vals);
1297 void load (
int a,
int b,
int c,
int d,
int e,
int f,
int g,
int h);
1341 template<
int scale=4>
1344 template<
int scale=4>
1346 template<
int scale=4>
1350 template<
int scale=4>
1353 template<
int scale=4>
1355 template<
int scale=4>
1407 vint8
srl (
const vint8&
val,
const unsigned int bits);
1411 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
1434 vint8
blend (
const vint8&
a,
const vint8&
b,
const vbool8&
mask);
1439 vint8
blend0 (
const vint8&
a,
const vbool8&
mask);
1449 vint8
select (
const vbool8&
mask,
const vint8&
a,
const vint8&
b);
1452 vint8
abs (
const vint8&
a);
1453 vint8
min (
const vint8&
a,
const vint8&
b);
1454 vint8
max (
const vint8&
a,
const vint8&
b);
1457 vint8
rotl (
const vint8&
x,
const int s);
1459 vint8
rotl32 (
const vint8&
x,
const unsigned int k);
1462 vint8
andnot (
const vint8&
a,
const vint8&
b);
1470 vint8
safe_mod (
const vint8&
a,
const vint8&
b);
1501 vint16 (
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
1502 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15);
1505 vint16 (
const int *vals);
1508 explicit vint16 (
const unsigned short *vals);
1511 explicit vint16 (
const short *vals);
1514 explicit vint16 (
const unsigned char *vals);
1517 explicit vint16 (
const char *vals);
1596 void load (
int v0,
int v1,
int v2,
int v3,
int v4,
int v5,
int v6,
int v7,
1597 int v8,
int v9,
int v10,
int v11,
int v12,
int v13,
int v14,
int v15);
1641 template<
int scale=4>
1644 template<
int scale=4>
1646 template<
int scale=4>
1648 gather_mask<scale> (
vbool_t(mask), baseptr, vindex);
1652 template<
int scale=4>
1655 template<
int scale=4>
1657 template<
int scale=4>
1659 scatter_mask<scale> (
vbool_t(mask), baseptr, vindex);
1711 vint16
srl (
const vint16&
val,
const unsigned int bits);
1714 template<
int i0,
int i1,
int i2,
int i3>
1718 template<
int i> vint16
shuffle4 (
const vint16&
a);
1721 template<
int i0,
int i1,
int i2,
int i3>
1725 template<
int i> vint16
shuffle (
const vint16&
a);
1744 vint16
blend (
const vint16&
a,
const vint16&
b,
const vbool16&
mask);
1749 vint16
blend0 (
const vint16&
a,
const vbool16&
mask);
1759 vint16
select (
const vbool16&
mask,
const vint16&
a,
const vint16&
b);
1762 vint16
abs (
const vint16&
a);
1763 vint16
min (
const vint16&
a,
const vint16&
b);
1764 vint16
max (
const vint16&
a,
const vint16&
b);
1767 vint16
rotl (
const vint16&
x,
const int s);
1769 vint16
rotl32 (
const vint16&
x,
const unsigned int k);
1772 vint16
andnot (
const vint16&
a,
const vint16&
b);
1780 vint16
safe_mod (
const vint16&
a,
const vint16&
b);
1859 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1913 void load (
float a,
float b,
float c,
float d=0.0
f);
1934 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1944 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1959 template<
int scale=4>
1962 template<
int scale=4>
1964 template<
int scale=4>
1968 template<
int scale=4>
1971 template<
int scale=4>
1973 template<
int scale=4>
2029 template<
int i0,
int i1,
int i2,
int i3>
2049 vfloat4
vdot (
const vfloat4 &
a,
const vfloat4 &
b);
2052 float dot (
const vfloat4 &
a,
const vfloat4 &
b);
2056 vfloat4
vdot3 (
const vfloat4 &
a,
const vfloat4 &
b);
2059 float dot3 (
const vfloat4 &
a,
const vfloat4 &
b);
2063 vfloat4
blend (
const vfloat4&
a,
const vfloat4&
b,
const vbool4&
mask);
2068 vfloat4
blend0 (
const vfloat4&
a,
const vbool4&
mask);
2077 vfloat4
safe_div (
const vfloat4 &
a,
const vfloat4 &
b);
2080 vfloat3
hdiv (
const vfloat4 &
a);
2085 vfloat4
select (
const vbool4&
mask,
const vfloat4&
a,
const vfloat4&
b);
2088 vfloat4
abs (
const vfloat4&
a);
2089 vfloat4
sign (
const vfloat4&
a);
2090 vfloat4
ceil (
const vfloat4&
a);
2091 vfloat4
floor (
const vfloat4&
a);
2092 vint4
ifloor (
const vfloat4&
a);
2103 vfloat4
round (
const vfloat4&
a);
2110 vint4
rint (
const vfloat4&
a);
2113 vfloat4
sqrt (
const vfloat4 &
a);
2114 vfloat4
rsqrt (
const vfloat4 &
a);
2116 vfloat4
min (
const vfloat4&
a,
const vfloat4&
b);
2117 vfloat4
max (
const vfloat4&
a,
const vfloat4&
b);
2122 vfloat4
andnot (
const vfloat4&
a,
const vfloat4&
b);
2125 vfloat4
madd (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2126 vfloat4
msub (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2127 vfloat4
nmadd (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2128 vfloat4
nmsub (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c);
2133 void transpose (vfloat4 &
a, vfloat4 &
b, vfloat4 &
c, vfloat4 &d);
2134 void transpose (
const vfloat4&
a,
const vfloat4&
b,
const vfloat4&
c,
const vfloat4& d,
2135 vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3);
2138 vfloat4
AxBxCxDx (
const vfloat4&
a,
const vfloat4&
b,
2139 const vfloat4&
c,
const vfloat4& d);
2199 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2238 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2247 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2288 vfloat3
abs (
const vfloat3&
a);
2289 vfloat3
sign (
const vfloat3&
a);
2290 vfloat3
ceil (
const vfloat3&
a);
2291 vfloat3
floor (
const vfloat3&
a);
2292 vfloat3
round (
const vfloat3&
a);
2302 #ifndef OIIO_SIMD_SSE
2303 : m_mat(Imath::UNINITIALIZED)
2310 m_row[0].load (M[0]);
2311 m_row[1].load (M[1]);
2312 m_row[2].load (M[2]);
2313 m_row[3].load (M[3]);
2322 m_row[0].load (f+0);
2323 m_row[1].load (f+4);
2324 m_row[2].load (f+8);
2325 m_row[3].load (f+12);
2335 m_row[0] =
a; m_row[1] =
b; m_row[2] =
c; m_row[3] = d;
2345 const float *
c,
const float *d) {
2347 m_row[0].load(a); m_row[1].load(b); m_row[2].load(c); m_row[3].load(d);
2349 memcpy (m_mat[0], a, 4*
sizeof(
float));
2350 memcpy (m_mat[1], b, 4*
sizeof(
float));
2351 memcpy (m_mat[2], c, 4*
sizeof(
float));
2352 memcpy (m_mat[3], d, 4*
sizeof(
float));
2358 float f10,
float f11,
float f12,
float f13,
2359 float f20,
float f21,
float f22,
float f23,
2360 float f30,
float f31,
float f32,
float f33)
2363 m_row[0].load (f00, f01, f02, f03);
2364 m_row[1].load (f10, f11, f12, f13);
2365 m_row[2].load (f20, f21, f22, f23);
2366 m_row[3].load (f30, f31, f32, f33);
2368 m_mat[0][0] = f00; m_mat[0][1] = f01; m_mat[0][2] = f02; m_mat[0][3] = f03;
2369 m_mat[1][0] = f10; m_mat[1][1] = f11; m_mat[1][2] = f12; m_mat[1][3] = f13;
2370 m_mat[2][0] = f20; m_mat[2][1] = f21; m_mat[2][2] = f22; m_mat[2][3] = f23;
2371 m_mat[3][0] = f30; m_mat[3][1] = f31; m_mat[3][2] = f32; m_mat[3][3] = f33;
2421 vfloat3
transformp (
const matrix44 &M,
const vfloat3 &V);
2425 vfloat3
transformv (
const matrix44 &M,
const vfloat3 &V);
2429 vfloat3
transformvT (
const matrix44 &M,
const vfloat3 &V);
2461 float e,
float f,
float g,
float h) {
load(a,b,c,d,e,f,g,h); }
2499 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2553 void load (
float a,
float b,
float c,
float d,
2554 float e,
float f,
float g,
float h);
2575 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2585 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2600 template<
int scale=4>
2602 template<
int scale=4>
2605 template<
int scale=4>
2609 template<
int scale=4>
2612 template<
int scale=4>
2614 template<
int scale=4>
2656 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
2676 vfloat8
vdot (
const vfloat8 &
a,
const vfloat8 &
b);
2679 float dot (
const vfloat8 &
a,
const vfloat8 &
b);
2683 vfloat8
vdot3 (
const vfloat8 &
a,
const vfloat8 &
b);
2686 float dot3 (
const vfloat8 &
a,
const vfloat8 &
b);
2690 vfloat8
blend (
const vfloat8&
a,
const vfloat8&
b,
const vbool8&
mask);
2695 vfloat8
blend0 (
const vfloat8&
a,
const vbool8&
mask);
2704 vfloat8
safe_div (
const vfloat8 &
a,
const vfloat8 &
b);
2709 vfloat8
select (
const vbool8&
mask,
const vfloat8&
a,
const vfloat8&
b);
2712 vfloat8
abs (
const vfloat8&
a);
2713 vfloat8
sign (
const vfloat8&
a);
2714 vfloat8
ceil (
const vfloat8&
a);
2715 vfloat8
floor (
const vfloat8&
a);
2716 vint8
ifloor (
const vfloat8&
a);
2726 vfloat8
round (
const vfloat8&
a);
2733 vint8
rint (
const vfloat8&
a);
2736 vfloat8
sqrt (
const vfloat8 &
a);
2737 vfloat8
rsqrt (
const vfloat8 &
a);
2739 vfloat8
min (
const vfloat8&
a,
const vfloat8&
b);
2740 vfloat8
max (
const vfloat8&
a,
const vfloat8&
b);
2745 vfloat8
andnot (
const vfloat8&
a,
const vfloat8&
b);
2748 vfloat8
madd (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2749 vfloat8
msub (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2750 vfloat8
nmadd (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2751 vfloat8
nmsub (
const vfloat8&
a,
const vfloat8&
b,
const vfloat8&
c);
2781 float v4,
float v5,
float v6,
float v7,
2782 float v8,
float v9,
float v10,
float v11,
2783 float v12,
float v13,
float v14,
float v15);
2824 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2879 float v4,
float v5,
float v6,
float v7,
2880 float v8,
float v9,
float v10,
float v11,
2881 float v12,
float v13,
float v14,
float v15);
2902 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2912 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2927 template<
int scale=4>
2930 template<
int scale=4>
2932 template<
int scale=4>
2934 gather_mask<scale> (
vbool_t(mask), baseptr, vindex);
2938 template<
int scale=4>
2941 template<
int scale=4>
2943 template<
int scale=4>
2945 scatter_mask<scale> (
vbool_t(mask), baseptr, vindex);
2986 template<
int i0,
int i1,
int i2,
int i3>
2993 template<
int i0,
int i1,
int i2,
int i3>
2997 template<
int i> vfloat16
shuffle (
const vfloat16&
a);
3014 vfloat16
blend (
const vfloat16&
a,
const vfloat16&
b,
const vbool4&
mask);
3019 vfloat16
blend0 (
const vfloat16&
a,
const vbool4&
mask);
3028 vfloat16
safe_div (
const vfloat16 &
a,
const vfloat16 &
b);
3033 vfloat16
select (
const vbool16&
mask,
const vfloat16&
a,
const vfloat16&
b);
3036 vfloat16
abs (
const vfloat16&
a);
3037 vfloat16
sign (
const vfloat16&
a);
3038 vfloat16
ceil (
const vfloat16&
a);
3039 vfloat16
floor (
const vfloat16&
a);
3040 vint16
ifloor (
const vfloat16&
a);
3051 vfloat16
round (
const vfloat16&
a);
3058 vint16
rint (
const vfloat16&
a);
3061 vfloat16
sqrt (
const vfloat16 &
a);
3062 vfloat16
rsqrt (
const vfloat16 &
a);
3064 vfloat16
min (
const vfloat16&
a,
const vfloat16&
b);
3065 vfloat16
max (
const vfloat16&
a,
const vfloat16&
b);
3070 vfloat16
andnot (
const vfloat16&
a,
const vfloat16&
b);
3073 vfloat16
madd (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3074 vfloat16
msub (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3075 vfloat16
nmadd (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3076 vfloat16
nmsub (
const vfloat16&
a,
const vfloat16&
b,
const vfloat16&
c);
3085 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3086 _MM_SET_FLUSH_ZERO_MODE (on ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF);
3095 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3096 _MM_SET_DENORMALS_ZERO_MODE (on ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF);
3104 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3105 return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
3112 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3113 return _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
3151 return ((_mm_movemask_ps(
m_simd) >> i) & 1) ? -1 : 0;
3165 m_val[i] = value ? -1 : 0;
3171 for (
int i = 1; i < a.elements; ++i)
3172 cout <<
' ' << a[i];
3179 m_simd = _mm_castsi128_ps(_mm_set1_epi32(-
int(a)));
3180 #elif OIIO_SIMD_NEON
3181 m_simd = vdupq_n_u32(a ? 0xffffffff : 0);
3193 m_simd = _mm_castsi128_ps(_mm_set_epi32(-
int(d), -
int(c), -
int(b), -
int(a)));
3205 load (a[0], a[1], a[2], a[3]);
3216 return _mm_movemask_ps(
m_simd);
3236 m_simd = _mm_setzero_ps();
3245 return _mm_setzero_ps();
3254 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
3255 __m128i anyval = _mm_undefined_si128();
3257 __m128i anyval = _mm_setzero_si128();
3259 return _mm_castsi128_ps (_mm_cmpeq_epi8 (anyval, anyval));
3271 for (
int i = 0; i <
n; ++i)
3272 values[i] =
m_val[i] ?
true :
false;
3280 #elif OIIO_SIMD_NEON
3281 return vmvnq_u32(a.
simd());
3289 return _mm_and_ps (a.
simd(), b.
simd());
3290 #elif OIIO_SIMD_NEON
3291 return vandq_u32(a.
simd(), b.
simd());
3299 return _mm_or_ps (a.
simd(), b.
simd());
3300 #elif OIIO_SIMD_NEON
3301 return vorrq_u32(a.
simd(), b.
simd());
3309 return _mm_xor_ps (a.
simd(), b.
simd());
3310 #elif OIIO_SIMD_NEON
3311 return veorq_u32(a.
simd(), b.
simd());
3334 #elif OIIO_SIMD_NEON
3335 return vmvnq_u32(a.
m_simd);
3343 return _mm_castsi128_ps (_mm_cmpeq_epi32 (_mm_castps_si128 (a), _mm_castps_si128(b)));
3344 #elif OIIO_SIMD_NEON
3353 return _mm_xor_ps (a, b);
3354 #elif OIIO_SIMD_NEON
3366 template<
int i0,
int i1,
int i2,
int i3>
3368 return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3,
i2,
i1, i0));
3372 #if OIIO_SIMD_SSE >= 3
3375 return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(
a)));
3378 return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(
a)));
3381 return _mm_castpd_si128(_mm_movedup_pd(_mm_castsi128_pd(
a)));
3386 template<
int i0,
int i1,
int i2,
int i3>
3388 return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3,
i2,
i1, i0)));
3392 #if OIIO_SIMD_SSE >= 3
3395 return _mm_moveldup_ps(a);
3398 return _mm_movehdup_ps(a);
3401 return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(a)));
3408 template<
int i0,
int i1,
int i2,
int i3>
3411 return shuffle_sse<i0,i1,i2,i3> (a.
simd());
3419 return shuffle<i,i,i,i>(
a);
3427 #if OIIO_SIMD_SSE >= 4
3428 return _mm_extract_epi32(_mm_castps_si128(a.
simd()), i);
3437 #if OIIO_SIMD_SSE >= 4
3438 int ival = -
int(val);
3439 return _mm_castsi128_ps (_mm_insert_epi32 (_mm_castps_si128(a), ival, i));
3449 return _mm_testc_ps (v,
vbool4(
true)) != 0;
3451 return _mm_movemask_ps(v.
simd()) == 0xf;
3459 return ! _mm_testz_ps (v, v);
3461 return _mm_movemask_ps(v) != 0;
3480 return ((_mm256_movemask_ps(
m_simd) >> i) & 1) ? -1 : 0;
3488 m_val[i] = value ? -1 : 0;
3499 for (
int i = 1; i < a.elements; ++i)
3500 cout <<
' ' << a[i];
3507 m_simd = _mm256_castsi256_ps(_mm256_set1_epi32(-
int(a)));
3508 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3519 bool e,
bool f,
bool g,
bool h) {
3523 m_simd = _mm256_castsi256_ps(_mm256_set_epi32(-
int(h), -
int(g), -
int(f), -
int(e),
3524 -
int(d), -
int(c), -
int(b), -
int(a)));
3525 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3541 bool e,
bool f,
bool g,
bool h) {
3542 load (a, b, c, d, e, f, g, h);
3546 int e,
int f,
int g,
int h) {
3547 load (
bool(a),
bool(b),
bool(c),
bool(d),
3548 bool(e),
bool(f),
bool(g),
bool(h));
3552 load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
3568 return _mm256_movemask_ps(
m_simd);
3584 m_simd = _mm256_setzero_ps();
3592 return _mm256_setzero_ps();
3601 # if OIIO_SIMD_AVX >= 2 && (OIIO_GNUC_VERSION > 50000)
3603 __m256i anyval = _mm256_undefined_si256();
3604 return _mm256_castsi256_ps (_mm256_cmpeq_epi8 (anyval, anyval));
3606 return _mm256_castsi256_ps (_mm256_set1_epi32 (-1));
3620 for (
int i = 0; i <
n; ++i)
3621 values[i] =
m_val[i] ?
true :
false;
3627 return _mm256_castps256_ps128 (
simd());
3635 return _mm256_extractf128_ps (
simd(), 1);
3644 __m256
r = _mm256_castps128_ps256 (lo);
3645 m_simd = _mm256_insertf128_ps (r, hi, 1);
3664 return _mm256_and_ps (a.
simd(), b.
simd());
3672 return _mm256_or_ps (a.
simd(), b.
simd());
3680 return _mm256_xor_ps (a.
simd(), b.
simd());
3711 #if OIIO_SIMD_AVX >= 2
3712 return _mm256_castsi256_ps (_mm256_cmpeq_epi32 (_mm256_castps_si256 (a), _mm256_castps_si256(b)));
3714 return _mm256_cmp_ps (a, b, _CMP_EQ_UQ);
3722 return _mm256_xor_ps (a, b);
3729 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
3731 #if OIIO_SIMD_AVX >= 2
3733 return _mm256_permutevar8x32_ps (a.
simd(), index.
simd());
3735 return vbool8 (a[i0], a[
i1], a[
i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
3740 return shuffle<i,i,i,i,i,i,i,i>(
a);
3746 #if OIIO_SIMD_AVX && !_WIN32
3747 return _mm256_extract_epi32(_mm256_castps_si256(a.
simd()), i);
3755 #if OIIO_SIMD_AVX && !_WIN32
3756 int ival = -
int(val);
3757 return _mm256_castsi256_ps (_mm256_insert_epi32 (_mm256_castps_si256(a.
simd()), ival, i));
3768 return _mm256_testc_ps (v,
vbool8(
true)) != 0;
3777 return ! _mm256_testz_ps (v, v);
3797 #if OIIO_SIMD_AVX >= 512
3798 return (
int(
m_simd) >> i) & 1;
3800 return (
m_bits >> i) & 1;
3807 bits &= (0xffff ^ (1<<i));
3808 bits |= (
int(value)<<i);
3815 for (
int i = 1; i < a.elements; ++i)
3816 cout <<
' ' << a[i];
3832 bool v4,
bool v5,
bool v6,
bool v7,
3833 bool v8,
bool v9,
bool v10,
bool v11,
3834 bool v12,
bool v13,
bool v14,
bool v15) {
3854 bool v4,
bool v5,
bool v6,
bool v7,
3855 bool v8,
bool v9,
bool v10,
bool v11,
3856 bool v12,
bool v13,
bool v14,
bool v15) {
3857 load (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
3861 int v4,
int v5,
int v6,
int v7,
3862 int v8,
int v9,
int v10,
int v11,
3863 int v12,
int v13,
int v14,
int v15) {
3864 load (
bool(v0),
bool(v1),
bool(v2),
bool(v3),
3865 bool(v4),
bool(v5),
bool(v6),
bool(v7),
3866 bool(v8),
bool(v9),
bool(v10),
bool(v11),
3867 bool(v12),
bool(v13),
bool(v14),
bool(v15));
3875 load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
3876 a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
3892 #if OIIO_SIMD_AVX >= 512
3920 for (
int i = 0; i <
n; ++i)
3921 values[i] =
m_bits & (1<<i);
3927 #if OIIO_SIMD_AVX >= 512
3928 return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (
bitmask()&0xff, -1));
3935 #if OIIO_SIMD_AVX >= 512
3936 return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (
bitmask()>>8, -1));
3944 #if OIIO_SIMD_AVX >= 512
3945 return _mm512_knot (a.
simd());
3952 #if OIIO_SIMD_AVX >= 512
3953 return _mm512_kand (a.
simd(), b.
simd());
3960 #if OIIO_SIMD_AVX >= 512
3961 return _mm512_kor (a.
simd(), b.
simd());
3968 #if OIIO_SIMD_AVX >= 512
3969 return _mm512_kxor (a.
simd(), b.
simd());
3995 #if OIIO_SIMD_AVX >= 512
3996 return _mm512_kxnor (a.
simd(), b.
simd());
4003 #if OIIO_SIMD_AVX >= 512
4004 return _mm512_kxor (a.
simd(), b.
simd());
4068 m_simd = _mm_set1_epi32 (a);
4069 #elif OIIO_SIMD_NEON
4070 m_simd = vdupq_n_s32 (a);
4080 m_simd = _mm_set_epi32 (d, c, b, a);
4081 #elif OIIO_SIMD_NEON
4083 m_simd = vld1q_s32 (values);
4112 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4113 m_simd = _mm_maskz_loadu_epi32 (__mmask8(~(0xf << n)), values);
4117 m_simd = _mm_castps_si128 (_mm_load_ss ((
const float *)values));
4121 m_simd = _mm_castpd_si128 (_mm_load_sd ((
const double*)values));
4126 m_simd = _mm_castps_si128 (_mm_movelh_ps(_mm_castpd_ps(_mm_load_sd((
const double*)values)),
4127 _mm_load_ss ((
const float *)values + 2)));
4137 for (
int i = 0; i <
n; ++i)
4138 m_val[i] = values[i];
4146 #if OIIO_SIMD_SSE >= 4
4148 simd_t a = _mm_castpd_si128 (_mm_load_sd ((
const double *)values));
4149 m_simd = _mm_cvtepu16_epi32 (a);
4157 #if OIIO_SIMD_SSE >= 4
4159 simd_t a = _mm_castpd_si128 (_mm_load_sd ((
const double *)values));
4160 m_simd = _mm_cvtepi16_epi32 (a);
4168 #if OIIO_SIMD_SSE >= 4
4170 simd_t a = _mm_castps_si128 (_mm_load_ss ((
const float *)values));
4171 m_simd = _mm_cvtepu8_epi32 (a);
4179 #if OIIO_SIMD_SSE >= 4
4181 simd_t a = _mm_castps_si128 (_mm_load_ss ((
const float *)values));
4182 m_simd = _mm_cvtepi8_epi32 (a);
4222 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4223 m_simd = _mm_maskz_loadu_epi32 (__mmask8(mask), (
const simd_t *)values);
4224 #elif OIIO_SIMD_AVX >= 2
4233 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4235 #elif OIIO_SIMD_AVX >= 2
4236 m_simd = _mm_maskload_epi32 (values, _mm_castps_si128(mask));
4244 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4245 _mm_mask_storeu_epi32 (values, __mmask8(mask),
m_simd);
4246 #elif OIIO_SIMD_AVX >= 2
4249 SIMD_DO (
if ((mask>>i) & 1) values[i] = (*
this)[i]);
4255 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4257 #elif OIIO_SIMD_AVX >= 2
4258 _mm_maskstore_epi32 (values, _mm_castps_si128(mask),
m_simd);
4260 SIMD_DO (
if (mask[i]) values[i] = (*
this)[i]);
4265 template <
int scale>
4269 #if OIIO_SIMD_AVX >= 2
4270 m_simd = _mm_i32gather_epi32 (baseptr, vindex,
scale);
4280 #if OIIO_SIMD_AVX >= 2
4281 m_simd = _mm_mask_i32gather_epi32 (
m_simd, baseptr, vindex, _mm_cvtps_epi32(mask),
scale);
4291 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4293 _mm_i32scatter_epi32 (baseptr, vindex,
m_simd,
scale);
4302 const vint_t& vindex)
const
4304 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4315 m_simd = _mm_setzero_si128();
4325 return _mm_setzero_si128();
4338 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
4339 __m128i anyval = _mm_undefined_si128();
4341 __m128i anyval = _mm_setzero_si128();
4343 return _mm_cmpeq_epi8 (anyval, anyval);
4352 return vint4 (start+0*step, start+1*step, start+2*step, start+3*step);
4357 return vint4 (1<<0, 1<<1, 1<<2, 1<<3);
4363 return _mm_add_epi32 (a.
simd(), b.
simd());
4376 return _mm_sub_epi32 (_mm_setzero_si128(), a);
4385 return _mm_sub_epi32 (a.
simd(), b.
simd());
4401 #if OIIO_SIMD_SSE >= 4
4402 return _mm_mullo_epi32(a, b);
4408 t0 = _mm_mul_epu32 (a, b);
4409 t1 = _mm_mul_epu32 (_mm_shuffle_epi32 (a, 0xB1),
4410 _mm_shuffle_epi32 (b, 0xB1));
4411 t0 = _mm_shuffle_epi32 (t0, 0xD8);
4412 t1 = _mm_shuffle_epi32 (t1, 0xD8);
4413 return _mm_unpacklo_epi32 (t0, t1);
4421 return mul_epi32 (a.
simd(), b.
simd());
4461 return _mm_and_si128 (a.
simd(), b.
simd());
4462 #elif OIIO_SIMD_NEON
4463 return vandq_s32(a.
simd(), b.
simd());
4476 return _mm_or_si128 (a.
simd(), b.
simd());
4477 #elif OIIO_SIMD_NEON
4478 return vorrq_s32(a.
simd(), b.
simd());
4489 return _mm_xor_si128 (a.
simd(), b.
simd());
4490 #elif OIIO_SIMD_NEON
4491 return veorq_s32(a.
simd(), b.
simd());
4504 #elif OIIO_SIMD_NEON
4505 return vmvnq_s32(a.
m_simd);
4513 return _mm_slli_epi32 (a, bits);
4520 return a = a << bits;
4526 return _mm_srai_epi32 (a, bits);
4533 return a = a >> bits;
4539 return _mm_srli_epi32 (a, bits);
4548 return _mm_castsi128_ps(_mm_cmpeq_epi32 (a, b));
4549 #elif OIIO_SIMD_NEON
4563 return _mm_castsi128_ps(_mm_cmpgt_epi32 (a, b));
4571 return _mm_castsi128_ps(_mm_cmplt_epi32 (a, b));
4578 return (b < a) | (a ==
b);
4582 return (b > a) | (a ==
b);
4587 for (
int i = 1; i < val.elements; ++i)
4588 cout <<
' ' << val[i];
4595 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4599 _mm_mask_storeu_epi32 (values, __mmask8(~(0xf << n)),
m_simd);
4605 for (
int i = 0; i <
n; ++i)
4606 values[i] =
m_val[i];
4608 for (
int i = 0; i <
n; ++i)
4609 values[i] =
m_val[i];
4616 #if OIIO_AVX512VL_ENABLED
4617 _mm_mask_cvtepi32_storeu_epi16 (values, __mmask8(0xf),
m_simd);
4622 vint4 low = _mm_shufflelo_epi16 (clamped, (0<<0) | (2<<2) | (1<<4) | (1<<6));
4624 vint4 high = _mm_shufflehi_epi16 (clamped, (1<<0) | (1<<2) | (0<<4) | (2<<6));
4626 vint4 highswapped = shuffle_sse<2,3,0,1>(high);
4627 vint4
result = low | highswapped;
4628 _mm_storel_pd ((
double *)values, _mm_castsi128_pd(result));
4638 #if OIIO_AVX512VL_ENABLED
4639 _mm_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xf),
m_simd);
4644 vint4 swapped = shuffle_sse<1,0,3,2>(clamped);
4645 vint4 shifted = swapped << 8;
4646 vint4 merged = clamped | shifted;
4647 vint4 merged2 = shuffle_sse<2,2,2,2>(merged);
4648 vint4 shifted2 = merged2 << 16;
4649 vint4
result = merged | shifted2;
4650 *(
int*)values = result[0];
4660 template<
int i0,
int i1,
int i2,
int i3>
4663 return shuffle_sse<i0,i1,i2,i3> (__m128i(a));
4665 return vint4(a[i0], a[
i1], a[
i2], a[i3]);
4674 #if OIIO_SIMD_SSE >= 4
4675 return _mm_extract_epi32(v.
simd(), i);
4683 return _mm_cvtsi128_si32(v.simd());
4689 #if OIIO_SIMD_SSE >= 4
4690 return _mm_insert_epi32 (a.
simd(),
val, i);
4713 return _mm_castps_si128 (x.
simd());
4715 return *(
vint4 *)&x;
4725 #if OIIO_SIMD_SSE >= 3
4734 #elif OIIO_SIMD_SSE >= 2
4738 vint4 ab_ab_cd_cd = shuffle<1,0,3,2>(
v) + v;
4740 vint4 cd_cd_ab_ab = shuffle<2,3,0,1>(ab_ab_cd_cd);
4742 vint4 abcd = ab_ab_cd_cd + cd_cd_ab_ab;
4753 #elif OIIO_SIMD_NEON && defined(__aarch64__)
4754 return vaddvq_s32(v);
4763 vint4 ab = v & shuffle<1,1,3,3>(
v);
4764 vint4 abcd = ab & shuffle<2>(ab);
4765 return extract<0>(abcd);
4774 vint4 ab = v | shuffle<1,1,3,3>(
v);
4775 vint4 abcd = ab | shuffle<2>(ab);
4776 return extract<0>(abcd);
4785 #if OIIO_SIMD_SSE >= 4
4786 return _mm_castps_si128 (_mm_blendv_ps (_mm_castsi128_ps(a.
simd()),
4787 _mm_castsi128_ps(b.
simd()), mask));
4789 return _mm_or_si128 (_mm_and_si128(_mm_castps_si128(mask.
simd()), b.
simd()),
4790 _mm_andnot_si128(_mm_castps_si128(mask.
simd()), a.
simd()));
4791 #elif OIIO_SIMD_NEON
4800 return _mm_and_si128(_mm_castps_si128(mask), a.
simd());
4809 return _mm_andnot_si128(_mm_castps_si128(mask), a.
simd());
4817 return blend (b, a, mask);
4823 #if OIIO_SIMD_SSE >= 3
4824 return _mm_abs_epi32(a.
simd());
4825 #elif OIIO_SIMD_NEON
4826 return vabsq_s32(a.
simd());
4835 #if OIIO_SIMD_SSE >= 4
4836 return _mm_min_epi32 (a, b);
4837 #elif OIIO_SIMD_NEON
4838 return vminq_s32(a, b);
4846 #if OIIO_SIMD_SSE >= 4
4847 return _mm_max_epi32 (a, b);
4848 #elif OIIO_SIMD_NEON
4849 return vmaxq_s32(a, b);
4857 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4862 return (x<<s) |
srl(x,32-s);
4864 return (x<<s) |
srl(x,32-s);
4876 return _mm_andnot_si128 (a.
simd(), b.
simd());
4928 m_simd = _mm256_set1_epi32 (a);
4929 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4939 int e,
int f,
int g,
int h) {
4941 m_simd = _mm256_set_epi32 (h, g, f, e, d, c, b, a);
4942 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4961 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4973 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4974 m_simd = _mm256_maskz_loadu_epi32 ((~(0xff << n)), values);
4975 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4979 hi.
load (values+4, n-4);
4984 lo.
load (values, n);
4990 for (
int i = 0; i <
n; ++i)
4991 m_val[i] = values[i];
4999 #if OIIO_SIMD_AVX >= 2
5000 m_simd = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)values));
5001 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5010 #if OIIO_SIMD_AVX >= 2
5011 m_simd = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)values));
5012 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5022 #if OIIO_SIMD_AVX >= 2
5023 __m128i
bytes = _mm_castpd_si128 (_mm_load_sd ((
const double *)values));
5024 m_simd = _mm256_cvtepi8_epi32 (bytes);
5025 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5034 #if OIIO_SIMD_AVX >= 2
5035 __m128i
bytes = _mm_castpd_si128 (_mm_load_sd ((
const double *)values));
5036 m_simd = _mm256_cvtepu8_epi32 (bytes);
5037 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5050 int e,
int f,
int g,
int h) {
5051 load(a,b,c,d,e,f,g,h);
5069 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5079 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5080 m_simd = _mm256_maskz_loadu_epi32 (__mmask8(mask), (
const simd_t *)values);
5081 #elif OIIO_SIMD_AVX >= 2
5090 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5092 #elif OIIO_SIMD_AVX >= 2
5093 m_simd = _mm256_maskload_epi32 (values, _mm256_castps_si256(mask));
5101 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5102 _mm256_mask_storeu_epi32 (values, __mmask8(mask),
m_simd);
5103 #elif OIIO_SIMD_AVX >= 2
5106 SIMD_DO (
if ((mask>>i) & 1) values[i] = (*
this)[i]);
5112 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5113 _mm256_mask_storeu_epi32 (values, __mmask8(mask.
bitmask()),
m_simd);
5114 #elif OIIO_SIMD_AVX >= 2
5115 _mm256_maskstore_epi32 (values, _mm256_castps_si256(mask),
m_simd);
5117 SIMD_DO (
if (mask[i]) values[i] = (*
this)[i]);
5122 template <
int scale>
5126 #if OIIO_SIMD_AVX >= 2
5127 m_simd = _mm256_i32gather_epi32 (baseptr, vindex,
scale);
5137 #if OIIO_SIMD_AVX >= 2
5138 m_simd = _mm256_mask_i32gather_epi32 (
m_simd, baseptr, vindex, _mm256_cvtps_epi32(mask),
scale);
5148 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5149 _mm256_i32scatter_epi32 (baseptr, vindex,
m_simd,
scale);
5158 const vint_t& vindex)
const
5160 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5170 m_simd = _mm256_setzero_si256();
5179 return _mm256_setzero_si256();
5191 return vint8 (start+0*step, start+1*step, start+2*step, start+3*step,
5192 start+4*step, start+5*step, start+6*step, start+7*step);
5197 return vint8 (1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7);
5203 return _mm256_castsi256_si128 (
simd());
5211 return _mm256_extractf128_si256 (
simd(), 1);
5220 __m256i
r = _mm256_castsi128_si256 (lo);
5221 m_simd = _mm256_insertf128_si256 (r, hi, 1);
5232 #if OIIO_SIMD_AVX >= 2
5233 return _mm256_add_epi32 (a.
simd(), b.
simd());
5246 #if OIIO_SIMD_AVX >= 2
5247 return _mm256_sub_epi32 (_mm256_setzero_si256(), a);
5255 #if OIIO_SIMD_AVX >= 2
5256 return _mm256_sub_epi32 (a.
simd(), b.
simd());
5269 #if OIIO_SIMD_AVX >= 2
5270 return _mm256_mullo_epi32 (a.
simd(), b.
simd());
5305 #if OIIO_SIMD_AVX >= 2
5306 return _mm256_and_si256 (a.
simd(), b.
simd());
5315 #if OIIO_SIMD_AVX >= 2
5316 return _mm256_or_si256 (a.
simd(), b.
simd());
5325 #if OIIO_SIMD_AVX >= 2
5326 return _mm256_xor_si256 (a.
simd(), b.
simd());
5336 #if OIIO_SIMD_AVX >= 2
5345 #if OIIO_SIMD_AVX >= 2
5346 return _mm256_slli_epi32 (a, bits);
5348 return vint8 (a.
lo() << bits, a.
hi() << bits);
5356 return a = a << bits;
5360 #if OIIO_SIMD_AVX >= 2
5361 return _mm256_srai_epi32 (a, bits);
5363 return vint8 (a.
lo() >> bits, a.
hi() >> bits);
5370 return a = a >> bits;
5375 #if OIIO_SIMD_AVX >= 2
5376 return _mm256_srli_epi32 (a, bits);
5385 #if OIIO_SIMD_AVX >= 2
5386 return _mm256_castsi256_ps(_mm256_cmpeq_epi32 (a.
m_simd, b.
m_simd));
5403 #if OIIO_SIMD_AVX >= 2
5404 return _mm256_castsi256_ps(_mm256_cmpgt_epi32 (a, b));
5415 #if OIIO_SIMD_AVX >= 2
5428 return (a > b) | (a ==
b);
5434 return (b > a) | (a ==
b);
5440 for (
int i = 1; i < val.elements; ++i)
5441 cout <<
' ' << val[i];
5448 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5452 _mm256_mask_storeu_epi32 (values, __mmask8(~(0xff << n)),
m_simd);
5463 for (
int i = 0; i <
n; ++i)
5464 values[i] =
m_val[i];
5472 #if OIIO_AVX512VL_ENABLED
5473 _mm256_mask_cvtepi32_storeu_epi16 (values, __mmask8(0xff),
m_simd);
5484 #if OIIO_AVX512VL_ENABLED
5485 _mm256_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xff),
m_simd);
5495 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
5497 #if OIIO_SIMD_AVX >= 2
5499 return _mm256_castps_si256 (_mm256_permutevar8x32_ps (_mm256_castsi256_ps(a.
simd()), index.
simd()));
5501 return vint8 (a[i0], a[
i1], a[
i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
5506 return shuffle<i,i,i,i,i,i,i,i>(
a);
5512 #if OIIO_SIMD_AVX && !_WIN32
5513 return _mm256_extract_epi32(v.
simd(), i);
5522 #if OIIO_SIMD_AVX && !_WIN32
5523 return _mm256_insert_epi32 (a.
simd(),
val, i);
5545 return _mm256_castps_si256 (x.
simd());
5547 return *(
vint8 *)&x;
5553 #if OIIO_SIMD_AVX >= 2
5555 vint8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_epi32(v.
simd(), _mm256_setzero_si256());
5556 vint8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_epi32(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_si256());
5558 vint8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
5559 vint8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
5560 return shuffle<0>(final_sum);
5563 return vint8(hadd4, hadd4);
5580 #if OIIO_SSE_AVX >= 2
5581 vint8 ab = v & shuffle<1,1,3,3,5,5,7,7>(
v);
5582 vint8 abcd = ab & shuffle<2,2,2,2,6,6,6,6>(ab);
5583 vint8 abcdefgh = abcd & shuffle<4>(abcdefgh);
5584 return extract<0> (abcdefgh);
5593 #if OIIO_SSE_AVX >= 2
5594 vint8 ab = v | shuffle<1,1,3,3,5,5,7,7>(
v);
5595 vint8 abcd = ab | shuffle<2,2,2,2,6,6,6,6>(ab);
5596 vint8 abcdefgh = abcd | shuffle<4>(abcdefgh);
5597 return extract<0> (abcdefgh);
5607 return _mm256_castps_si256 (_mm256_blendv_ps (_mm256_castsi256_ps(a.
simd()),
5608 _mm256_castsi256_ps(b.
simd()), mask));
5622 return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a.
simd()), mask));
5636 return _mm256_castps_si256 (_mm256_andnot_ps (mask.
simd(), _mm256_castsi256_ps(a.
simd())));
5646 return blend (b, a, mask);
5651 #if OIIO_SIMD_AVX >= 2
5652 return _mm256_abs_epi32(a.
simd());
5653 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5662 #if OIIO_SIMD_AVX >= 2
5663 return _mm256_min_epi32 (a, b);
5671 #if OIIO_SIMD_AVX >= 2
5672 return _mm256_max_epi32 (a, b);
5680 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5685 return (x<<s) |
srl(x,32-s);
5687 return (x<<s) |
srl(x,32-s);
5698 #if OIIO_SIMD_AVX >= 2
5699 return _mm256_andnot_si256 (a.
simd(), b.
simd());
5700 #elif OIIO_SIMD_AVX >= 1
5701 return _mm256_castps_si256 (_mm256_andnot_ps (_mm256_castsi256_ps(a.
simd()), _mm256_castsi256_ps(b.
simd())));
5752 #if OIIO_SIMD_AVX >= 512
5753 m_simd = _mm512_set1_epi32 (a);
5762 int v4,
int v5,
int v6,
int v7,
5763 int v8,
int v9,
int v10,
int v11,
5764 int v12,
int v13,
int v14,
int v15) {
5765 #if OIIO_SIMD_AVX >= 512
5766 m_simd = _mm512_setr_epi32 (v0, v1, v2, v3, v4, v5, v6, v7,
5767 v8, v9, v10, v11, v12, v13, v14, v15);
5790 #if OIIO_SIMD_AVX >= 512
5801 #if OIIO_SIMD_AVX >= 512
5802 m_simd = _mm512_maskz_loadu_epi32 (__mmask16(~(0xffff << n)), values);
5816 #if OIIO_SIMD_AVX >= 512
5817 m_simd = _mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)values));
5825 #if OIIO_SIMD_AVX >= 512
5826 m_simd = _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)values));
5835 #if OIIO_SIMD_AVX >= 512
5836 m_simd = _mm512_cvtepi8_epi32(_mm_loadu_si128((__m128i*)values));
5844 #if OIIO_SIMD_AVX >= 512
5845 m_simd = _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)values));
5856 int v4,
int v5,
int v6,
int v7,
5857 int v8,
int v9,
int v10,
int v11,
5858 int v12,
int v13,
int v14,
int v15) {
5859 load (v0, v1, v2, v3, v4, v5, v6, v7,
5860 v8, v9, v10, v11, v12, v13, v14, v15);
5873 #if OIIO_SIMD_AVX >= 512
5874 m_simd = _mm512_maskz_loadu_epi32 (mask, (
const simd_t *)values);
5883 #if OIIO_SIMD_AVX >= 512
5892 template <
int scale>
5895 #if OIIO_SIMD_AVX >= 512
5896 m_simd = _mm512_i32gather_epi32 (vindex, baseptr,
scale);
5906 #if OIIO_SIMD_AVX >= 512
5917 #if OIIO_SIMD_AVX >= 512
5918 _mm512_i32scatter_epi32 (baseptr, vindex,
m_simd,
scale);
5928 const vint_t& vindex)
const {
5929 #if OIIO_SIMD_AVX >= 512
5930 _mm512_mask_i32scatter_epi32 (baseptr, mask, vindex,
m_simd,
scale);
5939 #if OIIO_SIMD_AVX >= 512
5952 #if OIIO_SIMD_AVX >= 512
5953 m_simd = _mm512_setzero_si512();
5961 #if OIIO_SIMD_AVX >= 512
5962 return _mm512_setzero_epi32();
5974 return vint16 (start+0*step, start+1*step, start+2*step, start+3*step,
5975 start+4*step, start+5*step, start+6*step, start+7*step,
5976 start+8*step, start+9*step, start+10*step, start+11*step,
5977 start+12*step, start+13*step, start+14*step, start+15*step);
5982 return vint16 (1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7,
5983 1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15);
5988 #if OIIO_SIMD_AVX >= 512
5989 return _mm512_castsi512_si256 (
simd());
5996 #if OIIO_SIMD_AVX >= 512
5997 return _mm512_extracti64x4_epi64 (
simd(), 1);
6005 #if OIIO_SIMD_AVX >= 512
6006 __m512i
r = _mm512_castsi256_si512 (lo);
6007 m_simd = _mm512_inserti32x8 (r, hi, 1);
6016 #if OIIO_SIMD_AVX >= 512
6017 m_simd = _mm512_broadcast_i32x4(a);
6029 #if OIIO_SIMD_AVX >= 512
6030 return _mm512_add_epi32 (a.
simd(), b.
simd());
6043 #if OIIO_SIMD_AVX >= 512
6044 return _mm512_sub_epi32 (_mm512_setzero_si512(), a);
6052 #if OIIO_SIMD_AVX >= 512
6053 return _mm512_sub_epi32 (a.
simd(), b.
simd());
6066 #if OIIO_SIMD_AVX >= 512
6067 return _mm512_mullo_epi32 (a.
simd(), b.
simd());
6102 #if OIIO_SIMD_AVX >= 512
6103 return _mm512_and_si512 (a.
simd(), b.
simd());
6112 #if OIIO_SIMD_AVX >= 512
6113 return _mm512_or_si512 (a.
simd(), b.
simd());
6122 #if OIIO_SIMD_AVX >= 512
6123 return _mm512_xor_si512 (a.
simd(), b.
simd());
6133 #if OIIO_SIMD_AVX >= 512
6142 #if OIIO_SIMD_AVX >= 512
6143 return _mm512_sllv_epi32 (a,
vint16(
int(bits)));
6147 return vint16 (a.
lo() << bits, a.
hi() << bits);
6153 return a = a << bits;
6157 #if OIIO_SIMD_AVX >= 512
6158 return _mm512_srav_epi32 (a,
vint16(
int(bits)));
6161 return vint16 (a.
lo() >> bits, a.
hi() >> bits);
6166 return a = a >> bits;
6171 #if OIIO_SIMD_AVX >= 512
6172 return _mm512_srlv_epi32 (a,
vint16(
int(bits)));
6181 #if OIIO_SIMD_AVX >= 512
6182 return _mm512_cmp_epi32_mask (a.
simd(), b.
simd(), 0 );
6190 #if OIIO_SIMD_AVX >= 512
6191 return _mm512_cmp_epi32_mask (a.
simd(), b.
simd(), 4 );
6199 #if OIIO_SIMD_AVX >= 512
6200 return _mm512_cmp_epi32_mask (a.
simd(), b.
simd(), 6 );
6208 #if OIIO_SIMD_AVX >= 512
6209 return _mm512_cmp_epi32_mask (a.
simd(), b.
simd(), 1 );
6217 #if OIIO_SIMD_AVX >= 512
6218 return _mm512_cmp_epi32_mask (a.
simd(), b.
simd(), 5 );
6226 #if OIIO_SIMD_AVX >= 512
6227 return _mm512_cmp_epi32_mask (a.
simd(), b.
simd(), 2 );
6236 for (
int i = 1; i < val.elements; ++i)
6237 cout <<
' ' << val[i];
6245 #if 0 && OIIO_SIMD_AVX >= 512
6249 _mm512_mask_storeu_epi32 (values, __mmask16(~(0xffff << n)),
m_simd);
6262 #if OIIO_SIMD_AVX512
6263 _mm512_mask_cvtepi32_storeu_epi16 (values, __mmask16(0xff),
m_simd);
6264 #elif OIIO_SIMD_AVX >= 2
6274 #if OIIO_SIMD_AVX512
6275 _mm512_mask_cvtepi32_storeu_epi8 (values, __mmask16(0xff),
m_simd);
6276 #elif OIIO_SIMD_AVX >= 2
6287 template<
int i0,
int i1,
int i2,
int i3>
6289 #if OIIO_SIMD_AVX >= 512
6290 __m512
x = _mm512_castsi512_ps(a);
6291 return _mm512_castps_si512(_mm512_shuffle_f32x4(x,x,_MM_SHUFFLE(i3,
i2,
i1,i0)));
6300 return shuffle4<i,i,i,i> (
a);
6303 template<
int i0,
int i1,
int i2,
int i3>
6305 #if OIIO_SIMD_AVX >= 512
6306 __m512
x = _mm512_castsi512_ps(a);
6307 return _mm512_castps_si512(_mm512_permute_ps(x,_MM_SHUFFLE(i3,
i2,
i1,i0)));
6311 return vint16 (shuffle<i0,i1,i2,i3>(x[0]), shuffle<i0,i1,i2,i3>(x[1]),
6312 shuffle<i0,i1,i2,i3>(x[2]), shuffle<i0,i1,i2,i3>(x[3]));
6317 return shuffle<i,i,i,i> (
a);
6336 #if OIIO_SIMD_AVX >= 512
6337 return _mm_cvtsi128_si32(_mm512_castsi512_si128(
m_simd));
6354 #if OIIO_SIMD_AVX >= 512
6355 return _mm512_maskz_set1_epi32 (x, -1);
6363 #if OIIO_SIMD_AVX >= 512
6366 vint16 AB_AB_CD_CD = v + shuffle4<1,0,3,2>(
v);
6367 vint16 w = AB_AB_CD_CD + shuffle4<2,3,0,1>(AB_AB_CD_CD);
6369 vint16 ab_ab_cd_cd = w + shuffle<1,0,3,2>(
w);
6370 return ab_ab_cd_cd + shuffle<2,3,0,1>(ab_ab_cd_cd);
6373 return vint16 (sum, sum);
6379 #if OIIO_SIMD_AVX >= 512
6388 #if OIIO_SIMD_AVX >= 512
6391 vint16 AB_AB_CD_CD = v & shuffle4<1,0,3,2>(
v);
6392 vint16 w = AB_AB_CD_CD & shuffle4<2,3,0,1>(AB_AB_CD_CD);
6394 vint16 ab_ab_cd_cd = w & shuffle<1,0,3,2>(
w);
6395 vint16 r = ab_ab_cd_cd & shuffle<2,3,0,1>(ab_ab_cd_cd);
6404 #if OIIO_SIMD_AVX >= 512
6407 vint16 AB_AB_CD_CD = v | shuffle4<1,0,3,2>(
v);
6408 vint16 w = AB_AB_CD_CD | shuffle4<2,3,0,1>(AB_AB_CD_CD);
6410 vint16 ab_ab_cd_cd = w | shuffle<1,0,3,2>(
w);
6411 vint16 r = ab_ab_cd_cd | shuffle<2,3,0,1>(ab_ab_cd_cd);
6421 #if OIIO_SIMD_AVX >= 512
6422 return _mm512_mask_blend_epi32 (mask, a, b);
6431 #if OIIO_SIMD_AVX >= 512
6432 return _mm512_maskz_mov_epi32 (mask, a);
6441 #if OIIO_SIMD_AVX >= 512
6442 return _mm512_maskz_mov_epi32 (!mask, a);
6450 return blend (b, a, mask);
6455 #if OIIO_SIMD_AVX >= 512
6456 return _mm512_abs_epi32(a.
simd());
6464 #if OIIO_SIMD_AVX >= 512
6465 return _mm512_min_epi32 (a, b);
6473 #if OIIO_SIMD_AVX >= 512
6474 return _mm512_max_epi32 (a, b);
6482 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6487 return (x<<s) |
srl(x,32-s);
6489 return (x<<s) |
srl(x,32-s);
6500 #if OIIO_SIMD_AVX >= 512
6501 return _mm512_andnot_epi32 (a.
simd(), b.
simd());
6529 #elif OIIO_SIMD_NEON
6539 return _mm_setzero_ps();
6550 return vfloat4 (start+0.0
f*step, start+1.0
f*step, start+2.0
f*step, start+3.0
f*step);
6556 m_simd = _mm_setzero_ps();
6563 load ((
const float *)&v);
6568 load (v[0], v[1], v[2], 0.0
f);
6585 m_simd = _mm_set1_ps (val);
6586 #elif OIIO_SIMD_NEON
6587 m_simd = vdupq_n_f32 (val);
6595 m_simd = _mm_set_ps (d, c, b, a);
6596 #elif OIIO_SIMD_NEON
6598 m_simd = vld1q_f32 (values);
6610 m_simd = _mm_loadu_ps (values);
6611 #elif OIIO_SIMD_NEON
6612 m_simd = vld1q_f32 (values);
6621 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6622 m_simd = _mm_maskz_loadu_ps (__mmask8(~(0xf << n)), values);
6626 m_simd = _mm_load_ss (values);
6630 m_simd = _mm_castpd_ps (_mm_load_sd ((
const double*)values));
6633 m_simd = _mm_setr_ps (values[0], values[1], values[2], 0.0
f);
6643 m_simd = _mm_loadu_ps (values);
6649 #elif OIIO_SIMD_NEON
6651 case 1:
m_simd = vdupq_n_f32(0);
m_simd[0] = values[0];
break;
6652 case 2:
load (values[0], values[1], 0.0
f, 0.0
f);
break;
6653 case 3:
load (values[0], values[1], values[2], 0.0
f);
break;
6654 case 4:
m_simd = vld1q_f32 (values);
break;
6658 for (
int i = 0; i <
n; ++i)
6659 m_val[i] = values[i];
6667 #if OIIO_SIMD_SSE >= 2
6678 #if OIIO_SIMD_SSE >= 2
6687 #if OIIO_SIMD_SSE >= 2
6696 #if OIIO_SIMD_SSE >= 2
6703 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
6705 #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE
6707 __m128i a = _mm_castpd_si128 (_mm_load_sd ((
const double *)values));
6708 m_simd = _mm_cvtph_ps (a);
6709 #elif OIIO_SIMD_SSE >= 2
6712 vint4 h ((
const unsigned short *)values);
6713 # define CONSTI(name) *(const __m128i *)&name
6714 # define CONSTF(name) *(const __m128 *)&name
6719 __m128i mnosign = CONSTI(mask_nosign);
6720 __m128i expmant = _mm_and_si128(mnosign,
h);
6721 __m128i justsign = _mm_xor_si128(
h, expmant);
6722 __m128i expmant2 = expmant;
6723 __m128i shifted = _mm_slli_epi32(expmant, 13);
6724 __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), *(
const __m128 *)&magic);
6725 __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant2, CONSTI(was_infnan));
6726 __m128i
sign = _mm_slli_epi32(justsign, 16);
6727 __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), CONSTF(exp_infnan));
6728 __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp);
6729 __m128
final = _mm_or_ps(scaled, sign_inf);
6745 _mm_storeu_ps (values,
m_simd);
6746 #elif OIIO_SIMD_NEON
6747 vst1q_f32 (values,
m_simd);
6755 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6759 _mm_mask_storeu_ps (values, __mmask8(~(0xf << n)),
m_simd);
6763 _mm_store_ss (values,
m_simd);
6767 _mm_store_sd ((
double*)values, _mm_castps_pd(
m_simd));
6770 values[0] =
m_val[0];
6771 values[1] =
m_val[1];
6772 values[2] =
m_val[2];
6785 #elif OIIO_SIMD_NEON
6788 vst1q_lane_f32 (values,
m_simd, 0);
6791 vst1q_lane_f32 (values++,
m_simd, 0);
6792 vst1q_lane_f32 (values,
m_simd, 1);
6795 vst1q_lane_f32 (values++,
m_simd, 0);
6796 vst1q_lane_f32 (values++,
m_simd, 1);
6797 vst1q_lane_f32 (values,
m_simd, 2);
6800 vst1q_f32 (values,
m_simd);
break;
6805 for (
int i = 0; i <
n; ++i)
6806 values[i] =
m_val[i];
6810 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
6812 #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE
6813 __m128i
h = _mm_cvtps_ph (
m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
6814 _mm_store_sd ((
double *)values, _mm_castsi128_pd(h));
6823 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6824 m_simd = _mm_maskz_loadu_ps (__mmask8(mask), (
const simd_t *)values);
6834 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6837 m_simd = _mm_maskload_ps (values, _mm_castps_si128(mask));
6845 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6846 _mm_mask_storeu_ps (values, __mmask8(mask),
m_simd);
6850 SIMD_DO (
if ((mask>>i) & 1) values[i] = (*
this)[i]);
6856 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6857 _mm_mask_storeu_ps (values, __mmask8(mask.
bitmask()),
m_simd);
6859 _mm_maskstore_ps (values, _mm_castps_si128(mask.
simd()),
m_simd);
6861 SIMD_DO (
if (mask[i]) values[i] = (*
this)[i]);
6866 template <
int scale>
6870 #if OIIO_SIMD_AVX >= 2
6871 m_simd = _mm_i32gather_ps (baseptr, vindex,
scale);
6881 #if OIIO_SIMD_AVX >= 2
6892 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6894 _mm_i32scatter_ps (baseptr, vindex,
m_simd,
scale);
6903 const vint_t& vindex)
const
6905 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6917 #elif OIIO_SIMD_NEON
6927 #elif OIIO_SIMD_NEON
6937 return _mm_sub_ps (_mm_setzero_ps(),
m_simd);
6938 #elif OIIO_SIMD_NEON
6948 #elif OIIO_SIMD_NEON
6958 #elif OIIO_SIMD_NEON
6968 return _mm_mul_ps (a.
m_simd, _mm_set1_ps(b));
6969 #elif OIIO_SIMD_NEON
6970 return vmulq_n_f32 (a.
m_simd, b);
6983 #elif OIIO_SIMD_NEON
6993 #elif OIIO_SIMD_NEON
7004 #elif OIIO_SIMD_NEON
7015 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7025 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7036 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7047 #elif OIIO_SIMD_NEON
7057 #elif OIIO_SIMD_NEON
7068 #elif OIIO_SIMD_NEON
7078 #elif OIIO_SIMD_NEON
7088 #elif OIIO_SIMD_NEON
7098 #elif OIIO_SIMD_NEON
7109 return vfloat4 (a[0], a[1], b[0], b[1]);
7117 return vfloat4 (a[0], b[0], a[1], b[1]);
7122 return insert<3>(*
this, 0.0f);
7126 return insert<3>(*
this, 1.0f);
7131 for (
int i = 1; i < val.elements; ++i)
7132 cout <<
' ' << val[i];
7148 template<
int i0,
int i1,
int i2,
int i3>
7151 return shuffle_sse<i0,i1,i2,i3> (__m128(a));
7161 float32x2_t
t = vget_low_f32(a.simd());
return vdupq_lane_f32(t,0);
7164 float32x2_t t = vget_low_f32(a.simd());
return vdupq_lane_f32(t,1);
7167 float32x2_t t = vget_high_f32(a.simd());
return vdupq_lane_f32(t,0);
7170 float32x2_t t = vget_high_f32(a.simd());
return vdupq_lane_f32(t,1);
7181 return _mm_cvtss_f32(shuffle_sse<i,i,i,i>(a.
simd()));
7189 return _mm_cvtss_f32(a.simd());
7197 #if OIIO_SIMD_SSE >= 4
7198 return _mm_insert_ps (a, _mm_set_ss(val), i<<4);
7209 return _mm_move_ss (a.simd(), _mm_set_ss(
val));
7227 return _mm_castps_si128 (x.
simd());
7229 return *(
vint4 *)&x;
7236 return _mm_castsi128_ps (x.
simd());
7250 #if OIIO_SIMD_SSE >= 3
7263 vfloat4 ab_ab_cd_cd = shuffle<1,0,3,2>(
v) + v;
7265 vfloat4 cd_cd_ab_ab = shuffle<2,3,0,1>(ab_ab_cd_cd);
7267 vfloat4 abcd = ab_ab_cd_cd + cd_cd_ab_ab;
7270 return vfloat4 (v[0] + v[1] + v[2] + v[3]);
7278 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7279 return vaddvq_f32(v);
7281 return v[0] + v[1] + v[2] + v[3];
7286 #if OIIO_SIMD_SSE >= 4
7287 return _mm_dp_ps (a.
simd(), b.
simd(), 0xff);
7288 #elif OIIO_SIMD_NEON
7289 float32x4_t ab = vmulq_f32(a, b);
7290 float32x4_t sum1 = vaddq_f32(ab, vrev64q_f32(ab));
7291 return vaddq_f32(sum1, vcombine_f32(vget_high_f32(sum1), vget_low_f32(sum1)));
7298 #if OIIO_SIMD_SSE >= 4
7299 return _mm_cvtss_f32 (_mm_dp_ps (a.
simd(), b.
simd(), 0xff));
7306 #if OIIO_SIMD_SSE >= 4
7307 return _mm_dp_ps (a.
simd(), b.
simd(), 0x7f);
7314 #if OIIO_SIMD_SSE >= 4
7315 return _mm_cvtss_f32 (_mm_dp_ps (a.
simd(), b.
simd(), 0x77));
7324 #if OIIO_SIMD_SSE >= 4
7326 return _mm_blendv_ps (a.
simd(), b.
simd(), mask.
simd());
7329 return _mm_or_ps (_mm_and_ps(mask.
simd(), b.
simd()),
7330 _mm_andnot_ps(mask.
simd(), a.
simd()));
7331 #elif OIIO_SIMD_NEON
7334 return vfloat4 (mask[0] ? b[0] : a[0],
7335 mask[1] ? b[1] : a[1],
7336 mask[2] ? b[2] : a[2],
7337 mask[3] ? b[3] : a[3]);
7345 return _mm_and_ps(mask.
simd(), a.
simd());
7347 return vfloat4 (mask[0] ? a[0] : 0.0
f,
7348 mask[1] ? a[1] : 0.0f,
7349 mask[2] ? a[2] : 0.0f,
7350 mask[3] ? a[3] : 0.0f);
7358 return _mm_andnot_ps(mask.
simd(), a.
simd());
7360 return vfloat4 (mask[0] ? 0.0
f : a[0],
7361 mask[1] ? 0.0
f : a[1],
7362 mask[2] ? 0.0
f : a[2],
7363 mask[3] ? 0.0
f : a[3]);
7372 return vfloat4 (b[0] == 0.0
f ? 0.0
f : a[0] / b[0],
7373 b[1] == 0.0
f ? 0.0
f : a[1] / b[1],
7374 b[2] == 0.0
f ? 0.0
f : a[2] / b[2],
7375 b[3] == 0.0
f ? 0.0
f : a[3] / b[3]);
7394 return blend (b, a, mask);
7402 return _mm_and_ps (a.
simd(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
7403 #elif OIIO_SIMD_NEON
7404 return vabsq_f32(a.
simd());
7420 #if OIIO_SIMD_SSE >= 4
7421 return _mm_ceil_ps (a);
7429 #if OIIO_SIMD_SSE >= 4
7430 return _mm_floor_ps (a);
7438 #if OIIO_SIMD_SSE >= 4
7439 return _mm_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
7448 #if OIIO_SIMD_SSE >= 4
7464 #if OIIO_SIMD_AVX512 && OIIO_AVX512VL_ENABLED
7468 #elif OIIO_SIMD_AVX512
7470 vfloat16 r = _mm512_rcp14_ps(_mm512_castps128_ps512(a));
7471 return _mm512_castps512_ps128(r);
7484 return _mm_sqrt_ps (a.
simd());
7494 return _mm_div_ps (_mm_set1_ps(1.0
f), _mm_sqrt_ps (a.
simd()));
7503 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
7505 return _mm512_castps512_ps128(_mm512_rsqrt28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC));
7506 #elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7508 return _mm512_castps512_ps128(_mm512_rsqrt14_ps(_mm512_castps128_ps512(a)));
7510 return _mm_rsqrt_ps (a.
simd());
7520 return _mm_min_ps (a, b);
7521 #elif OIIO_SIMD_NEON
7522 return vminq_f32(a, b);
7531 return _mm_max_ps (a, b);
7532 #elif OIIO_SIMD_NEON
7533 return vmaxq_f32(a, b);
7542 return _mm_andnot_ps (a.
simd(), b.
simd());
7544 const int *ai = (
const int *)&a;
7545 const int *bi = (
const int *)&b;
7557 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7559 return _mm_fmadd_ps (a, b, c);
7560 #elif OIIO_SIMD_NEON
7562 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7577 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7579 return _mm_fmsub_ps (a, b, c);
7580 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7596 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7598 return _mm_fnmadd_ps (a, b, c);
7599 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7615 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7617 return _mm_fnmsub_ps (a, b, c);
7618 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7625 return -(a *
b) - c;
7632 template<
typename T>
7640 typedef typename T::vint_t int_t;
7642 const float exp_hi (88.3762626647949
f);
7643 const float exp_lo (-88.3762626647949
f);
7644 const float cephes_LOG2EF (1.44269504088896341
f);
7645 const float cephes_exp_C1 (0.693359375
f);
7646 const float cephes_exp_C2 (-2.12194440e-4
f);
7647 const float cephes_exp_p0 (1.9875691500E-4
f);
7648 const float cephes_exp_p1 (1.3981999507E-3
f);
7649 const float cephes_exp_p2 (8.3334519073E-3
f);
7650 const float cephes_exp_p3 (4.1665795894E-2
f);
7651 const float cephes_exp_p4 (1.6666665459E-1
f);
7652 const float cephes_exp_p5 (5.0000001201E-1
f);
7655 x =
min (x,
T(exp_hi));
7656 x =
max (x,
T(exp_lo));
7657 T fx =
madd (x,
T(cephes_LOG2EF),
T(0.5
f));
7658 int_t emm0 = int_t(fx);
7662 tmp = fx * cephes_exp_C1;
7663 T z = fx * cephes_exp_C2;
7667 T y = cephes_exp_p0;
7668 y =
madd (y, x, cephes_exp_p1);
7669 y =
madd (y, x, cephes_exp_p2);
7670 y =
madd (y, x, cephes_exp_p3);
7671 y =
madd (y, x, cephes_exp_p4);
7672 y =
madd (y, x, cephes_exp_p5);
7675 emm0 = (int_t(fx) + int_t(0x7f)) << 23;
7687 template<
typename T>
7695 typedef typename T::vint_t int_t;
7696 typedef typename T::vbool_t bool_t;
7701 bool_t invalid_mask = (x <=
zero);
7702 const int min_norm_pos ((
int)0x00800000);
7703 const int inv_mant_mask ((
int)~0x7f800000);
7709 emm0 = emm0 - int_t(0x7f);
7713 const float cephes_SQRTHF (0.707106781186547524
f);
7714 bool_t
mask = (x <
T(cephes_SQRTHF));
7720 const float cephes_log_p0 (7.0376836292E-2
f);
7721 const float cephes_log_p1 (- 1.1514610310E-1
f);
7722 const float cephes_log_p2 (1.1676998740E-1
f);
7723 const float cephes_log_p3 (- 1.2420140846E-1
f);
7724 const float cephes_log_p4 (+ 1.4249322787E-1
f);
7725 const float cephes_log_p5 (- 1.6668057665E-1
f);
7726 const float cephes_log_p6 (+ 2.0000714765E-1
f);
7727 const float cephes_log_p7 (- 2.4999993993E-1
f);
7728 const float cephes_log_p8 (+ 3.3333331174E-1
f);
7729 const float cephes_log_q1 (-2.12194440e-4
f);
7730 const float cephes_log_q2 (0.693359375
f);
7731 T y = cephes_log_p0;
7732 y =
madd (y, x,
T(cephes_log_p1));
7733 y =
madd (y, x,
T(cephes_log_p2));
7734 y =
madd (y, x,
T(cephes_log_p3));
7735 y =
madd (y, x,
T(cephes_log_p4));
7736 y =
madd (y, x,
T(cephes_log_p5));
7737 y =
madd (y, x,
T(cephes_log_p6));
7738 y =
madd (y, x,
T(cephes_log_p7));
7739 y =
madd (y, x,
T(cephes_log_p8));
7742 y =
madd(e,
T(cephes_log_q1), y);
7745 x =
madd (e,
T(cephes_log_q2), x);
7760 vfloat4 A (a[0], b[0], c[0], d[0]);
7761 vfloat4 B (a[1], b[1], c[1], d[1]);
7762 vfloat4 C (a[2], b[2], c[2], d[2]);
7763 vfloat4 D (a[3], b[3], c[3], d[3]);
7764 a =
A; b =
B; c = C; d = D;
7774 auto l02 = _mm_unpacklo_ps (a, c);
7775 auto h02 = _mm_unpackhi_ps (a, c);
7776 auto l13 = _mm_unpacklo_ps (b, d);
7777 auto h13 = _mm_unpackhi_ps (b, d);
7778 r0 =
vfloat4(_mm_unpacklo_ps (l02, l13));
7779 r1 =
vfloat4(_mm_unpackhi_ps (l02, l13));
7780 r2 =
vfloat4(_mm_unpacklo_ps (h02, h13));
7781 r3 =
vfloat4(_mm_unpackhi_ps (h02, h13));
7783 r0.
load (a[0], b[0], c[0], d[0]);
7784 r1.
load (a[1], b[1], c[1], d[1]);
7785 r2.
load (a[2], b[2], c[2], d[2]);
7786 r3.
load (a[3], b[3], c[3], d[3]);
7794 __m128
A = _mm_castsi128_ps (a);
7795 __m128
B = _mm_castsi128_ps (b);
7796 __m128 C = _mm_castsi128_ps (c);
7797 __m128 D = _mm_castsi128_ps (d);
7798 _MM_TRANSPOSE4_PS (A, B, C, D);
7799 a = _mm_castps_si128 (A);
7800 b = _mm_castps_si128 (B);
7801 c = _mm_castps_si128 (C);
7802 d = _mm_castps_si128 (D);
7804 vint4 A (a[0], b[0], c[0], d[0]);
7805 vint4 B (a[1], b[1], c[1], d[1]);
7806 vint4 C (a[2], b[2], c[2], d[2]);
7807 vint4 D (a[3], b[3], c[3], d[3]);
7808 a =
A; b =
B; c = C; d = D;
7817 __m128
A = _mm_castsi128_ps (a);
7818 __m128
B = _mm_castsi128_ps (b);
7819 __m128 C = _mm_castsi128_ps (c);
7820 __m128 D = _mm_castsi128_ps (d);
7821 _MM_TRANSPOSE4_PS (A, B, C, D);
7822 r0 = _mm_castps_si128 (A);
7823 r1 = _mm_castps_si128 (B);
7824 r2 = _mm_castps_si128 (C);
7825 r3 = _mm_castps_si128 (D);
7827 r0.
load (a[0], b[0], c[0], d[0]);
7828 r1.
load (a[1], b[1], c[1], d[1]);
7829 r2.
load (a[2], b[2], c[2], d[2]);
7830 r3.
load (a[3], b[3], c[3], d[3]);
7839 vfloat4 l02 = _mm_unpacklo_ps (a, c);
7840 vfloat4 l13 = _mm_unpacklo_ps (b, d);
7841 return _mm_unpacklo_ps (l02, l13);
7843 return vfloat4 (a[0], b[0], c[0], d[0]);
7852 vint4 l02 = _mm_unpacklo_epi32 (a, c);
7853 vint4 l13 = _mm_unpacklo_epi32 (b, d);
7854 return _mm_unpacklo_epi32 (l02, l13);
7856 return vint4 (a[0], b[0], c[0], d[0]);
7866 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
7874 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
7887 return vfloat3 (start+0.0
f*step, start+1.0
f*step, start+2.0
f*step);
7900 vfloat4::load (
float(values[0]),
float(values[1]),
float(values[2]));
7904 vfloat4::load (
float(values[0]),
float(values[1]),
float(values[2]));
7908 vfloat4::load (
float(values[0]),
float(values[1]),
float(values[2]));
7912 vfloat4::load (
float(values[0]),
float(values[1]),
float(values[2]));
7915 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
7917 vfloat4::load (
float(values[0]),
float(values[1]),
float(values[2]));
7929 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
7936 store ((
float *)&vec);
7944 *
this = *
this +
a;
return *
this;
7956 *
this = *
this -
a;
return *
this;
7972 *
this = *
this *
a;
return *
this;
7976 *
this = *
this *
a;
return *
this;
7984 *
this = *
this /
a;
return *
this;
7988 *
this = *
this /
a;
return *
this;
7994 for (
int i = 1; i < val.elements; ++i)
7995 cout <<
' ' << val[i];
8004 return vfloat3(_mm_and_ps (a.
simd(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
8005 #elif OIIO_SIMD_NEON
8022 #if OIIO_SIMD_SSE >= 4
8023 return vfloat3(_mm_ceil_ps (a));
8031 #if OIIO_SIMD_SSE >= 4
8032 return vfloat3(_mm_floor_ps (a));
8040 #if OIIO_SIMD_SSE >= 4
8041 return vfloat3(_mm_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)));
8052 return vfloat3 (v[0] + v[1] + v[2]);
8058 #if OIIO_SIMD_SSE >= 4
8067 #if OIIO_SIMD_SSE >= 4
8068 return _mm_cvtss_f32 (_mm_dp_ps (a.
simd(), b.
simd(), 0x77));
8072 return a[0]*b[0] + a[1]*b[1] + a[2]*b[2];
8078 #if OIIO_SIMD_SSE >= 4
8088 return dot(*
this, *
this);
8094 return sqrtf(
dot(*
this, *
this));
8104 float len2 =
dot (*
this, *
this);
8105 return len2 > 0.0f ? (*this) / sqrtf(len2) :
vfloat3::Zero();
8115 return vfloat3 ((*
this) * invlen);
8117 float len2 =
dot (*
this, *
this);
8118 return len2 > 0.0f ? (*this) / sqrtf(len2) :
vfloat3::Zero();
8146 T.m_row[0], T.m_row[1], T.m_row[2], T.m_row[3]);
8148 T.m_mat = m_mat.transposed();
8155 vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
8156 shuffle<2>(V) * m_row[2] + m_row[3];
8157 R = R / shuffle<3>(R);
8168 vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
8169 shuffle<2>(V) * m_row[2];
8181 vfloat4 R = shuffle<0>(V) * T[0] + shuffle<1>(V) * T[1] +
8182 shuffle<2>(V) * T[2];
8186 m_mat.transposed().multDirMatrix (*(
Imath::V3f *)&V, R);
8194 return shuffle<0>(V) * M[0] + shuffle<1>(V) * M[1] +
8195 shuffle<2>(V) * M[2] + shuffle<3>(V) * M[3];
8203 #if OIIO_SIMD_SSE >= 3
8208 vfloat4 s01 = _mm_hadd_ps(m0v, m1v);
8210 vfloat4 s23 = _mm_hadd_ps(m2v, m3v);
8226 vbool4 b0 = (m_row[0] == m[0]);
8227 vbool4 b1 = (m_row[1] == m[1]);
8228 vbool4 b2 = (m_row[2] == m[2]);
8229 vbool4 b3 = (m_row[3] == m[3]);
8232 return memcmp(
this, &m, 16*
sizeof(
float)) == 0;
8237 return memcmp(
this, &m, 16*
sizeof(
float)) == 0;
8246 vbool4 b0 = (m_row[0] != m[0]);
8247 vbool4 b1 = (m_row[1] != m[1]);
8248 vbool4 b2 = (m_row[2] != m[2]);
8249 vbool4 b3 = (m_row[3] != m[3]);
8252 return memcmp(
this, &m, 16*
sizeof(
float)) != 0;
8257 return memcmp(
this, &m, 16*
sizeof(
float)) != 0;
8268 vfloat4 minor0, minor1, minor2, minor3;
8269 vfloat4 row0, row1, row2, row3;
8271 const float *
src = (
const float *)
this;
8273 tmp1 =
vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src)), (__m64*)(src+ 4)));
8274 row1 =
vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+8)), (__m64*)(src+12)));
8275 row0 =
vfloat4(_mm_shuffle_ps(tmp1, row1, 0x88));
8276 row1 =
vfloat4(_mm_shuffle_ps(row1, tmp1, 0xDD));
8277 tmp1 =
vfloat4(_mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)));
8278 row3 =
vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+10)), (__m64*)(src+14)));
8279 row2 =
vfloat4(_mm_shuffle_ps(tmp1, row3, 0x88));
8280 row3 =
vfloat4(_mm_shuffle_ps(row3, tmp1, 0xDD));
8283 tmp1 = shuffle<1,0,3,2>(tmp1);
8284 minor0 = row1 * tmp1;
8285 minor1 = row0 * tmp1;
8286 tmp1 = shuffle<2,3,0,1>(tmp1);
8287 minor0 = (row1 * tmp1) - minor0;
8288 minor1 = (row0 * tmp1) - minor1;
8289 minor1 = shuffle<2,3,0,1>(minor1);
8292 tmp1 = shuffle<1,0,3,2>(tmp1);
8293 minor0 = (row3 * tmp1) + minor0;
8294 minor3 = row0 * tmp1;
8295 tmp1 = shuffle<2,3,0,1>(tmp1);
8296 minor0 = minor0 - (row3 * tmp1);
8297 minor3 = (row0 * tmp1) - minor3;
8298 minor3 = shuffle<2,3,0,1>(minor3);
8300 tmp1 = shuffle<2,3,0,1>(row1) * row3;
8301 tmp1 = shuffle<1,0,3,2>(tmp1);
8302 row2 = shuffle<2,3,0,1>(row2);
8303 minor0 = (row2 * tmp1) + minor0;
8304 minor2 = row0 * tmp1;
8305 tmp1 = shuffle<2,3,0,1>(tmp1);
8306 minor0 = minor0 - (row2 * tmp1);
8307 minor2 = (row0 * tmp1) - minor2;
8308 minor2 = shuffle<2,3,0,1>(minor2);
8311 tmp1 = shuffle<1,0,3,2>(tmp1);
8312 minor2 = (row3 * tmp1) + minor2;
8313 minor3 = (row2 * tmp1) - minor3;
8314 tmp1 = shuffle<2,3,0,1>(tmp1);
8315 minor2 = (row3 * tmp1) - minor2;
8316 minor3 = minor3 - (row2 * tmp1);
8319 tmp1 = shuffle<1,0,3,2>(tmp1);
8320 minor1 = minor1 - (row2 * tmp1);
8321 minor2 = (row1 * tmp1) + minor2;
8322 tmp1 = shuffle<2,3,0,1>(tmp1);
8323 minor1 = (row2 * tmp1) + minor1;
8324 minor2 = minor2 - (row1 * tmp1);
8327 tmp1 = shuffle<1,0,3,2>(tmp1);
8328 minor1 = (row3 * tmp1) + minor1;
8329 minor3 = minor3 - (row1 * tmp1);
8330 tmp1 = shuffle<2,3,0,1>(tmp1);
8331 minor1 = minor1 - (row3 * tmp1);
8332 minor3 = (row1 * tmp1) + minor3;
8334 det = row0 * minor0;
8335 det = shuffle<2,3,0,1>(det) + det;
8336 det =
vfloat4(_mm_add_ss(shuffle<1,0,3,2>(det), det));
8337 tmp1 =
vfloat4(_mm_rcp_ss(det));
8338 det =
vfloat4(_mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))));
8339 det = shuffle<0>(det);
8340 return matrix44 (det*minor0, det*minor1, det*minor2, det*minor3);
8348 const float *m = (
const float *)&M;
8350 for (
int i = 1; i < 16; ++i)
8351 cout <<
' ' << m[i];
8367 M.multVecMatrix (*(
const Imath::V3f *)&V, R);
8383 M.multDirMatrix (*(
const Imath::V3f *)&V, R);
8420 for (
int i = 1; i < val.elements; ++i)
8421 cout <<
' ' << val[i];
8428 return _mm256_castps256_ps128 (
simd());
8436 return _mm256_extractf128_ps (
simd(), 1);
8445 __m256
r = _mm256_castps128_ps256 (lo);
8446 m_simd = _mm256_insertf128_ps (r, hi, 1);
8458 m_simd = _mm256_cvtepi32_ps (ival);
8467 return _mm256_setzero_ps();
8478 return vfloat8 (start+0.0
f*step, start+1.0
f*step, start+2.0
f*step, start+3.0
f*step,
8479 start+4.0
f*step, start+5.0
f*step, start+6.0
f*step, start+7.0
f*step);
8485 m_simd = _mm256_setzero_ps();
8495 m_simd = _mm256_set1_ps (val);
8496 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8505 float e,
float f,
float g,
float h) {
8507 m_simd = _mm256_set_ps (h, g, f, e, d, c, b, a);
8508 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8526 m_simd = _mm256_loadu_ps (values);
8527 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8538 #if 0 && OIIO_AVX512VL_ENABLED
8542 m_simd = _mm256_maskz_loadu_ps ((~(0xff << n)), values);
8543 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8547 hi.
load (values+4, n-4);
8552 lo.
load (values, n);
8558 for (
int i = 0; i <
n; ++i)
8559 m_val[i] = values[i];
8570 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8583 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8595 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8607 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8615 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8617 #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED
8619 vint4 a ((
const int *)values);
8620 m_simd = _mm256_cvtph_ps (a);
8621 #elif OIIO_SIMD_SSE >= 2
8636 _mm256_storeu_ps (values,
m_simd);
8637 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8648 #if 0 && OIIO_AVX512VL_ENABLED
8652 _mm256_mask_storeu_ps (values, __mmask8(~(0xff << n)),
m_simd);
8653 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8656 }
else if (n <= 8) {
8661 for (
int i = 0; i <
n; ++i)
8662 values[i] =
m_val[i];
8666 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8668 #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED
8669 __m128i h = _mm256_cvtps_ph (
m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
8670 _mm_storeu_si128 ((__m128i *)values, h);
8671 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8682 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8683 m_simd = _mm256_maskz_loadu_ps (__mmask8(mask), (
const simd_t *)values);
8693 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8696 m_simd = _mm256_maskload_ps (values, _mm256_castps_si256(mask));
8704 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8705 _mm256_mask_storeu_ps (values, __mmask8(mask),
m_simd);
8709 SIMD_DO (
if ((mask>>i) & 1) values[i] = (*
this)[i]);
8715 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8716 _mm256_mask_storeu_ps (values, __mmask8(mask.
bitmask()),
m_simd);
8718 _mm256_maskstore_ps (values, _mm256_castps_si256(mask.
simd()),
m_simd);
8720 SIMD_DO (
if (mask[i]) values[i] = (*
this)[i]);
8725 template <
int scale>
8729 #if OIIO_SIMD_AVX >= 2
8730 m_simd = _mm256_i32gather_ps (baseptr, vindex,
scale);
8740 #if OIIO_SIMD_AVX >= 2
8751 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8752 _mm256_i32scatter_ps (baseptr, vindex,
m_simd,
scale);
8761 const vint_t& vindex)
const
8763 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8774 return _mm256_add_ps (a, b);
8786 return _mm256_sub_ps (_mm256_setzero_ps(), a);
8794 return _mm256_sub_ps (a, b);
8806 return _mm256_mul_ps (a.
m_simd, _mm256_set1_ps(b));
8818 return _mm256_mul_ps (a, b);
8830 return _mm256_div_ps (a, b);
8842 return _mm256_cmp_ps (a, b, _CMP_EQ_OQ);
8850 return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ);
8858 return _mm256_cmp_ps (a, b, _CMP_LT_OQ);
8866 return _mm256_cmp_ps (a, b, _CMP_GT_OQ);
8874 return _mm256_cmp_ps (a, b, _CMP_GE_OQ);
8882 return _mm256_cmp_ps (a, b, _CMP_LE_OQ);
8893 m_simd = _mm256_cvttps_epi32(f);
8894 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8902 template<
int i0,
int i1,
int i2,
int i3,
int i4,
int i5,
int i6,
int i7>
8904 #if OIIO_SIMD_AVX >= 2
8906 return _mm256_permutevar8x32_ps (a, index);
8908 return vfloat8 (a[i0], a[
i1], a[
i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
8913 #if OIIO_SIMD_AVX >= 2
8914 return _mm256_permutevar8x32_ps (a,
vint8(i));
8916 return shuffle<i,i,i,i,i,i,i,i>(
a);
8923 #if OIIO_SIMD_AVX_NO_FIXME
8926 _m128 f4 = _mm256_extractf128_ps (i >> 2);
8928 return _mm_cvtss_f32(shuffle_sse<j,j,j,j>(a.simd()));
8937 #if OIIO_SIMD_AVX_NO_FIXME
8938 return _mm256_insert_epi32 (a, val, i);
8960 return _mm256_castps_si256 (x.
simd());
8962 return *(
vint8 *)&x;
8969 return _mm256_castsi256_ps (x.
simd());
8979 vfloat8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_ps(v.
simd(), _mm256_setzero_ps());
8980 vfloat8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_ps(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_ps());
8982 vfloat8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
8983 vfloat8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
8984 return shuffle<0>(final_sum);
8993 #if OIIO_SIMD_AVX >= 2
9004 return _mm256_blendv_ps (a, b, mask);
9005 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9017 return _mm256_and_ps(mask, a);
9018 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9030 return _mm256_andnot_ps(mask, a);
9031 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9042 return blend (b, a, mask);
9059 return _mm256_and_ps (a.
simd(), _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
9060 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9078 return _mm256_ceil_ps (a);
9087 return _mm256_floor_ps (a);
9096 return _mm256_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
9124 #if OIIO_SIMD_AVX512 && OIIO_AVX512VL_ENABLED
9139 return _mm256_sqrt_ps (a.
simd());
9150 return _mm256_div_ps (_mm256_set1_ps(1.0
f), _mm256_sqrt_ps (a.
simd()));
9160 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
9162 return _mm512_castps512_ps256(_mm512_rsqrt28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC));
9163 #elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
9165 return _mm512_castps512_ps256(_mm512_rsqrt14_ps(_mm512_castps256_ps512(a)));
9167 return _mm256_rsqrt_ps (a.
simd());
9180 return _mm256_min_ps (a, b);
9189 return _mm256_max_ps (a, b);
9198 return _mm256_andnot_ps (a.
simd(), b.
simd());
9200 const int *ai = (
const int *)&a;
9201 const int *bi = (
const int *)&b;
9217 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9219 return _mm256_fmadd_ps (a, b, c);
9220 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9233 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9235 return _mm256_fmsub_ps (a, b, c);
9236 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9250 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9252 return _mm256_fnmadd_ps (a, b, c);
9253 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9267 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9269 return _mm256_fnmsub_ps (a, b, c);
9270 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9275 return -(a *
b) - c;
9298 for (
int i = 1; i < val.elements; ++i)
9299 cout <<
' ' << val[i];
9305 #if OIIO_SIMD_AVX >= 512
9306 return _mm512_castps512_ps256 (
simd());
9313 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512DQ_ENABLED
9314 return _mm512_extractf32x8_ps (
simd(), 1);
9322 float v4,
float v5,
float v6,
float v7,
9323 float v8,
float v9,
float v10,
float v11,
9324 float v12,
float v13,
float v14,
float v15) {
9325 load (v0, v1, v2, v3, v4, v5, v6, v7,
9326 v8, v9, v10, v11, v12, v13, v14, v15);
9330 #if OIIO_SIMD_AVX >= 512
9331 __m512
r = _mm512_castps256_ps512 (lo);
9332 m_simd = _mm512_insertf32x8 (r, hi, 1);
9340 #if OIIO_SIMD_AVX >= 512
9341 m_simd = _mm512_broadcast_f32x4(a);
9353 #if OIIO_SIMD_AVX >= 512
9354 m_simd = _mm512_cvtepi32_ps (ival);
9362 #if OIIO_SIMD_AVX >= 512
9363 return _mm512_setzero_ps();
9374 return vfloat16 (start+0.0
f*step, start+1.0
f*step, start+2.0
f*step, start+3.0
f*step,
9375 start+4.0
f*step, start+5.0
f*step, start+6.0
f*step, start+7.0
f*step,
9376 start+8.0
f*step, start+9.0
f*step, start+10.0
f*step, start+11.0
f*step,
9377 start+12.0
f*step, start+13.0
f*step, start+14.0
f*step, start+15.0
f*step);
9382 #if OIIO_SIMD_AVX >= 512
9383 m_simd = _mm512_setzero_ps();
9391 #if OIIO_SIMD_AVX >= 512
9392 m_simd = _mm512_set1_ps (a);
9401 float v4,
float v5,
float v6,
float v7,
9402 float v8,
float v9,
float v10,
float v11,
9403 float v12,
float v13,
float v14,
float v15) {
9404 #if OIIO_SIMD_AVX >= 512
9405 m_simd = _mm512_setr_ps (v0, v1, v2, v3, v4, v5, v6, v7,
9406 v8, v9, v10, v11, v12, v13, v14, v15);
9429 #if OIIO_SIMD_AVX >= 512
9430 m_simd = _mm512_loadu_ps (values);
9441 #if OIIO_SIMD_AVX >= 512
9442 m_simd = _mm512_maskz_loadu_ps (__mmask16(~(0xffff << n)), values);
9456 #if OIIO_SIMD_AVX >= 512
9467 #if OIIO_SIMD_AVX >= 512
9478 #if OIIO_SIMD_AVX >= 512
9488 #if OIIO_SIMD_AVX >= 512
9497 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
9499 #if OIIO_SIMD_AVX >= 512
9501 vint8 a ((
const int *)values);
9502 m_simd = _mm512_cvtph_ps (a);
9513 #if OIIO_SIMD_AVX >= 512
9517 _mm512_storeu_ps (values,
m_simd);
9528 #if 0 && OIIO_SIMD_AVX >= 512
9532 _mm512_mask_storeu_ps (values, __mmask16(~(0xffff << n)),
m_simd);
9536 }
else if (n < 16) {
9545 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
9547 #if OIIO_SIMD_AVX >= 512
9548 __m256i h = _mm512_cvtps_ph (
m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
9549 _mm256_storeu_si256 ((__m256i *)values, h);
9559 #if OIIO_SIMD_AVX >= 512
9560 m_simd = _mm512_maskz_loadu_ps (mask, (
const simd_t *)values);
9569 #if OIIO_SIMD_AVX >= 512
9579 template <
int scale>
9583 #if OIIO_SIMD_AVX >= 512
9584 m_simd = _mm512_i32gather_ps (vindex, baseptr,
scale);
9595 #if OIIO_SIMD_AVX >= 512
9607 #if OIIO_SIMD_AVX >= 512
9608 _mm512_i32scatter_ps (baseptr, vindex,
m_simd,
scale);
9618 const vint_t& vindex)
const
9620 #if OIIO_SIMD_AVX >= 512
9621 _mm512_mask_i32scatter_ps (baseptr, mask, vindex,
m_simd,
scale);
9631 #if OIIO_SIMD_AVX >= 512
9643 #if OIIO_SIMD_AVX >= 512
9644 return _mm512_sub_ps (_mm512_setzero_ps(), a.
simd());
9651 #if OIIO_SIMD_AVX >= 512
9664 #if OIIO_SIMD_AVX >= 512
9665 return _mm512_mul_ps (a.
m_simd, _mm512_set1_ps(b));
9676 #if OIIO_SIMD_AVX >= 512
9688 #if OIIO_SIMD_AVX >= 512
9701 #if OIIO_SIMD_AVX >= 512
9702 return _mm512_cmp_ps_mask (a.
simd(), b.
simd(), _CMP_EQ_OQ);
9710 #if OIIO_SIMD_AVX >= 512
9711 return _mm512_cmp_ps_mask (a.
simd(), b.
simd(), _CMP_NEQ_OQ);
9719 #if OIIO_SIMD_AVX >= 512
9720 return _mm512_cmp_ps_mask (a.
simd(), b.
simd(), _CMP_LT_OQ);
9728 #if OIIO_SIMD_AVX >= 512
9729 return _mm512_cmp_ps_mask (a.
simd(), b.
simd(), _CMP_GT_OQ);
9737 #if OIIO_SIMD_AVX >= 512
9738 return _mm512_cmp_ps_mask (a.
simd(), b.
simd(), _CMP_GE_OQ);
9746 #if OIIO_SIMD_AVX >= 512
9747 return _mm512_cmp_ps_mask (a.
simd(), b.
simd(), _CMP_LE_OQ);
9757 #if OIIO_SIMD_AVX >= 512
9758 m_simd = _mm512_cvttps_epi32(f);
9767 template<
int i0,
int i1,
int i2,
int i3>
9769 #if OIIO_SIMD_AVX >= 512
9770 return _mm512_shuffle_f32x4(a,a,_MM_SHUFFLE(i3,
i2,
i1,i0));
9773 a.
store ((
float *)x);
9779 return shuffle4<i,i,i,i> (
a);
9782 template<
int i0,
int i1,
int i2,
int i3>
9784 #if OIIO_SIMD_AVX >= 512
9785 return _mm512_permute_ps(a,_MM_SHUFFLE(i3,
i2,
i1,i0));
9788 a.
store ((
float *)x);
9789 return vfloat16 (shuffle<i0,i1,i2,i3>(x[0]), shuffle<i0,i1,i2,i3>(x[1]),
9790 shuffle<i0,i1,i2,i3>(x[2]), shuffle<i0,i1,i2,i3>(x[3]));
9795 return shuffle<i,i,i,i> (
a);
9814 #if OIIO_SIMD_AVX >= 512
9815 return _mm_cvtss_f32(_mm512_castps512_ps128(
m_simd));
9832 #if OIIO_SIMD_AVX >= 512
9833 return _mm512_castps_si512 (x.
simd());
9841 #if OIIO_SIMD_AVX >= 512
9842 return _mm512_castsi512_ps (x.
simd());
9850 #if OIIO_SIMD_AVX >= 512
9853 vfloat16 AB_AB_CD_CD = v + shuffle4<1,0,3,2>(
v);
9854 vfloat16 w = AB_AB_CD_CD + shuffle4<2,3,0,1>(AB_AB_CD_CD);
9856 vfloat16 ab_ab_cd_cd = w + shuffle<1,0,3,2>(
w);
9857 return ab_ab_cd_cd + shuffle<2,3,0,1>(ab_ab_cd_cd);
9866 #if OIIO_SIMD_AVX >= 512
9876 #if OIIO_SIMD_AVX >= 512
9877 return _mm512_mask_blend_ps (mask, a, b);
9887 #if OIIO_SIMD_AVX >= 512
9888 return _mm512_maskz_mov_ps (mask, a);
9898 #if OIIO_SIMD_AVX >= 512
9899 return _mm512_maskz_mov_ps (!mask, a);
9909 return blend (b, a, mask);
9924 #if OIIO_SIMD_AVX >= 512
9927 return _mm512_castsi512_ps (_mm512_and_epi32 (_mm512_castps_si512(a.
simd()),
9928 _mm512_set1_epi32(0x7fffffff)));
9944 #if OIIO_SIMD_AVX >= 512
9945 return _mm512_ceil_ps (a);
9953 #if OIIO_SIMD_AVX >= 512
9954 return _mm512_floor_ps (a);
9963 #if OIIO_SIMD_AVX >= 512
9964 return _mm512_roundscale_ps (a, (1<<4) | 3);
9972 #if OIIO_SIMD_AVX >= 512
9973 return _mm512_cvt_roundps_epi32 (a, (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC));
9988 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
9989 return _mm512_rcp28_ps(a);
9990 #elif OIIO_SIMD_AVX >= 512
10001 #if OIIO_SIMD_AVX >= 512
10002 return _mm512_sqrt_ps (a);
10011 #if OIIO_SIMD_AVX >= 512
10012 return _mm512_div_ps (_mm512_set1_ps(1.0
f), _mm512_sqrt_ps (a));
10021 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
10022 return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
10023 #elif OIIO_SIMD_AVX >= 512
10024 return _mm512_rsqrt14_ps (a);
10033 #if OIIO_SIMD_AVX >= 512
10034 return _mm512_min_ps (a, b);
10042 #if OIIO_SIMD_AVX >= 512
10043 return _mm512_max_ps (a, b);
10051 #if OIIO_SIMD_AVX >= 512 && defined(__AVX512DQ__)
10052 return _mm512_andnot_ps (a, b);
10062 #if OIIO_SIMD_AVX >= 512
10063 return _mm512_fmadd_ps (a, b, c);
10074 #if OIIO_SIMD_AVX >= 512
10075 return _mm512_fmsub_ps (a, b, c);
10087 #if OIIO_SIMD_AVX >= 512
10088 return _mm512_fnmadd_ps (a, b, c);
10100 #if OIIO_SIMD_AVX >= 512
10101 return _mm512_fnmsub_ps (a, b, c);
10117 #undef SIMD_CONSTRUCT
10118 #undef SIMD_CONSTRUCT_PAD
10120 #undef SIMD_RETURN_REDUCE
friend const vfloat8 & operator/=(vfloat8 &a, const vfloat8 &b)
friend vfloat8 operator+(const vfloat8 &a, const vfloat8 &b)
static const char * type_name()
static const char * type_name()
static const vbool4 True()
Return a vbool4 the is 'true' for all values.
vint16()
Default constructor (contents undefined)
friend vint4 operator|(const vint4 &a, const vint4 &b)
vfloat16(float a)
Construct from a single value (store it in all slots)
friend const vint8 & operator%=(vint8 &a, const vint8 &b)
static const vint4 NegOne()
Return an vint4 with all components set to -1 (aka 0xffffffff)
friend vbool8 operator!(const vbool8 &a)
Logical/bitwise operators, component-by-component.
friend const vint4 & operator%=(vint4 &a, const vint4 &b)
int operator[](int i) const
Component access (get)
friend const vfloat16 & operator*=(vfloat16 &a, const vfloat16 &b)
vint4 max(const vint4 &a, const vint4 &b)
friend vfloat3 operator*(const vfloat3 &a, const vfloat3 &b)
static const char * type_name()
typedef int(APIENTRYP RE_PFNGLXSWAPINTERVALSGIPROC)(int)
vfloat4(float a)
Construct from a single value (store it in all slots)
static vbool4 from_bitmask(int bitmask)
Convert from integer bitmask to a true vbool4.
bool none(const vbool4 &v)
void clear()
Set all components to 0.0.
vbool4(bool a)
Construct from a single value (store it in all slots)
friend const vbool16 & operator|=(vbool16 &a, const vbool16 &b)
vfloat3 operator-() const
friend vbool8 operator!=(const vint8 &a, const vint8 &b)
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
const vfloat3 & operator-=(const vfloat3 &a)
vfloat4(const Imath::V3f &v)
Construct from a Imath::V3f.
void store(float *values) const
static const vfloat8 One()
Return a vfloat8 with all components set to 1.0.
friend vfloat4 operator*(const vfloat4 &a, const vfloat4 &b)
vint16 shuffle4(const vint16 &a)
Shuffle groups of 4.
friend vbool16 operator<=(const vfloat16 &a, const vfloat16 &b)
OIIO_FORCEINLINE const vint4 & operator/=(vint4 &a, const vint4 &b)
friend vint8 operator~(const vint8 &a)
void store_mask(int mask, value_t *values) const
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vfloat16 operator-(const vfloat16 &a)
vfloat8(float a)
Construct from a single value (store it in all slots)
OIIO_FORCEINLINE vbool4 shuffle(const vbool4 &a)
friend vfloat3 operator+(const vfloat3 &a, const vfloat3 &b)
friend vbool16 operator!=(const vint16 &a, const vint16 &b)
friend const vbool8 & operator|=(vbool8 &a, const vbool8 &b)
static const char * name()
SYS_API float expf(float x)
Matrix44< float > M44f
4x4 matrix of float
friend vfloat16 operator%(const vfloat16 &a, const vfloat16 &b)
friend std::ostream & operator<<(std::ostream &cout, const vfloat4 &val)
Stream output.
friend vint8 operator/(const vint8 &a, const vint8 &b)
vfloat4 bitcast_to_float(const vint4 &x)
static const vint4 Giota()
Return an vint4 with "geometric" iota: (1, 2, 4, 8).
OIIO_FORCEINLINE const vint4 & operator>>=(vint4 &a, const unsigned int bits)
friend const vint8 & operator>>=(vint8 &a, unsigned int bits)
void load_mask(const vbool_t &mask, const value_t *values)
void scatter_mask(int mask, value_t *baseptr, const vint_t &vindex) const
friend vint4 operator&(const vint4 &a, const vint4 &b)
int operator[](int i) const
Component access (get)
friend vbool8 operator>(const vint8 &a, const vint8 &b)
static const vfloat3 Zero()
Return a vfloat3 with all components set to 0.0.
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
friend vfloat4 AxyBxy(const vfloat4 &a, const vfloat4 &b)
Vec4< float > V4f
Vec4 of float.
static const char * type_name()
imath_half_bits_t half
if we're in a C-only context, alias the half bits type to half
void store_mask(int mask, value_t *values) const
vint4 srl(const vint4 &val, const unsigned int bits)
OIIO_FORCEINLINE vint4 operator%(const vint4 &a, const vint4 &b)
vint4 bitcast_to_int4(const vfloat4 &x)
friend const vbool4 & operator&=(vbool4 &a, const vbool4 &b)
friend const vint4 & operator>>=(vint4 &a, unsigned int bits)
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
friend vint8 operator|(const vint8 &a, const vint8 &b)
vfloat4 vfloat_t
SIMD int type.
friend vfloat16 operator*(const vfloat16 &a, const vfloat16 &b)
int operator[](int i) const
Component access (get)
friend const vfloat8 & operator+=(vfloat8 &a, const vfloat8 &b)
int bitmask() const
Extract the bitmask.
vfloat16 vfloat_t
float type of the same length
vfloat3(const float *f)
Construct from a pointer to 4 values.
friend vfloat4 operator/(const vfloat4 &a, const vfloat4 &b)
vfloat16(const unsigned short *vals)
Construct from a pointer to unsigned short values.
int value_t
Underlying equivalent scalar value type.
OIIO_FORCEINLINE const vbool4 & operator^=(vbool4 &a, const vbool4 &b)
void clear()
Sset all components to 0.
vint8()
Default constructor (contents undefined)
friend vbool16 operator==(const vbool16 &a, const vbool16 &b)
Comparison operators, component by component.
friend vint4 operator/(const vint4 &a, const vint4 &b)
vfloat8(const short *vals)
Construct from a pointer to short values.
vfloat3 transformv(const vfloat3 &V) const
Transform 3-vector V by 4x4 matrix M.
int bitmask() const
Extract the bitmask.
void setcomp(int i, bool value)
Component access (set).
void clear()
Set all components to false.
static vbool16 from_bitmask(int bitmask)
Convert from integer bitmask to a true vbool16.
void load(float val)
Helper: load a single value into all components.
vfloat4 sqrt(const vfloat4 &a)
GLdouble GLdouble GLdouble z
static const vfloat16 One()
Return a vfloat16 with all components set to 1.0.
static const char * name()
vfloat8()
Default constructor (contents undefined)
vbool8 vbool_t
bool type of the same length
OIIO_FORCEINLINE vbool4 operator!(const vbool4 &a)
const vfloat4 & operator=(float a)
Assign a single value to all components.
const vfloat4 & operator/=(const vfloat4 &a)
friend vint16 operator+(const vint16 &a, const vint16 &b)
friend const vint8 & operator*=(vint8 &a, const vint8 &b)
friend const vint4 & operator*=(vint4 &a, const vint4 &b)
friend vint8 operator<<(const vint8 &a, unsigned int bits)
static const char * name()
vfloat8(const float *f)
Construct from a pointer to 8 values.
vfloat4(const vfloat4 &other)
Copy construct from another vfloat4.
friend vbool8 operator!=(const vfloat8 &a, const vfloat8 &b)
friend vbool8 operator>=(const vfloat8 &a, const vfloat8 &b)
void clear()
Set all components to false.
friend const vint16 & operator&=(vint16 &a, const vint16 &b)
vfloat4 rsqrt_fast(const vfloat4 &a)
Fast, approximate 1/sqrt.
GLboolean GLboolean GLboolean GLboolean a
vint16(const simd_t &m)
Construct from the underlying SIMD type.
friend const vbool16 & operator&=(vbool16 &a, const vbool16 &b)
static const vfloat4 Zero()
Return a vfloat4 with all components set to 0.0.
vbool4(const simd_t &m)
Construct from the underlying SIMD type.
float operator[](int i) const
Component access (get)
vfloat16 min(const vfloat16 &a, const vfloat16 &b)
Per-element min.
float value_t
Underlying equivalent scalar value type.
vfloat3 transformvT(const vfloat3 &V) const
Transform 3-vector V by the transpose of 4x4 matrix M.
int operator[](int i) const
Component access (get)
void load(float val)
Helper: load a single value into all components.
#define SIMD_CONSTRUCT_PAD(x)
friend const vint16 & operator<<=(vint16 &a, unsigned int bits)
OIIO_FORCEINLINE const vbool4 & operator|=(vbool4 &a, const vbool4 &b)
friend vint16 operator*(const vint16 &a, const vint16 &b)
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
vfloat16 max(const vfloat16 &a, const vfloat16 &b)
Per-element max.
vfloat3()
Default constructor (contents undefined)
bool value_t
Underlying equivalent scalar value type.
simd_raw_t< int, elements >::type simd_t
the native SIMD type used
static const vint8 Zero()
Return an vint8 with all components set to 0.
vbool8 vbool_t
SIMD bool type.
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
vint16 vint_t
SIMD int type.
vfloat4(float a, float b, float c, float d=0.0f)
Construct from 3 or 4 values.
#define OIIO_SIMD_UINT4_CONST(name, val)
static const vint8 NegOne()
Return an vint8 with all components set to -1 (aka 0xffffffff)
static const char * type_name()
bool reduce_or(const vbool4 &v)
friend vfloat8 operator-(const vfloat8 &a)
friend vint16 operator/(const vint16 &a, const vint16 &b)
**But if you need a result
static const vint16 Giota()
Return an vint16 with "geometric" iota: (1, 2, 4, 8, ...).
vfloat4()
Default constructor (contents undefined)
void load_mask(int mask, const value_t *values)
GLfloat GLfloat GLfloat v2
friend const vint16 & operator^=(vint16 &a, const vint16 &b)
const vfloat8 & operator=(float a)
Assign a single value to all components.
Integer 8-vector, accelerated by SIMD instructions when available.
void clear()
Set all components to 0.0.
vfloat4(const char *vals)
Construct from a pointer to 4 char values.
static const char * name()
friend const vint4 & operator<<=(vint4 &a, unsigned int bits)
GLfloat GLfloat GLfloat GLfloat v3
OIIO_FORCEINLINE vbool4 insert(const vbool4 &a, bool val)
Helper: substitute val for a[i].
simd_raw_t< int, elements >::type simd_t
the native SIMD type used
vfloat3 hdiv(const vfloat4 &a)
Homogeneous divide to turn a vfloat4 into a vfloat3.
vfloat3 transformv(const matrix44 &M, const vfloat3 &V)
Transform 3-vector V by 4x4 matrix M.
const vbool8 & operator=(bool a)
Assign one value to all components.
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
vbool8 lo() const
Extract the lower precision vbool8.
void load_bitmask(int a)
Helper: load all components from a bitmask in an int.
friend vbool4 operator!=(const vbool4 &a, const vbool4 &b)
vint8(const simd_t &m)
Construct from the underlying SIMD type.
void clear()
Sset all components to 0.
friend vbool4 operator<(const vint4 &a, const vint4 &b)
vbool8(bool a)
Construct from a single value (store it in all slots)
const value_t * data() const
Return a pointer to the underlying scalar type.
friend const vint4 & operator^=(vint4 &a, const vint4 &b)
OIIO_FORCEINLINE vbool4 operator>=(const vint4 &a, const vint4 &b)
int operator[](int i) const
Component access (get)
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vfloat16 operator/(const vfloat16 &a, const vfloat16 &b)
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
OIIO_FORCEINLINE vbool4 operator~(const vbool4 &a)
float dot3(const vfloat4 &a, const vfloat4 &b)
Return the float 3-component dot (inner) product of a and b.
value_t m_val[paddedelements]
void load_mask(int mask, const value_t *values)
friend const vfloat16 & operator+=(vfloat16 &a, const vfloat16 &b)
vint8 vint_t
SIMD int type.
friend vbool4 operator|(const vbool4 &a, const vbool4 &b)
friend const vint16 & operator%=(vint16 &a, const vint16 &b)
simd_bool_t< 8 >::type simd_t
the native SIMD type used
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
static const char * type_name()
Template giving a printable name for each type.
vint4 abs(const vint4 &a)
vfloat4 safe_div(const vfloat4 &a, const vfloat4 &b)
void store(int *values) const
Store the values into memory.
friend vbool8 operator<=(const vfloat8 &a, const vfloat8 &b)
friend std::ostream & operator<<(std::ostream &cout, const vfloat8 &val)
Stream output.
vbool16(const simd_t &m)
Construct from the underlying SIMD type.
vfloat8 lo() const
Extract the lower precision vfloat8.
vbool4(const vbool4 &other)
Copy construct from another vbool4.
static const vbool4 False()
Return a vbool4 the is 'false' for all values.
vbool4 lo() const
Extract the lower precision vbool4.
static const vbool16 False()
Return a vbool16 the is 'false' for all values.
void setcomp(int i, bool value)
Component access (set).
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vint4 operator~(const vint4 &a)
void gather_mask(int mask, const value_t *baseptr, const vint_t &vindex)
friend vbool4 operator==(const vfloat4 &a, const vfloat4 &b)
OIIO_FORCEINLINE bool extract(const vbool4 &a)
vfloat3 normalized() const
Return a normalized version of the vector.
vfloat4 floor(const vfloat4 &a)
void gather_mask(int mask, const value_t *baseptr, const vint_t &vindex)
OIIO_FORCEINLINE matrix44(const float *f)
Construct from a float array.
static const vint4 One()
Return an vint4 with all components set to 1.
vint4 blend(const vint4 &a, const vint4 &b, const vbool4 &mask)
int value_t
Underlying equivalent scalar value type.
vbool16(const vbool16 &other)
Copy construct from another vbool16.
vfloat8(const vfloat8 &other)
Copy construct from another vfloat8.
vint4 blend0not(const vint4 &a, const vbool4 &mask)
vint4 vint_t
SIMD int type.
GA_API const UT_StringHolder scale
friend const vbool16 & operator^=(vbool16 &a, const vbool16 &b)
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
vbool4 hi() const
Extract the higher precision vbool4.
OIIO_FORCEINLINE vbool4 operator>(const vint4 &a, const vint4 &b)
friend vbool16 operator!=(const vfloat16 &a, const vfloat16 &b)
vfloat16(const float *f)
Construct from a pointer to 16 values.
vbool8(const simd_t &m)
Construct from the underlying SIMD type.
bool operator!=(const matrix44 &m) const
friend vbool4 operator!(const vbool4 &a)
Logical/bitwise operators, component-by-component.
vfloat8 hi() const
Extract the higher precision vfloat8.
OIIO_FORCEINLINE vbool4 operator<=(const vint4 &a, const vint4 &b)
friend const vint8 & operator+=(vint8 &a, const vint8 &b)
friend vfloat8 operator/(const vfloat8 &a, const vfloat8 &b)
const value_t * data() const
Return a pointer to the underlying scalar type.
vfloat4 vdot3(const vfloat4 &a, const vfloat4 &b)
friend const vint8 & operator-=(vint8 &a, const vint8 &b)
float value_t
Underlying equivalent scalar value type.
vint8(const vint8 &other)
Copy construct from another vint8.
simd_raw_t< int, elements >::type simd_t
the native SIMD type used
Integer 4-vector, accelerated by SIMD instructions when available.
#define SIMD_RETURN_REDUCE(T, init, op)
friend vbool8 operator<(const vint8 &a, const vint8 &b)
vint8 vint_t
int type of the same length
OIIO_FORCEINLINE vbool4 operator==(const vbool4 &a, const vbool4 &b)
friend vfloat3 operator/(const vfloat3 &a, const vfloat3 &b)
OIIO_DEPRECATED("use bitcast_to_int() (1.8)") inline vint4 bitcast_to_int4(const vbool4 &x)
vfloat4 lo() const
Extract the lower precision vfloat4.
vint16(const vint16 &other)
Copy construct from another vint16.
simd_raw_t< float, 8 >::type simd_t
the native SIMD type used
IMATH_NAMESPACE::V2f float
int value_t
Underlying equivalent scalar value type.
float length() const
Length of the vector.
bool any(const vbool4 &v)
const vfloat4 & operator-=(const vfloat4 &a)
void load(bool a)
Helper: load a single value into all components.
bool reduce_and(const vbool4 &v)
Logical reduction across all components.
vint16 vint_t
int type of the same length
vfloat8(const unsigned short *vals)
Construct from a pointer to unsigned short values.
friend vbool16 operator<=(const vint16 &a, const vint16 &b)
vfloat3 transformp(const matrix44 &M, const vfloat3 &V)
Transform 3-point V by 4x4 matrix M.
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
friend const vint4 & operator+=(vint4 &a, const vint4 &b)
vfloat8 vfloat_t
float type of the same length
OIIO_FORCEINLINE const vint4 & operator+=(vint4 &a, const vint4 &b)
void transpose(vint4 &a, vint4 &b, vint4 &c, vint4 &d)
vint4 select(const vbool4 &mask, const vint4 &a, const vint4 &b)
const Imath::V3f & V3f() const
Cast to a Imath::V3f.
void load(bool a)
Helper: load a single value into all components.
vfloat4(const short *vals)
Construct from a pointer to 4 short values.
static vbool8 from_bitmask(int bitmask)
Convert from integer bitmask to a true vbool8.
vint4 rotl(const vint4 &x, const int s)
Circular bit rotate by s bits, for N values at once.
friend vbool16 operator>=(const vint16 &a, const vint16 &b)
const vfloat4 & operator*=(const vfloat4 &a)
vfloat4 msub(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
vbool16 vbool_t
bool type of the same length
vfloat4 madd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
friend vint8 operator&(const vint8 &a, const vint8 &b)
friend vbool16 operator^(const vbool16 &a, const vbool16 &b)
friend vbool8 operator<(const vfloat8 &a, const vfloat8 &b)
void store_mask(int mask, value_t *values) const
float operator[](int i) const
Component access (get)
OIIO_FORCEINLINE matrix44(const Imath::M44f &M)
Construct from a reference to an Imath::M44f.
vfloat3 normalized_fast() const
Return a fast, approximate normalized version of the vector.
void load_mask(int mask, const value_t *values)
const value_t * data() const
Return a pointer to the underlying scalar type.
friend vbool4 operator<(const vfloat4 &a, const vfloat4 &b)
static const char * type_name()
void setcomp(int i, int value)
Component access (set).
OIIO_FORCEINLINE T exp(const T &v)
void load_mask(int mask, const value_t *values)
static const vfloat3 One()
Return a vfloat3 with all components set to 1.0.
vfloat16(const vfloat16 &other)
Copy construct from another vfloat16.
void load(int a)
Helper: load a single int into all components.
bool set_denorms_zero_mode(bool on)
static const char * name()
vbool16()
Default constructor (contents undefined)
simd_bool_t< 16 >::type simd_t
the native SIMD type used
friend vint16 operator<<(const vint16 &a, unsigned int bits)
void store_mask(int mask, value_t *values) const
vbool4(bool a, bool b, bool c, bool d)
Construct from 4 bool values.
OIIO_FORCEINLINE std::ostream & operator<<(std::ostream &cout, const vbool4 &a)
static const vfloat16 Zero()
Return a vfloat16 with all components set to 0.0.
vfloat4 operator-() const
value_t m_val[paddedelements]
vfloat4(const unsigned short *vals)
Construct from a pointer to 4 unsigned short values.
friend vint4 operator%(const vint4 &a, const vint4 &b)
friend vbool8 operator!=(const vbool8 &a, const vbool8 &b)
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
vfloat16 vfloat_t
SIMD int type.
friend const vfloat16 & operator/=(vfloat16 &a, const vfloat16 &b)
static const vbool16 True()
Return a vbool16 the is 'true' for all values.
const vbool16 & operator=(bool a)
Assign one value to all components.
vfloat4 nmsub(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
vbool4()
Default constructor (contents undefined)
static const vbool8 True()
Return a vbool8 the is 'true' for all values.
static const char * name()
friend vbool8 operator==(const vbool8 &a, const vbool8 &b)
Comparison operators, component by component.
vbool8(const vbool8 &other)
Copy construct from another vbool8.
friend vint4 operator^(const vint4 &a, const vint4 &b)
OIIO_FORCEINLINE vfloat4 AxBxAyBy(const vfloat4 &a, const vfloat4 &b)
vint4 blend0(const vint4 &a, const vbool4 &mask)
vfloat3(const short *vals)
Construct from a pointer to 4 short values.
void store(float *values) const
friend vint8 operator*(const vint8 &a, const vint8 &b)
friend vfloat16 operator+(const vfloat16 &a, const vfloat16 &b)
vfloat4(const float *f)
Construct from a pointer to 4 values.
const vint4 & operator=(int a)
Assign one value to all components.
void store(bool *values) const
Helper: store the values into memory as bools.
static const vint4 Zero()
Return an vint4 with all components set to 0.
friend vbool16 operator<(const vfloat16 &a, const vfloat16 &b)
friend const vbool4 & operator^=(vbool4 &a, const vbool4 &b)
vfloat3(float a)
Construct from a single value (store it in all slots)
friend vint8 operator-(const vint8 &a)
static const char * name()
friend const vint16 & operator+=(vint16 &a, const vint16 &b)
bool set_flush_zero_mode(bool on)
friend vint8 operator^(const vint8 &a, const vint8 &b)
void clear()
Set all components to 0.0.
SYS_API float logf(float x)
float length2() const
Square of the length of the vector.
friend vbool4 operator!=(const vfloat4 &a, const vfloat4 &b)
friend vbool16 operator<(const vint16 &a, const vint16 &b)
vbool4(int a, int b, int c, int d)
Construct from 4 int values.
vfloat3(float a, float b, float c)
Construct from 3 or 4 values.
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
void store(bool *values) const
Helper: store the values into memory as bools.
friend std::ostream & operator<<(std::ostream &cout, const matrix44 &M)
Stream output.
matrix44 transposed() const
Return the transposed matrix.
OIIO_FORCEINLINE vint4 operator>>(const vint4 &a, const unsigned int bits)
vfloat4 round(const vfloat4 &a)
GLboolean GLboolean GLboolean b
friend vbool4 operator!(const vbool4 &a)
Logical/bitwise operators, component-by-component.
static const vfloat4 One()
Return a vfloat4 with all components set to 1.0.
void store_mask(int mask, value_t *values) const
#define SIMD_RETURN(T, x)
friend vbool16 operator>(const vint16 &a, const vint16 &b)
const value_t * data() const
Return a pointer to the underlying scalar type.
const vint16 & operator=(int a)
Assign one value to all components.
vfloat8(const char *vals)
Construct from a pointer to char values.
bool get_denorms_zero_mode()
friend vbool8 operator~(const vbool8 &a)
friend vbool4 operator!=(const vint4 &a, const vint4 &b)
vfloat3 transformvT(const matrix44 &M, const vfloat3 &V)
const vfloat3 & operator/=(const vfloat3 &a)
friend vbool4 operator>(const vint4 &a, const vint4 &b)
friend const vint4 & operator-=(vint4 &a, const vint4 &b)
bool get_flush_zero_mode()
vfloat4 operator[](int i) const
Return one row.
float operator[](int i) const
Component access (get)
const Imath::M44f & M44f() const
Present as an Imath::M44f.
void load(float val)
Helper: load a single value into all components.
static const vfloat8 Iota(float start=0.0f, float step=1.0f)
friend const vint4 & operator&=(vint4 &a, const vint4 &b)
simd_raw_t< float, 4 >::type simd_t
the native SIMD type used
vbool8()
Default constructor (contents undefined)
friend vbool16 operator|(const vbool16 &a, const vbool16 &b)
vfloat16()
Default constructor (contents undefined)
vint8 lo() const
Extract the lower precision vint8.
vfloat4 vdot(const vfloat4 &a, const vfloat4 &b)
Return the float dot (inner) product of a and b in every component.
static const vfloat8 Zero()
Return a vfloat8 with all components set to 0.0.
friend vint16 operator|(const vint16 &a, const vint16 &b)
vfloat4 rcp_fast(const vfloat4 &a)
Fast, approximate 1/a.
friend vint4 operator+(const vint4 &a, const vint4 &b)
const value_t * data() const
Return a pointer to the underlying scalar type.
friend vint16 operator>>(const vint16 &a, unsigned int bits)
Integer 16-vector, accelerated by SIMD instructions when available.
vfloat16(const unsigned char *vals)
Construct from a pointer to unsigned char values.
static const char * type_name()
friend vbool4 operator^(const vbool4 &a, const vbool4 &b)
const vfloat3 & operator*=(const vfloat3 &a)
static const char * type_name()
OIIO_FORCEINLINE vint4 operator/(const vint4 &a, const vint4 &b)
vfloat16(const simd_t &m)
Construct from the underlying SIMD type.
vfloat3(const Imath::V3f &v)
Construct from a Imath::V3f.
OIIO_FORCEINLINE vbool4 operator^(const vbool4 &a, const vbool4 &b)
static const vint16 One()
Return an vint16 with all components set to 1.
vfloat16(const char *vals)
Construct from a pointer to char values.
vbool16 vbool_t
SIMD bool type.
vint4 hi() const
Extract the higher precision vint4.
vfloat3 transformp(const vfloat3 &V) const
Transform 3-point V by 4x4 matrix M.
OIIO_FORCEINLINE matrix44(const float *a, const float *b, const float *c, const float *d)
Construct from 4 float[4] rows.
friend vbool8 operator>(const vfloat8 &a, const vfloat8 &b)
OIIO_FORCEINLINE vbool4 operator|(const vbool4 &a, const vbool4 &b)
Vec3< float > V3f
Vec3 of float.
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
GLfloat GLfloat GLfloat GLfloat h
void store_mask(const vbool_t &mask, value_t *values) const
static const vfloat3 Iota(float start=0.0f, float step=1.0f)
vint4 safe_mod(const vint4 &a, const vint4 &b)
friend const vint8 & operator/=(vint8 &a, const vint8 &b)
static const vfloat4 Iota(float start=0.0f, float step=1.0f)
friend const vfloat8 & operator-=(vfloat8 &a, const vfloat8 &b)
void load(int a)
Helper: load a single int into all components.
void store(bool *values) const
Helper: store the values into memory as bools.
vint4 vreduce_add(const vint4 &v)
The sum of all components, returned in all components.
OIIO_FORCEINLINE const vint4 & operator<<=(vint4 &a, const unsigned int bits)
void scatter_mask(int mask, value_t *baseptr, const vint_t &vindex) const
float value_t
Underlying equivalent scalar value type.
OIIO_FORCEINLINE vbool4 operator<(const vint4 &a, const vint4 &b)
friend vbool8 operator|(const vbool8 &a, const vbool8 &b)
friend const vint16 & operator*=(vint16 &a, const vint16 &b)
OIIO_FORCEINLINE vbool4 operator!=(const vbool4 &a, const vbool4 &b)
vfloat4 ceil(const vfloat4 &a)
friend vbool16 operator==(const vfloat16 &a, const vfloat16 &b)
vbool4 vbool_t
SIMD bool type.
void store(int *values) const
Store the values into memory.
void clear()
Sset all components to 0.
vfloat4 vfloat_t
float type of the same length
GLenum GLsizei GLsizei GLint * values
friend vint16 operator%(const vint16 &a, const vint16 &b)
OIIO_FORCEINLINE vbool4 operator&(const vbool4 &a, const vbool4 &b)
#define SIMD_CONSTRUCT(x)
vfloat4(const simd_t &m)
Construct from the underlying SIMD type.
OIIO_FORCEINLINE matrix44()
friend const vint16 & operator-=(vint16 &a, const vint16 &b)
vfloat8(float a, float b, float c, float d, float e, float f, float g, float h)
Construct from 8 values.
static const vint4 Iota(int start=0, int step=1)
void load_mask(int mask, const value_t *values)
friend vfloat4 operator+(const vfloat4 &a, const vfloat4 &b)
void clear()
Set all components to false.
int m_val[paddedelements]
friend vbool8 operator&(const vbool8 &a, const vbool8 &b)
static const vint16 NegOne()
Return an vint16 with all components set to -1 (aka 0xffffffff)
vfloat4 bitcast_to_float4(const vint4 &x)
vint4 lo() const
Extract the lower precision vint4.
friend vbool8 operator^(const vbool8 &a, const vbool8 &b)
friend vint4 operator-(const vint4 &a)
matrix44 inverse() const
Return the inverse of the matrix.
vfloat4 nmadd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
vfloat3(const unsigned short *vals)
Construct from a pointer to 4 unsigned short values.
void load_mask(int mask, const value_t *values)
friend vbool16 operator~(const vbool16 &a)
friend const vint16 & operator/=(vint16 &a, const vint16 &b)
vint4(const simd_t &m)
Construct from the underlying SIMD type.
friend vbool8 operator>=(const vint8 &a, const vint8 &b)
vfloat8 vfloat_t
SIMD int type.
friend vint16 operator^(const vint16 &a, const vint16 &b)
friend vint8 operator>>(const vint8 &a, unsigned int bits)
int m_val[paddedelements]
value_t m_val[paddedelements]
vint4()
Default constructor (contents undefined)
const vbool4 & operator=(bool a)
Assign one value to all components.
vint4 bitcast_to_int(const vbool4 &x)
Bitcast back and forth to intN (not a convert – move the bits!)
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
friend vint4 operator<<(const vint4 &a, unsigned int bits)
vfloat4(const unsigned char *vals)
Construct from a pointer to 4 unsigned char values.
GA_API const UT_StringHolder N
friend vint16 operator&(const vint16 &a, const vint16 &b)
void setcomp(int i, float value)
Component access (set).
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
const vint8 & operator=(int a)
Assign one value to all components.
void store_mask(int mask, value_t *values) const
static const vbool8 False()
Return a vbool8 the is 'false' for all values.
friend vint16 operator-(const vint16 &a)
friend vbool4 operator&(const vbool4 &a, const vbool4 &b)
friend vbool16 operator&(const vbool16 &a, const vbool16 &b)
vbool4 vbool_t
bool type of the same length
friend vfloat8 operator%(const vfloat8 &a, const vfloat8 &b)
vbool8 hi() const
Extract the higher precision vbool8.
friend vint8 operator+(const vint8 &a, const vint8 &b)
friend const vbool8 & operator^=(vbool8 &a, const vbool8 &b)
static const char * name()
const Imath::V3f & V3f() const
Cast to a Imath::V3f.
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
friend const vfloat16 & operator-=(vfloat16 &a, const vfloat16 &b)
friend vbool16 operator!=(const vbool16 &a, const vbool16 &b)
const vfloat16 & operator=(float a)
Assign a single value to all components.
vint4 rotl32(const vint4 &x, const unsigned int k)
friend const vbool8 & operator&=(vbool8 &a, const vbool8 &b)
vint4 floori(const vfloat4 &a)
OIIO_FORCEINLINE const vint4 & operator-=(vint4 &a, const vint4 &b)
friend const vint4 & operator|=(vint4 &a, const vint4 &b)
vint4(const vint4 &other)
Copy construct from another vint4.
friend vbool16 operator>=(const vfloat16 &a, const vfloat16 &b)
friend const vint16 & operator>>=(vint16 &a, unsigned int bits)
void setcomp(int i, int value)
Component access (set).
static const vint8 Iota(int start=0, int step=1)
friend vint4 operator>>(const vint4 &a, unsigned int bits)
OIIO_FORCEINLINE vint4 operator*(const vint4 &a, const vint4 &b)
OIIO_FORCEINLINE const vbool4 & operator&=(vbool4 &a, const vbool4 &b)
const value_t * data() const
Return a pointer to the underlying scalar type.
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
GLubyte GLubyte GLubyte GLubyte w
friend vbool8 operator==(const vfloat8 &a, const vfloat8 &b)
static const vint8 Giota()
Return an vint8 with "geometric" iota: (1, 2, 4, 8, ...).
IMATH_INTERNAL_NAMESPACE_HEADER_ENTER IMATH_HOSTDEVICE constexpr T abs(T a) IMATH_NOEXCEPT
vfloat4(const Imath::V4f &v)
Construct from a Imath::V4f.
friend const vfloat8 & operator*=(vfloat8 &a, const vfloat8 &b)
friend vfloat4 operator*(const vfloat4 &V, const matrix44 &M)
friend const vint8 & operator^=(vint8 &a, const vint8 &b)
vfloat3(const unsigned char *vals)
Construct from a pointer to 4 unsigned char values.
const vfloat4 & operator+=(const vfloat4 &a)
friend vint4 operator*(const vint4 &a, const vint4 &b)
void load(bool a)
Helper: load a single value into all components.
friend const vbool4 & operator|=(vbool4 &a, const vbool4 &b)
static const char * name()
friend vbool4 operator~(const vbool4 &a)
const vfloat3 & operator+=(const vfloat3 &a)
void store_mask(const vbool_t &mask, value_t *values) const
friend vbool4 operator<=(const vint4 &a, const vint4 &b)
#define OIIO_NAMESPACE_END
friend const vint16 & operator|=(vint16 &a, const vint16 &b)
void load(float val)
Helper: load a single value into all components.
vint4 min(const vint4 &a, const vint4 &b)
friend const vint8 & operator&=(vint8 &a, const vint8 &b)
friend std::ostream & operator<<(std::ostream &cout, const vbool4 &a)
Stream output.
static const char * name()
bool value_t
Underlying equivalent scalar value type.
bool value_t
Underlying equivalent scalar value type.
vfloat4 hi() const
Extract the higher precision vfloat4.
void load_mask(const vbool_t &mask, const value_t *values)
OIIO_FORCEINLINE T log(const T &v)
friend vfloat4 AxBxAyBy(const vfloat4 &a, const vfloat4 &b)
friend vfloat8 operator*(const vfloat8 &a, const vfloat8 &b)
friend vbool8 operator<=(const vint8 &a, const vint8 &b)
OIIO_FORCEINLINE vfloat4 AxyBxy(const vfloat4 &a, const vfloat4 &b)
vfloat4 rsqrt(const vfloat4 &a)
Fully accurate 1/sqrt.
vbool16(bool a)
Construct from a single value (store it in all slots)
friend vbool4 operator<=(const vfloat4 &a, const vfloat4 &b)
static const vint8 One()
Return an vint8 with all components set to 1.
void load(int a)
Helper: load a single int into all components.
vint4 vint_t
int type of the same length
vfloat16(const short *vals)
Construct from a pointer to short values.
OIIO_FORCEINLINE matrix44(float f00, float f01, float f02, float f03, float f10, float f11, float f12, float f13, float f20, float f21, float f22, float f23, float f30, float f31, float f32, float f33)
Construct from 16 floats.
ImageBuf OIIO_API zero(ROI roi, int nthreads=0)
vint4 rint(const vfloat4 &a)
const Imath::V4f & V4f() const
Cast to a Imath::V4f.
void setcomp(int i, float value)
Component access (set).
simd_raw_t< float, 16 >::type simd_t
the native SIMD type used
vfloat4 sign(const vfloat4 &a)
1.0 when value >= 0, -1 when negative
static const vint16 Zero()
Return an vint16 with all components set to 0.
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
friend vbool16 operator>(const vfloat16 &a, const vfloat16 &b)
OIIO_FORCEINLINE const vint4 & operator*=(vint4 &a, const vint4 &b)
bool all(const vbool4 &v)
void store(float *values) const
friend const vint4 & operator/=(vint4 &a, const vint4 &b)
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
vfloat8(const simd_t &m)
Construct from the underlying SIMD type.
vint4 andnot(const vint4 &a, const vint4 &b)
andnot(a,b) returns ((~a) & b)
vfloat3(const char *vals)
Construct from a pointer to 4 char values.
void store(int *values) const
Store the values into memory.
OIIO_FORCEINLINE matrix44(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d)
Construct from 4 vfloat4 rows.
void normalize()
Normalize in place.
static const vint16 Iota(int start=0, int step=1)
friend vbool4 operator>=(const vint4 &a, const vint4 &b)
bool operator==(const matrix44 &m) const
vfloat4 xyz1() const
Return xyz components, plus 1 for w.
const vfloat3 & operator=(float a)
Assign a single value to all components.
static const vfloat16 Iota(float start=0.0f, float step=1.0f)
OIIO_FORCEINLINE vint4 operator-(const vint4 &a)
friend vbool4 operator==(const vint4 &a, const vint4 &b)
friend std::ostream & operator<<(std::ostream &cout, const vfloat16 &val)
Stream output.
friend std::ostream & operator<<(std::ostream &cout, const vbool8 &a)
Stream output.
friend std::ostream & operator<<(std::ostream &cout, const vbool16 &a)
Stream output.
vfloat8(const unsigned char *vals)
Construct from a pointer to unsigned char values.
friend vbool16 operator==(const vint16 &a, const vint16 &b)
friend vint16 operator~(const vint16 &a)
int operator[](int i) const
Component access (get)
vint4 ifloor(const vfloat4 &a)
(int)floor
friend vbool4 operator==(const vbool4 &a, const vbool4 &b)
Comparison operators, component by component.
float dot(const vfloat4 &a, const vfloat4 &b)
Return the float dot (inner) product of a and b.
void setcomp(int i, float value)
Component access (set).
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vbool8 operator==(const vint8 &a, const vint8 &b)
OIIO_FORCEINLINE vint4 operator+(const vint4 &a, const vint4 &b)
OIIO_FORCEINLINE const vint4 & operator%=(vint4 &a, const vint4 &b)
friend vbool4 operator>(const vfloat4 &a, const vfloat4 &b)
vint4 AxBxCxDx(const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d)
vint8 hi() const
Extract the higher precision vint8.
friend vint8 operator%(const vint8 &a, const vint8 &b)
void setcomp(int i, bool value)
Component access (set).
vfloat4 xyz0() const
Return xyz components, plus 0 for w.
simd_bool_t< 4 >::type simd_t
the native SIMD type used
void setcomp(int i, int value)
Component access (set).
int reduce_add(const vint4 &v)
#define OIIO_NAMESPACE_BEGIN
friend std::ostream & operator<<(std::ostream &cout, const vfloat3 &val)
Stream output.
void store(float *values) const
friend const vint8 & operator<<=(vint8 &a, unsigned int bits)
static const char * type_name()
friend vbool4 operator>=(const vfloat4 &a, const vfloat4 &b)
friend const vint8 & operator|=(vint8 &a, const vint8 &b)