HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
simd.h
Go to the documentation of this file.
1 // Copyright 2008-present Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/OpenImageIO/oiio
4 
5 /// @file simd.h
6 ///
7 /// @brief Classes for SIMD processing.
8 ///
9 /// Nice references for all the Intel intrinsics (SSE*, AVX*, etc.):
10 /// https://software.intel.com/sites/landingpage/IntrinsicsGuide/
11 ///
12 /// Similar guide for ARM intrinsics:
13 /// https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
14 ///
15 /// It helped me a lot to peruse the source of these packages:
16 /// Syrah: https://github.com/boulos/syrah
17 /// Embree: https://github.com/embree
18 /// Vectorial: https://github.com/scoopr/vectorial
19 ///
20 /// To find out which CPU features you have:
21 /// Linux: cat /proc/cpuinfo
22 /// OSX: sysctl machdep.cpu.features
23 ///
24 /// Additional web resources:
25 /// http://www.codersnotes.com/notes/maths-lib-2016/
26 
27 // clang-format off
28 
29 #pragma once
30 
31 #include <algorithm>
32 #include <cstring>
33 
34 #include <OpenImageIO/Imath.h>
35 #include <OpenImageIO/dassert.h>
36 #include <OpenImageIO/platform.h>
37 
38 
39 //////////////////////////////////////////////////////////////////////////
40 // Sort out which SIMD capabilities we have and set definitions
41 // appropriately. This is mostly for internal (within this file) use,
42 // but client applications using this header may find a few of the macros
43 // we define to be useful:
44 //
45 // OIIO_SIMD : Will be 0 if no hardware SIMD support is specified. If SIMD
46 // hardware is available, this will hold the width in number of
47 // float SIMD "lanes" of widest SIMD registers available. For
48 // example, OIIO_SIMD will be 4 if vfloat4/vint4/vbool4 are
49 // hardware accelerated, 8 if vfloat8/vint8/vbool8 are accelerated,
50 // etc. Using SIMD classes wider than this should work (will be
51 // emulated with narrower SIMD or scalar operations), but is not
52 // expected to have high performance.
53 // OIIO_SIMD_SSE : if Intel SSE is supported, this will be nonzero,
54 // specifically 2 for SSE2, 3 for SSSE3, 4 for SSE4.1 or
55 // higher (including AVX).
56 // OIIO_SIMD_AVX : If Intel AVX is supported, this will be nonzero, and
57 // specifically 1 for AVX (1.0), 2 for AVX2, 512 for AVX512f.
58 // OIIO_SIMD_NEON : If ARM NEON is supported, this will be nonzero.
59 // OIIO_SIMD_MAX_SIZE : holds the width in bytes of the widest SIMD
60 // available (generally will be OIIO_SIMD*4).
61 // OIIO_SIMD4_ALIGN : macro for best alignment of 4-wide SIMD values in mem.
62 // OIIO_SIMD8_ALIGN : macro for best alignment of 8-wide SIMD values in mem.
63 // OIIO_SIMD16_ALIGN : macro for best alignment of 16-wide SIMD values in mem.
64 // OIIO_SIMD_HAS_MATRIX4 : nonzero if matrix44 is defined
65 // OIIO_SIMD_HAS_SIMD8 : nonzero if vfloat8, vint8, vbool8 are defined
66 // OIIO_SIMD_HAS_SIMD16 : nonzero if vfloat16, vint16, vbool16 are defined
67 
68 #if defined(__CUDA_ARCH__)
69  // Cuda -- don't include any of these headers
70 #elif defined(_WIN32)
71 # include <intrin.h>
72 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) || defined(__e2k__)
73 # include <x86intrin.h>
74 #elif defined(__GNUC__) && defined(__ARM_NEON__)
75 # include <arm_neon.h>
76 #endif
77 
78 // Disable SSE for 32 bit Windows patforms, it's unreliable and hard for us
79 // to test thoroughly. We presume that anybody needing high performance
80 // badly enough to want SIMD also is on a 64 bit CPU.
81 #if defined(_WIN32) && defined(__i386__) && !defined(__x86_64__) && !defined(OIIO_NO_SSE)
82 #define OIIO_NO_SSE 1
83 #endif
84 
85 // Make sure to disable SSE intrinsics when compiling for Cuda.
86 #if defined(__CUDA_ARCH__) && !defined(OIIO_NO_SSE)
87 #define OIIO_NO_SSE 1
88 #endif
89 
90 #if (defined(__SSE2__) || (_MSC_VER >= 1300 && !_M_CEE_PURE)) && !defined(OIIO_NO_SSE)
91 # if (defined(__SSE4_1__) || defined(__SSE4_2__))
92 # define OIIO_SIMD_SSE 4
93  /* N.B. We consider both SSE4.1 and SSE4.2 to be "4". There are a few
94  * instructions specific to 4.2, but they are all related to string
95  * comparisons and CRCs, which don't currently seem relevant to OIIO,
96  * so for simplicity, we sweep this difference under the rug.
97  */
98 # elif defined(__SSSE3__)
99 # define OIIO_SIMD_SSE 3
100  /* N.B. We only use OIIO_SIMD_SSE = 3 when fully at SSSE3. In theory,
101  * there are a few older architectures that are SSE3 but not SSSE3,
102  * and this simplification means that these particular old platforms
103  * will only get SSE2 goodness out of our code. So be it. Anybody who
104  * cares about performance is probably using a 64 bit machine that's
105  * SSE 4.x or AVX by now.
106  */
107 # else
108 # define OIIO_SIMD_SSE 2
109 # endif
110 # define OIIO_SIMD 4
111 # define OIIO_SIMD_MAX_SIZE_BYTES 16
112 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
113 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
114 #else
115 # define OIIO_SIMD_SSE 0
116 #endif
117 
118 #if defined(__AVX__) && !defined(OIIO_NO_AVX)
119  // N.B. Any machine with AVX will also have SSE
120 # if defined(__AVX2__) && !defined(OIIO_NO_AVX2)
121 # define OIIO_SIMD_AVX 2
122 # else
123 # define OIIO_SIMD_AVX 1
124 # endif
125 # undef OIIO_SIMD
126 # define OIIO_SIMD 8
127 # undef OIIO_SIMD_MAX_SIZE_BYTES
128 # define OIIO_SIMD_MAX_SIZE_BYTES 32
129 # define OIIO_SIMD8_ALIGN OIIO_ALIGN(32)
130 # define OIIO_AVX_ALIGN OIIO_ALIGN(32)
131 # if defined(__AVX512F__)
132 # undef OIIO_SIMD_AVX
133 # define OIIO_SIMD_AVX 512
134 # undef OIIO_SIMD_MAX_SIZE_BYTES
135 # define OIIO_SIMD_MAX_SIZE_BYTES 64
136 # undef OIIO_SIMD
137 # define OIIO_SIMD 16
138 # define OIIO_SIMD16_ALIGN OIIO_ALIGN(64)
139 # define OIIO_AVX512_ALIGN OIIO_ALIGN(64)
140 # define OIIO_AVX512F_ENABLED 1
141 # endif
142 # if defined(__AVX512DQ__)
143 # define OIIO_AVX512DQ_ENABLED 1 /* Doubleword and quadword */
144 # else
145 # define OIIO_AVX512DQ_ENABLED 0
146 # endif
147 # if defined(__AVX512PF__)
148 # define OIIO_AVX512PF_ENABLED 1 /* Prefetch */
149 # else
150 # define OIIO_AVX512PF_ENABLED 0
151 # endif
152 # if defined(__AVX512ER__)
153 # define OIIO_AVX512ER_ENABLED 1 /* Exponential & reciprocal */
154 # else
155 # define OIIO_AVX512ER_ENABLED 0
156 # endif
157 # if defined(__AVX512CD__)
158 # define OIIO_AVX512CD_ENABLED 1 /* Conflict detection */
159 # else
160 # define OIIO_AVX512CD_ENABLED 0
161 # endif
162 # if defined(__AVX512BW__)
163 # define OIIO_AVX512BW_ENABLED 1 /* Byte and word */
164 # else
165 # define OIIO_AVX512BW_ENABLED 0
166 # endif
167 # if defined(__AVX512VL__)
168 # define OIIO_AVX512VL_ENABLED 1 /* Vector length extensions */
169 # else
170 # define OIIO_AVX512VL_ENABLED 0
171 # endif
172 #else
173 # define OIIO_SIMD_AVX 0
174 # define OIIO_AVX512VL_ENABLED 0
175 # define OIIO_AVX512DQ_ENABLED 0
176 # define OIIO_AVX512PF_ENABLED 0
177 # define OIIO_AVX512ER_ENABLED 0
178 # define OIIO_AVX512CD_ENABLED 0
179 # define OIIO_AVX512BW_ENABLED 0
180 #endif
181 
182 #if defined(__FMA__)
183 # define OIIO_FMA_ENABLED 1
184 #else
185 # define OIIO_FMA_ENABLED 0
186 #endif
187 #if defined(__AVX512IFMA__)
188 # define OIIO_AVX512IFMA_ENABLED 1
189 #else
190 # define OIIO_AVX512IFMA_ENABLED 0
191 #endif
192 
193 #if defined(__F16C__)
194 # define OIIO_F16C_ENABLED 1
195 #else
196 # define OIIO_F16C_ENABLED 0
197 #endif
198 
199 // FIXME Future: support ARM Neon
200 // Uncomment this when somebody with Neon can verify it works
201 #if defined(__ARM_NEON__) && !defined(OIIO_NO_NEON)
202 # define OIIO_SIMD 4
203 # define OIIO_SIMD_NEON 1
204 # define OIIO_SIMD_MAX_SIZE_BYTES 16
205 # define OIIO_SIMD4_ALIGN OIIO_ALIGN(16)
206 # define OIIO_SSE_ALIGN OIIO_ALIGN(16)
207 #else
208 # define OIIO_SIMD_NEON 0
209 #endif
210 
211 #ifndef OIIO_SIMD
212  // No SIMD available
213 # define OIIO_SIMD 0
214 # define OIIO_SIMD4_ALIGN
215 # define OIIO_SIMD_MAX_SIZE_BYTES 16
216 #endif
217 
218 #ifndef OIIO_SIMD8_ALIGN
219 # define OIIO_SIMD8_ALIGN OIIO_SIMD4_ALIGN
220 #endif
221 #ifndef OIIO_SIMD16_ALIGN
222 # define OIIO_SIMD16_ALIGN OIIO_SIMD8_ALIGN
223 #endif
224 
225 
226 // General features that client apps may want to test for, for conditional
227 // compilation. Will add to this over time as needed. Note that just
228 // because a feature is present doesn't mean it's fast -- HAS_SIMD8 means
229 // the vfloat8 class (and friends) are in this version of simd.h, but that's
230 // different from OIIO_SIMD >= 8, which means it's supported in hardware.
231 #define OIIO_SIMD_HAS_MATRIX4 1 /* matrix44 defined */
232 #define OIIO_SIMD_HAS_FLOAT8 1 /* DEPRECATED(1.8) */
233 #define OIIO_SIMD_HAS_SIMD8 1 /* vfloat8, vint8, vbool8 defined */
234 #define OIIO_SIMD_HAS_SIMD16 1 /* vfloat16, vint16, vbool16 defined */
235 
236 
237 // Embarrassing hack: Xlib.h #define's True and False!
238 #ifdef True
239 # undef True
240 #endif
241 #ifdef False
242 # undef False
243 #endif
244 
245 
246 
248 
249 namespace simd {
250 
251 //////////////////////////////////////////////////////////////////////////
252 // Forward declarations of our main SIMD classes
253 
254 class vbool4;
255 class vint4;
256 class vfloat4;
257 class vfloat3;
258 class matrix44;
259 class vbool8;
260 class vint8;
261 class vfloat8;
262 class vbool16;
263 class vint16;
264 class vfloat16;
265 
266 // Deprecated names -- remove these in 1.9
267 typedef vbool4 mask4; // old name
268 typedef vbool4 bool4;
269 typedef vbool8 bool8;
270 typedef vint4 int4;
271 typedef vint8 int8;
272 typedef vfloat3 float3;
273 typedef vfloat4 float4;
274 typedef vfloat8 float8;
275 
276 
277 
278 //////////////////////////////////////////////////////////////////////////
279 // Template magic to determine the raw SIMD types involved, and other
280 // things helpful for metaprogramming.
281 
282 template <typename T, int N> struct simd_raw_t { struct type { T val[N]; }; };
283 template <int N> struct simd_bool_t { struct type { int val[N]; }; };
284 
285 #if OIIO_SIMD_SSE
286 template<> struct simd_raw_t<int,4> { typedef __m128i type; };
287 template<> struct simd_raw_t<float,4> { typedef __m128 type; };
288 template<> struct simd_bool_t<4> { typedef __m128 type; };
289 #endif
290 
291 #if OIIO_SIMD_AVX
292 template<> struct simd_raw_t<int,8> { typedef __m256i type; };
293 template<> struct simd_raw_t<float,8> { typedef __m256 type; };
294 template<> struct simd_bool_t<8> { typedef __m256 type; };
295 #endif
296 
297 #if OIIO_SIMD_AVX >= 512
298 template<> struct simd_raw_t<int,16> { typedef __m512i type; };
299 template<> struct simd_raw_t<float,16> { typedef __m512 type; };
300 template<> struct simd_bool_t<16> { typedef __mmask16 type; };
301 #else
302 // Note: change in strategy for 16-wide SIMD: instead of int[16] for
303 // vbool16, it's just a plain old bitmask, and __mask16 for actual HW.
304 template<> struct simd_bool_t<16> { typedef uint16_t type; };
305 #endif
306 
307 #if OIIO_SIMD_NEON
308 template<> struct simd_raw_t<int,4> { typedef int32x4_t type; };
309 template<> struct simd_raw_t<float,4> { typedef float32x4_t type; };
310 template<> struct simd_bool_t<4> { typedef uint32x4_t type; };
311 #endif
312 
313 
314 /// Template to retrieve the vector type from the scalar. For example,
315 /// simd::VecType<int,4> will be vfloat4.
316 template<typename T,int elements> struct VecType {};
317 template<> struct VecType<int,1> { typedef int type; };
318 template<> struct VecType<float,1> { typedef float type; };
319 template<> struct VecType<int,4> { typedef vint4 type; };
320 template<> struct VecType<float,4> { typedef vfloat4 type; };
321 template<> struct VecType<float,3> { typedef vfloat3 type; };
322 template<> struct VecType<bool,4> { typedef vbool4 type; };
323 template<> struct VecType<int,8> { typedef vint8 type; };
324 template<> struct VecType<float,8> { typedef vfloat8 type; };
325 template<> struct VecType<bool,8> { typedef vbool8 type; };
326 template<> struct VecType<int,16> { typedef vint16 type; };
327 template<> struct VecType<float,16> { typedef vfloat16 type; };
328 template<> struct VecType<bool,16> { typedef vbool16 type; };
329 
330 /// Template to retrieve the SIMD size of a SIMD type. Rigged to be 1 for
331 /// anything but our SIMD types.
332 template<typename T> struct SimdSize { static const int size = 1; };
333 template<> struct SimdSize<vint4> { static const int size = 4; };
334 template<> struct SimdSize<vfloat4> { static const int size = 4; };
335 template<> struct SimdSize<vfloat3> { static const int size = 4; };
336 template<> struct SimdSize<vbool4> { static const int size = 4; };
337 template<> struct SimdSize<vint8> { static const int size = 8; };
338 template<> struct SimdSize<vfloat8> { static const int size = 8; };
339 template<> struct SimdSize<vbool8> { static const int size = 8; };
340 template<> struct SimdSize<vint16> { static const int size = 16; };
341 template<> struct SimdSize<vfloat16> { static const int size = 16; };
342 template<> struct SimdSize<vbool16> { static const int size = 16; };
343 
344 /// Template to retrieve the number of elements size of a SIMD type. Rigged
345 /// to be 1 for anything but our SIMD types.
346 template<typename T> struct SimdElements { static const int size = SimdSize<T>::size; };
347 template<> struct SimdElements<vfloat3> { static const int size = 3; };
348 
349 /// Template giving a printable name for each type
350 template<typename T> struct SimdTypeName { static const char *name() { return "unknown"; } };
351 template<> struct SimdTypeName<vfloat4> { static const char *name() { return "vfloat4"; } };
352 template<> struct SimdTypeName<vint4> { static const char *name() { return "vint4"; } };
353 template<> struct SimdTypeName<vbool4> { static const char *name() { return "vbool4"; } };
354 template<> struct SimdTypeName<vfloat8> { static const char *name() { return "vfloat8"; } };
355 template<> struct SimdTypeName<vint8> { static const char *name() { return "vint8"; } };
356 template<> struct SimdTypeName<vbool8> { static const char *name() { return "vbool8"; } };
357 template<> struct SimdTypeName<vfloat16> { static const char *name() { return "vfloat16"; } };
358 template<> struct SimdTypeName<vint16> { static const char *name() { return "vint16"; } };
359 template<> struct SimdTypeName<vbool16> { static const char *name() { return "vbool16"; } };
360 
361 
362 //////////////////////////////////////////////////////////////////////////
363 // Macros helpful for making static constants in code.
364 
365 # define OIIO_SIMD_FLOAT4_CONST(name,val) \
366  static const OIIO_SIMD4_ALIGN float name[4] = { (val), (val), (val), (val) }
367 # define OIIO_SIMD_FLOAT4_CONST4(name,v0,v1,v2,v3) \
368  static const OIIO_SIMD4_ALIGN float name[4] = { (v0), (v1), (v2), (v3) }
369 # define OIIO_SIMD_INT4_CONST(name,val) \
370  static const OIIO_SIMD4_ALIGN int name[4] = { (val), (val), (val), (val) }
371 # define OIIO_SIMD_INT4_CONST4(name,v0,v1,v2,v3) \
372  static const OIIO_SIMD4_ALIGN int name[4] = { (v0), (v1), (v2), (v3) }
373 # define OIIO_SIMD_UINT4_CONST(name,val) \
374  static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (val), (val), (val), (val) }
375 # define OIIO_SIMD_UINT4_CONST4(name,v0,v1,v2,v3) \
376  static const OIIO_SIMD4_ALIGN uint32_t name[4] = { (v0), (v1), (v2), (v3) }
377 
378 # define OIIO_SIMD_FLOAT8_CONST(name,val) \
379  static const OIIO_SIMD8_ALIGN float name[8] = { (val), (val), (val), (val), \
380  (val), (val), (val), (val) }
381 # define OIIO_SIMD_FLOAT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
382  static const OIIO_SIMD8_ALIGN float name[8] = { (v0), (v1), (v2), (v3), \
383  (v4), (v5), (v6), (v7) }
384 # define OIIO_SIMD_INT8_CONST(name,val) \
385  static const OIIO_SIMD8_ALIGN int name[8] = { (val), (val), (val), (val), \
386  (val), (val), (val), (val) }
387 # define OIIO_SIMD_INT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
388  static const OIIO_SIMD8_ALIGN int name[8] = { (v0), (v1), (v2), (v3), \
389  (v4), (v5), (v6), (v7) }
390 # define OIIO_SIMD_UINT8_CONST(name,val) \
391  static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (val), (val), (val), (val), \
392  (val), (val), (val), (val) }
393 # define OIIO_SIMD_UINT8_CONST8(name,v0,v1,v2,v3,v4,v5,v6,v7) \
394  static const OIIO_SIMD8_ALIGN uint32_t name[8] = { (v0), (v1), (v2), (v3), \
395  (v4), (v5), (v6), (v7) }
396 
397 # define OIIO_SIMD_VFLOAT16_CONST(name,val) \
398  static const OIIO_SIMD16_ALIGN float name[16] = { \
399  (val), (val), (val), (val), (val), (val), (val), (val), \
400  (val), (val), (val), (val), (val), (val), (val), (val) }
401 # define OIIO_SIMD_VFLOAT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
402  static const OIIO_SIMD16_ALIGN float name[16] = { \
403  (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
404  (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
405 # define OIIO_SIMD_INT16_CONST(name,val) \
406  static const OIIO_SIMD16_ALIGN int name[16] = { \
407  (val), (val), (val), (val), (val), (val), (val), (val), \
408  (val), (val), (val), (val), (val), (val), (val), (val) }
409 # define OIIO_SIMD_INT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
410  static const OIIO_SIMD16_ALIGN int name[16] = { \
411  (v0), (v1), (v2), (v3), (v4), (v5), (v6), (v7), \
412  (v8), (v9), (v10), (v11), (v12), (v13), (v14), (v15) }
413 # define OIIO_SIMD_UINT16_CONST(name,val) \
414  static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
415  (val), (val), (val), (val), (val), (val), (val), (val), \
416  (val), (val), (val), (val), (val), (val), (val), (val) }
417 # define OIIO_SIMD_UINT16_CONST16(name,v0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15) \
418  static const OIIO_SIMD16_ALIGN uint32_t name[16] = { \
419  (val), (val), (val), (val), (val), (val), (val), (val), \
420  (val), (val), (val), (val), (val), (val), (val), (val) }
421 
422 
423 //////////////////////////////////////////////////////////////////////////
424 // Some macros just for use in this file (#undef-ed at the end) making
425 // it more succinct to express per-element operations.
426 
427 #define SIMD_DO(x) for (int i = 0; i < elements; ++i) x
428 #define SIMD_CONSTRUCT(x) for (int i = 0; i < elements; ++i) m_val[i] = (x)
429 #define SIMD_CONSTRUCT_PAD(x) for (int i = 0; i < elements; ++i) m_val[i] = (x); \
430  for (int i = elements; i < paddedelements; ++i) m_val[i] = 0
431 #define SIMD_RETURN(T,x) T r; for (int i = 0; i < r.elements; ++i) r[i] = (x); return r
432 #define SIMD_RETURN_REDUCE(T,init,op) T r = init; for (int i = 0; i < v.elements; ++i) op; return r
433 
434 
435 
436 //////////////////////////////////////////////////////////////////////////
437 //////////////////////////////////////////////////////////////////////////
438 // The public declarations of the main SIMD classes follow: boolN, intN,
439 // floatN, matrix44.
440 //
441 // These class declarations are intended to be brief and self-documenting,
442 // and give all the information that users or client applications need to
443 // know to use these classes.
444 //
445 // No implementations are given inline except for the briefest, completely
446 // generic methods that don't have any architecture-specific overloads.
447 // After the class defintions, there will be an immense pile of full
448 // implementation definitions, which casual users are not expected to
449 // understand.
450 //////////////////////////////////////////////////////////////////////////
451 //////////////////////////////////////////////////////////////////////////
452 
453 
454 /// vbool4: An 4-vector whose elements act mostly like bools, accelerated by
455 /// SIMD instructions when available. This is what is naturally produced by
456 /// SIMD comparison operators on the vfloat4 and vint4 types.
457 class vbool4 {
458 public:
459  static const char* type_name() { return "vbool4"; }
460  typedef bool value_t; ///< Underlying equivalent scalar value type
461  enum { elements = 4 }; ///< Number of scalar elements
462  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
463  enum { bits = elements*32 }; ///< Total number of bits
464  typedef simd_bool_t<4>::type simd_t; ///< the native SIMD type used
465 
466  /// Default constructor (contents undefined)
467  vbool4 () { }
468 
469  /// Construct from a single value (store it in all slots)
470  vbool4 (bool a) { load(a); }
471 
472  explicit vbool4 (const bool *a);
473 
474  /// Construct from 4 bool values
475  vbool4 (bool a, bool b, bool c, bool d) { load (a, b, c, d); }
476 
477  /// Copy construct from another vbool4
478  vbool4 (const vbool4 &other) { m_simd = other.m_simd; }
479 
480  /// Construct from 4 int values
481  vbool4 (int a, int b, int c, int d) {
482  load (bool(a), bool(b), bool(c), bool(d));
483  }
484 
485  /// Construct from a SIMD int (is each element nonzero?)
486  vbool4 (const vint4 &i);
487 
488  /// Construct from the underlying SIMD type
489  vbool4 (const simd_t& m) : m_simd(m) { }
490 
491  /// Return the raw SIMD type
492  operator simd_t () const { return m_simd; }
493  simd_t simd () const { return m_simd; }
494  simd_t& simd () { return m_simd; }
495 
496  /// Extract the bitmask
497  int bitmask () const;
498 
499  /// Convert from integer bitmask to a true vbool4
500  static vbool4 from_bitmask (int bitmask);
501 
502  /// Set all components to false
503  void clear ();
504 
505  /// Return a vbool4 the is 'false' for all values
506  static const vbool4 False ();
507 
508  /// Return a vbool4 the is 'true' for all values
509  static const vbool4 True ();
510 
511  /// Assign one value to all components
512  const vbool4 & operator= (bool a) { load(a); return *this; }
513 
514  /// Assignment of another vbool4
515  const vbool4 & operator= (const vbool4 & other);
516 
517  /// Component access (get)
518  int operator[] (int i) const;
519 
520  /// Component access (set).
521  void setcomp (int i, bool value);
522 
523  /// Component access (set).
524  /// NOTE: avoid this unsafe construct. It will go away some day.
525  int& operator[] (int i);
526 
527  /// Helper: load a single value into all components.
528  void load (bool a);
529 
530  /// Helper: load separate values into each component.
531  void load (bool a, bool b, bool c, bool d);
532 
533  /// Helper: store the values into memory as bools.
534  void store (bool *values) const;
535 
536  /// Store the first n values into memory.
537  void store (bool *values, int n) const;
538 
539  /// Logical/bitwise operators, component-by-component
540  friend vbool4 operator! (const vbool4& a);
541  friend vbool4 operator& (const vbool4& a, const vbool4& b);
542  friend vbool4 operator| (const vbool4& a, const vbool4& b);
543  friend vbool4 operator^ (const vbool4& a, const vbool4& b);
544  friend vbool4 operator~ (const vbool4& a);
545  friend const vbool4& operator&= (vbool4& a, const vbool4& b);
546  friend const vbool4& operator|= (vbool4& a, const vbool4& b);
547  friend const vbool4& operator^= (vbool4& a, const vbool4& b);
548 
549  /// Comparison operators, component by component
550  friend vbool4 operator== (const vbool4& a, const vbool4& b);
551  friend vbool4 operator!= (const vbool4& a, const vbool4& b);
552 
553  /// Stream output
554  friend std::ostream& operator<< (std::ostream& cout, const vbool4 & a);
555 
556 private:
557  // The actual data representation
558  union {
561  };
562 };
563 
564 
565 
566 /// Helper: shuffle/swizzle with constant (templated) indices.
567 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
568 template<int i0, int i1, int i2, int i3>
569 OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a);
570 
571 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
572 template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a);
573 
574 /// Helper: as rapid as possible extraction of one component, when the
575 /// index is fixed.
576 template<int i> OIIO_FORCEINLINE bool extract (const vbool4& a);
577 
578 /// Helper: substitute val for a[i]
579 template<int i> OIIO_FORCEINLINE vbool4 insert (const vbool4& a, bool val);
580 
581 /// Logical reduction across all components.
582 bool reduce_and (const vbool4& v);
583 bool reduce_or (const vbool4& v);
584 
585 // Are all/any/no components true?
586 bool all (const vbool4& v);
587 bool any (const vbool4& v);
588 bool none (const vbool4& v);
589 
590 // It's handy to have this defined for regular bool as well
591 inline bool all (bool v) { return v; }
592 
593 
594 
595 /// vbool8: An 8-vector whose elements act mostly like bools, accelerated by
596 /// SIMD instructions when available. This is what is naturally produced by
597 /// SIMD comparison operators on the vfloat8 and vint8 types.
598 class vbool8 {
599 public:
600  static const char* type_name() { return "vbool8"; }
601  typedef bool value_t; ///< Underlying equivalent scalar value type
602  enum { elements = 8 }; ///< Number of scalar elements
603  enum { paddedelements = 8 }; ///< Number of scalar elements for full pad
604  enum { bits = elements*32 }; ///< Total number of bits
605  typedef simd_bool_t<8>::type simd_t; ///< the native SIMD type used
606 
607  /// Default constructor (contents undefined)
608  vbool8 () { }
609 
610  /// Construct from a single value (store it in all slots)
611  vbool8 (bool a) { load (a); }
612 
613  explicit vbool8 (const bool *values);
614 
615  /// Construct from 8 bool values
616  vbool8 (bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h);
617 
618  /// Copy construct from another vbool8
619  vbool8 (const vbool8 &other) { m_simd = other.m_simd; }
620 
621  /// Construct from 8 int values
622  vbool8 (int a, int b, int c, int d, int e, int f, int g, int h);
623 
624  /// Construct from a SIMD int (is each element nonzero?)
625  vbool8 (const vint8 &i);
626 
627  /// Construct from two vbool4's
628  vbool8 (const vbool4 &lo, const vbool4 &hi);
629 
630  /// Construct from the underlying SIMD type
631  vbool8 (const simd_t& m) : m_simd(m) { }
632 
633  /// Return the raw SIMD type
634  operator simd_t () const { return m_simd; }
635  simd_t simd () const { return m_simd; }
636  simd_t& simd () { return m_simd; }
637 
638  /// Extract the bitmask
639  int bitmask () const;
640 
641  /// Convert from integer bitmask to a true vbool8
642  static vbool8 from_bitmask (int bitmask);
643 
644  /// Set all components to false
645  void clear ();
646 
647  /// Return a vbool8 the is 'false' for all values
648  static const vbool8 False ();
649 
650  /// Return a vbool8 the is 'true' for all values
651  static const vbool8 True ();
652 
653  /// Assign one value to all components
654  const vbool8 & operator= (bool a);
655 
656  /// Assignment of another vbool8
657  const vbool8 & operator= (const vbool8 & other);
658 
659  /// Component access (get)
660  int operator[] (int i) const;
661 
662  /// Component access (set).
663  void setcomp (int i, bool value);
664 
665  /// Component access (set).
666  /// NOTE: avoid this unsafe construct. It will go away some day.
667  int& operator[] (int i);
668 
669  /// Extract the lower precision vbool4
670  vbool4 lo () const;
671 
672  /// Extract the higher precision vbool4
673  vbool4 hi () const;
674 
675  /// Helper: load a single value into all components.
676  void load (bool a);
677 
678  /// Helper: load separate values into each component.
679  void load (bool a, bool b, bool c, bool d,
680  bool e, bool f, bool g, bool h);
681 
682  /// Helper: store the values into memory as bools.
683  void store (bool *values) const;
684 
685  /// Store the first n values into memory.
686  void store (bool *values, int n) const;
687 
688  /// Logical/bitwise operators, component-by-component
689  friend vbool8 operator! (const vbool8& a);
690  friend vbool8 operator& (const vbool8& a, const vbool8& b);
691  friend vbool8 operator| (const vbool8& a, const vbool8& b);
692  friend vbool8 operator^ (const vbool8& a, const vbool8& b);
693  friend vbool8 operator~ (const vbool8& a);
694  friend const vbool8& operator&= (vbool8& a, const vbool8& b);
695  friend const vbool8& operator|= (vbool8& a, const vbool8& b);
696  friend const vbool8& operator^= (vbool8& a, const vbool8& b);
697 
698  /// Comparison operators, component by component
699  friend vbool8 operator== (const vbool8& a, const vbool8& b);
700  friend vbool8 operator!= (const vbool8& a, const vbool8& b);
701 
702  /// Stream output
703  friend std::ostream& operator<< (std::ostream& cout, const vbool8 & a);
704 
705 private:
706  // The actual data representation
707  union {
711  };
712 };
713 
714 
715 
716 /// Helper: shuffle/swizzle with constant (templated) indices.
717 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
718 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
719 OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a);
720 
721 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
722 template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a);
723 
724 /// Helper: as rapid as possible extraction of one component, when the
725 /// index is fixed.
726 template<int i> OIIO_FORCEINLINE bool extract (const vbool8& a);
727 
728 /// Helper: substitute val for a[i]
729 template<int i> OIIO_FORCEINLINE vbool8 insert (const vbool8& a, bool val);
730 
731 /// Logical reduction across all components.
732 bool reduce_and (const vbool8& v);
733 bool reduce_or (const vbool8& v);
734 
735 // Are all/any/no components true?
736 bool all (const vbool8& v);
737 bool any (const vbool8& v);
738 bool none (const vbool8& v);
739 
740 
741 
742 
743 /// vbool16: An 16-vector whose elements act mostly like bools, accelerated
744 /// by SIMD instructions when available. This is what is naturally produced
745 /// by SIMD comparison operators on the vfloat16 and vint16 types.
746 class vbool16 {
747 public:
748  static const char* type_name() { return "vbool16"; }
749  typedef bool value_t; ///< Underlying equivalent scalar value type
750  enum { elements = 16 }; ///< Number of scalar elements
751  enum { paddedelements = 16 }; ///< Number of scalar elements for full pad
752  enum { bits = 16 }; ///< Total number of bits
753  typedef simd_bool_t<16>::type simd_t; ///< the native SIMD type used
754 
755  /// Default constructor (contents undefined)
756  vbool16 () { }
757 
758  /// Construct from a single value (store it in all slots)
759  vbool16 (bool a) { load (a); }
760 
761  explicit vbool16 (int bitmask) { load_bitmask (bitmask); }
762 
763  explicit vbool16 (const bool *values);
764 
765  /// Construct from 16 bool values
766  vbool16 (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7,
767  bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
768 
769  /// Copy construct from another vbool16
770  vbool16 (const vbool16 &other) { m_simd = other.m_simd; }
771 
772  /// Construct from 16 int values
773  vbool16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
774  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
775 
776  /// Construct from a SIMD int (is each element nonzero?)
777  vbool16 (const vint16 &i);
778 
779  /// Construct from two vbool8's
780  vbool16 (const vbool8 &lo, const vbool8 &hi);
781 
782  /// Construct from four vbool4's
783  vbool16 (const vbool4 &b4a, const vbool4 &b4b, const vbool4 &b4c, const vbool4 &b4d);
784 
785  /// Construct from the underlying SIMD type
786  vbool16 (const simd_t& m) : m_simd(m) { }
787 
788  /// Return the raw SIMD type
789  operator simd_t () const { return m_simd; }
790  simd_t simd () const { return m_simd; }
791  simd_t& simd () { return m_simd; }
792 
793  int bitmask () const;
794 
795  /// Convert from integer bitmask to a true vbool16
796  static vbool16 from_bitmask (int bitmask) { return vbool16(bitmask); }
797 
798  /// Set all components to false
799  void clear ();
800 
801  /// Return a vbool16 the is 'false' for all values
802  static const vbool16 False ();
803 
804  /// Return a vbool16 the is 'true' for all values
805  static const vbool16 True ();
806 
807  /// Assign one value to all components
808  const vbool16 & operator= (bool a);
809 
810  /// Assignment of another vbool16
811  const vbool16 & operator= (const vbool16 & other);
812 
813  /// Component access (get)
814  int operator[] (int i) const;
815 
816  /// Component access (set).
817  void setcomp (int i, bool value);
818 
819  /// Extract the lower precision vbool8
820  vbool8 lo () const;
821 
822  /// Extract the higher precision vbool8
823  vbool8 hi () const;
824 
825  /// Helper: load a single value into all components.
826  void load (bool a);
827 
828  /// Helper: load separate values into each component.
829  void load (bool v0, bool v1, bool v2, bool v3, bool v4, bool v5, bool v6, bool v7,
830  bool v8, bool v9, bool v10, bool v11, bool v12, bool v13, bool v14, bool v15);
831 
832  /// Helper: load all components from a bitmask in an int.
833  void load_bitmask (int a);
834 
835  /// Helper: store the values into memory as bools.
836  void store (bool *values) const;
837 
838  /// Store the first n values into memory.
839  void store (bool *values, int n) const;
840 
841  /// Logical/bitwise operators, component-by-component
842  friend vbool4 operator! (const vbool4& a);
843  friend vbool16 operator! (const vbool16& a);
844  friend vbool16 operator& (const vbool16& a, const vbool16& b);
845  friend vbool16 operator| (const vbool16& a, const vbool16& b);
846  friend vbool16 operator^ (const vbool16& a, const vbool16& b);
847  friend vbool16 operator~ (const vbool16& a);
848  friend const vbool16& operator&= (vbool16& a, const vbool16& b);
849  friend const vbool16& operator|= (vbool16& a, const vbool16& b);
850  friend const vbool16& operator^= (vbool16& a, const vbool16& b);
851 
852  /// Comparison operators, component by component
853  friend vbool16 operator== (const vbool16& a, const vbool16& b);
854  friend vbool16 operator!= (const vbool16& a, const vbool16& b);
855 
856  /// Stream output
857  friend std::ostream& operator<< (std::ostream& cout, const vbool16 & a);
858 
859 private:
860  // The actual data representation
861  union {
863  uint16_t m_bits;
864  };
865 };
866 
867 
868 
869 /// Helper: as rapid as possible extraction of one component, when the
870 /// index is fixed.
871 template<int i> OIIO_FORCEINLINE bool extract (const vbool16& a);
872 
873 /// Helper: substitute val for a[i]
874 template<int i> OIIO_FORCEINLINE vbool16 insert (const vbool16& a, bool val);
875 
876 /// Logical reduction across all components.
877 bool reduce_and (const vbool16& v);
878 bool reduce_or (const vbool16& v);
879 
880 // Are all/any/no components true?
881 bool all (const vbool16& v);
882 bool any (const vbool16& v);
883 bool none (const vbool16& v);
884 
885 
886 
887 
888 
889 /// Integer 4-vector, accelerated by SIMD instructions when available.
890 class vint4 {
891 public:
892  static const char* type_name() { return "vint4"; }
893  typedef int value_t; ///< Underlying equivalent scalar value type
894  enum { elements = 4 }; ///< Number of scalar elements
895  enum { paddedelements =4 }; ///< Number of scalar elements for full pad
896  enum { bits = 128 }; ///< Total number of bits
897  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
898  typedef vbool4 vbool_t; ///< bool type of the same length
899  typedef vfloat4 vfloat_t; ///< float type of the same length
900  typedef vint4 vint_t; ///< int type of the same length
901  OIIO_DEPRECATED("use vbool_t (1.8)")
902  typedef vbool4 bool_t; // old name (deprecated 1.8)
903  OIIO_DEPRECATED("use vfloat_t (1.8)")
904  typedef vfloat4 float_t; // old name (deprecated 1.8)
905 
906  /// Default constructor (contents undefined)
907  vint4 () { }
908 
909  /// Construct from a single value (store it in all slots)
910  vint4 (int a);
911 
912  /// Construct from 2 values -- (a,a,b,b)
913  vint4 (int a, int b);
914 
915  /// Construct from 4 values
916  vint4 (int a, int b, int c, int d);
917 
918  /// Construct from a pointer to values
919  vint4 (const int *vals);
920 
921  /// Construct from a pointer to unsigned short values
922  explicit vint4 (const unsigned short *vals);
923 
924  /// Construct from a pointer to signed short values
925  explicit vint4 (const short *vals);
926 
927  /// Construct from a pointer to unsigned char values (0 - 255)
928  explicit vint4 (const unsigned char *vals);
929 
930  /// Construct from a pointer to signed char values (-128 - 127)
931  explicit vint4 (const char *vals);
932 
933  /// Copy construct from another vint4
934  vint4 (const vint4 & other) { m_simd = other.m_simd; }
935 
936  /// Convert a vfloat to an vint. Equivalent to i = (int)f;
937  explicit vint4 (const vfloat4& f); // implementation below
938 
939  /// Construct from the underlying SIMD type
940  vint4 (const simd_t& m) : m_simd(m) { }
941 
942  /// Return the raw SIMD type
943  operator simd_t () const { return m_simd; }
944  simd_t simd () const { return m_simd; }
945  simd_t& simd () { return m_simd; }
946 
947  /// Return a pointer to the underlying scalar type
948  const value_t* data () const { return (const value_t*)this; }
949  value_t* data () { return (value_t*)this; }
950 
951  /// Sset all components to 0
952  void clear () ;
953 
954  /// Return an vint4 with all components set to 0
955  static const vint4 Zero ();
956 
957  /// Return an vint4 with all components set to 1
958  static const vint4 One ();
959 
960  /// Return an vint4 with all components set to -1 (aka 0xffffffff)
961  static const vint4 NegOne ();
962 
963  /// Return an vint4 with incremented components (e.g., 0,1,2,3).
964  /// Optional arguments can give a non-zero starting point and step size.
965  static const vint4 Iota (int start=0, int step=1);
966 
967  /// Return an vint4 with "geometric" iota: (1, 2, 4, 8).
968  static const vint4 Giota ();
969 
970  /// Assign one value to all components.
971  const vint4 & operator= (int a);
972 
973  /// Assignment from another vint4
974  const vint4 & operator= (const vint4& other) ;
975 
976  /// Component access (get)
977  int operator[] (int i) const;
978 
979  /// Component access (set)
980  int& operator[] (int i);
981 
982  /// Component access (set).
983  void setcomp (int i, int value);
984 
985  value_t x () const;
986  value_t y () const;
987  value_t z () const;
988  value_t w () const;
989  void set_x (value_t val);
990  void set_y (value_t val);
991  void set_z (value_t val);
992  void set_w (value_t val);
993 
994  /// Helper: load a single int into all components
995  void load (int a);
996 
997  /// Helper: load separate values into each component.
998  void load (int a, int b, int c, int d);
999 
1000  /// Load from an array of 4 values
1001  void load (const int *values);
1002 
1003  void load (const int *values, int n) ;
1004 
1005  /// Load from an array of 4 unsigned short values, convert to vint4
1006  void load (const unsigned short *values) ;
1007 
1008  /// Load from an array of 4 unsigned short values, convert to vint4
1009  void load (const short *values);
1010 
1011  /// Load from an array of 4 unsigned char values, convert to vint4
1012  void load (const unsigned char *values);
1013 
1014  /// Load from an array of 4 unsigned char values, convert to vint4
1015  void load (const char *values);
1016 
1017  /// Store the values into memory
1018  void store (int *values) const;
1019 
1020  /// Store the first n values into memory
1021  void store (int *values, int n) const;
1022 
1023  /// Store the least significant 16 bits of each element into adjacent
1024  /// unsigned shorts.
1025  void store (unsigned short *values) const;
1026 
1027  /// Store the least significant 8 bits of each element into adjacent
1028  /// unsigned chars.
1029  void store (unsigned char *values) const;
1030 
1031  /// Masked load -- read from values[] where mask is 1, load zero where
1032  /// mask is 0.
1033  void load_mask (int mask, const value_t *values);
1034  void load_mask (const vbool_t& mask, const value_t *values);
1035 
1036  /// Masked store -- write to values[] where mask is enabled, don't
1037  /// touch values[] where it's not.
1038  void store_mask (int mask, value_t *values) const;
1039  void store_mask (const vbool_t& mask, value_t *values) const;
1040 
1041  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1042  template<int scale=4>
1043  void gather (const value_t *baseptr, const vint_t& vindex);
1044  /// Gather elements defined by the mask, leave others unchanged.
1045  template<int scale=4>
1046  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
1047  template<int scale=4>
1048  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1049 
1050  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1051  template<int scale=4>
1052  void scatter (value_t *baseptr, const vint_t& vindex) const;
1053  /// Scatter elements defined by the mask
1054  template<int scale=4>
1055  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1056  template<int scale=4>
1057  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1058 
1059  // Arithmetic operators (component-by-component)
1060  friend vint4 operator+ (const vint4& a, const vint4& b);
1061  friend vint4 operator- (const vint4& a);
1062  friend vint4 operator- (const vint4& a, const vint4& b);
1063  friend vint4 operator* (const vint4& a, const vint4& b);
1064  friend vint4 operator/ (const vint4& a, const vint4& b);
1065  friend vint4 operator% (const vint4& a, const vint4& b);
1066  friend const vint4 & operator+= (vint4& a, const vint4& b);
1067  friend const vint4 & operator-= (vint4& a, const vint4& b);
1068  friend const vint4 & operator*= (vint4& a, const vint4& b);
1069  friend const vint4 & operator/= (vint4& a, const vint4& b);
1070  friend const vint4 & operator%= (vint4& a, const vint4& b);
1071  // Bitwise operators (component-by-component)
1072  friend vint4 operator& (const vint4& a, const vint4& b);
1073  friend vint4 operator| (const vint4& a, const vint4& b);
1074  friend vint4 operator^ (const vint4& a, const vint4& b);
1075  friend const vint4& operator&= (vint4& a, const vint4& b);
1076  friend const vint4& operator|= (vint4& a, const vint4& b);
1077  friend const vint4& operator^= (vint4& a, const vint4& b);
1078  friend vint4 operator~ (const vint4& a);
1079  friend vint4 operator<< (const vint4& a, unsigned int bits);
1080  friend vint4 operator>> (const vint4& a, unsigned int bits);
1081  friend const vint4& operator<<= (vint4& a, unsigned int bits);
1082  friend const vint4& operator>>= (vint4& a, unsigned int bits);
1083  // Comparison operators (component-by-component)
1084  friend vbool4 operator== (const vint4& a, const vint4& b);
1085  friend vbool4 operator!= (const vint4& a, const vint4& b);
1086  friend vbool4 operator< (const vint4& a, const vint4& b);
1087  friend vbool4 operator> (const vint4& a, const vint4& b);
1088  friend vbool4 operator>= (const vint4& a, const vint4& b);
1089  friend vbool4 operator<= (const vint4& a, const vint4& b);
1090 
1091  /// Stream output
1092  friend std::ostream& operator<< (std::ostream& cout, const vint4 & a);
1093 
1094 private:
1095  // The actual data representation
1096  union {
1099  };
1100 };
1101 
1102 
1103 
1104 // Shift right logical -- unsigned shift. This differs from operator>>
1105 // in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1106 // srl((1<<31),1) == 1<<30.
1107 vint4 srl (const vint4& val, const unsigned int bits);
1108 
1109 /// Helper: shuffle/swizzle with constant (templated) indices.
1110 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
1111 template<int i0, int i1, int i2, int i3>
1112 OIIO_FORCEINLINE vint4 shuffle (const vint4& a);
1113 
1114 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1115 template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a);
1116 
1117 /// Helper: as rapid as possible extraction of one component, when the
1118 /// index is fixed.
1119 template<int i> OIIO_FORCEINLINE int extract (const vint4& v);
1120 
1121 /// The sum of all components, returned in all components.
1122 vint4 vreduce_add (const vint4& v);
1123 
1124 // Reduction across all components
1125 int reduce_add (const vint4& v);
1126 int reduce_and (const vint4& v);
1127 int reduce_or (const vint4& v);
1128 
1129 /// Use a bool mask to select between components of a (if mask[i] is false)
1130 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1131 vint4 blend (const vint4& a, const vint4& b, const vbool4& mask);
1132 
1133 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1134 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1135 /// blend(0,a,mask).
1136 vint4 blend0 (const vint4& a, const vbool4& mask);
1137 
1138 /// Use a bool mask to select between components of a (if mask[i] is false)
1139 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1140 /// blend(0,a,!mask), or blend(a,0,mask).
1141 vint4 blend0not (const vint4& a, const vbool4& mask);
1142 
1143 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1144 /// synonym for blend with arguments rearranged, but this is more clear
1145 /// because the arguments are symmetric to scalar (cond ? a : b).
1146 vint4 select (const vbool4& mask, const vint4& a, const vint4& b);
1147 
1148 // Per-element math
1149 vint4 abs (const vint4& a);
1150 vint4 min (const vint4& a, const vint4& b);
1151 vint4 max (const vint4& a, const vint4& b);
1152 
1153 /// Circular bit rotate by s bits, for N values at once.
1154 vint4 rotl (const vint4& x, const int s);
1155 // DEPRECATED(2.1)
1156 vint4 rotl32 (const vint4& x, const unsigned int k);
1157 
1158 /// andnot(a,b) returns ((~a) & b)
1159 vint4 andnot (const vint4& a, const vint4& b);
1160 
1161 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1162 vint4 bitcast_to_int (const vbool4& x);
1163 vint4 bitcast_to_int (const vfloat4& x);
1164 vfloat4 bitcast_to_float (const vint4& x);
1165 
1166 void transpose (vint4 &a, vint4 &b, vint4 &c, vint4 &d);
1167 void transpose (const vint4& a, const vint4& b, const vint4& c, const vint4& d,
1168  vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3);
1169 
1170 vint4 AxBxCxDx (const vint4& a, const vint4& b, const vint4& c, const vint4& d);
1171 
1172 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1173 vint4 safe_mod (const vint4& a, const vint4& b);
1174 vint4 safe_mod (const vint4& a, int b);
1175 
1176 
1177 
1178 
1179 /// Integer 8-vector, accelerated by SIMD instructions when available.
1180 class vint8 {
1181 public:
1182  static const char* type_name() { return "vint8"; }
1183  typedef int value_t; ///< Underlying equivalent scalar value type
1184  enum { elements = 8 }; ///< Number of scalar elements
1185  enum { paddedelements =8 }; ///< Number of scalar elements for full pad
1186  enum { bits = elements*32 }; ///< Total number of bits
1187  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
1188  typedef vbool8 vbool_t; ///< bool type of the same length
1189  typedef vfloat8 vfloat_t; ///< float type of the same length
1190  typedef vint8 vint_t; ///< int type of the same length
1191  OIIO_DEPRECATED("use vbool_t (1.8)")
1192  typedef vbool8 bool_t; // old name (deprecated 1.8)
1193  OIIO_DEPRECATED("use vfloat_t (1.8)")
1194  typedef vfloat8 float_t; // old name (deprecated 1.8)
1195 
1196  /// Default constructor (contents undefined)
1197  vint8 () { }
1198 
1199  /// Construct from a single value (store it in all slots)
1200  vint8 (int a);
1201 
1202  /// Construct from 2 values -- (a,a,b,b)
1203  vint8 (int a, int b);
1204 
1205  /// Construct from 8 values (won't work for vint8)
1206  vint8 (int a, int b, int c, int d, int e, int f, int g, int h);
1207 
1208  /// Construct from a pointer to values
1209  vint8 (const int *vals);
1210 
1211  /// Construct from a pointer to unsigned short values
1212  explicit vint8 (const unsigned short *vals);
1213 
1214  /// Construct from a pointer to signed short values
1215  explicit vint8 (const short *vals);
1216 
1217  /// Construct from a pointer to unsigned char values (0 - 255)
1218  explicit vint8 (const unsigned char *vals);
1219 
1220  /// Construct from a pointer to signed char values (-128 - 127)
1221  explicit vint8 (const char *vals);
1222 
1223  /// Copy construct from another vint8
1224  vint8 (const vint8 & other) { m_simd = other.m_simd; }
1225 
1226  /// Convert a vfloat8 to an vint8. Equivalent to i = (int)f;
1227  explicit vint8 (const vfloat8& f); // implementation below
1228 
1229  /// Construct from two vint4's
1230  vint8 (const vint4 &lo, const vint4 &hi);
1231 
1232  /// Construct from the underlying SIMD type
1233  vint8 (const simd_t& m) : m_simd(m) { }
1234 
1235  /// Return the raw SIMD type
1236  operator simd_t () const { return m_simd; }
1237  simd_t simd () const { return m_simd; }
1238  simd_t& simd () { return m_simd; }
1239 
1240  /// Return a pointer to the underlying scalar type
1241  const value_t* data () const { return (const value_t*)this; }
1242  value_t* data () { return (value_t*)this; }
1243 
1244  /// Sset all components to 0
1245  void clear () ;
1246 
1247  /// Return an vint8 with all components set to 0
1248  static const vint8 Zero ();
1249 
1250  /// Return an vint8 with all components set to 1
1251  static const vint8 One ();
1252 
1253  /// Return an vint8 with all components set to -1 (aka 0xffffffff)
1254  static const vint8 NegOne ();
1255 
1256  /// Return an vint8 with incremented components (e.g., 0,1,2,3).
1257  /// Optional arguments can give a non-zero starting point and step size.
1258  static const vint8 Iota (int start=0, int step=1);
1259 
1260  /// Return an vint8 with "geometric" iota: (1, 2, 4, 8, ...).
1261  static const vint8 Giota ();
1262 
1263  /// Assign one value to all components.
1264  const vint8 & operator= (int a);
1265 
1266  /// Assignment from another vint8
1267  const vint8 & operator= (const vint8& other) ;
1268 
1269  /// Component access (get)
1270  int operator[] (int i) const;
1271 
1272  /// Component access (set)
1273  int& operator[] (int i);
1274 
1275  /// Component access (set).
1276  void setcomp (int i, int value);
1277 
1278  value_t x () const;
1279  value_t y () const;
1280  value_t z () const;
1281  value_t w () const;
1282  void set_x (value_t val);
1283  void set_y (value_t val);
1284  void set_z (value_t val);
1285  void set_w (value_t val);
1286 
1287  /// Extract the lower precision vint4
1288  vint4 lo () const;
1289 
1290  /// Extract the higher precision vint4
1291  vint4 hi () const;
1292 
1293  /// Helper: load a single int into all components
1294  void load (int a);
1295 
1296  /// Load separate values into each component.
1297  void load (int a, int b, int c, int d, int e, int f, int g, int h);
1298 
1299  /// Load from an array of 8 values
1300  void load (const int *values);
1301 
1302  void load (const int *values, int n) ;
1303 
1304  /// Load from an array of 8 unsigned short values, convert to vint8
1305  void load (const unsigned short *values) ;
1306 
1307  /// Load from an array of 8 unsigned short values, convert to vint8
1308  void load (const short *values);
1309 
1310  /// Load from an array of 8 unsigned char values, convert to vint8
1311  void load (const unsigned char *values);
1312 
1313  /// Load from an array of 8 unsigned char values, convert to vint8
1314  void load (const char *values);
1315 
1316  /// Store the values into memory
1317  void store (int *values) const;
1318 
1319  /// Store the first n values into memory
1320  void store (int *values, int n) const;
1321 
1322  /// Store the least significant 16 bits of each element into adjacent
1323  /// unsigned shorts.
1324  void store (unsigned short *values) const;
1325 
1326  /// Store the least significant 8 bits of each element into adjacent
1327  /// unsigned chars.
1328  void store (unsigned char *values) const;
1329 
1330  /// Masked load -- read from values[] where mask is 1, load zero where
1331  /// mask is 0.
1332  void load_mask (int mask, const value_t *values);
1333  void load_mask (const vbool_t& mask, const value_t *values);
1334 
1335  /// Masked store -- write to values[] where mask is enabled, don't
1336  /// touch values[] where it's not.
1337  void store_mask (int mask, value_t *values) const;
1338  void store_mask (const vbool_t& mask, value_t *values) const;
1339 
1340  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1341  template<int scale=4>
1342  void gather (const value_t *baseptr, const vint_t& vindex);
1343  /// Gather elements defined by the mask, leave others unchanged.
1344  template<int scale=4>
1345  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
1346  template<int scale=4>
1347  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1348 
1349  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1350  template<int scale=4>
1351  void scatter (value_t *baseptr, const vint_t& vindex) const;
1352  /// Scatter elements defined by the mask
1353  template<int scale=4>
1354  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1355  template<int scale=4>
1356  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1357 
1358  // Arithmetic operators (component-by-component)
1359  friend vint8 operator+ (const vint8& a, const vint8& b);
1360  friend vint8 operator- (const vint8& a);
1361  friend vint8 operator- (const vint8& a, const vint8& b);
1362  friend vint8 operator* (const vint8& a, const vint8& b);
1363  friend vint8 operator/ (const vint8& a, const vint8& b);
1364  friend vint8 operator% (const vint8& a, const vint8& b);
1365  friend const vint8 & operator+= (vint8& a, const vint8& b);
1366  friend const vint8 & operator-= (vint8& a, const vint8& b);
1367  friend const vint8 & operator*= (vint8& a, const vint8& b);
1368  friend const vint8 & operator/= (vint8& a, const vint8& b);
1369  friend const vint8 & operator%= (vint8& a, const vint8& b);
1370  // Bitwise operators (component-by-component)
1371  friend vint8 operator& (const vint8& a, const vint8& b);
1372  friend vint8 operator| (const vint8& a, const vint8& b);
1373  friend vint8 operator^ (const vint8& a, const vint8& b);
1374  friend const vint8& operator&= (vint8& a, const vint8& b);
1375  friend const vint8& operator|= (vint8& a, const vint8& b);
1376  friend const vint8& operator^= (vint8& a, const vint8& b);
1377  friend vint8 operator~ (const vint8& a);
1378  friend vint8 operator<< (const vint8& a, unsigned int bits);
1379  friend vint8 operator>> (const vint8& a, unsigned int bits);
1380  friend const vint8& operator<<= (vint8& a, unsigned int bits);
1381  friend const vint8& operator>>= (vint8& a, unsigned int bits);
1382  // Comparison operators (component-by-component)
1383  friend vbool8 operator== (const vint8& a, const vint8& b);
1384  friend vbool8 operator!= (const vint8& a, const vint8& b);
1385  friend vbool8 operator< (const vint8& a, const vint8& b);
1386  friend vbool8 operator> (const vint8& a, const vint8& b);
1387  friend vbool8 operator>= (const vint8& a, const vint8& b);
1388  friend vbool8 operator<= (const vint8& a, const vint8& b);
1389 
1390  /// Stream output
1391  friend std::ostream& operator<< (std::ostream& cout, const vint8& a);
1392 
1393 private:
1394  // The actual data representation
1395  union {
1399  };
1400 };
1401 
1402 
1403 
1404 // Shift right logical -- unsigned shift. This differs from operator>>
1405 // in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1406 // srl((1<<31),1) == 1<<30.
1407 vint8 srl (const vint8& val, const unsigned int bits);
1408 
1409 /// Helper: shuffle/swizzle with constant (templated) indices.
1410 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
1411 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
1412 OIIO_FORCEINLINE vint8 shuffle (const vint8& a);
1413 
1414 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1415 template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a);
1416 
1417 /// Helper: as rapid as possible extraction of one component, when the
1418 /// index is fixed.
1419 template<int i> OIIO_FORCEINLINE int extract (const vint8& v);
1420 
1421 /// Helper: substitute val for a[i]
1422 template<int i> OIIO_FORCEINLINE vint8 insert (const vint8& a, int val);
1423 
1424 /// The sum of all components, returned in all components.
1425 vint8 vreduce_add (const vint8& v);
1426 
1427 // Reduction across all components
1428 int reduce_add (const vint8& v);
1429 int reduce_and (const vint8& v);
1430 int reduce_or (const vint8& v);
1431 
1432 /// Use a bool mask to select between components of a (if mask[i] is false)
1433 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1434 vint8 blend (const vint8& a, const vint8& b, const vbool8& mask);
1435 
1436 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1437 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1438 /// blend(0,a,mask).
1439 vint8 blend0 (const vint8& a, const vbool8& mask);
1440 
1441 /// Use a bool mask to select between components of a (if mask[i] is false)
1442 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1443 /// blend(0,a,!mask), or blend(a,0,mask).
1444 vint8 blend0not (const vint8& a, const vbool8& mask);
1445 
1446 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1447 /// synonym for blend with arguments rearranged, but this is more clear
1448 /// because the arguments are symmetric to scalar (cond ? a : b).
1449 vint8 select (const vbool8& mask, const vint8& a, const vint8& b);
1450 
1451 // Per-element math
1452 vint8 abs (const vint8& a);
1453 vint8 min (const vint8& a, const vint8& b);
1454 vint8 max (const vint8& a, const vint8& b);
1455 
1456 /// Circular bit rotate by s bits, for N values at once.
1457 vint8 rotl (const vint8& x, const int s);
1458 // DEPRECATED(2.1)
1459 vint8 rotl32 (const vint8& x, const unsigned int k);
1460 
1461 /// andnot(a,b) returns ((~a) & b)
1462 vint8 andnot (const vint8& a, const vint8& b);
1463 
1464 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1465 vint8 bitcast_to_int (const vbool8& x);
1466 vint8 bitcast_to_int (const vfloat8& x);
1467 vfloat8 bitcast_to_float (const vint8& x);
1468 
1469 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1470 vint8 safe_mod (const vint8& a, const vint8& b);
1471 vint8 safe_mod (const vint8& a, int b);
1472 
1473 
1474 
1475 
1476 
1477 /// Integer 16-vector, accelerated by SIMD instructions when available.
1478 class vint16 {
1479 public:
1480  static const char* type_name() { return "vint16"; }
1481  typedef int value_t; ///< Underlying equivalent scalar value type
1482  enum { elements = 16 }; ///< Number of scalar elements
1483  enum { paddedelements =16 }; ///< Number of scalar elements for full pad
1484  enum { bits = 128 }; ///< Total number of bits
1485  typedef simd_raw_t<int,elements>::type simd_t; ///< the native SIMD type used
1486  typedef vbool16 vbool_t; ///< bool type of the same length
1487  typedef vfloat16 vfloat_t; ///< float type of the same length
1488  typedef vint16 vint_t; ///< int type of the same length
1489  OIIO_DEPRECATED("use vbool_t (1.8)")
1490  typedef vbool16 bool_t; // old name (deprecated 1.8)
1491  OIIO_DEPRECATED("use vfloat_t (1.8)")
1492  typedef vfloat16 float_t; // old name (deprecated 1.8)
1493 
1494  /// Default constructor (contents undefined)
1495  vint16 () { }
1496 
1497  /// Construct from a single value (store it in all slots)
1498  vint16 (int a);
1499 
1500  /// Construct from 16 values (won't work for vint16)
1501  vint16 (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
1502  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
1503 
1504  /// Construct from a pointer to values
1505  vint16 (const int *vals);
1506 
1507  /// Construct from a pointer to unsigned short values
1508  explicit vint16 (const unsigned short *vals);
1509 
1510  /// Construct from a pointer to signed short values
1511  explicit vint16 (const short *vals);
1512 
1513  /// Construct from a pointer to unsigned char values (0 - 255)
1514  explicit vint16 (const unsigned char *vals);
1515 
1516  /// Construct from a pointer to signed char values (-128 - 127)
1517  explicit vint16 (const char *vals);
1518 
1519  /// Copy construct from another vint16
1520  vint16 (const vint16 & other) { m_simd = other.m_simd; }
1521 
1522  /// Convert a vfloat16 to an vint16. Equivalent to i = (int)f;
1523  explicit vint16 (const vfloat16& f); // implementation below
1524 
1525  /// Construct from two vint8's
1526  vint16 (const vint8 &lo, const vint8 &hi);
1527 
1528  /// Construct from four vint4's
1529  vint16 (const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d);
1530 
1531  /// Construct from the underlying SIMD type
1532  vint16 (const simd_t& m) : m_simd(m) { }
1533 
1534  /// Return the raw SIMD type
1535  operator simd_t () const { return m_simd; }
1536  simd_t simd () const { return m_simd; }
1537  simd_t& simd () { return m_simd; }
1538 
1539  /// Return a pointer to the underlying scalar type
1540  const value_t* data () const { return (const value_t*)this; }
1541  value_t* data () { return (value_t*)this; }
1542 
1543  /// Sset all components to 0
1544  void clear () ;
1545 
1546  /// Return an vint16 with all components set to 0
1547  static const vint16 Zero ();
1548 
1549  /// Return an vint16 with all components set to 1
1550  static const vint16 One ();
1551 
1552  /// Return an vint16 with all components set to -1 (aka 0xffffffff)
1553  static const vint16 NegOne ();
1554 
1555  /// Return an vint16 with incremented components (e.g., 0,1,2,3).
1556  /// Optional arguments can give a non-zero starting point and step size.
1557  static const vint16 Iota (int start=0, int step=1);
1558 
1559  /// Return an vint16 with "geometric" iota: (1, 2, 4, 8, ...).
1560  static const vint16 Giota ();
1561 
1562  /// Assign one value to all components.
1563  const vint16 & operator= (int a);
1564 
1565  /// Assignment from another vint16
1566  const vint16 & operator= (const vint16& other) ;
1567 
1568  /// Component access (get)
1569  int operator[] (int i) const;
1570 
1571  /// Component access (set)
1572  int& operator[] (int i);
1573 
1574  /// Component access (set).
1575  void setcomp (int i, int value);
1576 
1577  value_t x () const;
1578  value_t y () const;
1579  value_t z () const;
1580  value_t w () const;
1581  void set_x (value_t val);
1582  void set_y (value_t val);
1583  void set_z (value_t val);
1584  void set_w (value_t val);
1585 
1586  /// Extract the lower precision vint8
1587  vint8 lo () const;
1588 
1589  /// Extract the higher precision vint8
1590  vint8 hi () const;
1591 
1592  /// Helper: load a single int into all components
1593  void load (int a);
1594 
1595  /// Load separate values into each component.
1596  void load (int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7,
1597  int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15);
1598 
1599  /// Load from an array of 16 values
1600  void load (const int *values);
1601 
1602  void load (const int *values, int n) ;
1603 
1604  /// Load from an array of 16 unsigned short values, convert to vint16
1605  void load (const unsigned short *values) ;
1606 
1607  /// Load from an array of 16 unsigned short values, convert to vint16
1608  void load (const short *values);
1609 
1610  /// Load from an array of 16 unsigned char values, convert to vint16
1611  void load (const unsigned char *values);
1612 
1613  /// Load from an array of 16 unsigned char values, convert to vint16
1614  void load (const char *values);
1615 
1616  /// Store the values into memory
1617  void store (int *values) const;
1618 
1619  /// Store the first n values into memory
1620  void store (int *values, int n) const;
1621 
1622  /// Store the least significant 16 bits of each element into adjacent
1623  /// unsigned shorts.
1624  void store (unsigned short *values) const;
1625 
1626  /// Store the least significant 8 bits of each element into adjacent
1627  /// unsigned chars.
1628  void store (unsigned char *values) const;
1629 
1630  /// Masked load -- read from values[] where mask is 1, load zero where
1631  /// mask is 0.
1632  void load_mask (const vbool_t &mask, const value_t *values);
1633  void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); }
1634 
1635  /// Masked store -- write to values[] where mask is enabled, don't
1636  /// touch values[] where it's not.
1637  void store_mask (const vbool_t &mask, value_t *values) const;
1638  void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); }
1639 
1640  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1641  template<int scale=4>
1642  void gather (const value_t *baseptr, const vint_t& vindex);
1643  /// Gather elements defined by the mask, leave others unchanged.
1644  template<int scale=4>
1645  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
1646  template<int scale=4>
1647  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) {
1648  gather_mask<scale> (vbool_t(mask), baseptr, vindex);
1649  }
1650 
1651  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1652  template<int scale=4>
1653  void scatter (value_t *baseptr, const vint_t& vindex) const;
1654  /// Scatter elements defined by the mask
1655  template<int scale=4>
1656  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1657  template<int scale=4>
1658  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const {
1659  scatter_mask<scale> (vbool_t(mask), baseptr, vindex);
1660  }
1661 
1662  // Arithmetic operators (component-by-component)
1663  friend vint16 operator+ (const vint16& a, const vint16& b);
1664  friend vint16 operator- (const vint16& a);
1665  friend vint16 operator- (const vint16& a, const vint16& b);
1666  friend vint16 operator* (const vint16& a, const vint16& b);
1667  friend vint16 operator/ (const vint16& a, const vint16& b);
1668  friend vint16 operator% (const vint16& a, const vint16& b);
1669  friend const vint16 & operator+= (vint16& a, const vint16& b);
1670  friend const vint16 & operator-= (vint16& a, const vint16& b);
1671  friend const vint16 & operator*= (vint16& a, const vint16& b);
1672  friend const vint16 & operator/= (vint16& a, const vint16& b);
1673  friend const vint16 & operator%= (vint16& a, const vint16& b);
1674  // Bitwise operators (component-by-component)
1675  friend vint16 operator& (const vint16& a, const vint16& b);
1676  friend vint16 operator| (const vint16& a, const vint16& b);
1677  friend vint16 operator^ (const vint16& a, const vint16& b);
1678  friend const vint16& operator&= (vint16& a, const vint16& b);
1679  friend const vint16& operator|= (vint16& a, const vint16& b);
1680  friend const vint16& operator^= (vint16& a, const vint16& b);
1681  friend vint16 operator~ (const vint16& a);
1682  friend vint16 operator<< (const vint16& a, unsigned int bits);
1683  friend vint16 operator>> (const vint16& a, unsigned int bits);
1684  friend const vint16& operator<<= (vint16& a, unsigned int bits);
1685  friend const vint16& operator>>= (vint16& a, unsigned int bits);
1686  // Comparison operators (component-by-component)
1687  friend vbool16 operator== (const vint16& a, const vint16& b);
1688  friend vbool16 operator!= (const vint16& a, const vint16& b);
1689  friend vbool16 operator< (const vint16& a, const vint16& b);
1690  friend vbool16 operator> (const vint16& a, const vint16& b);
1691  friend vbool16 operator>= (const vint16& a, const vint16& b);
1692  friend vbool16 operator<= (const vint16& a, const vint16& b);
1693 
1694  /// Stream output
1695  friend std::ostream& operator<< (std::ostream& cout, const vint16& a);
1696 
1697 private:
1698  // The actual data representation
1699  union {
1703  };
1704 };
1705 
1706 
1707 
1708 /// Shift right logical -- unsigned shift. This differs from operator>>
1709 /// in how it handles the sign bit. (1<<31) >> 1 == (1<<31), but
1710 /// srl((1<<31),1) == 1<<30.
1711 vint16 srl (const vint16& val, const unsigned int bits);
1712 
1713 /// Shuffle groups of 4
1714 template<int i0, int i1, int i2, int i3>
1715 vint16 shuffle4 (const vint16& a);
1716 
1717 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a)
1718 template<int i> vint16 shuffle4 (const vint16& a);
1719 
1720 /// Shuffle within each group of 4
1721 template<int i0, int i1, int i2, int i3>
1722 vint16 shuffle (const vint16& a);
1723 
1724 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
1725 template<int i> vint16 shuffle (const vint16& a);
1726 
1727 /// Helper: as rapid as possible extraction of one component, when the
1728 /// index is fixed.
1729 template<int i> OIIO_FORCEINLINE int extract (const vint16& v);
1730 
1731 /// Helper: substitute val for a[i]
1732 template<int i> OIIO_FORCEINLINE vint16 insert (const vint16& a, int val);
1733 
1734 /// The sum of all components, returned in all components.
1735 vint16 vreduce_add (const vint16& v);
1736 
1737 // Reduction across all components
1738 int reduce_add (const vint16& v);
1739 int reduce_and (const vint16& v);
1740 int reduce_or (const vint16& v);
1741 
1742 /// Use a bool mask to select between components of a (if mask[i] is false)
1743 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
1744 vint16 blend (const vint16& a, const vint16& b, const vbool16& mask);
1745 
1746 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
1747 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
1748 /// blend(0,a,mask).
1749 vint16 blend0 (const vint16& a, const vbool16& mask);
1750 
1751 /// Use a bool mask to select between components of a (if mask[i] is false)
1752 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
1753 /// blend(0,a,!mask), or blend(a,0,mask).
1754 vint16 blend0not (const vint16& a, const vbool16& mask);
1755 
1756 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
1757 /// synonym for blend with arguments rearranged, but this is more clear
1758 /// because the arguments are symmetric to scalar (cond ? a : b).
1759 vint16 select (const vbool16& mask, const vint16& a, const vint16& b);
1760 
1761 // Per-element math
1762 vint16 abs (const vint16& a);
1763 vint16 min (const vint16& a, const vint16& b);
1764 vint16 max (const vint16& a, const vint16& b);
1765 
1766 /// Circular bit rotate by s bits, for N values at once.
1767 vint16 rotl (const vint16& x, const int s);
1768 // DEPRECATED(2.1)
1769 vint16 rotl32 (const vint16& x, const unsigned int k);
1770 
1771 /// andnot(a,b) returns ((~a) & b)
1772 vint16 andnot (const vint16& a, const vint16& b);
1773 
1774 /// Bitcast back and forth to intN (not a convert -- move the bits!)
1775 vint16 bitcast_to_int (const vbool16& x);
1776 vint16 bitcast_to_int (const vfloat16& x);
1777 vfloat16 bitcast_to_float (const vint16& x);
1778 
1779 // safe_mod(a,b) is like a%b, but safely returns 0 when b==0.
1780 vint16 safe_mod (const vint16& a, const vint16& b);
1781 vint16 safe_mod (const vint16& a, int b);
1782 
1783 
1784 
1785 
1786 
1787 /// Floating point 4-vector, accelerated by SIMD instructions when
1788 /// available.
1789 class vfloat4 {
1790 public:
1791  static const char* type_name() { return "vfloat4"; }
1792  typedef float value_t; ///< Underlying equivalent scalar value type
1793  enum { elements = 4 }; ///< Number of scalar elements
1794  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
1795  enum { bits = elements*32 }; ///< Total number of bits
1796  typedef simd_raw_t<float,4>::type simd_t; ///< the native SIMD type used
1797  typedef vfloat4 vfloat_t; ///< SIMD int type
1798  typedef vint4 vint_t; ///< SIMD int type
1799  typedef vbool4 vbool_t; ///< SIMD bool type
1800  OIIO_DEPRECATED("use vbool_t (1.8)")
1801  typedef vint4 int_t; // old name (deprecated 1.8)
1802  OIIO_DEPRECATED("use vfloat_t (1.8)")
1803  typedef vbool4 bool_t; // old name (deprecated 1.8)
1804 
1805  /// Default constructor (contents undefined)
1806  vfloat4 () { }
1807 
1808  /// Construct from a single value (store it in all slots)
1809  vfloat4 (float a) { load(a); }
1810 
1811  /// Construct from 3 or 4 values
1812  vfloat4 (float a, float b, float c, float d=0.0f) { load(a,b,c,d); }
1813 
1814  /// Construct from a pointer to 4 values
1815  vfloat4 (const float *f) { load (f); }
1816 
1817  /// Copy construct from another vfloat4
1818  vfloat4 (const vfloat4 &other) { m_simd = other.m_simd; }
1819 
1820  /// Construct from an vint4 (promoting all components to float)
1821  explicit vfloat4 (const vint4& ival);
1822 
1823  /// Construct from the underlying SIMD type
1824  vfloat4 (const simd_t& m) : m_simd(m) { }
1825 
1826  /// Return the raw SIMD type
1827  operator simd_t () const { return m_simd; }
1828  simd_t simd () const { return m_simd; }
1829  simd_t& simd () { return m_simd; }
1830 
1831  /// Return a pointer to the underlying scalar type
1832  const value_t* data () const { return (const value_t*)this; }
1833  value_t* data () { return (value_t*)this; }
1834 
1835  /// Construct from a Imath::V3f
1836  explicit vfloat4 (const Imath::V3f &v) { load (v[0], v[1], v[2]); }
1837 
1838  /// Cast to a Imath::V3f
1839  const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; }
1840 
1841  /// Construct from a Imath::V4f
1842  explicit vfloat4 (const Imath::V4f &v) { load ((const float *)&v); }
1843 
1844  /// Cast to a Imath::V4f
1845  const Imath::V4f& V4f () const { return *(const Imath::V4f*)this; }
1846 
1847  /// Construct from a pointer to 4 unsigned short values
1848  explicit vfloat4 (const unsigned short *vals) { load(vals); }
1849 
1850  /// Construct from a pointer to 4 short values
1851  explicit vfloat4 (const short *vals) { load(vals); }
1852 
1853  /// Construct from a pointer to 4 unsigned char values
1854  explicit vfloat4 (const unsigned char *vals) { load(vals); }
1855 
1856  /// Construct from a pointer to 4 char values
1857  explicit vfloat4 (const char *vals) { load(vals); }
1858 
1859 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1860  /// Construct from a pointer to 4 half (16 bit float) values
1861  explicit vfloat4 (const half *vals) { load(vals); }
1862 #endif
1863 
1864  /// Assign a single value to all components
1865  const vfloat4 & operator= (float a) { load(a); return *this; }
1866 
1867  /// Assign a vfloat4
1868  const vfloat4 & operator= (vfloat4 other) {
1869  m_simd = other.m_simd;
1870  return *this;
1871  }
1872 
1873  /// Return a vfloat4 with all components set to 0.0
1874  static const vfloat4 Zero ();
1875 
1876  /// Return a vfloat4 with all components set to 1.0
1877  static const vfloat4 One ();
1878 
1879  /// Return a vfloat4 with incremented components (e.g., 0.0,1.0,2.0,3.0).
1880  /// Optional argument can give a non-zero starting point and non-1 step.
1881  static const vfloat4 Iota (float start=0.0f, float step=1.0f);
1882 
1883  /// Set all components to 0.0
1884  void clear ();
1885 
1886  /// Assign from a Imath::V4f
1887  const vfloat4 & operator= (const Imath::V4f &v);
1888 
1889  /// Assign from a Imath::V3f
1890  const vfloat4 & operator= (const Imath::V3f &v);
1891 
1892  /// Component access (get)
1893  float operator[] (int i) const;
1894  /// Component access (set)
1895  float& operator[] (int i);
1896 
1897  /// Component access (set).
1898  void setcomp (int i, float value);
1899 
1900  value_t x () const;
1901  value_t y () const;
1902  value_t z () const;
1903  value_t w () const;
1904  void set_x (value_t val);
1905  void set_y (value_t val);
1906  void set_z (value_t val);
1907  void set_w (value_t val);
1908 
1909  /// Helper: load a single value into all components
1910  void load (float val);
1911 
1912  /// Helper: load 3 or 4 values. (If 3 are supplied, the 4th will be 0.)
1913  void load (float a, float b, float c, float d=0.0f);
1914 
1915  /// Load from an array of 4 values
1916  void load (const float *values);
1917 
1918  /// Load from a partial array of <=4 values. Unassigned values are
1919  /// undefined.
1920  void load (const float *values, int n);
1921 
1922  /// Load from an array of 4 unsigned short values, convert to float
1923  void load (const unsigned short *values);
1924 
1925  /// Load from an array of 4 short values, convert to float
1926  void load (const short *values);
1927 
1928  /// Load from an array of 4 unsigned char values, convert to float
1929  void load (const unsigned char *values);
1930 
1931  /// Load from an array of 4 char values, convert to float
1932  void load (const char *values);
1933 
1934 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1935  /// Load from an array of 4 half values, convert to float
1936  void load (const half *values);
1937 #endif /* _HALF_H_ or _IMATH_H_ */
1938 
1939  void store (float *values) const;
1940 
1941  /// Store the first n values into memory
1942  void store (float *values, int n) const;
1943 
1944 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1945  void store (half *values) const;
1946 #endif
1947 
1948  /// Masked load -- read from values[] where mask is 1, load zero where
1949  /// mask is 0.
1950  void load_mask (int mask, const value_t *values);
1951  void load_mask (const vbool_t& mask, const value_t *values);
1952 
1953  /// Masked store -- write to values[] where mask is enabled, don't
1954  /// touch values[] where it's not.
1955  void store_mask (int mask, value_t *values) const;
1956  void store_mask (const vbool_t& mask, value_t *values) const;
1957 
1958  /// Load values from addresses (char*)basepatr + vindex[i]*scale
1959  template<int scale=4>
1960  void gather (const value_t *baseptr, const vint_t& vindex);
1961  /// Gather elements defined by the mask, leave others unchanged.
1962  template<int scale=4>
1963  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
1964  template<int scale=4>
1965  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
1966 
1967  /// Store values at addresses (char*)basepatr + vindex[i]*scale
1968  template<int scale=4>
1969  void scatter (value_t *baseptr, const vint_t& vindex) const;
1970  /// Scatter elements defined by the mask
1971  template<int scale=4>
1972  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
1973  template<int scale=4>
1974  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
1975 
1976  // Arithmetic operators
1977  friend vfloat4 operator+ (const vfloat4& a, const vfloat4& b);
1978  const vfloat4 & operator+= (const vfloat4& a);
1979  vfloat4 operator- () const;
1980  friend vfloat4 operator- (const vfloat4& a, const vfloat4& b);
1981  const vfloat4 & operator-= (const vfloat4& a);
1982  friend vfloat4 operator* (const vfloat4& a, const vfloat4& b);
1983  friend vfloat4 operator* (const vfloat4& a, float b);
1984  friend vfloat4 operator* (float a, const vfloat4& b);
1985  const vfloat4 & operator*= (const vfloat4& a);
1986  const vfloat4 & operator*= (float val);
1987  friend vfloat4 operator/ (const vfloat4& a, const vfloat4& b);
1988  const vfloat4 & operator/= (const vfloat4& a);
1989  const vfloat4 & operator/= (float val);
1990 
1991  // Comparison operations
1992  friend vbool4 operator== (const vfloat4& a, const vfloat4& b);
1993  friend vbool4 operator!= (const vfloat4& a, const vfloat4& b);
1994  friend vbool4 operator< (const vfloat4& a, const vfloat4& b);
1995  friend vbool4 operator> (const vfloat4& a, const vfloat4& b);
1996  friend vbool4 operator>= (const vfloat4& a, const vfloat4& b);
1997  friend vbool4 operator<= (const vfloat4& a, const vfloat4& b);
1998 
1999  // Some oddball items that are handy
2000 
2001  /// Combine the first two components of A with the first two components
2002  /// of B.
2003  friend vfloat4 AxyBxy (const vfloat4& a, const vfloat4& b);
2004 
2005  /// Combine the first two components of A with the first two components
2006  /// of B, but interleaved.
2007  friend vfloat4 AxBxAyBy (const vfloat4& a, const vfloat4& b);
2008 
2009  /// Return xyz components, plus 0 for w
2010  vfloat4 xyz0 () const;
2011 
2012  /// Return xyz components, plus 1 for w
2013  vfloat4 xyz1 () const;
2014 
2015  /// Stream output
2016  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat4& val);
2017 
2018 protected:
2019  // The actual data representation
2020  union {
2023  };
2024 };
2025 
2026 
2027 /// Helper: shuffle/swizzle with constant (templated) indices.
2028 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
2029 template<int i0, int i1, int i2, int i3>
2030 OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
2031 
2032 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2033 template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a);
2034 
2035 /// Helper: as rapid as possible extraction of one component, when the
2036 /// index is fixed.
2037 template<int i> OIIO_FORCEINLINE float extract (const vfloat4& a);
2038 
2039 /// Helper: substitute val for a[i]
2040 template<int i> OIIO_FORCEINLINE vfloat4 insert (const vfloat4& a, float val);
2041 
2042 /// The sum of all components, returned in all components.
2043 vfloat4 vreduce_add (const vfloat4& v);
2044 
2045 /// The sum of all components, returned as a scalar.
2046 float reduce_add (const vfloat4& v);
2047 
2048 /// Return the float dot (inner) product of a and b in every component.
2049 vfloat4 vdot (const vfloat4 &a, const vfloat4 &b);
2050 
2051 /// Return the float dot (inner) product of a and b.
2052 float dot (const vfloat4 &a, const vfloat4 &b);
2053 
2054 /// Return the float 3-component dot (inner) product of a and b in
2055 /// all components.
2056 vfloat4 vdot3 (const vfloat4 &a, const vfloat4 &b);
2057 
2058 /// Return the float 3-component dot (inner) product of a and b.
2059 float dot3 (const vfloat4 &a, const vfloat4 &b);
2060 
2061 /// Use a bool mask to select between components of a (if mask[i] is false)
2062 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2063 vfloat4 blend (const vfloat4& a, const vfloat4& b, const vbool4& mask);
2064 
2065 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2066 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2067 /// blend(0,a,mask).
2068 vfloat4 blend0 (const vfloat4& a, const vbool4& mask);
2069 
2070 /// Use a bool mask to select between components of a (if mask[i] is false)
2071 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2072 /// blend(0,a,!mask), or blend(a,0,mask).
2073 vfloat4 blend0not (const vfloat4& a, const vbool4& mask);
2074 
2075 /// "Safe" divide of vfloat4/vfloat4 -- for any component of the divisor
2076 /// that is 0, return 0 rather than Inf.
2077 vfloat4 safe_div (const vfloat4 &a, const vfloat4 &b);
2078 
2079 /// Homogeneous divide to turn a vfloat4 into a vfloat3.
2080 vfloat3 hdiv (const vfloat4 &a);
2081 
2082 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2083 /// synonym for blend with arguments rearranged, but this is more clear
2084 /// because the arguments are symmetric to scalar (cond ? a : b).
2085 vfloat4 select (const vbool4& mask, const vfloat4& a, const vfloat4& b);
2086 
2087 // Per-element math
2088 vfloat4 abs (const vfloat4& a); ///< absolute value (float)
2089 vfloat4 sign (const vfloat4& a); ///< 1.0 when value >= 0, -1 when negative
2090 vfloat4 ceil (const vfloat4& a);
2091 vfloat4 floor (const vfloat4& a);
2092 vint4 ifloor (const vfloat4& a); ///< (int)floor
2093 OIIO_DEPRECATED("use ifloor (1.8)")
2094 inline vint4 floori (const vfloat4& a) { return ifloor(a); } // DEPRECATED(1.8) alias
2095 
2096 /// Per-element round to nearest integer.
2097 /// CAVEAT: the rounding when mid-way between integers may differ depending
2098 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
2099 /// integer) but std::round() says to round away from 0 regardless of
2100 /// current rounding mode (but that is multiple instructions on x64).
2101 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2102 /// match std::round().
2103 vfloat4 round (const vfloat4& a);
2104 
2105 /// Per-element round to nearest integer (equivalent to vint(round(a))).
2106 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
2107 /// C++ std::rint() which says to use the current rounding mode.
2108 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2109 /// match std::rint().
2110 vint4 rint (const vfloat4& a);
2111 
2112 vfloat4 rcp_fast (const vfloat4 &a); ///< Fast, approximate 1/a
2113 vfloat4 sqrt (const vfloat4 &a);
2114 vfloat4 rsqrt (const vfloat4 &a); ///< Fully accurate 1/sqrt
2115 vfloat4 rsqrt_fast (const vfloat4 &a); ///< Fast, approximate 1/sqrt
2116 vfloat4 min (const vfloat4& a, const vfloat4& b); ///< Per-element min
2117 vfloat4 max (const vfloat4& a, const vfloat4& b); ///< Per-element max
2118 template <typename T> OIIO_FORCEINLINE T exp (const T& v); // template for all SIMD variants
2119 template <typename T> OIIO_FORCEINLINE T log (const T& v);
2120 
2121 /// andnot(a,b) returns ((~a) & b)
2122 vfloat4 andnot (const vfloat4& a, const vfloat4& b);
2123 
2124 // Fused multiply and add (or subtract):
2125 vfloat4 madd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b + c
2126 vfloat4 msub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // a*b - c
2127 vfloat4 nmadd (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b + c
2128 vfloat4 nmsub (const vfloat4& a, const vfloat4& b, const vfloat4& c); // -a*b - c
2129 
2130 /// Transpose the rows and columns of the 4x4 matrix [a b c d].
2131 /// In the end, a will have the original (a[0], b[0], c[0], d[0]),
2132 /// b will have the original (a[1], b[1], c[1], d[1]), and so on.
2133 void transpose (vfloat4 &a, vfloat4 &b, vfloat4 &c, vfloat4 &d);
2134 void transpose (const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d,
2135  vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3);
2136 
2137 /// Make a vfloat4 consisting of the first element of each of 4 vfloat4's.
2138 vfloat4 AxBxCxDx (const vfloat4& a, const vfloat4& b,
2139  const vfloat4& c, const vfloat4& d);
2140 
2141 
2142 
2143 /// Floating point 3-vector, aligned to be internally identical to a vfloat4.
2144 /// The way it differs from vfloat4 is that all of he load functions only
2145 /// load three values, and all the stores only store 3 values. The vast
2146 /// majority of ops just fall back to the vfloat4 version, and so will
2147 /// operate on the 4th component, but we won't care about that results.
2148 class vfloat3 : public vfloat4 {
2149 public:
2150  static const char* type_name() { return "vfloat3"; }
2151  enum { elements = 3 }; ///< Number of scalar elements
2152  enum { paddedelements = 4 }; ///< Number of scalar elements for full pad
2153 
2154  /// Default constructor (contents undefined)
2155  vfloat3 () { }
2156 
2157  /// Construct from a single value (store it in all slots)
2158  vfloat3 (float a) { load(a); }
2159 
2160  /// Construct from 3 or 4 values
2161  vfloat3 (float a, float b, float c) { vfloat4::load(a,b,c); }
2162 
2163  /// Construct from a pointer to 4 values
2164  vfloat3 (const float *f) { load (f); }
2165 
2166  /// Copy construct from another vfloat3
2167  vfloat3 (const vfloat3 &other);
2168 
2169  /// Construct from a vfloat4. Note: it will not zero out the internal
2170  /// 4th component, but rather accept on faith that the vfloat4 you are
2171  /// giving it is a valid vfloat3. Be careful!
2172  explicit vfloat3 (const vfloat4 &other);
2173 
2174 #if OIIO_SIMD
2175  /// Construct from the underlying SIMD type. Note: it will not zero out
2176  /// the internal 4th component, but rather accept on faith that the
2177  /// vfloat4 you are giving it is a valid vfloat3. Be careful!
2178  explicit vfloat3 (const simd_t& m) : vfloat4(m) { }
2179 #endif
2180 
2181  /// Construct from a Imath::V3f
2182  vfloat3 (const Imath::V3f &v) : vfloat4(v) { }
2183 
2184  /// Cast to a Imath::V3f
2185  const Imath::V3f& V3f () const { return *(const Imath::V3f*)this; }
2186 
2187  /// Construct from a pointer to 4 unsigned short values
2188  explicit vfloat3 (const unsigned short *vals) { load(vals); }
2189 
2190  /// Construct from a pointer to 4 short values
2191  explicit vfloat3 (const short *vals) { load(vals); }
2192 
2193  /// Construct from a pointer to 4 unsigned char values
2194  explicit vfloat3 (const unsigned char *vals) { load(vals); }
2195 
2196  /// Construct from a pointer to 4 char values
2197  explicit vfloat3 (const char *vals) { load(vals); }
2198 
2199 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2200  /// Construct from a pointer to 4 half (16 bit float) values
2201  explicit vfloat3 (const half *vals) { load(vals); }
2202 #endif
2203 
2204  /// Assign a single value to all components
2205  const vfloat3 & operator= (float a) { load(a); return *this; }
2206 
2207  /// Return a vfloat3 with all components set to 0.0
2208  static const vfloat3 Zero ();
2209 
2210  /// Return a vfloat3 with all components set to 1.0
2211  static const vfloat3 One ();
2212 
2213  /// Return a vfloat3 with incremented components (e.g., 0.0,1.0,2.0).
2214  /// Optional argument can give a non-zero starting point and non-1 step.
2215  static const vfloat3 Iota (float start=0.0f, float step=1.0f);
2216 
2217  /// Helper: load a single value into all components
2218  void load (float val);
2219 
2220  /// Load from an array of 4 values
2221  void load (const float *values);
2222 
2223  /// Load from an array of 4 values
2224  void load (const float *values, int n);
2225 
2226  /// Load from an array of 4 unsigned short values, convert to float
2227  void load (const unsigned short *values);
2228 
2229  /// Load from an array of 4 short values, convert to float
2230  void load (const short *values);
2231 
2232  /// Load from an array of 4 unsigned char values, convert to float
2233  void load (const unsigned char *values);
2234 
2235  /// Load from an array of 4 char values, convert to float
2236  void load (const char *values);
2237 
2238 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2239  /// Load from an array of 4 half values, convert to float
2240  void load (const half *values);
2241 #endif /* _HALF_H_ or _IMATH_H_ */
2242 
2243  void store (float *values) const;
2244 
2245  void store (float *values, int n) const;
2246 
2247 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2248  void store (half *values) const;
2249 #endif
2250 
2251  /// Store into an Imath::V3f reference.
2252  void store (Imath::V3f &vec) const;
2253 
2254  // Math operators -- define in terms of vfloat3.
2255  friend vfloat3 operator+ (const vfloat3& a, const vfloat3& b);
2256  const vfloat3 & operator+= (const vfloat3& a);
2257  vfloat3 operator- () const;
2258  friend vfloat3 operator- (const vfloat3& a, const vfloat3& b);
2259  const vfloat3 & operator-= (const vfloat3& a);
2260  friend vfloat3 operator* (const vfloat3& a, const vfloat3& b);
2261  friend vfloat3 operator* (const vfloat3& a, float b);
2262  friend vfloat3 operator* (float a, const vfloat3& b);
2263  const vfloat3 & operator*= (const vfloat3& a);
2264  const vfloat3 & operator*= (float a);
2265  friend vfloat3 operator/ (const vfloat3& a, const vfloat3& b);
2266  const vfloat3 & operator/= (const vfloat3& a);
2267  const vfloat3 & operator/= (float a);
2268 
2269  /// Square of the length of the vector
2270  float length2() const;
2271  /// Length of the vector
2272  float length() const;
2273 
2274  /// Return a normalized version of the vector.
2275  vfloat3 normalized () const;
2276  /// Return a fast, approximate normalized version of the vector.
2277  vfloat3 normalized_fast () const;
2278  /// Normalize in place.
2279  void normalize() { *this = normalized(); }
2280 
2281  /// Stream output
2282  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat3& val);
2283 };
2284 
2285 
2286 
2287 // Per-element math on float3
2288 vfloat3 abs (const vfloat3& a);
2289 vfloat3 sign (const vfloat3& a);
2290 vfloat3 ceil (const vfloat3& a);
2291 vfloat3 floor (const vfloat3& a);
2292 vfloat3 round (const vfloat3& a);
2293 
2294 
2295 
2296 /// SIMD-based 4x4 matrix. This is guaranteed to have memory layout (when
2297 /// not in registers) isomorphic to Imath::M44f.
2298 class matrix44 {
2299 public:
2300  // Uninitialized
2302 #ifndef OIIO_SIMD_SSE
2303  : m_mat(Imath::UNINITIALIZED)
2304 #endif
2305  { }
2306 
2307  /// Construct from a reference to an Imath::M44f
2308  OIIO_FORCEINLINE explicit matrix44 (const Imath::M44f &M) {
2309 #if OIIO_SIMD_SSE
2310  m_row[0].load (M[0]);
2311  m_row[1].load (M[1]);
2312  m_row[2].load (M[2]);
2313  m_row[3].load (M[3]);
2314 #else
2315  m_mat = M;
2316 #endif
2317  }
2318 
2319  /// Construct from a float array
2320  OIIO_FORCEINLINE explicit matrix44 (const float *f) {
2321 #if OIIO_SIMD_SSE
2322  m_row[0].load (f+0);
2323  m_row[1].load (f+4);
2324  m_row[2].load (f+8);
2325  m_row[3].load (f+12);
2326 #else
2327  m_mat = *(const Imath::M44f*)f;
2328 #endif
2329  }
2330 
2331  /// Construct from 4 vfloat4 rows
2332  OIIO_FORCEINLINE explicit matrix44 (const vfloat4& a, const vfloat4& b,
2333  const vfloat4& c, const vfloat4& d) {
2334 #if OIIO_SIMD_SSE
2335  m_row[0] = a; m_row[1] = b; m_row[2] = c; m_row[3] = d;
2336 #else
2337  a.store (m_mat[0]);
2338  b.store (m_mat[1]);
2339  c.store (m_mat[2]);
2340  d.store (m_mat[3]);
2341 #endif
2342  }
2343  /// Construct from 4 float[4] rows
2344  OIIO_FORCEINLINE explicit matrix44 (const float *a, const float *b,
2345  const float *c, const float *d) {
2346 #if OIIO_SIMD_SSE
2347  m_row[0].load(a); m_row[1].load(b); m_row[2].load(c); m_row[3].load(d);
2348 #else
2349  memcpy (m_mat[0], a, 4*sizeof(float));
2350  memcpy (m_mat[1], b, 4*sizeof(float));
2351  memcpy (m_mat[2], c, 4*sizeof(float));
2352  memcpy (m_mat[3], d, 4*sizeof(float));
2353 #endif
2354  }
2355 
2356  /// Construct from 16 floats
2357  OIIO_FORCEINLINE matrix44 (float f00, float f01, float f02, float f03,
2358  float f10, float f11, float f12, float f13,
2359  float f20, float f21, float f22, float f23,
2360  float f30, float f31, float f32, float f33)
2361  {
2362 #if OIIO_SIMD_SSE
2363  m_row[0].load (f00, f01, f02, f03);
2364  m_row[1].load (f10, f11, f12, f13);
2365  m_row[2].load (f20, f21, f22, f23);
2366  m_row[3].load (f30, f31, f32, f33);
2367 #else
2368  m_mat[0][0] = f00; m_mat[0][1] = f01; m_mat[0][2] = f02; m_mat[0][3] = f03;
2369  m_mat[1][0] = f10; m_mat[1][1] = f11; m_mat[1][2] = f12; m_mat[1][3] = f13;
2370  m_mat[2][0] = f20; m_mat[2][1] = f21; m_mat[2][2] = f22; m_mat[2][3] = f23;
2371  m_mat[3][0] = f30; m_mat[3][1] = f31; m_mat[3][2] = f32; m_mat[3][3] = f33;
2372 #endif
2373  }
2374 
2375  /// Present as an Imath::M44f
2376  const Imath::M44f& M44f() const;
2377 
2378  /// Return one row
2379  vfloat4 operator[] (int i) const;
2380 
2381  /// Return the transposed matrix
2382  matrix44 transposed () const;
2383 
2384  /// Transform 3-point V by 4x4 matrix M.
2385  vfloat3 transformp (const vfloat3 &V) const;
2386 
2387  /// Transform 3-vector V by 4x4 matrix M.
2388  vfloat3 transformv (const vfloat3 &V) const;
2389 
2390  /// Transform 3-vector V by the transpose of 4x4 matrix M.
2391  vfloat3 transformvT (const vfloat3 &V) const;
2392 
2393  friend vfloat4 operator* (const vfloat4 &V, const matrix44& M);
2394  friend vfloat4 operator* (const matrix44& M, const vfloat4 &V);
2395 
2396  bool operator== (const matrix44& m) const;
2397 
2398  bool operator== (const Imath::M44f& m) const ;
2399  friend bool operator== (const Imath::M44f& a, const matrix44 &b);
2400 
2401  bool operator!= (const matrix44& m) const;
2402 
2403  bool operator!= (const Imath::M44f& m) const;
2404  friend bool operator!= (const Imath::M44f& a, const matrix44 &b);
2405 
2406  /// Return the inverse of the matrix.
2407  matrix44 inverse() const;
2408 
2409  /// Stream output
2410  friend inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M);
2411 
2412 private:
2413 #if OIIO_SIMD_SSE
2414  vfloat4 m_row[4];
2415 #else
2416  Imath::M44f m_mat;
2417 #endif
2418 };
2419 
2420 /// Transform 3-point V by 4x4 matrix M.
2421 vfloat3 transformp (const matrix44 &M, const vfloat3 &V);
2422 vfloat3 transformp (const Imath::M44f &M, const vfloat3 &V);
2423 
2424 /// Transform 3-vector V by 4x4 matrix M.
2425 vfloat3 transformv (const matrix44 &M, const vfloat3 &V);
2426 vfloat3 transformv (const Imath::M44f &M, const vfloat3 &V);
2427 
2428 // Transform 3-vector by the transpose of 4x4 matrix M.
2429 vfloat3 transformvT (const matrix44 &M, const vfloat3 &V);
2430 vfloat3 transformvT (const Imath::M44f &M, const vfloat3 &V);
2431 
2432 
2433 
2434 
2435 /// Floating point 8-vector, accelerated by SIMD instructions when
2436 /// available.
2437 class vfloat8 {
2438 public:
2439  static const char* type_name() { return "vfloat8"; }
2440  typedef float value_t; ///< Underlying equivalent scalar value type
2441  enum { elements = 8 }; ///< Number of scalar elements
2442  enum { paddedelements = 8 }; ///< Number of scalar elements for full pad
2443  enum { bits = elements*32 }; ///< Total number of bits
2444  typedef simd_raw_t<float,8>::type simd_t; ///< the native SIMD type used
2445  typedef vfloat8 vfloat_t; ///< SIMD int type
2446  typedef vint8 vint_t; ///< SIMD int type
2447  typedef vbool8 vbool_t; ///< SIMD bool type
2448  OIIO_DEPRECATED("use vint_t (1.8)")
2449  typedef vint8 int_t; // old name (deprecated 1.8)
2450  OIIO_DEPRECATED("use vbool_t (1.8)")
2451  typedef vbool8 bool_t; // old name (deprecated 1.8)
2452 
2453  /// Default constructor (contents undefined)
2454  vfloat8 () { }
2455 
2456  /// Construct from a single value (store it in all slots)
2457  vfloat8 (float a) { load(a); }
2458 
2459  /// Construct from 8 values
2460  vfloat8 (float a, float b, float c, float d,
2461  float e, float f, float g, float h) { load(a,b,c,d,e,f,g,h); }
2462 
2463  /// Construct from a pointer to 8 values
2464  vfloat8 (const float *f) { load (f); }
2465 
2466  /// Copy construct from another vfloat8
2467  vfloat8 (const vfloat8 &other) { m_simd = other.m_simd; }
2468 
2469  /// Construct from an int vector (promoting all components to float)
2470  explicit vfloat8 (const vint8& ival);
2471 
2472  /// Construct from two vfloat4's
2473  vfloat8 (const vfloat4 &lo, const vfloat4 &hi);
2474 
2475  /// Construct from the underlying SIMD type
2476  vfloat8 (const simd_t& m) : m_simd(m) { }
2477 
2478  /// Return the raw SIMD type
2479  operator simd_t () const { return m_simd; }
2480  simd_t simd () const { return m_simd; }
2481  simd_t& simd () { return m_simd; }
2482 
2483  /// Return a pointer to the underlying scalar type
2484  const value_t* data () const { return (const value_t*)this; }
2485  value_t* data () { return (value_t*)this; }
2486 
2487  /// Construct from a pointer to unsigned short values
2488  explicit vfloat8 (const unsigned short *vals) { load(vals); }
2489 
2490  /// Construct from a pointer to short values
2491  explicit vfloat8 (const short *vals) { load(vals); }
2492 
2493  /// Construct from a pointer to unsigned char values
2494  explicit vfloat8 (const unsigned char *vals) { load(vals); }
2495 
2496  /// Construct from a pointer to char values
2497  explicit vfloat8 (const char *vals) { load(vals); }
2498 
2499 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2500  /// Construct from a pointer to half (16 bit float) values
2501  explicit vfloat8 (const half *vals) { load(vals); }
2502 #endif
2503 
2504  /// Assign a single value to all components
2505  const vfloat8& operator= (float a) { load(a); return *this; }
2506 
2507  /// Assign a vfloat8
2508  const vfloat8& operator= (vfloat8 other) {
2509  m_simd = other.m_simd;
2510  return *this;
2511  }
2512 
2513  /// Return a vfloat8 with all components set to 0.0
2514  static const vfloat8 Zero ();
2515 
2516  /// Return a vfloat8 with all components set to 1.0
2517  static const vfloat8 One ();
2518 
2519  /// Return a vfloat8 with incremented components (e.g., 0,1,2,3,...)
2520  /// Optional argument can give a non-zero starting point and non-1 step.
2521  static const vfloat8 Iota (float start=0.0f, float step=1.0f);
2522 
2523  /// Set all components to 0.0
2524  void clear ();
2525 
2526  /// Component access (get)
2527  float operator[] (int i) const;
2528  /// Component access (set)
2529  float& operator[] (int i);
2530 
2531  /// Component access (set).
2532  void setcomp (int i, float value);
2533 
2534  value_t x () const;
2535  value_t y () const;
2536  value_t z () const;
2537  value_t w () const;
2538  void set_x (value_t val);
2539  void set_y (value_t val);
2540  void set_z (value_t val);
2541  void set_w (value_t val);
2542 
2543  /// Extract the lower precision vfloat4
2544  vfloat4 lo () const;
2545 
2546  /// Extract the higher precision vfloat4
2547  vfloat4 hi () const;
2548 
2549  /// Helper: load a single value into all components
2550  void load (float val);
2551 
2552  /// Helper: load 8 values
2553  void load (float a, float b, float c, float d,
2554  float e, float f, float g, float h);
2555 
2556  /// Load from an array of values
2557  void load (const float *values);
2558 
2559  /// Load from a partial array of <=8 values. Unassigned values are
2560  /// undefined.
2561  void load (const float *values, int n);
2562 
2563  /// Load from an array of 8 unsigned short values, convert to float
2564  void load (const unsigned short *values);
2565 
2566  /// Load from an array of 8 short values, convert to float
2567  void load (const short *values);
2568 
2569  /// Load from an array of 8 unsigned char values, convert to float
2570  void load (const unsigned char *values);
2571 
2572  /// Load from an array of 8 char values, convert to float
2573  void load (const char *values);
2574 
2575 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2576  /// Load from an array of 8 half values, convert to float
2577  void load (const half *values);
2578 #endif /* _HALF_H_ or _IMATH_H_ */
2579 
2580  void store (float *values) const;
2581 
2582  /// Store the first n values into memory
2583  void store (float *values, int n) const;
2584 
2585 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2586  void store (half *values) const;
2587 #endif
2588 
2589  /// Masked load -- read from values[] where mask is 1, load zero where
2590  /// mask is 0.
2591  void load_mask (int mask, const value_t *values);
2592  void load_mask (const vbool_t& mask, const value_t *values);
2593 
2594  /// Masked store -- write to values[] where mask is enabled, don't
2595  /// touch values[] where it's not.
2596  void store_mask (int mask, value_t *values) const;
2597  void store_mask (const vbool_t& mask, value_t *values) const;
2598 
2599  /// Load values from addresses (char*)basepatr + vindex[i]*scale
2600  template<int scale=4>
2601  void gather (const value_t *baseptr, const vint_t& vindex);
2602  template<int scale=4>
2603  // Fastest way to fill with all 1 bits is to cmp any value to itself.
2604  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
2605  template<int scale=4>
2606  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex);
2607 
2608  /// Store values at addresses (char*)basepatr + vindex[i]*scale
2609  template<int scale=4>
2610  void scatter (value_t *baseptr, const vint_t& vindex) const;
2611  /// Scatter elements defined by the mask
2612  template<int scale=4>
2613  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
2614  template<int scale=4>
2615  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const;
2616 
2617  // Arithmetic operators (component-by-component)
2618  friend vfloat8 operator+ (const vfloat8& a, const vfloat8& b);
2619  friend vfloat8 operator- (const vfloat8& a);
2620  friend vfloat8 operator- (const vfloat8& a, const vfloat8& b);
2621  friend vfloat8 operator* (const vfloat8& a, const vfloat8& b);
2622  friend vfloat8 operator* (const vfloat8& a, float b);
2623  friend vfloat8 operator* (float a, const vfloat8& b);
2624  friend vfloat8 operator/ (const vfloat8& a, const vfloat8& b);
2625  friend vfloat8 operator% (const vfloat8& a, const vfloat8& b);
2626  friend const vfloat8 & operator+= (vfloat8& a, const vfloat8& b);
2627  friend const vfloat8 & operator-= (vfloat8& a, const vfloat8& b);
2628  friend const vfloat8 & operator*= (vfloat8& a, const vfloat8& b);
2629  friend const vfloat8 & operator/= (vfloat8& a, const vfloat8& b);
2630 
2631  // Comparison operations
2632  friend vbool8 operator== (const vfloat8& a, const vfloat8& b);
2633  friend vbool8 operator!= (const vfloat8& a, const vfloat8& b);
2634  friend vbool8 operator< (const vfloat8& a, const vfloat8& b);
2635  friend vbool8 operator> (const vfloat8& a, const vfloat8& b);
2636  friend vbool8 operator>= (const vfloat8& a, const vfloat8& b);
2637  friend vbool8 operator<= (const vfloat8& a, const vfloat8& b);
2638 
2639  // Some oddball items that are handy
2640 
2641  /// Stream output
2642  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat8& val);
2643 
2644 protected:
2645  // The actual data representation
2646  union {
2650  };
2651 };
2652 
2653 
2654 /// Helper: shuffle/swizzle with constant (templated) indices.
2655 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
2656 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
2657 OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a);
2658 
2659 /// shuffle<i>(a) is the same as shuffle<i,i,i,i,...>(a)
2660 template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a);
2661 
2662 /// Helper: as rapid as possible extraction of one component, when the
2663 /// index is fixed.
2664 template<int i> OIIO_FORCEINLINE float extract (const vfloat8& a);
2665 
2666 /// Helper: substitute val for a[i]
2667 template<int i> OIIO_FORCEINLINE vfloat8 insert (const vfloat8& a, float val);
2668 
2669 /// The sum of all components, returned in all components.
2670 vfloat8 vreduce_add (const vfloat8& v);
2671 
2672 /// The sum of all components, returned as a scalar.
2673 float reduce_add (const vfloat8& v);
2674 
2675 /// Return the float dot (inner) product of a and b in every component.
2676 vfloat8 vdot (const vfloat8 &a, const vfloat8 &b);
2677 
2678 /// Return the float dot (inner) product of a and b.
2679 float dot (const vfloat8 &a, const vfloat8 &b);
2680 
2681 /// Return the float 3-component dot (inner) product of a and b in
2682 /// all components.
2683 vfloat8 vdot3 (const vfloat8 &a, const vfloat8 &b);
2684 
2685 /// Return the float 3-component dot (inner) product of a and b.
2686 float dot3 (const vfloat8 &a, const vfloat8 &b);
2687 
2688 /// Use a bool mask to select between components of a (if mask[i] is false)
2689 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
2690 vfloat8 blend (const vfloat8& a, const vfloat8& b, const vbool8& mask);
2691 
2692 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
2693 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
2694 /// blend(0,a,mask).
2695 vfloat8 blend0 (const vfloat8& a, const vbool8& mask);
2696 
2697 /// Use a bool mask to select between components of a (if mask[i] is false)
2698 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
2699 /// blend(0,a,!mask), or blend(a,0,mask).
2700 vfloat8 blend0not (const vfloat8& a, const vbool8& mask);
2701 
2702 /// "Safe" divide of vfloat8/vfloat8 -- for any component of the divisor
2703 /// that is 0, return 0 rather than Inf.
2704 vfloat8 safe_div (const vfloat8 &a, const vfloat8 &b);
2705 
2706 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
2707 /// synonym for blend with arguments rearranged, but this is more clear
2708 /// because the arguments are symmetric to scalar (cond ? a : b).
2709 vfloat8 select (const vbool8& mask, const vfloat8& a, const vfloat8& b);
2710 
2711 // Per-element math
2712 vfloat8 abs (const vfloat8& a); ///< absolute value (float)
2713 vfloat8 sign (const vfloat8& a); ///< 1.0 when value >= 0, -1 when negative
2714 vfloat8 ceil (const vfloat8& a);
2715 vfloat8 floor (const vfloat8& a);
2716 vint8 ifloor (const vfloat8& a); ///< (int)floor
2717 inline vint8 floori (const vfloat8& a) { return ifloor(a); } // DEPRECATED(1.8) alias
2718 
2719 /// Per-element round to nearest integer.
2720 /// CAVEAT: the rounding when mid-way between integers may differ depending
2721 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
2722 /// integer) but std::round() says to round away from 0 regardless of
2723 /// current rounding mode (but that is multiple instructions on x64).
2724 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2725 /// match std::round().
2726 vfloat8 round (const vfloat8& a);
2727 
2728 /// Per-element round to nearest integer (equivalent to vint(round(a))).
2729 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
2730 /// C++ std::rint() which says to use the current rounding mode.
2731 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
2732 /// match std::rint().
2733 vint8 rint (const vfloat8& a);
2734 
2735 vfloat8 rcp_fast (const vfloat8 &a); ///< Fast, approximate 1/a
2736 vfloat8 sqrt (const vfloat8 &a);
2737 vfloat8 rsqrt (const vfloat8 &a); ///< Fully accurate 1/sqrt
2738 vfloat8 rsqrt_fast (const vfloat8 &a); ///< Fast, approximate 1/sqrt
2739 vfloat8 min (const vfloat8& a, const vfloat8& b); ///< Per-element min
2740 vfloat8 max (const vfloat8& a, const vfloat8& b); ///< Per-element max
2741 // vfloat8 exp (const vfloat8& v); // See template with vfloat4
2742 // vfloat8 log (const vfloat8& v); // See template with vfloat4
2743 
2744 /// andnot(a,b) returns ((~a) & b)
2745 vfloat8 andnot (const vfloat8& a, const vfloat8& b);
2746 
2747 // Fused multiply and add (or subtract):
2748 vfloat8 madd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b + c
2749 vfloat8 msub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // a*b - c
2750 vfloat8 nmadd (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b + c
2751 vfloat8 nmsub (const vfloat8& a, const vfloat8& b, const vfloat8& c); // -a*b - c
2752 
2753 
2754 
2755 /// Floating point 16-vector, accelerated by SIMD instructions when
2756 /// available.
2757 class vfloat16 {
2758 public:
2759  static const char* type_name() { return "vfloat16"; }
2760  typedef float value_t; ///< Underlying equivalent scalar value type
2761  enum { elements = 16 }; ///< Number of scalar elements
2762  enum { paddedelements = 16 }; ///< Number of scalar elements for full pad
2763  enum { bits = elements*32 }; ///< Total number of bits
2764  typedef simd_raw_t<float,16>::type simd_t; ///< the native SIMD type used
2765  typedef vfloat16 vfloat_t; ///< SIMD int type
2766  typedef vint16 vint_t; ///< SIMD int type
2767  typedef vbool16 vbool_t; ///< SIMD bool type
2768  OIIO_DEPRECATED("use vint_t (1.8)")
2769  typedef vint16 int_t; // old name (deprecated 1.8)
2770  OIIO_DEPRECATED("use vbool_t (1.8)")
2771  typedef vbool16 bool_t; // old name (deprecated 1.8)
2772 
2773  /// Default constructor (contents undefined)
2774  vfloat16 () { }
2775 
2776  /// Construct from a single value (store it in all slots)
2777  vfloat16 (float a) { load(a); }
2778 
2779  /// Construct from 16 values
2780  vfloat16 (float v0, float v1, float v2, float v3,
2781  float v4, float v5, float v6, float v7,
2782  float v8, float v9, float v10, float v11,
2783  float v12, float v13, float v14, float v15);
2784 
2785  /// Construct from a pointer to 16 values
2786  vfloat16 (const float *f) { load (f); }
2787 
2788  /// Copy construct from another vfloat16
2789  vfloat16 (const vfloat16 &other) { m_simd = other.m_simd; }
2790 
2791  /// Construct from an int vector (promoting all components to float)
2792  explicit vfloat16 (const vint16& ival);
2793 
2794  /// Construct from two vfloat8's
2795  vfloat16 (const vfloat8 &lo, const vfloat8 &hi);
2796 
2797  /// Construct from four vfloat4's
2798  vfloat16 (const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d);
2799 
2800  /// Construct from the underlying SIMD type
2801  vfloat16 (const simd_t& m) : m_simd(m) { }
2802 
2803  /// Return the raw SIMD type
2804  operator simd_t () const { return m_simd; }
2805  simd_t simd () const { return m_simd; }
2806  simd_t& simd () { return m_simd; }
2807 
2808  /// Return a pointer to the underlying scalar type
2809  const value_t* data () const { return (const value_t*)this; }
2810  value_t* data () { return (value_t*)this; }
2811 
2812  /// Construct from a pointer to unsigned short values
2813  explicit vfloat16 (const unsigned short *vals) { load(vals); }
2814 
2815  /// Construct from a pointer to short values
2816  explicit vfloat16 (const short *vals) { load(vals); }
2817 
2818  /// Construct from a pointer to unsigned char values
2819  explicit vfloat16 (const unsigned char *vals) { load(vals); }
2820 
2821  /// Construct from a pointer to char values
2822  explicit vfloat16 (const char *vals) { load(vals); }
2823 
2824 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2825  /// Construct from a pointer to half (16 bit float) values
2826  explicit vfloat16 (const half *vals) { load(vals); }
2827 #endif
2828 
2829  /// Assign a single value to all components
2830  const vfloat16& operator= (float a) { load(a); return *this; }
2831 
2832  /// Assign a vfloat16
2833  const vfloat16& operator= (vfloat16 other) {
2834  m_simd = other.m_simd;
2835  return *this;
2836  }
2837 
2838  /// Return a vfloat16 with all components set to 0.0
2839  static const vfloat16 Zero ();
2840 
2841  /// Return a vfloat16 with all components set to 1.0
2842  static const vfloat16 One ();
2843 
2844  /// Return a vfloat16 with incremented components (e.g., 0,1,2,3,...)
2845  /// Optional argument can give a non-zero starting point and non-1 step.
2846  static const vfloat16 Iota (float start=0.0f, float step=1.0f);
2847 
2848  /// Set all components to 0.0
2849  void clear ();
2850 
2851  /// Component access (get)
2852  float operator[] (int i) const;
2853  /// Component access (set)
2854  float& operator[] (int i);
2855 
2856  /// Component access (set).
2857  void setcomp (int i, float value);
2858 
2859  value_t x () const;
2860  value_t y () const;
2861  value_t z () const;
2862  value_t w () const;
2863  void set_x (value_t val);
2864  void set_y (value_t val);
2865  void set_z (value_t val);
2866  void set_w (value_t val);
2867 
2868  /// Extract the lower precision vfloat8
2869  vfloat8 lo () const;
2870 
2871  /// Extract the higher precision vfloat8
2872  vfloat8 hi () const;
2873 
2874  /// Helper: load a single value into all components
2875  void load (float val);
2876 
2877  /// Load separate values into each component.
2878  void load (float v0, float v1, float v2, float v3,
2879  float v4, float v5, float v6, float v7,
2880  float v8, float v9, float v10, float v11,
2881  float v12, float v13, float v14, float v15);
2882 
2883  /// Load from an array of values
2884  void load (const float *values);
2885 
2886  /// Load from a partial array of <=16 values. Unassigned values are
2887  /// undefined.
2888  void load (const float *values, int n);
2889 
2890  /// Load from an array of 16 unsigned short values, convert to float
2891  void load (const unsigned short *values);
2892 
2893  /// Load from an array of 16 short values, convert to float
2894  void load (const short *values);
2895 
2896  /// Load from an array of 16 unsigned char values, convert to float
2897  void load (const unsigned char *values);
2898 
2899  /// Load from an array of 16 char values, convert to float
2900  void load (const char *values);
2901 
2902 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2903  /// Load from an array of 16 half values, convert to float
2904  void load (const half *values);
2905 #endif /* _HALF_H_ or _IMATH_H_ */
2906 
2907  void store (float *values) const;
2908 
2909  /// Store the first n values into memory
2910  void store (float *values, int n) const;
2911 
2912 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
2913  void store (half *values) const;
2914 #endif
2915 
2916  /// Masked load -- read from values[] where mask is 1, load zero where
2917  /// mask is 0.
2918  void load_mask (const vbool_t &mask, const value_t *values);
2919  void load_mask (int mask, const value_t *values) { load_mask(vbool_t(mask), values); }
2920 
2921  /// Masked store -- write to values[] where mask is enabled, don't
2922  /// touch values[] where it's not.
2923  void store_mask (const vbool_t &mask, value_t *values) const;
2924  void store_mask (int mask, value_t *values) const { store_mask(vbool_t(mask), values); }
2925 
2926  /// Load values from addresses (char*)basepatr + vindex[i]*scale
2927  template<int scale=4>
2928  void gather (const value_t *baseptr, const vint_t& vindex);
2929  /// Gather elements defined by the mask, leave others unchanged.
2930  template<int scale=4>
2931  void gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex);
2932  template<int scale=4>
2933  void gather_mask (int mask, const value_t *baseptr, const vint_t& vindex) {
2934  gather_mask<scale> (vbool_t(mask), baseptr, vindex);
2935  }
2936 
2937  /// Store values at addresses (char*)basepatr + vindex[i]*scale
2938  template<int scale=4>
2939  void scatter (value_t *baseptr, const vint_t& vindex) const;
2940  /// Scatter elements defined by the mask
2941  template<int scale=4>
2942  void scatter_mask (const vbool_t& mask, value_t *baseptr, const vint_t& vindex) const;
2943  template<int scale=4>
2944  void scatter_mask (int mask, value_t *baseptr, const vint_t& vindex) const {
2945  scatter_mask<scale> (vbool_t(mask), baseptr, vindex);
2946  }
2947 
2948  // Arithmetic operators (component-by-component)
2949  friend vfloat16 operator+ (const vfloat16& a, const vfloat16& b);
2950  friend vfloat16 operator- (const vfloat16& a);
2951  friend vfloat16 operator- (const vfloat16& a, const vfloat16& b);
2952  friend vfloat16 operator* (const vfloat16& a, const vfloat16& b);
2953  friend vfloat16 operator* (const vfloat16& a, float b);
2954  friend vfloat16 operator* (float a, const vfloat16& b);
2955  friend vfloat16 operator/ (const vfloat16& a, const vfloat16& b);
2956  friend vfloat16 operator% (const vfloat16& a, const vfloat16& b);
2957  friend const vfloat16 & operator+= (vfloat16& a, const vfloat16& b);
2958  friend const vfloat16 & operator-= (vfloat16& a, const vfloat16& b);
2959  friend const vfloat16 & operator*= (vfloat16& a, const vfloat16& b);
2960  friend const vfloat16 & operator/= (vfloat16& a, const vfloat16& b);
2961 
2962  // Comparison operations
2963  friend vbool16 operator== (const vfloat16& a, const vfloat16& b);
2964  friend vbool16 operator!= (const vfloat16& a, const vfloat16& b);
2965  friend vbool16 operator< (const vfloat16& a, const vfloat16& b);
2966  friend vbool16 operator> (const vfloat16& a, const vfloat16& b);
2967  friend vbool16 operator>= (const vfloat16& a, const vfloat16& b);
2968  friend vbool16 operator<= (const vfloat16& a, const vfloat16& b);
2969 
2970  // Some oddball items that are handy
2971 
2972  /// Stream output
2973  friend inline std::ostream& operator<< (std::ostream& cout, const vfloat16& val);
2974 
2975 protected:
2976  // The actual data representation
2977  union {
2981  };
2982 };
2983 
2984 
2985 /// Shuffle groups of 4
2986 template<int i0, int i1, int i2, int i3>
2987 OIIO_FORCEINLINE vfloat16 shuffle4 (const vfloat16& a);
2988 
2989 /// shuffle4<i>(a) is the same as shuffle4<i,i,i,i>(a)
2990 template<int i> OIIO_FORCEINLINE vfloat16 shuffle4 (const vfloat16& a);
2991 
2992 /// Shuffle within each group of 4
2993 template<int i0, int i1, int i2, int i3>
2994 OIIO_FORCEINLINE vfloat16 shuffle (const vfloat16& a);
2995 
2996 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
2997 template<int i> vfloat16 shuffle (const vfloat16& a);
2998 
2999 /// Helper: as rapid as possible extraction of one component, when the
3000 /// index is fixed.
3001 template<int i> OIIO_FORCEINLINE float extract (const vfloat16& a);
3002 
3003 /// Helper: substitute val for a[i]
3004 template<int i> OIIO_FORCEINLINE vfloat16 insert (const vfloat16& a, float val);
3005 
3006 /// The sum of all components, returned in all components.
3007 vfloat16 vreduce_add (const vfloat16& v);
3008 
3009 /// The sum of all components, returned as a scalar.
3010 float reduce_add (const vfloat16& v);
3011 
3012 /// Use a bool mask to select between components of a (if mask[i] is false)
3013 /// and b (if mask[i] is true), i.e., mask[i] ? b[i] : a[i].
3014 vfloat16 blend (const vfloat16& a, const vfloat16& b, const vbool4& mask);
3015 
3016 /// Use a bool mask to select between `a` (if mask[i] is true) or 0 if
3017 /// mask[i] is false), i.e., mask[i] ? a[i] : 0. Equivalent to
3018 /// blend(0,a,mask).
3019 vfloat16 blend0 (const vfloat16& a, const vbool4& mask);
3020 
3021 /// Use a bool mask to select between components of a (if mask[i] is false)
3022 /// or 0 (if mask[i] is true), i.e., mask[i] ? 0 : a[i]. Equivalent to
3023 /// blend(0,a,!mask), or blend(a,0,mask).
3024 vfloat16 blend0not (const vfloat16& a, const vbool4& mask);
3025 
3026 /// "Safe" divide of vfloat16/vfloat16 -- for any component of the divisor
3027 /// that is 0, return 0 rather than Inf.
3028 vfloat16 safe_div (const vfloat16 &a, const vfloat16 &b);
3029 
3030 /// Select 'a' where mask is true, 'b' where mask is false. Sure, it's a
3031 /// synonym for blend with arguments rearranged, but this is more clear
3032 /// because the arguments are symmetric to scalar (cond ? a : b).
3033 vfloat16 select (const vbool16& mask, const vfloat16& a, const vfloat16& b);
3034 
3035 // Per-element math
3036 vfloat16 abs (const vfloat16& a); ///< absolute value (float)
3037 vfloat16 sign (const vfloat16& a); ///< 1.0 when value >= 0, -1 when negative
3038 vfloat16 ceil (const vfloat16& a);
3039 vfloat16 floor (const vfloat16& a);
3040 vint16 ifloor (const vfloat16& a); ///< (int)floor
3041 OIIO_DEPRECATED("use ifloor (1.8)")
3042 inline vint16 floori (const vfloat16& a) { return ifloor(a); } // DEPRECATED(1.8) alias
3043 
3044 /// Per-element round to nearest integer.
3045 /// CAVEAT: the rounding when mid-way between integers may differ depending
3046 /// on hardware. Intel SSE/AVX does "banker's founding" (to nearest even
3047 /// integer) but std::round() says to round away from 0 regardless of
3048 /// current rounding mode (but that is multiple instructions on x64).
3049 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
3050 /// match std::round().
3051 vfloat16 round (const vfloat16& a);
3052 
3053 /// Per-element round to nearest integer (equivalent to vint(round(a))).
3054 /// CAVEAT: On SSE/AVX this uses banker's rounding, which may differ from
3055 /// C++ std::rint() which says to use the current rounding mode.
3056 /// USE WITH CAUTION, and maybe avoid this if it is critical to exactly
3057 /// match std::rint().
3058 vint16 rint (const vfloat16& a);
3059 
3060 vfloat16 rcp_fast (const vfloat16 &a); ///< Fast, approximate 1/a
3061 vfloat16 sqrt (const vfloat16 &a);
3062 vfloat16 rsqrt (const vfloat16 &a); ///< Fully accurate 1/sqrt
3063 vfloat16 rsqrt_fast (const vfloat16 &a); ///< Fast, approximate 1/sqrt
3064 vfloat16 min (const vfloat16& a, const vfloat16& b); ///< Per-element min
3065 vfloat16 max (const vfloat16& a, const vfloat16& b); ///< Per-element max
3066 // vfloat16 exp (const vfloat16& v); // See template with vfloat4
3067 // vfloat16 log (const vfloat16& v); // See template with vfloat4
3068 
3069 /// andnot(a,b) returns ((~a) & b)
3070 vfloat16 andnot (const vfloat16& a, const vfloat16& b);
3071 
3072 // Fused multiply and add (or subtract):
3073 vfloat16 madd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b + c
3074 vfloat16 msub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // a*b - c
3075 vfloat16 nmadd (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b + c
3076 vfloat16 nmsub (const vfloat16& a, const vfloat16& b, const vfloat16& c); // -a*b - c
3077 
3078 
3079 
3080 // Odds and ends, other CPU hardware tricks
3081 
3082 // Try to set the flush_zero_mode CPU flag on x86. Return true if we are
3083 // able, otherwise false (because it's not available on that platform).
3084 inline bool set_flush_zero_mode (bool on) {
3085 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3086  _MM_SET_FLUSH_ZERO_MODE (on ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF);
3087  return true;
3088 #endif
3089  return false;
3090 }
3091 
3092 // Try to set the denorms_zero_mode CPU flag on x86. Return true if we are
3093 // able, otherwise false (because it's not available on that platform).
3094 inline bool set_denorms_zero_mode (bool on) {
3095 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3096  _MM_SET_DENORMALS_ZERO_MODE (on ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF);
3097  return true;
3098 #endif
3099  return false;
3100 }
3101 
3102 // Get the flush_zero_mode CPU flag on x86.
3103 inline bool get_flush_zero_mode () {
3104 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3105  return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
3106 #endif
3107  return false;
3108 }
3109 
3110 // Get the denorms_zero_mode CPU flag on x86.
3111 inline bool get_denorms_zero_mode () {
3112 #if (defined(__x86_64__) || defined(__i386__)) && !defined(__CUDA_ARCH__)
3113  return _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
3114 #endif
3115  return false;
3116 }
3117 
3118 
3119 
3120 
3121 
3122 
3123 //////////////////////////////////////////////////////////////////////////
3124 //////////////////////////////////////////////////////////////////////////
3125 //
3126 // Gory implementation details follow.
3127 //
3128 // ^^^ All declarations and documention is above ^^^
3129 //
3130 // vvv Below is the implementation, often considerably cluttered with
3131 // #if's for each architeture, and unapologitic use of intrinsics and
3132 // every manner of dirty trick we can think of to make things fast.
3133 // Some of this isn't pretty. We won't recapitulate comments or
3134 // documentation of what the functions are supposed to do, please
3135 // consult the declarations above for that.
3136 //
3137 // Here be dragons.
3138 //
3139 //////////////////////////////////////////////////////////////////////////
3140 //////////////////////////////////////////////////////////////////////////
3141 
3142 
3143 
3144 //////////////////////////////////////////////////////////////////////
3145 // vbool4 implementation
3146 
3147 
3149  OIIO_DASSERT(i >= 0 && i < elements);
3150 #if OIIO_SIMD_SSE
3151  return ((_mm_movemask_ps(m_simd) >> i) & 1) ? -1 : 0;
3152 #else
3153  return m_val[i];
3154 #endif
3155 }
3156 
3158  OIIO_DASSERT(i >= 0 && i < elements);
3159  return m_val[i];
3160 }
3161 
3162 
3164  OIIO_DASSERT(i >= 0 && i < elements);
3165  m_val[i] = value ? -1 : 0;
3166 }
3167 
3168 
3169 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool4& a) {
3170  cout << a[0];
3171  for (int i = 1; i < a.elements; ++i)
3172  cout << ' ' << a[i];
3173  return cout;
3174 }
3175 
3176 
3178 #if OIIO_SIMD_SSE
3179  m_simd = _mm_castsi128_ps(_mm_set1_epi32(-int(a)));
3180 #elif OIIO_SIMD_NEON
3181  m_simd = vdupq_n_u32(a ? 0xffffffff : 0);
3182 #else
3183  int val = -int(a);
3184  SIMD_CONSTRUCT (val);
3185 #endif
3186 }
3187 
3188 
3189 OIIO_FORCEINLINE void vbool4::load (bool a, bool b, bool c, bool d) {
3190 #if OIIO_SIMD_SSE
3191  // N.B. -- we need to reverse the order because of our convention
3192  // of storing a,b,c,d in the same order in memory.
3193  m_simd = _mm_castsi128_ps(_mm_set_epi32(-int(d), -int(c), -int(b), -int(a)));
3194 // #elif OIIO_SIMD_NEON
3195 // FIXME
3196 #else
3197  m_val[0] = -int(a);
3198  m_val[1] = -int(b);
3199  m_val[2] = -int(c);
3200  m_val[3] = -int(d);
3201 #endif
3202 }
3203 
3205  load (a[0], a[1], a[2], a[3]);
3206 }
3207 
3209  m_simd = other.m_simd;
3210  return *this;
3211 }
3212 
3213 
3215 #if OIIO_SIMD_SSE
3216  return _mm_movemask_ps(m_simd);
3217 #else
3218  int r = 0;
3219  for (int i = 0; i < elements; ++i)
3220  if (m_val[i])
3221  r |= 1<<i;
3222  return r;
3223 #endif
3224 }
3225 
3226 
3228 vbool4::from_bitmask (int bitmask) {
3229  // I think this is a fast conversion from int bitmask to vbool4
3230  return (vint4::Giota() & vint4(bitmask)) != vint4::Zero();
3231 }
3232 
3233 
3235 #if OIIO_SIMD_SSE
3236  m_simd = _mm_setzero_ps();
3237 #else
3238  *this = false;
3239 #endif
3240 }
3241 
3242 
3244 #if OIIO_SIMD_SSE
3245  return _mm_setzero_ps();
3246 #else
3247  return false;
3248 #endif
3249 }
3250 
3252  // Fastest way to fill with all 1 bits is to cmp any value to itself.
3253 #if OIIO_SIMD_SSE
3254 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
3255  __m128i anyval = _mm_undefined_si128();
3256 # else
3257  __m128i anyval = _mm_setzero_si128();
3258 # endif
3259  return _mm_castsi128_ps (_mm_cmpeq_epi8 (anyval, anyval));
3260 #else
3261  return true;
3262 #endif
3263 }
3264 
3266  SIMD_DO (values[i] = m_val[i] ? true : false);
3267 }
3268 
3269 OIIO_FORCEINLINE void vbool4::store (bool *values, int n) const {
3270  OIIO_DASSERT (n >= 0 && n <= elements);
3271  for (int i = 0; i < n; ++i)
3272  values[i] = m_val[i] ? true : false;
3273 }
3274 
3275 
3276 
3278 #if OIIO_SIMD_SSE
3279  return _mm_xor_ps (a.simd(), vbool4::True());
3280 #elif OIIO_SIMD_NEON
3281  return vmvnq_u32(a.simd());
3282 #else
3283  SIMD_RETURN (vbool4, a[i] ^ (-1));
3284 #endif
3285 }
3286 
3288 #if OIIO_SIMD_SSE
3289  return _mm_and_ps (a.simd(), b.simd());
3290 #elif OIIO_SIMD_NEON
3291  return vandq_u32(a.simd(), b.simd());
3292 #else
3293  SIMD_RETURN (vbool4, a[i] & b[i]);
3294 #endif
3295 }
3296 
3298 #if OIIO_SIMD_SSE
3299  return _mm_or_ps (a.simd(), b.simd());
3300 #elif OIIO_SIMD_NEON
3301  return vorrq_u32(a.simd(), b.simd());
3302 #else
3303  SIMD_RETURN (vbool4, a[i] | b[i]);
3304 #endif
3305 }
3306 
3308 #if OIIO_SIMD_SSE
3309  return _mm_xor_ps (a.simd(), b.simd());
3310 #elif OIIO_SIMD_NEON
3311  return veorq_u32(a.simd(), b.simd());
3312 #else
3313  SIMD_RETURN (vbool4, a[i] ^ b[i]);
3314 #endif
3315 }
3316 
3317 
3319  return a = a & b;
3320 }
3321 
3323  return a = a | b;
3324 }
3325 
3327  return a = a ^ b;
3328 }
3329 
3331 #if OIIO_SIMD_SSE
3332  // Fastest way to bit-complement in SSE is to xor with 0xffffffff.
3333  return _mm_xor_ps (a.simd(), vbool4::True());
3334 #elif OIIO_SIMD_NEON
3335  return vmvnq_u32(a.m_simd);
3336 #else
3337  SIMD_RETURN (vbool4, ~a[i]);
3338 #endif
3339 }
3340 
3342 #if OIIO_SIMD_SSE
3343  return _mm_castsi128_ps (_mm_cmpeq_epi32 (_mm_castps_si128 (a), _mm_castps_si128(b)));
3344 #elif OIIO_SIMD_NEON
3345  return vceqq_u32 (a.m_simd, b.m_simd);
3346 #else
3347  SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0);
3348 #endif
3349 }
3350 
3352 #if OIIO_SIMD_SSE
3353  return _mm_xor_ps (a, b);
3354 #elif OIIO_SIMD_NEON
3355  return !(a == b);
3356 #else
3357  SIMD_RETURN (vbool4, a[i] != b[i] ? -1 : 0);
3358 #endif
3359 }
3360 
3361 
3362 
3363 
3364 #if OIIO_SIMD_SSE
3365 // Shuffling. Use like this: x = shuffle<3,2,1,0>(b)
3366 template<int i0, int i1, int i2, int i3>
3367 OIIO_FORCEINLINE __m128i shuffle_sse (__m128i v) {
3368  return _mm_shuffle_epi32(v, _MM_SHUFFLE(i3, i2, i1, i0));
3369 }
3370 #endif
3371 
3372 #if OIIO_SIMD_SSE >= 3
3373 // SSE3 has intrinsics for a few special cases
3374 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 0, 2, 2> (__m128i a) {
3375  return _mm_castps_si128(_mm_moveldup_ps(_mm_castsi128_ps(a)));
3376 }
3377 template<> OIIO_FORCEINLINE __m128i shuffle_sse<1, 1, 3, 3> (__m128i a) {
3378  return _mm_castps_si128(_mm_movehdup_ps(_mm_castsi128_ps(a)));
3379 }
3380 template<> OIIO_FORCEINLINE __m128i shuffle_sse<0, 1, 0, 1> (__m128i a) {
3381  return _mm_castpd_si128(_mm_movedup_pd(_mm_castsi128_pd(a)));
3382 }
3383 #endif
3384 
3385 #if OIIO_SIMD_SSE
3386 template<int i0, int i1, int i2, int i3>
3387 OIIO_FORCEINLINE __m128 shuffle_sse (__m128 a) {
3388  return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(a), _MM_SHUFFLE(i3, i2, i1, i0)));
3389 }
3390 #endif
3391 
3392 #if OIIO_SIMD_SSE >= 3
3393 // SSE3 has intrinsics for a few special cases
3394 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 0, 2, 2> (__m128 a) {
3395  return _mm_moveldup_ps(a);
3396 }
3397 template<> OIIO_FORCEINLINE __m128 shuffle_sse<1, 1, 3, 3> (__m128 a) {
3398  return _mm_movehdup_ps(a);
3399 }
3400 template<> OIIO_FORCEINLINE __m128 shuffle_sse<0, 1, 0, 1> (__m128 a) {
3401  return _mm_castpd_ps(_mm_movedup_pd(_mm_castps_pd(a)));
3402 }
3403 #endif
3404 
3405 
3406 /// Helper: shuffle/swizzle with constant (templated) indices.
3407 /// Example: shuffle<1,1,2,2>(vbool4(a,b,c,d)) returns (b,b,c,c)
3408 template<int i0, int i1, int i2, int i3>
3410 #if OIIO_SIMD_SSE
3411  return shuffle_sse<i0,i1,i2,i3> (a.simd());
3412 #else
3413  return vbool4 (a[i0], a[i1], a[i2], a[i3]);
3414 #endif
3415 }
3416 
3417 /// shuffle<i>(a) is the same as shuffle<i,i,i,i>(a)
3418 template<int i> OIIO_FORCEINLINE vbool4 shuffle (const vbool4& a) {
3419  return shuffle<i,i,i,i>(a);
3420 }
3421 
3422 
3423 /// Helper: as rapid as possible extraction of one component, when the
3424 /// index is fixed.
3425 template<int i>
3427 #if OIIO_SIMD_SSE >= 4
3428  return _mm_extract_epi32(_mm_castps_si128(a.simd()), i); // SSE4.1 only
3429 #else
3430  return a[i];
3431 #endif
3432 }
3433 
3434 /// Helper: substitute val for a[i]
3435 template<int i>
3437 #if OIIO_SIMD_SSE >= 4
3438  int ival = -int(val);
3439  return _mm_castsi128_ps (_mm_insert_epi32 (_mm_castps_si128(a), ival, i));
3440 #else
3441  vbool4 tmp = a;
3442  tmp[i] = -int(val);
3443  return tmp;
3444 #endif
3445 }
3446 
3448 #if OIIO_SIMD_AVX
3449  return _mm_testc_ps (v, vbool4(true)) != 0;
3450 #elif OIIO_SIMD_SSE
3451  return _mm_movemask_ps(v.simd()) == 0xf;
3452 #else
3453  SIMD_RETURN_REDUCE (bool, true, r &= (v[i] != 0));
3454 #endif
3455 }
3456 
3458 #if OIIO_SIMD_AVX
3459  return ! _mm_testz_ps (v, v);
3460 #elif OIIO_SIMD_SSE
3461  return _mm_movemask_ps(v) != 0;
3462 #else
3463  SIMD_RETURN_REDUCE (bool, false, r |= (v[i] != 0));
3464 #endif
3465 }
3466 
3467 OIIO_FORCEINLINE bool all (const vbool4& v) { return reduce_and(v) == true; }
3468 OIIO_FORCEINLINE bool any (const vbool4& v) { return reduce_or(v) == true; }
3469 OIIO_FORCEINLINE bool none (const vbool4& v) { return reduce_or(v) == false; }
3470 
3471 
3472 
3473 //////////////////////////////////////////////////////////////////////
3474 // vbool8 implementation
3475 
3476 
3478  OIIO_DASSERT(i >= 0 && i < elements);
3479 #if OIIO_SIMD_AVX
3480  return ((_mm256_movemask_ps(m_simd) >> i) & 1) ? -1 : 0;
3481 #else
3482  return m_val[i];
3483 #endif
3484 }
3485 
3487  OIIO_DASSERT(i >= 0 && i < elements);
3488  m_val[i] = value ? -1 : 0;
3489 }
3490 
3492  OIIO_DASSERT(i >= 0 && i < elements);
3493  return m_val[i];
3494 }
3495 
3496 
3497 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool8& a) {
3498  cout << a[0];
3499  for (int i = 1; i < a.elements; ++i)
3500  cout << ' ' << a[i];
3501  return cout;
3502 }
3503 
3504 
3506 #if OIIO_SIMD_AVX
3507  m_simd = _mm256_castsi256_ps(_mm256_set1_epi32(-int(a)));
3508 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3509  m_4[0].load(a);
3510  m_4[1].load(a);
3511 #else
3512  int val = -int(a);
3513  SIMD_CONSTRUCT (val);
3514 #endif
3515 }
3516 
3517 
3518 OIIO_FORCEINLINE void vbool8::load (bool a, bool b, bool c, bool d,
3519  bool e, bool f, bool g, bool h) {
3520 #if OIIO_SIMD_AVX
3521  // N.B. -- we need to reverse the order because of our convention
3522  // of storing a,b,c,d in the same order in memory.
3523  m_simd = _mm256_castsi256_ps(_mm256_set_epi32(-int(h), -int(g), -int(f), -int(e),
3524  -int(d), -int(c), -int(b), -int(a)));
3525 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
3526  m_4[0].load(a, b, c, d);
3527  m_4[1].load(e, f, g, h);
3528 #else
3529  m_val[0] = -int(a);
3530  m_val[1] = -int(b);
3531  m_val[2] = -int(c);
3532  m_val[3] = -int(d);
3533  m_val[4] = -int(e);
3534  m_val[5] = -int(f);
3535  m_val[6] = -int(g);
3536  m_val[7] = -int(h);
3537 #endif
3538 }
3539 
3540 OIIO_FORCEINLINE vbool8::vbool8 (bool a, bool b, bool c, bool d,
3541  bool e, bool f, bool g, bool h) {
3542  load (a, b, c, d, e, f, g, h);
3543 }
3544 
3545 OIIO_FORCEINLINE vbool8::vbool8 (int a, int b, int c, int d,
3546  int e, int f, int g, int h) {
3547  load (bool(a), bool(b), bool(c), bool(d),
3548  bool(e), bool(f), bool(g), bool(h));
3549 }
3550 
3552  load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
3553 }
3554 
3555 
3557  load(a);
3558  return *this;
3559 }
3560 
3562  m_simd = other.m_simd;
3563  return *this;
3564 }
3565 
3567 #if OIIO_SIMD_AVX
3568  return _mm256_movemask_ps(m_simd);
3569 #else
3570  return lo().bitmask() | (hi().bitmask() << 4);
3571 #endif
3572 }
3573 
3574 
3576 vbool8::from_bitmask (int bitmask) {
3577  // I think this is a fast conversion from int bitmask to vbool8
3578  return (vint8::Giota() & vint8(bitmask)) != vint8::Zero();
3579 }
3580 
3581 
3583 #if OIIO_SIMD_AVX
3584  m_simd = _mm256_setzero_ps();
3585 #else
3586  *this = false;
3587 #endif
3588 }
3589 
3591 #if OIIO_SIMD_AVX
3592  return _mm256_setzero_ps();
3593 #else
3594  return false;
3595 #endif
3596 }
3597 
3598 
3600 #if OIIO_SIMD_AVX
3601 # if OIIO_SIMD_AVX >= 2 && (OIIO_GNUC_VERSION > 50000)
3602  // Fastest way to fill with all 1 bits is to cmp any value to itself.
3603  __m256i anyval = _mm256_undefined_si256();
3604  return _mm256_castsi256_ps (_mm256_cmpeq_epi8 (anyval, anyval));
3605 # else
3606  return _mm256_castsi256_ps (_mm256_set1_epi32 (-1));
3607 # endif
3608 #else
3609  return true;
3610 #endif
3611 }
3612 
3613 
3615  SIMD_DO (values[i] = m_val[i] ? true : false);
3616 }
3617 
3618 OIIO_FORCEINLINE void vbool8::store (bool *values, int n) const {
3619  OIIO_DASSERT (n >= 0 && n <= elements);
3620  for (int i = 0; i < n; ++i)
3621  values[i] = m_val[i] ? true : false;
3622 }
3623 
3624 
3626 #if OIIO_SIMD_AVX
3627  return _mm256_castps256_ps128 (simd());
3628 #else
3629  return m_4[0];
3630 #endif
3631 }
3632 
3634 #if OIIO_SIMD_AVX
3635  return _mm256_extractf128_ps (simd(), 1);
3636 #else
3637  return m_4[1];
3638 #endif
3639 }
3640 
3641 
3643 #if OIIO_SIMD_AVX
3644  __m256 r = _mm256_castps128_ps256 (lo);
3645  m_simd = _mm256_insertf128_ps (r, hi, 1);
3646  // N.B. equivalent, if available: m_simd = _mm256_set_m128 (hi, lo);
3647 #else
3648  m_4[0] = lo;
3649  m_4[1] = hi;
3650 #endif
3651 }
3652 
3653 
3655 #if OIIO_SIMD_AVX
3656  return _mm256_xor_ps (a.simd(), vbool8::True());
3657 #else
3658  SIMD_RETURN (vbool8, a[i] ^ (-1));
3659 #endif
3660 }
3661 
3663 #if OIIO_SIMD_AVX
3664  return _mm256_and_ps (a.simd(), b.simd());
3665 #else
3666  SIMD_RETURN (vbool8, a[i] & b[i]);
3667 #endif
3668 }
3669 
3671 #if OIIO_SIMD_AVX
3672  return _mm256_or_ps (a.simd(), b.simd());
3673 #else
3674  SIMD_RETURN (vbool8, a[i] | b[i]);
3675 #endif
3676 }
3677 
3679 #if OIIO_SIMD_AVX
3680  return _mm256_xor_ps (a.simd(), b.simd());
3681 #else
3682  SIMD_RETURN (vbool8, a[i] ^ b[i]);
3683 #endif
3684 }
3685 
3686 
3688  return a = a & b;
3689 }
3690 
3692  return a = a | b;
3693 }
3694 
3696  return a = a ^ b;
3697 }
3698 
3699 
3701 #if OIIO_SIMD_AVX
3702  // Fastest way to bit-complement in SSE is to xor with 0xffffffff.
3703  return _mm256_xor_ps (a.simd(), vbool8::True());
3704 #else
3705  SIMD_RETURN (vbool8, ~a[i]);
3706 #endif
3707 }
3708 
3709 
3711 #if OIIO_SIMD_AVX >= 2
3712  return _mm256_castsi256_ps (_mm256_cmpeq_epi32 (_mm256_castps_si256 (a), _mm256_castps_si256(b)));
3713 #elif OIIO_SIMD_AVX
3714  return _mm256_cmp_ps (a, b, _CMP_EQ_UQ);
3715 #else
3716  SIMD_RETURN (vbool8, a[i] == b[i] ? -1 : 0);
3717 #endif
3718 }
3719 
3721 #if OIIO_SIMD_AVX
3722  return _mm256_xor_ps (a, b);
3723 #else
3724  SIMD_RETURN (vbool8, a[i] != b[i] ? -1 : 0);
3725 #endif
3726 }
3727 
3728 
3729 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
3731 #if OIIO_SIMD_AVX >= 2
3732  vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
3733  return _mm256_permutevar8x32_ps (a.simd(), index.simd());
3734 #else
3735  return vbool8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
3736 #endif
3737 }
3738 
3739 template<int i> OIIO_FORCEINLINE vbool8 shuffle (const vbool8& a) {
3740  return shuffle<i,i,i,i,i,i,i,i>(a);
3741 }
3742 
3743 
3744 template<int i>
3746 #if OIIO_SIMD_AVX && !_WIN32
3747  return _mm256_extract_epi32(_mm256_castps_si256(a.simd()), i); // SSE4.1 only
3748 #else
3749  return a[i];
3750 #endif
3751 }
3752 
3753 template<int i>
3755 #if OIIO_SIMD_AVX && !_WIN32
3756  int ival = -int(val);
3757  return _mm256_castsi256_ps (_mm256_insert_epi32 (_mm256_castps_si256(a.simd()), ival, i));
3758 #else
3759  vbool8 tmp = a;
3760  tmp[i] = -int(val);
3761  return tmp;
3762 #endif
3763 }
3764 
3765 
3767 #if OIIO_SIMD_AVX
3768  return _mm256_testc_ps (v, vbool8(true)) != 0;
3769  // return _mm256_movemask_ps(v.simd()) == 0xff;
3770 #else
3771  SIMD_RETURN_REDUCE (bool, true, r &= bool(v[i]));
3772 #endif
3773 }
3774 
3776 #if OIIO_SIMD_AVX
3777  return ! _mm256_testz_ps (v, v); // FIXME? Not in all immintrin.h !
3778  // return _mm256_movemask_ps(v) != 0;
3779 #else
3780  SIMD_RETURN_REDUCE (bool, false, r |= bool(v[i]));
3781 #endif
3782 }
3783 
3784 
3785 OIIO_FORCEINLINE bool all (const vbool8& v) { return reduce_and(v) == true; }
3786 OIIO_FORCEINLINE bool any (const vbool8& v) { return reduce_or(v) == true; }
3787 OIIO_FORCEINLINE bool none (const vbool8& v) { return reduce_or(v) == false; }
3788 
3789 
3790 
3791 //////////////////////////////////////////////////////////////////////
3792 // vbool16 implementation
3793 
3794 
3796  OIIO_DASSERT(i >= 0 && i < elements);
3797 #if OIIO_SIMD_AVX >= 512
3798  return (int(m_simd) >> i) & 1;
3799 #else
3800  return (m_bits >> i) & 1;
3801 #endif
3802 }
3803 
3805  OIIO_DASSERT(i >= 0 && i < elements);
3806  int bits = m_bits;
3807  bits &= (0xffff ^ (1<<i));
3808  bits |= (int(value)<<i);
3809  m_bits = bits;
3810 }
3811 
3812 
3813 OIIO_FORCEINLINE std::ostream& operator<< (std::ostream& cout, const vbool16& a) {
3814  cout << a[0];
3815  for (int i = 1; i < a.elements; ++i)
3816  cout << ' ' << a[i];
3817  return cout;
3818 }
3819 
3820 
3822  m_simd = a ? 0xffff : 0;
3823 }
3824 
3825 
3827  m_simd = simd_t(a);
3828 }
3829 
3830 
3831 OIIO_FORCEINLINE void vbool16::load (bool v0, bool v1, bool v2, bool v3,
3832  bool v4, bool v5, bool v6, bool v7,
3833  bool v8, bool v9, bool v10, bool v11,
3834  bool v12, bool v13, bool v14, bool v15) {
3835  m_simd = simd_t((int(v0) << 0) |
3836  (int(v1) << 1) |
3837  (int(v2) << 2) |
3838  (int(v3) << 3) |
3839  (int(v4) << 4) |
3840  (int(v5) << 5) |
3841  (int(v6) << 6) |
3842  (int(v7) << 7) |
3843  (int(v8) << 8) |
3844  (int(v9) << 9) |
3845  (int(v10) << 10) |
3846  (int(v11) << 11) |
3847  (int(v12) << 12) |
3848  (int(v13) << 13) |
3849  (int(v14) << 14) |
3850  (int(v15) << 15));
3851 }
3852 
3853 OIIO_FORCEINLINE vbool16::vbool16 (bool v0, bool v1, bool v2, bool v3,
3854  bool v4, bool v5, bool v6, bool v7,
3855  bool v8, bool v9, bool v10, bool v11,
3856  bool v12, bool v13, bool v14, bool v15) {
3857  load (v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
3858 }
3859 
3861  int v4, int v5, int v6, int v7,
3862  int v8, int v9, int v10, int v11,
3863  int v12, int v13, int v14, int v15) {
3864  load (bool(v0), bool(v1), bool(v2), bool(v3),
3865  bool(v4), bool(v5), bool(v6), bool(v7),
3866  bool(v8), bool(v9), bool(v10), bool(v11),
3867  bool(v12), bool(v13), bool(v14), bool(v15));
3868 }
3869 
3871  load_bitmask (a.bitmask() | (b.bitmask() << 8));
3872 }
3873 
3875  load (a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
3876  a[8], a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
3877 }
3878 
3879 
3881  load(a);
3882  return *this;
3883 }
3884 
3886  m_simd = other.m_simd;
3887  return *this;
3888 }
3889 
3890 
3892 #if OIIO_SIMD_AVX >= 512
3893  return int(m_simd);
3894 #else
3895  return int(m_bits);
3896 #endif
3897 }
3898 
3899 
3901  m_simd = simd_t(0);
3902 }
3903 
3905  return simd_t(0);
3906 }
3907 
3908 
3910  return simd_t(0xffff);
3911 }
3912 
3913 
3915  SIMD_DO (values[i] = m_bits & (1<<i));
3916 }
3917 
3918 OIIO_FORCEINLINE void vbool16::store (bool *values, int n) const {
3919  OIIO_DASSERT (n >= 0 && n <= elements);
3920  for (int i = 0; i < n; ++i)
3921  values[i] = m_bits & (1<<i);
3922 }
3923 
3924 
3925 
3927 #if OIIO_SIMD_AVX >= 512
3928  return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()&0xff, -1));
3929 #else
3930  SIMD_RETURN (vbool8, (*this)[i] ? -1 : 0);
3931 #endif
3932 }
3933 
3935 #if OIIO_SIMD_AVX >= 512
3936  return _mm256_castsi256_ps (_mm256_maskz_set1_epi32 (bitmask()>>8, -1));
3937 #else
3938  SIMD_RETURN (vbool8, (*this)[i+8] ? -1 : 0);
3939 #endif
3940 }
3941 
3942 
3944 #if OIIO_SIMD_AVX >= 512
3945  return _mm512_knot (a.simd());
3946 #else
3947  return vbool16 (a.m_bits ^ 0xffff);
3948 #endif
3949 }
3950 
3952 #if OIIO_SIMD_AVX >= 512
3953  return _mm512_kand (a.simd(), b.simd());
3954 #else
3955  return vbool16 (a.m_bits & b.m_bits);
3956 #endif
3957 }
3958 
3960 #if OIIO_SIMD_AVX >= 512
3961  return _mm512_kor (a.simd(), b.simd());
3962 #else
3963  return vbool16 (a.m_bits | b.m_bits);
3964 #endif
3965 }
3966 
3968 #if OIIO_SIMD_AVX >= 512
3969  return _mm512_kxor (a.simd(), b.simd());
3970 #else
3971  return vbool16 (a.m_bits ^ b.m_bits);
3972 #endif
3973 }
3974 
3975 
3977  return a = a & b;
3978 }
3979 
3981  return a = a | b;
3982 }
3983 
3985  return a = a ^ b;
3986 }
3987 
3988 
3990  return a ^ vbool16::True();
3991 }
3992 
3993 
3995 #if OIIO_SIMD_AVX >= 512
3996  return _mm512_kxnor (a.simd(), b.simd());
3997 #else
3998  return vbool16 (!(a.m_bits ^ b.m_bits));
3999 #endif
4000 }
4001 
4003 #if OIIO_SIMD_AVX >= 512
4004  return _mm512_kxor (a.simd(), b.simd());
4005 #else
4006  return vbool16 (a.m_bits ^ b.m_bits);
4007 #endif
4008 }
4009 
4010 
4011 template<int i>
4013  return a[i];
4014 }
4015 
4016 template<int i>
4018  vbool16 tmp = a;
4019  tmp.setcomp (i, val);
4020  return tmp;
4021 }
4022 
4023 
4025  return v.bitmask() == 0xffff;
4026 }
4027 
4029  return v.bitmask() != 0;
4030 }
4031 
4032 
4033 OIIO_FORCEINLINE bool all (const vbool16& v) { return reduce_and(v) == true; }
4034 OIIO_FORCEINLINE bool any (const vbool16& v) { return reduce_or(v) == true; }
4035 OIIO_FORCEINLINE bool none (const vbool16& v) { return reduce_or(v) == false; }
4036 
4037 
4038 
4039 
4040 
4041 
4042 //////////////////////////////////////////////////////////////////////
4043 // vint4 implementation
4044 
4046  m_simd = other.m_simd;
4047  return *this;
4048 }
4049 
4052  return m_val[i];
4053 }
4054 
4057  return m_val[i];
4058 }
4059 
4062  m_val[i] = val;
4063 }
4064 
4065 
4067 #if OIIO_SIMD_SSE
4068  m_simd = _mm_set1_epi32 (a);
4069 #elif OIIO_SIMD_NEON
4070  m_simd = vdupq_n_s32 (a);
4071 #else
4072  SIMD_CONSTRUCT (a);
4073 #endif
4074 }
4075 
4076 
4077 
4078 OIIO_FORCEINLINE void vint4::load (int a, int b, int c, int d) {
4079 #if OIIO_SIMD_SSE
4080  m_simd = _mm_set_epi32 (d, c, b, a);
4081 #elif OIIO_SIMD_NEON
4082  int values[4] = { a, b, c, d };
4083  m_simd = vld1q_s32 (values);
4084 #else
4085  m_val[0] = a;
4086  m_val[1] = b;
4087  m_val[2] = c;
4088  m_val[3] = d;
4089 #endif
4090 }
4091 
4092 
4093 // OIIO_FORCEINLINE void vint4::load (int a, int b, int c, int d,
4094 // int e, int f, int g, int h) {
4095 // load (a, b, c, d);
4096 // }
4097 
4098 
4099 
4101 #if OIIO_SIMD_SSE
4102  m_simd = _mm_loadu_si128 ((const simd_t *)values);
4103 #else
4104  SIMD_CONSTRUCT (values[i]);
4105 #endif
4106 }
4107 
4108 
4109 OIIO_FORCEINLINE void vint4::load (const int *values, int n)
4110 {
4111  OIIO_DASSERT (n >= 0 && n <= elements);
4112 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4113  m_simd = _mm_maskz_loadu_epi32 (__mmask8(~(0xf << n)), values);
4114 #elif OIIO_SIMD_SSE
4115  switch (n) {
4116  case 1:
4117  m_simd = _mm_castps_si128 (_mm_load_ss ((const float *)values));
4118  break;
4119  case 2:
4120  // Trickery: load one double worth of bits!
4121  m_simd = _mm_castpd_si128 (_mm_load_sd ((const double*)values));
4122  break;
4123  case 3:
4124  // Trickery: load one double worth of bits, then a float,
4125  // and combine, casting to ints.
4126  m_simd = _mm_castps_si128 (_mm_movelh_ps(_mm_castpd_ps(_mm_load_sd((const double*)values)),
4127  _mm_load_ss ((const float *)values + 2)));
4128  break;
4129  case 4:
4130  m_simd = _mm_loadu_si128 ((const simd_t *)values);
4131  break;
4132  default:
4133  clear ();
4134  break;
4135  }
4136 #else
4137  for (int i = 0; i < n; ++i)
4138  m_val[i] = values[i];
4139  for (int i = n; i < elements; ++i)
4140  m_val[i] = 0;
4141 #endif
4142 }
4143 
4144 
4145 OIIO_FORCEINLINE void vint4::load (const unsigned short *values) {
4146 #if OIIO_SIMD_SSE >= 4
4147  // Trickery: load one double worth of bits = 4 ushorts!
4148  simd_t a = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
4149  m_simd = _mm_cvtepu16_epi32 (a);
4150 #else
4151  SIMD_CONSTRUCT (values[i]);
4152 #endif
4153 }
4154 
4155 
4156 OIIO_FORCEINLINE void vint4::load (const short *values) {
4157 #if OIIO_SIMD_SSE >= 4
4158  // Trickery: load one double worth of bits = 4 shorts!
4159  simd_t a = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
4160  m_simd = _mm_cvtepi16_epi32 (a);
4161 #else
4162  SIMD_CONSTRUCT (values[i]);
4163 #endif
4164 }
4165 
4166 
4167 OIIO_FORCEINLINE void vint4::load (const unsigned char *values) {
4168 #if OIIO_SIMD_SSE >= 4
4169  // Trickery: load one float worth of bits = 4 uchars!
4170  simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values));
4171  m_simd = _mm_cvtepu8_epi32 (a);
4172 #else
4173  SIMD_CONSTRUCT (values[i]);
4174 #endif
4175 }
4176 
4177 
4179 #if OIIO_SIMD_SSE >= 4
4180  // Trickery: load one float worth of bits = 4 chars!
4181  simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values));
4182  m_simd = _mm_cvtepi8_epi32 (a);
4183 #else
4184  SIMD_CONSTRUCT (values[i]);
4185 #endif
4186 }
4187 
4188 
4190 
4191 OIIO_FORCEINLINE vint4::vint4 (int a, int b) { load(a,a,b,b); }
4192 
4193 OIIO_FORCEINLINE vint4::vint4 (int a, int b, int c, int d) { load(a,b,c,d); }
4194 
4195 // OIIO_FORCEINLINE vint4::vint4 (int a, int b, int c, int d,
4196 // int e, int f, int g, int h) {
4197 // load(a,b,c,d,e,f,g,h);
4198 // }
4199 
4200 OIIO_FORCEINLINE vint4::vint4 (const int *vals) { load (vals); }
4201 OIIO_FORCEINLINE vint4::vint4 (const unsigned short *vals) { load(vals); }
4202 OIIO_FORCEINLINE vint4::vint4 (const short *vals) { load(vals); }
4203 OIIO_FORCEINLINE vint4::vint4 (const unsigned char *vals) { load(vals); }
4204 OIIO_FORCEINLINE vint4::vint4 (const char *vals) { load(vals); }
4205 
4206 OIIO_FORCEINLINE const vint4 & vint4::operator= (int a) { load(a); return *this; }
4207 
4208 
4210 #if OIIO_SIMD_SSE
4211  // Use an unaligned store -- it's just as fast when the memory turns
4212  // out to be aligned, nearly as fast even when unaligned. Not worth
4213  // the headache of using stores that require alignment.
4214  _mm_storeu_si128 ((simd_t *)values, m_simd);
4215 #else
4216  SIMD_DO (values[i] = m_val[i]);
4217 #endif
4218 }
4219 
4220 
4222 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4223  m_simd = _mm_maskz_loadu_epi32 (__mmask8(mask), (const simd_t *)values);
4224 #elif OIIO_SIMD_AVX >= 2
4225  m_simd = _mm_maskload_epi32 (values, _mm_castps_si128(vbool_t::from_bitmask(mask)));
4226 #else
4227  SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0);
4228 #endif
4229 }
4230 
4231 
4233 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4234  m_simd = _mm_maskz_loadu_epi32 (__mmask8(mask.bitmask()), (const simd_t *)values);
4235 #elif OIIO_SIMD_AVX >= 2
4236  m_simd = _mm_maskload_epi32 (values, _mm_castps_si128(mask));
4237 #else
4238  SIMD_CONSTRUCT (mask[i] ? values[i] : 0);
4239 #endif
4240 }
4241 
4242 
4244 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4245  _mm_mask_storeu_epi32 (values, __mmask8(mask), m_simd);
4246 #elif OIIO_SIMD_AVX >= 2
4247  _mm_maskstore_epi32 (values, _mm_castps_si128(vbool_t::from_bitmask(mask)), m_simd);
4248 #else
4249  SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]);
4250 #endif
4251 }
4252 
4253 
4255 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4256  _mm_mask_storeu_epi32 (values, mask.bitmask(), m_simd);
4257 #elif OIIO_SIMD_AVX >= 2
4258  _mm_maskstore_epi32 (values, _mm_castps_si128(mask), m_simd);
4259 #else
4260  SIMD_DO (if (mask[i]) values[i] = (*this)[i]);
4261 #endif
4262 }
4263 
4264 
4265 template <int scale>
4266 OIIO_FORCEINLINE void
4267 vint4::gather (const value_t *baseptr, const vint_t& vindex)
4268 {
4269 #if OIIO_SIMD_AVX >= 2
4270  m_simd = _mm_i32gather_epi32 (baseptr, vindex, scale);
4271 #else
4272  SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale));
4273 #endif
4274 }
4275 
4276 template<int scale>
4277 OIIO_FORCEINLINE void
4278 vint4::gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex)
4279 {
4280 #if OIIO_SIMD_AVX >= 2
4281  m_simd = _mm_mask_i32gather_epi32 (m_simd, baseptr, vindex, _mm_cvtps_epi32(mask), scale);
4282 #else
4283  SIMD_CONSTRUCT (mask[i] ? *(const value_t *)((const char *)baseptr + vindex[i]*scale) : 0);
4284 #endif
4285 }
4286 
4287 template<int scale>
4288 OIIO_FORCEINLINE void
4289 vint4::scatter (value_t *baseptr, const vint_t& vindex) const
4290 {
4291 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4292  // FIXME: disable because it benchmarks slower than the dumb way
4293  _mm_i32scatter_epi32 (baseptr, vindex, m_simd, scale);
4294 #else
4295  SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
4296 #endif
4297 }
4298 
4299 template<int scale>
4300 OIIO_FORCEINLINE void
4302  const vint_t& vindex) const
4303 {
4304 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4305  // FIXME: disable because it benchmarks slower than the dumb way
4306  _mm_mask_i32scatter_epi32 (baseptr, mask.bitmask(), vindex, m_simd, scale);
4307 #else
4308  SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
4309 #endif
4310 }
4311 
4312 
4314 #if OIIO_SIMD_SSE
4315  m_simd = _mm_setzero_si128();
4316 #else
4317  *this = 0;
4318 #endif
4319 }
4320 
4321 
4322 
4324 #if OIIO_SIMD_SSE
4325  return _mm_setzero_si128();
4326 #else
4327  return 0;
4328 #endif
4329 }
4330 
4331 
4332 OIIO_FORCEINLINE const vint4 vint4::One () { return vint4(1); }
4333 
4335 #if OIIO_SIMD_SSE
4336  // Fastest way to fill an __m128 with all 1 bits is to cmpeq_epi8
4337  // any value to itself.
4338 # if OIIO_SIMD_AVX && (OIIO_GNUC_VERSION > 50000)
4339  __m128i anyval = _mm_undefined_si128();
4340 # else
4341  __m128i anyval = _mm_setzero_si128();
4342 # endif
4343  return _mm_cmpeq_epi8 (anyval, anyval);
4344 #else
4345  return vint4(-1);
4346 #endif
4347 }
4348 
4349 
4350 
4351 OIIO_FORCEINLINE const vint4 vint4::Iota (int start, int step) {
4352  return vint4 (start+0*step, start+1*step, start+2*step, start+3*step);
4353 }
4354 
4355 
4357  return vint4 (1<<0, 1<<1, 1<<2, 1<<3);
4358 }
4359 
4360 
4362 #if OIIO_SIMD_SSE
4363  return _mm_add_epi32 (a.simd(), b.simd());
4364 #else
4365  SIMD_RETURN (vint4, a[i] + b[i]);
4366 #endif
4367 }
4368 
4370  return a = a + b;
4371 }
4372 
4373 
4375 #if OIIO_SIMD_SSE
4376  return _mm_sub_epi32 (_mm_setzero_si128(), a);
4377 #else
4378  SIMD_RETURN (vint4, -a[i]);
4379 #endif
4380 }
4381 
4382 
4384 #if OIIO_SIMD_SSE
4385  return _mm_sub_epi32 (a.simd(), b.simd());
4386 #else
4387  SIMD_RETURN (vint4, a[i] - b[i]);
4388 #endif
4389 }
4390 
4391 
4393  return a = a - b;
4394 }
4395 
4396 
4397 #if OIIO_SIMD_SSE
4398 // Shamelessly lifted from Syrah which lifted from Manta which lifted it
4399 // from intel.com
4400 OIIO_FORCEINLINE __m128i mul_epi32 (__m128i a, __m128i b) {
4401 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
4402  return _mm_mullo_epi32(a, b);
4403 #else
4404  // Prior to SSE 4.1, there is no _mm_mullo_epi32 instruction, so we have
4405  // to fake it.
4406  __m128i t0;
4407  __m128i t1;
4408  t0 = _mm_mul_epu32 (a, b);
4409  t1 = _mm_mul_epu32 (_mm_shuffle_epi32 (a, 0xB1),
4410  _mm_shuffle_epi32 (b, 0xB1));
4411  t0 = _mm_shuffle_epi32 (t0, 0xD8);
4412  t1 = _mm_shuffle_epi32 (t1, 0xD8);
4413  return _mm_unpacklo_epi32 (t0, t1);
4414 #endif
4415 }
4416 #endif
4417 
4418 
4420 #if OIIO_SIMD_SSE
4421  return mul_epi32 (a.simd(), b.simd());
4422 #else
4423  SIMD_RETURN (vint4, a[i] * b[i]);
4424 #endif
4425 }
4426 
4427 
4428 OIIO_FORCEINLINE const vint4& operator*= (vint4& a, const vint4& b) { return a = a * b; }
4429 OIIO_FORCEINLINE const vint4& operator*= (vint4& a, int b) { return a = a * b; }
4430 
4431 
4433  // NO INTEGER DIVISION IN SSE!
4434  SIMD_RETURN (vint4, a[i] / b[i]);
4435 }
4436 
4437 
4438 OIIO_FORCEINLINE const vint4& operator/= (vint4& a, const vint4& b) { return a = a / b; }
4439 
4441  // NO INTEGER MODULUS IN SSE!
4442  SIMD_RETURN (vint4, a[i] % b[i]);
4443 }
4444 
4445 
4446 
4447 OIIO_FORCEINLINE const vint4& operator%= (vint4& a, const vint4& b) { return a = a % b; }
4448 
4449 
4451  // NO INTEGER MODULUS in SSE!
4452  SIMD_RETURN (vint4, a[i] % w);
4453 }
4454 
4455 
4456 OIIO_FORCEINLINE const vint4& operator%= (vint4& a, int b) { return a = a % b; }
4457 
4458 
4460 #if OIIO_SIMD_SSE
4461  return _mm_and_si128 (a.simd(), b.simd());
4462 #elif OIIO_SIMD_NEON
4463  return vandq_s32(a.simd(), b.simd());
4464 #else
4465  SIMD_RETURN (vint4, a[i] & b[i]);
4466 #endif
4467 }
4468 
4469 
4470 OIIO_FORCEINLINE const vint4& operator&= (vint4& a, const vint4& b) { return a = a & b; }
4471 
4472 
4473 
4475 #if OIIO_SIMD_SSE
4476  return _mm_or_si128 (a.simd(), b.simd());
4477 #elif OIIO_SIMD_NEON
4478  return vorrq_s32(a.simd(), b.simd());
4479 #else
4480  SIMD_RETURN (vint4, a[i] | b[i]);
4481 #endif
4482 }
4483 
4484 OIIO_FORCEINLINE const vint4& operator|= (vint4& a, const vint4& b) { return a = a | b; }
4485 
4486 
4488 #if OIIO_SIMD_SSE
4489  return _mm_xor_si128 (a.simd(), b.simd());
4490 #elif OIIO_SIMD_NEON
4491  return veorq_s32(a.simd(), b.simd());
4492 #else
4493  SIMD_RETURN (vint4, a[i] ^ b[i]);
4494 #endif
4495 }
4496 
4497 
4498 OIIO_FORCEINLINE const vint4& operator^= (vint4& a, const vint4& b) { return a = a ^ b; }
4499 
4500 
4502 #if OIIO_SIMD_SSE
4503  return a ^ a.NegOne();
4504 #elif OIIO_SIMD_NEON
4505  return vmvnq_s32(a.m_simd);
4506 #else
4507  SIMD_RETURN (vint4, ~a[i]);
4508 #endif
4509 }
4510 
4511 OIIO_FORCEINLINE vint4 operator<< (const vint4& a, unsigned int bits) {
4512 #if OIIO_SIMD_SSE
4513  return _mm_slli_epi32 (a, bits);
4514 #else
4515  SIMD_RETURN (vint4, a[i] << bits);
4516 #endif
4517 }
4518 
4519 OIIO_FORCEINLINE const vint4& operator<<= (vint4& a, const unsigned int bits) {
4520  return a = a << bits;
4521 }
4522 
4523 
4524 OIIO_FORCEINLINE vint4 operator>> (const vint4& a, const unsigned int bits) {
4525 #if OIIO_SIMD_SSE
4526  return _mm_srai_epi32 (a, bits);
4527 #else
4528  SIMD_RETURN (vint4, a[i] >> bits);
4529 #endif
4530 }
4531 
4532 OIIO_FORCEINLINE const vint4& operator>>= (vint4& a, const unsigned int bits) {
4533  return a = a >> bits;
4534 }
4535 
4536 
4537 OIIO_FORCEINLINE vint4 srl (const vint4& a, const unsigned int bits) {
4538 #if OIIO_SIMD_SSE
4539  return _mm_srli_epi32 (a, bits);
4540 #else
4541  SIMD_RETURN (vint4, int ((unsigned int)(a[i]) >> bits));
4542 #endif
4543 }
4544 
4545 
4547 #if OIIO_SIMD_SSE
4548  return _mm_castsi128_ps(_mm_cmpeq_epi32 (a, b));
4549 #elif OIIO_SIMD_NEON
4550  return vceqq_s32 (a.m_simd, b.m_simd);
4551 #else
4552  SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0);
4553 #endif
4554 }
4555 
4557  return ! (a == b);
4558 }
4559 
4560 
4562 #if OIIO_SIMD_SSE
4563  return _mm_castsi128_ps(_mm_cmpgt_epi32 (a, b));
4564 #else
4565  SIMD_RETURN (vbool4, a[i] > b[i] ? -1 : 0);
4566 #endif
4567 }
4568 
4570 #if OIIO_SIMD_SSE
4571  return _mm_castsi128_ps(_mm_cmplt_epi32 (a, b));
4572 #else
4573  SIMD_RETURN (vbool4, a[i] < b[i] ? -1 : 0);
4574 #endif
4575 }
4576 
4578  return (b < a) | (a == b);
4579 }
4580 
4582  return (b > a) | (a == b);
4583 }
4584 
4585 inline std::ostream& operator<< (std::ostream& cout, const vint4& val) {
4586  cout << val[0];
4587  for (int i = 1; i < val.elements; ++i)
4588  cout << ' ' << val[i];
4589  return cout;
4590 }
4591 
4592 
4593 OIIO_FORCEINLINE void vint4::store (int *values, int n) const {
4594  OIIO_DASSERT (n >= 0 && n <= elements);
4595 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4596  // This SHOULD be fast, but in my benchmarks, it is slower!
4597  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
4598  // Re-test this periodically with new Intel hardware.
4599  _mm_mask_storeu_epi32 (values, __mmask8(~(0xf << n)), m_simd);
4600 #elif OIIO_SIMD
4601  // For full SIMD, there is a speed advantage to storing all components.
4602  if (n == elements)
4603  store (values);
4604  else
4605  for (int i = 0; i < n; ++i)
4606  values[i] = m_val[i];
4607 #else
4608  for (int i = 0; i < n; ++i)
4609  values[i] = m_val[i];
4610 #endif
4611 }
4612 
4613 
4614 
4615 OIIO_FORCEINLINE void vint4::store (unsigned short *values) const {
4616 #if OIIO_AVX512VL_ENABLED
4617  _mm_mask_cvtepi32_storeu_epi16 (values, __mmask8(0xf), m_simd);
4618 #elif OIIO_SIMD_SSE
4619  // Expressed as half-words and considering little endianness, we
4620  // currently have AxBxCxDx (the 'x' means don't care).
4621  vint4 clamped = m_simd & vint4(0xffff); // A0B0C0D0
4622  vint4 low = _mm_shufflelo_epi16 (clamped, (0<<0) | (2<<2) | (1<<4) | (1<<6));
4623  // low = AB00xxxx
4624  vint4 high = _mm_shufflehi_epi16 (clamped, (1<<0) | (1<<2) | (0<<4) | (2<<6));
4625  // high = xxxx00CD
4626  vint4 highswapped = shuffle_sse<2,3,0,1>(high); // 00CDxxxx
4627  vint4 result = low | highswapped; // ABCDxxxx
4628  _mm_storel_pd ((double *)values, _mm_castsi128_pd(result));
4629  // At this point, values[] should hold A,B,C,D
4630 #else
4631  SIMD_DO (values[i] = m_val[i]);
4632 #endif
4633 }
4634 
4635 
4636 
4637 OIIO_FORCEINLINE void vint4::store (unsigned char *values) const {
4638 #if OIIO_AVX512VL_ENABLED
4639  _mm_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xf), m_simd);
4640 #elif OIIO_SIMD_SSE
4641  // Expressed as bytes and considering little endianness, we
4642  // currently have AxBxCxDx (the 'x' means don't care).
4643  vint4 clamped = m_simd & vint4(0xff); // A000 B000 C000 D000
4644  vint4 swapped = shuffle_sse<1,0,3,2>(clamped); // B000 A000 D000 C000
4645  vint4 shifted = swapped << 8; // 0B00 0A00 0D00 0C00
4646  vint4 merged = clamped | shifted; // AB00 xxxx CD00 xxxx
4647  vint4 merged2 = shuffle_sse<2,2,2,2>(merged); // CD00 ...
4648  vint4 shifted2 = merged2 << 16; // 00CD ...
4649  vint4 result = merged | shifted2; // ABCD ...
4650  *(int*)values = result[0]; //extract<0>(result);
4651  // At this point, values[] should hold A,B,C,D
4652 #else
4653  SIMD_DO (values[i] = m_val[i]);
4654 #endif
4655 }
4656 
4657 
4658 
4659 
4660 template<int i0, int i1, int i2, int i3>
4662 #if OIIO_SIMD_SSE
4663  return shuffle_sse<i0,i1,i2,i3> (__m128i(a));
4664 #else
4665  return vint4(a[i0], a[i1], a[i2], a[i3]);
4666 #endif
4667 }
4668 
4669 template<int i> OIIO_FORCEINLINE vint4 shuffle (const vint4& a) { return shuffle<i,i,i,i>(a); }
4670 
4671 
4672 template<int i>
4674 #if OIIO_SIMD_SSE >= 4
4675  return _mm_extract_epi32(v.simd(), i); // SSE4.1 only
4676 #else
4677  return v[i];
4678 #endif
4679 }
4680 
4681 #if OIIO_SIMD_SSE
4682 template<> OIIO_FORCEINLINE int extract<0> (const vint4& v) {
4683  return _mm_cvtsi128_si32(v.simd());
4684 }
4685 #endif
4686 
4687 template<int i>
4689 #if OIIO_SIMD_SSE >= 4
4690  return _mm_insert_epi32 (a.simd(), val, i);
4691 #else
4692  vint4 tmp = a;
4693  tmp[i] = val;
4694  return tmp;
4695 #endif
4696 }
4697 
4698 
4699 
4700 OIIO_FORCEINLINE int vint4::x () const { return extract<0>(*this); }
4701 OIIO_FORCEINLINE int vint4::y () const { return extract<1>(*this); }
4702 OIIO_FORCEINLINE int vint4::z () const { return extract<2>(*this); }
4703 OIIO_FORCEINLINE int vint4::w () const { return extract<3>(*this); }
4704 OIIO_FORCEINLINE void vint4::set_x (int val) { *this = insert<0>(*this, val); }
4705 OIIO_FORCEINLINE void vint4::set_y (int val) { *this = insert<1>(*this, val); }
4706 OIIO_FORCEINLINE void vint4::set_z (int val) { *this = insert<2>(*this, val); }
4707 OIIO_FORCEINLINE void vint4::set_w (int val) { *this = insert<3>(*this, val); }
4708 
4709 
4711 {
4712 #if OIIO_SIMD_SSE
4713  return _mm_castps_si128 (x.simd());
4714 #else
4715  return *(vint4 *)&x;
4716 #endif
4717 }
4718 
4719 // Old names: (DEPRECATED 1.8)
4720 OIIO_DEPRECATED("use bitcast_to_int() (1.8)")
4721 inline vint4 bitcast_to_int4 (const vbool4& x) { return bitcast_to_int(x); }
4722 
4723 
4725 #if OIIO_SIMD_SSE >= 3
4726  // People seem to agree that SSE3 does add reduction best with 2
4727  // horizontal adds.
4728  // suppose v = (a, b, c, d)
4729  simd::vint4 ab_cd = _mm_hadd_epi32 (v.simd(), v.simd());
4730  // ab_cd = (a+b, c+d, a+b, c+d)
4731  simd::vint4 abcd = _mm_hadd_epi32 (ab_cd.simd(), ab_cd.simd());
4732  // all abcd elements are a+b+c+d, return an element as fast as possible
4733  return abcd;
4734 #elif OIIO_SIMD_SSE >= 2
4735  // I think this is the best we can do for SSE2, and I'm still not sure
4736  // it's faster than the default scalar operation. But anyway...
4737  // suppose v = (a, b, c, d)
4738  vint4 ab_ab_cd_cd = shuffle<1,0,3,2>(v) + v;
4739  // ab_ab_cd_cd = (b,a,d,c) + (a,b,c,d) = (a+b,a+b,c+d,c+d)
4740  vint4 cd_cd_ab_ab = shuffle<2,3,0,1>(ab_ab_cd_cd);
4741  // cd_cd_ab_ab = (c+d,c+d,a+b,a+b)
4742  vint4 abcd = ab_ab_cd_cd + cd_cd_ab_ab; // a+b+c+d in all components
4743  return abcd;
4744 #else
4745  return vint4(reduce_add(v));
4746 #endif
4747 }
4748 
4749 
4751 #if OIIO_SIMD_SSE
4752  return extract<0> (vreduce_add(v));
4753 #elif OIIO_SIMD_NEON && defined(__aarch64__)
4754  return vaddvq_s32(v);
4755 #else
4756  SIMD_RETURN_REDUCE (int, 0, r += v[i]);
4757 #endif
4758 }
4759 
4760 
4762 #if OIIO_SIMD_SSE
4763  vint4 ab = v & shuffle<1,1,3,3>(v); // ab bb cd dd
4764  vint4 abcd = ab & shuffle<2>(ab);
4765  return extract<0>(abcd);
4766 #else
4767  SIMD_RETURN_REDUCE (int, -1, r &= v[i]);
4768 #endif
4769 }
4770 
4771 
4773 #if OIIO_SIMD_SSE
4774  vint4 ab = v | shuffle<1,1,3,3>(v); // ab bb cd dd
4775  vint4 abcd = ab | shuffle<2>(ab);
4776  return extract<0>(abcd);
4777 #else
4778  SIMD_RETURN_REDUCE (int, 0, r |= v[i]);
4779 #endif
4780 }
4781 
4782 
4783 
4784 OIIO_FORCEINLINE vint4 blend (const vint4& a, const vint4& b, const vbool4& mask) {
4785 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
4786  return _mm_castps_si128 (_mm_blendv_ps (_mm_castsi128_ps(a.simd()),
4787  _mm_castsi128_ps(b.simd()), mask));
4788 #elif OIIO_SIMD_SSE
4789  return _mm_or_si128 (_mm_and_si128(_mm_castps_si128(mask.simd()), b.simd()),
4790  _mm_andnot_si128(_mm_castps_si128(mask.simd()), a.simd()));
4791 #elif OIIO_SIMD_NEON
4792  return vbslq_s32 (mask.simd(), b.simd(), a.simd());
4793 #else
4794  SIMD_RETURN (vint4, mask[i] ? b[i] : a[i]);
4795 #endif
4796 }
4797 
4799 #if OIIO_SIMD_SSE
4800  return _mm_and_si128(_mm_castps_si128(mask), a.simd());
4801 #else
4802  SIMD_RETURN (vint4, mask[i] ? a[i] : 0.0f);
4803 #endif
4804 }
4805 
4806 
4808 #if OIIO_SIMD_SSE
4809  return _mm_andnot_si128(_mm_castps_si128(mask), a.simd());
4810 #else
4811  SIMD_RETURN (vint4, mask[i] ? 0.0f : a[i]);
4812 #endif
4813 }
4814 
4815 
4816 OIIO_FORCEINLINE vint4 select (const vbool4& mask, const vint4& a, const vint4& b) {
4817  return blend (b, a, mask);
4818 }
4819 
4820 
4821 
4823 #if OIIO_SIMD_SSE >= 3
4824  return _mm_abs_epi32(a.simd());
4825 #elif OIIO_SIMD_NEON
4826  return vabsq_s32(a.simd());
4827 #else
4828  SIMD_RETURN (vint4, std::abs(a[i]));
4829 #endif
4830 }
4831 
4832 
4833 
4834 OIIO_FORCEINLINE vint4 min (const vint4& a, const vint4& b) {
4835 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
4836  return _mm_min_epi32 (a, b);
4837 #elif OIIO_SIMD_NEON
4838  return vminq_s32(a, b);
4839 #else
4840  SIMD_RETURN (vint4, std::min(a[i], b[i]));
4841 #endif
4842 }
4843 
4844 
4845 OIIO_FORCEINLINE vint4 max (const vint4& a, const vint4& b) {
4846 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
4847  return _mm_max_epi32 (a, b);
4848 #elif OIIO_SIMD_NEON
4849  return vmaxq_s32(a, b);
4850 #else
4851  SIMD_RETURN (vint4, std::max(a[i], b[i]));
4852 #endif
4853 }
4854 
4855 
4857 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4858  // return _mm_rol_epi32 (x, s);
4859  // We want to do this ^^^ but this intrinsic only takes an *immediate*
4860  // argument for s, and there isn't a way to express in C++ that a
4861  // parameter must be an immediate/literal value from the caller.
4862  return (x<<s) | srl(x,32-s);
4863 #else
4864  return (x<<s) | srl(x,32-s);
4865 #endif
4866 }
4867 
4868 // DEPRECATED (2.1)
4869 OIIO_FORCEINLINE vint4 rotl32 (const vint4& x, const unsigned int k) {
4870  return rotl(x, k);
4871 }
4872 
4873 
4874 OIIO_FORCEINLINE vint4 andnot (const vint4& a, const vint4& b) {
4875 #if OIIO_SIMD_SSE
4876  return _mm_andnot_si128 (a.simd(), b.simd());
4877 #else
4878  SIMD_RETURN (vint4, ~(a[i]) & b[i]);
4879 #endif
4880 }
4881 
4882 
4883 // Implementation had to be after the definition of vint4::Zero.
4885  m_simd = (ival != vint4::Zero());
4886 }
4887 
4888 
4889 
4891  // NO INTEGER MODULUS IN SSE!
4892  SIMD_RETURN (vint4, b[i] ? a[i] % b[i] : 0);
4893 }
4894 
4896  return b ? (a % b) : vint4::Zero();
4897 }
4898 
4899 
4900 
4901 
4902 //////////////////////////////////////////////////////////////////////
4903 // vint8 implementation
4904 
4906  m_simd = other.m_simd;
4907  return *this;
4908 }
4909 
4912  return m_val[i];
4913 }
4914 
4917  return m_val[i];
4918 }
4919 
4922  m_val[i] = val;
4923 }
4924 
4925 
4927 #if OIIO_SIMD_AVX
4928  m_simd = _mm256_set1_epi32 (a);
4929 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4930  m_4[0].load(a);
4931  m_4[1].load(a);
4932 #else
4933  SIMD_CONSTRUCT (a);
4934 #endif
4935 }
4936 
4937 
4938 OIIO_FORCEINLINE void vint8::load (int a, int b, int c, int d,
4939  int e, int f, int g, int h) {
4940 #if OIIO_SIMD_AVX
4941  m_simd = _mm256_set_epi32 (h, g, f, e, d, c, b, a);
4942 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4943  m_4[0].load(a, b, c, d);
4944  m_4[1].load(e, f, g, h);
4945 #else
4946  m_val[0] = a;
4947  m_val[1] = b;
4948  m_val[2] = c;
4949  m_val[3] = d;
4950  m_val[4] = e;
4951  m_val[5] = f;
4952  m_val[6] = g;
4953  m_val[7] = h;
4954 #endif
4955 }
4956 
4957 
4959 #if OIIO_SIMD_AVX
4960  m_simd = _mm256_loadu_si256 ((const simd_t *)values);
4961 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4962  m_4[0].load(values);
4963  m_4[1].load(values+4);
4964 #else
4965  SIMD_CONSTRUCT (values[i]);
4966 #endif
4967 }
4968 
4969 
4970 OIIO_FORCEINLINE void vint8::load (const int *values, int n)
4971 {
4972  OIIO_DASSERT (n >= 0 && n <= elements);
4973 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
4974  m_simd = _mm256_maskz_loadu_epi32 ((~(0xff << n)), values);
4975 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
4976  if (n > 4) {
4977  vint4 lo, hi;
4978  lo.load (values);
4979  hi.load (values+4, n-4);
4980  m_4[0] = lo;
4981  m_4[1] = hi;
4982  } else {
4983  vint4 lo, hi;
4984  lo.load (values, n);
4985  hi.clear();
4986  m_4[0] = lo;
4987  m_4[1] = hi;
4988  }
4989 #else
4990  for (int i = 0; i < n; ++i)
4991  m_val[i] = values[i];
4992  for (int i = n; i < elements; ++i)
4993  m_val[i] = 0;
4994 #endif
4995 }
4996 
4997 
4998 OIIO_FORCEINLINE void vint8::load (const short *values) {
4999 #if OIIO_SIMD_AVX >= 2
5000  m_simd = _mm256_cvtepi16_epi32(_mm_loadu_si128((__m128i*)values));
5001 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5002  m_4[0].load(values);
5003  m_4[1].load(values+4);
5004 #else
5005  SIMD_CONSTRUCT (values[i]);
5006 #endif
5007 }
5008 
5009 OIIO_FORCEINLINE void vint8::load (const unsigned short *values) {
5010 #if OIIO_SIMD_AVX >= 2
5011  m_simd = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)values));
5012 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5013  m_4[0].load(values);
5014  m_4[1].load(values+4);
5015 #else
5016  SIMD_CONSTRUCT (values[i]);
5017 #endif
5018 }
5019 
5020 
5022 #if OIIO_SIMD_AVX >= 2
5023  __m128i bytes = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
5024  m_simd = _mm256_cvtepi8_epi32 (bytes);
5025 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5026  m_4[0].load(values);
5027  m_4[1].load(values+4);
5028 #else
5029  SIMD_CONSTRUCT (values[i]);
5030 #endif
5031 }
5032 
5033 OIIO_FORCEINLINE void vint8::load (const unsigned char *values) {
5034 #if OIIO_SIMD_AVX >= 2
5035  __m128i bytes = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
5036  m_simd = _mm256_cvtepu8_epi32 (bytes);
5037 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5038  m_4[0].load(values);
5039  m_4[1].load(values+4);
5040 #else
5041  SIMD_CONSTRUCT (values[i]);
5042 #endif
5043 }
5044 
5045 
5046 
5048 
5049 OIIO_FORCEINLINE vint8::vint8 (int a, int b, int c, int d,
5050  int e, int f, int g, int h) {
5051  load(a,b,c,d,e,f,g,h);
5052 }
5053 
5054 OIIO_FORCEINLINE vint8::vint8 (const int *vals) { load (vals); }
5055 OIIO_FORCEINLINE vint8::vint8 (const unsigned short *vals) { load(vals); }
5056 OIIO_FORCEINLINE vint8::vint8 (const short *vals) { load(vals); }
5057 OIIO_FORCEINLINE vint8::vint8 (const unsigned char *vals) { load(vals); }
5058 OIIO_FORCEINLINE vint8::vint8 (const char *vals) { load(vals); }
5059 
5060 OIIO_FORCEINLINE const vint8 & vint8::operator= (int a) { load(a); return *this; }
5061 
5062 
5064 #if OIIO_SIMD_AVX
5065  // Use an unaligned store -- it's just as fast when the memory turns
5066  // out to be aligned, nearly as fast even when unaligned. Not worth
5067  // the headache of using stores that require alignment.
5068  _mm256_storeu_si256 ((simd_t *)values, m_simd);
5069 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5070  m_4[0].store(values);
5071  m_4[1].store(values+4);
5072 #else
5073  SIMD_DO (values[i] = m_val[i]);
5074 #endif
5075 }
5076 
5077 
5079 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5080  m_simd = _mm256_maskz_loadu_epi32 (__mmask8(mask), (const simd_t *)values);
5081 #elif OIIO_SIMD_AVX >= 2
5082  m_simd = _mm256_maskload_epi32 (values, _mm256_castps_si256(vbool8::from_bitmask(mask)));
5083 #else
5084  SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0);
5085 #endif
5086 }
5087 
5088 
5090 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5091  m_simd = _mm256_maskz_loadu_epi32 (__mmask8(mask.bitmask()), (const simd_t *)values);
5092 #elif OIIO_SIMD_AVX >= 2
5093  m_simd = _mm256_maskload_epi32 (values, _mm256_castps_si256(mask));
5094 #else
5095  SIMD_CONSTRUCT (mask[i] ? values[i] : 0);
5096 #endif
5097 }
5098 
5099 
5101 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5102  _mm256_mask_storeu_epi32 (values, __mmask8(mask), m_simd);
5103 #elif OIIO_SIMD_AVX >= 2
5104  _mm256_maskstore_epi32 (values, _mm256_castps_si256(vbool8::from_bitmask(mask)), m_simd);
5105 #else
5106  SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]);
5107 #endif
5108 }
5109 
5110 
5112 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5113  _mm256_mask_storeu_epi32 (values, __mmask8(mask.bitmask()), m_simd);
5114 #elif OIIO_SIMD_AVX >= 2
5115  _mm256_maskstore_epi32 (values, _mm256_castps_si256(mask), m_simd);
5116 #else
5117  SIMD_DO (if (mask[i]) values[i] = (*this)[i]);
5118 #endif
5119 }
5120 
5121 
5122 template <int scale>
5123 OIIO_FORCEINLINE void
5124 vint8::gather (const value_t *baseptr, const vint_t& vindex)
5125 {
5126 #if OIIO_SIMD_AVX >= 2
5127  m_simd = _mm256_i32gather_epi32 (baseptr, vindex, scale);
5128 #else
5129  SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale));
5130 #endif
5131 }
5132 
5133 template<int scale>
5134 OIIO_FORCEINLINE void
5135 vint8::gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex)
5136 {
5137 #if OIIO_SIMD_AVX >= 2
5138  m_simd = _mm256_mask_i32gather_epi32 (m_simd, baseptr, vindex, _mm256_cvtps_epi32(mask), scale);
5139 #else
5140  SIMD_CONSTRUCT (mask[i] ? *(const value_t *)((const char *)baseptr + vindex[i]*scale) : 0);
5141 #endif
5142 }
5143 
5144 template<int scale>
5145 OIIO_FORCEINLINE void
5146 vint8::scatter (value_t *baseptr, const vint_t& vindex) const
5147 {
5148 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5149  _mm256_i32scatter_epi32 (baseptr, vindex, m_simd, scale);
5150 #else
5151  SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
5152 #endif
5153 }
5154 
5155 template<int scale>
5156 OIIO_FORCEINLINE void
5158  const vint_t& vindex) const
5159 {
5160 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5161  _mm256_mask_i32scatter_epi32 (baseptr, mask.bitmask(), vindex, m_simd, scale);
5162 #else
5163  SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
5164 #endif
5165 }
5166 
5167 
5169 #if OIIO_SIMD_AVX
5170  m_simd = _mm256_setzero_si256();
5171 #else
5172  *this = 0;
5173 #endif
5174 }
5175 
5176 
5178 #if OIIO_SIMD_AVX
5179  return _mm256_setzero_si256();
5180 #else
5181  return 0;
5182 #endif
5183 }
5184 
5185 OIIO_FORCEINLINE const vint8 vint8::One () { return vint8(1); }
5186 
5187 OIIO_FORCEINLINE const vint8 vint8::NegOne () { return vint8(-1); }
5188 
5189 
5190 OIIO_FORCEINLINE const vint8 vint8::Iota (int start, int step) {
5191  return vint8 (start+0*step, start+1*step, start+2*step, start+3*step,
5192  start+4*step, start+5*step, start+6*step, start+7*step);
5193 }
5194 
5195 
5197  return vint8 (1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7);
5198 }
5199 
5200 
5202 #if OIIO_SIMD_AVX
5203  return _mm256_castsi256_si128 (simd());
5204 #else
5205  return m_4[0];
5206 #endif
5207 }
5208 
5210 #if OIIO_SIMD_AVX
5211  return _mm256_extractf128_si256 (simd(), 1);
5212 #else
5213  return m_4[1];
5214 #endif
5215 }
5216 
5217 
5218 OIIO_FORCEINLINE vint8::vint8 (const vint4& lo, const vint4 &hi) {
5219 #if OIIO_SIMD_AVX
5220  __m256i r = _mm256_castsi128_si256 (lo);
5221  m_simd = _mm256_insertf128_si256 (r, hi, 1);
5222  // N.B. equivalent, if available: m_simd = _mm256_set_m128i (hi, lo);
5223  // FIXME: when would this not be available?
5224 #else
5225  m_4[0] = lo;
5226  m_4[1] = hi;
5227 #endif
5228 }
5229 
5230 
5232 #if OIIO_SIMD_AVX >= 2
5233  return _mm256_add_epi32 (a.simd(), b.simd());
5234 #else
5235  SIMD_RETURN (vint8, a[i] + b[i]);
5236 #endif
5237 }
5238 
5239 
5241  return a = a + b;
5242 }
5243 
5244 
5246 #if OIIO_SIMD_AVX >= 2
5247  return _mm256_sub_epi32 (_mm256_setzero_si256(), a);
5248 #else
5249  SIMD_RETURN (vint8, -a[i]);
5250 #endif
5251 }
5252 
5253 
5255 #if OIIO_SIMD_AVX >= 2
5256  return _mm256_sub_epi32 (a.simd(), b.simd());
5257 #else
5258  SIMD_RETURN (vint8, a[i] - b[i]);
5259 #endif
5260 }
5261 
5262 
5264  return a = a - b;
5265 }
5266 
5267 
5269 #if OIIO_SIMD_AVX >= 2
5270  return _mm256_mullo_epi32 (a.simd(), b.simd());
5271 #else
5272  SIMD_RETURN (vint8, a[i] * b[i]);
5273 #endif
5274 }
5275 
5276 
5277 OIIO_FORCEINLINE const vint8& operator*= (vint8& a, const vint8& b) { return a = a * b; }
5278 OIIO_FORCEINLINE const vint8& operator*= (vint8& a, int b) { return a = a * b; }
5279 
5280 
5282  // NO INTEGER DIVISION IN SSE or AVX!
5283  SIMD_RETURN (vint8, a[i] / b[i]);
5284 }
5285 
5286 OIIO_FORCEINLINE const vint8& operator/= (vint8& a, const vint8& b) { return a = a / b; }
5287 
5288 
5290  // NO INTEGER MODULUS IN SSE or AVX!
5291  SIMD_RETURN (vint8, a[i] % b[i]);
5292 }
5293 
5294 OIIO_FORCEINLINE const vint8& operator%= (vint8& a, const vint8& b) { return a = a % b; }
5295 
5297  // NO INTEGER MODULUS in SSE or AVX!
5298  SIMD_RETURN (vint8, a[i] % w);
5299 }
5300 
5301 OIIO_FORCEINLINE const vint8& operator%= (vint8& a, int b) { return a = a % b; }
5302 
5303 
5305 #if OIIO_SIMD_AVX >= 2
5306  return _mm256_and_si256 (a.simd(), b.simd());
5307 #else
5308  SIMD_RETURN (vint8, a[i] & b[i]);
5309 #endif
5310 }
5311 
5312 OIIO_FORCEINLINE const vint8& operator&= (vint8& a, const vint8& b) { return a = a & b; }
5313 
5315 #if OIIO_SIMD_AVX >= 2
5316  return _mm256_or_si256 (a.simd(), b.simd());
5317 #else
5318  SIMD_RETURN (vint8, a[i] | b[i]);
5319 #endif
5320 }
5321 
5322 OIIO_FORCEINLINE const vint8& operator|= (vint8& a, const vint8& b) { return a = a | b; }
5323 
5325 #if OIIO_SIMD_AVX >= 2
5326  return _mm256_xor_si256 (a.simd(), b.simd());
5327 #else
5328  SIMD_RETURN (vint8, a[i] ^ b[i]);
5329 #endif
5330 }
5331 
5332 OIIO_FORCEINLINE const vint8& operator^= (vint8& a, const vint8& b) { return a = a ^ b; }
5333 
5334 
5336 #if OIIO_SIMD_AVX >= 2
5337  return a ^ a.NegOne();
5338 #else
5339  SIMD_RETURN (vint8, ~a[i]);
5340 #endif
5341 }
5342 
5343 
5344 OIIO_FORCEINLINE vint8 operator<< (const vint8& a, unsigned int bits) {
5345 #if OIIO_SIMD_AVX >= 2
5346  return _mm256_slli_epi32 (a, bits);
5347 #elif OIIO_SIMD_SSE
5348  return vint8 (a.lo() << bits, a.hi() << bits);
5349 #else
5350  SIMD_RETURN (vint8, a[i] << bits);
5351 #endif
5352 }
5353 
5354 
5355 OIIO_FORCEINLINE const vint8& operator<<= (vint8& a, const unsigned int bits) {
5356  return a = a << bits;
5357 }
5358 
5359 OIIO_FORCEINLINE vint8 operator>> (const vint8& a, const unsigned int bits) {
5360 #if OIIO_SIMD_AVX >= 2
5361  return _mm256_srai_epi32 (a, bits);
5362 #elif OIIO_SIMD_SSE
5363  return vint8 (a.lo() >> bits, a.hi() >> bits);
5364 #else
5365  SIMD_RETURN (vint8, a[i] >> bits);
5366 #endif
5367 }
5368 
5369 OIIO_FORCEINLINE const vint8& operator>>= (vint8& a, const unsigned int bits) {
5370  return a = a >> bits;
5371 }
5372 
5373 
5374 OIIO_FORCEINLINE vint8 srl (const vint8& a, const unsigned int bits) {
5375 #if OIIO_SIMD_AVX >= 2
5376  return _mm256_srli_epi32 (a, bits);
5377 #else
5378  SIMD_RETURN (vint8, int ((unsigned int)(a[i]) >> bits));
5379 #endif
5380 }
5381 
5382 
5384  // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5385 #if OIIO_SIMD_AVX >= 2
5386  return _mm256_castsi256_ps(_mm256_cmpeq_epi32 (a.m_simd, b.m_simd));
5387 #elif OIIO_SIMD_SSE /* Fall back to 4-wide */
5388  return vbool8 (a.lo() == b.lo(), a.hi() == b.hi());
5389 #else
5390  SIMD_RETURN (vbool8, a[i] == b[i] ? -1 : 0);
5391 #endif
5392 }
5393 
5394 
5396  // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5397  return ! (a == b);
5398 }
5399 
5400 
5402  // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5403 #if OIIO_SIMD_AVX >= 2
5404  return _mm256_castsi256_ps(_mm256_cmpgt_epi32 (a, b));
5405 #elif OIIO_SIMD_SSE /* Fall back to 4-wide */
5406  return vbool8 (a.lo() > b.lo(), a.hi() > b.hi());
5407 #else
5408  SIMD_RETURN (vbool8, a[i] > b[i] ? -1 : 0);
5409 #endif
5410 }
5411 
5412 
5414  // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5415 #if OIIO_SIMD_AVX >= 2
5416  // No lt or lte!
5417  return (b > a);
5418 #elif OIIO_SIMD_SSE /* Fall back to 4-wide */
5419  return vbool8 (a.lo() < b.lo(), a.hi() < b.hi());
5420 #else
5421  SIMD_RETURN (vbool8, a[i] < b[i] ? -1 : 0);
5422 #endif
5423 }
5424 
5425 
5427  // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5428  return (a > b) | (a == b);
5429 }
5430 
5431 
5433  // FIXME: on AVX-512 should we use _mm256_cmp_epi32_mask() ?
5434  return (b > a) | (a == b);
5435 }
5436 
5437 
5438 inline std::ostream& operator<< (std::ostream& cout, const vint8& val) {
5439  cout << val[0];
5440  for (int i = 1; i < val.elements; ++i)
5441  cout << ' ' << val[i];
5442  return cout;
5443 }
5444 
5445 
5446 OIIO_FORCEINLINE void vint8::store (int *values, int n) const {
5447  OIIO_DASSERT (n >= 0 && n <= elements);
5448 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5449  // This SHOULD be fast, but in my benchmarks, it is slower!
5450  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
5451  // Re-test this periodically with new Intel hardware.
5452  _mm256_mask_storeu_epi32 (values, __mmask8(~(0xff << n)), m_simd);
5453 #elif OIIO_SIMD_SSE
5454  if (n <= 4) {
5455  lo().store (values, n);
5456  } else if (n < 8) {
5457  lo().store (values);
5458  hi().store (values+4, n-4);
5459  } else {
5460  store (values);
5461  }
5462 #else
5463  for (int i = 0; i < n; ++i)
5464  values[i] = m_val[i];
5465 #endif
5466 }
5467 
5468 
5469 // FIXME(AVX): fast vint8 store to unsigned short, unsigned char
5470 
5471 OIIO_FORCEINLINE void vint8::store (unsigned short *values) const {
5472 #if OIIO_AVX512VL_ENABLED
5473  _mm256_mask_cvtepi32_storeu_epi16 (values, __mmask8(0xff), m_simd);
5474 #elif OIIO_SIMD_SSE
5475  lo().store (values);
5476  hi().store (values+4);
5477 #else
5478  SIMD_DO (values[i] = m_val[i]);
5479 #endif
5480 }
5481 
5482 
5483 OIIO_FORCEINLINE void vint8::store (unsigned char *values) const {
5484 #if OIIO_AVX512VL_ENABLED
5485  _mm256_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xff), m_simd);
5486 #elif OIIO_SIMD_SSE
5487  lo().store (values);
5488  hi().store (values+4);
5489 #else
5490  SIMD_DO (values[i] = m_val[i]);
5491 #endif
5492 }
5493 
5494 
5495 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
5497 #if OIIO_SIMD_AVX >= 2
5498  vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
5499  return _mm256_castps_si256 (_mm256_permutevar8x32_ps (_mm256_castsi256_ps(a.simd()), index.simd()));
5500 #else
5501  return vint8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
5502 #endif
5503 }
5504 
5505 template<int i> OIIO_FORCEINLINE vint8 shuffle (const vint8& a) {
5506  return shuffle<i,i,i,i,i,i,i,i>(a);
5507 }
5508 
5509 
5510 template<int i>
5512 #if OIIO_SIMD_AVX && !_WIN32
5513  return _mm256_extract_epi32(v.simd(), i);
5514 #else
5515  return v[i];
5516 #endif
5517 }
5518 
5519 
5520 template<int i>
5522 #if OIIO_SIMD_AVX && !_WIN32
5523  return _mm256_insert_epi32 (a.simd(), val, i);
5524 #else
5525  vint8 tmp = a;
5526  tmp[i] = val;
5527  return tmp;
5528 #endif
5529 }
5530 
5531 
5532 OIIO_FORCEINLINE int vint8::x () const { return extract<0>(*this); }
5533 OIIO_FORCEINLINE int vint8::y () const { return extract<1>(*this); }
5534 OIIO_FORCEINLINE int vint8::z () const { return extract<2>(*this); }
5535 OIIO_FORCEINLINE int vint8::w () const { return extract<3>(*this); }
5536 OIIO_FORCEINLINE void vint8::set_x (int val) { *this = insert<0>(*this, val); }
5537 OIIO_FORCEINLINE void vint8::set_y (int val) { *this = insert<1>(*this, val); }
5538 OIIO_FORCEINLINE void vint8::set_z (int val) { *this = insert<2>(*this, val); }
5539 OIIO_FORCEINLINE void vint8::set_w (int val) { *this = insert<3>(*this, val); }
5540 
5541 
5543 {
5544 #if OIIO_SIMD_AVX
5545  return _mm256_castps_si256 (x.simd());
5546 #else
5547  return *(vint8 *)&x;
5548 #endif
5549 }
5550 
5551 
5553 #if OIIO_SIMD_AVX >= 2
5554  // From Syrah:
5555  vint8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_epi32(v.simd(), _mm256_setzero_si256());
5556  vint8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_epi32(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_si256());
5557  // get efgh in the 0-idx slot
5558  vint8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
5559  vint8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
5560  return shuffle<0>(final_sum);
5561 #elif OIIO_SIMD_SSE
5562  vint4 hadd4 = vreduce_add(v.lo()) + vreduce_add(v.hi());
5563  return vint8(hadd4, hadd4);
5564 #else
5565  return vint8(reduce_add(v));
5566 #endif
5567 }
5568 
5569 
5571 #if OIIO_SIMD_SSE
5572  return extract<0> (vreduce_add(v));
5573 #else
5574  return reduce_add(v.lo()) + reduce_add(v.hi());
5575 #endif
5576 }
5577 
5578 
5580 #if OIIO_SSE_AVX >= 2
5581  vint8 ab = v & shuffle<1,1,3,3,5,5,7,7>(v); // ab bb cd dd ef ff gh hh
5582  vint8 abcd = ab & shuffle<2,2,2,2,6,6,6,6>(ab); // abcd x x x efgh x x x
5583  vint8 abcdefgh = abcd & shuffle<4>(abcdefgh); // abcdefgh x x x x x x x
5584  return extract<0> (abcdefgh);
5585 #else
5586  // AVX 1.0 or less -- use SSE
5587  return reduce_and(v.lo() & v.hi());
5588 #endif
5589 }
5590 
5591 
5593 #if OIIO_SSE_AVX >= 2
5594  vint8 ab = v | shuffle<1,1,3,3,5,5,7,7>(v); // ab bb cd dd ef ff gh hh
5595  vint8 abcd = ab | shuffle<2,2,2,2,6,6,6,6>(ab); // abcd x x x efgh x x x
5596  vint8 abcdefgh = abcd | shuffle<4>(abcdefgh); // abcdefgh x x x x x x x
5597  return extract<0> (abcdefgh);
5598 #else
5599  // AVX 1.0 or less -- use SSE
5600  return reduce_or(v.lo() | v.hi());
5601 #endif
5602 }
5603 
5604 
5605 OIIO_FORCEINLINE vint8 blend (const vint8& a, const vint8& b, const vbool8& mask) {
5606 #if OIIO_SIMD_AVX
5607  return _mm256_castps_si256 (_mm256_blendv_ps (_mm256_castsi256_ps(a.simd()),
5608  _mm256_castsi256_ps(b.simd()), mask));
5609 #elif OIIO_SIMD_SSE
5610  return vint8 (blend(a.lo(), b.lo(), mask.lo()),
5611  blend(a.hi(), b.hi(), mask.hi()));
5612 #else
5613  SIMD_RETURN (vint8, mask[i] ? b[i] : a[i]);
5614 #endif
5615 }
5616 
5617 
5619 // FIXME: More efficient for AVX-512 to use
5620 // _mm256_maxkz_mov_epi32(_mm256_movemask_ps(maxk),a))?
5621 #if OIIO_SIMD_AVX
5622  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a.simd()), mask));
5623 #elif OIIO_SIMD_SSE
5624  return vint8 (blend0(a.lo(), mask.lo()),
5625  blend0(a.hi(), mask.hi()));
5626 #else
5627  SIMD_RETURN (vint8, mask[i] ? a[i] : 0.0f);
5628 #endif
5629 }
5630 
5631 
5633 // FIXME: More efficient for AVX-512 to use
5634 // _mm256_maxkz_mov_epi32(_mm256_movemask_ps(!maxk),a))?
5635 #if OIIO_SIMD_AVX
5636  return _mm256_castps_si256 (_mm256_andnot_ps (mask.simd(), _mm256_castsi256_ps(a.simd())));
5637 #elif OIIO_SIMD_SSE
5638  return vint8 (blend0not(a.lo(), mask.lo()),
5639  blend0not(a.hi(), mask.hi()));
5640 #else
5641  SIMD_RETURN (vint8, mask[i] ? 0.0f : a[i]);
5642 #endif
5643 }
5644 
5645 OIIO_FORCEINLINE vint8 select (const vbool8& mask, const vint8& a, const vint8& b) {
5646  return blend (b, a, mask);
5647 }
5648 
5649 
5651 #if OIIO_SIMD_AVX >= 2
5652  return _mm256_abs_epi32(a.simd());
5653 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
5654  return vint8(abs(a.lo()), abs(a.hi()));
5655 #else
5656  SIMD_RETURN (vint8, std::abs(a[i]));
5657 #endif
5658 }
5659 
5660 
5661 OIIO_FORCEINLINE vint8 min (const vint8& a, const vint8& b) {
5662 #if OIIO_SIMD_AVX >= 2
5663  return _mm256_min_epi32 (a, b);
5664 #else
5665  return vint8 (min(a.lo(), b.lo()), min(a.hi(), b.hi()));
5666 #endif
5667 }
5668 
5669 
5670 OIIO_FORCEINLINE vint8 max (const vint8& a, const vint8& b) {
5671 #if OIIO_SIMD_AVX >= 2
5672  return _mm256_max_epi32 (a, b);
5673 #else
5674  return vint8 (max(a.lo(), b.lo()), max(a.hi(), b.hi()));
5675 #endif
5676 }
5677 
5678 
5680 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
5681  // return _mm256_rol_epi32 (x, s);
5682  // We want to do this ^^^ but this intrinsic only takes an *immediate*
5683  // argument for s, and there isn't a way to express in C++ that a
5684  // parameter must be an immediate/literal value from the caller.
5685  return (x<<s) | srl(x,32-s);
5686 #else
5687  return (x<<s) | srl(x,32-s);
5688 #endif
5689 }
5690 
5691 // DEPRECATED (2.1)
5692 OIIO_FORCEINLINE vint8 rotl32 (const vint8& x, const unsigned int k) {
5693  return rotl(x, k);
5694 }
5695 
5696 
5697 OIIO_FORCEINLINE vint8 andnot (const vint8& a, const vint8& b) {
5698 #if OIIO_SIMD_AVX >= 2
5699  return _mm256_andnot_si256 (a.simd(), b.simd());
5700 #elif OIIO_SIMD_AVX >= 1
5701  return _mm256_castps_si256 (_mm256_andnot_ps (_mm256_castsi256_ps(a.simd()), _mm256_castsi256_ps(b.simd())));
5702 #else
5703  SIMD_RETURN (vint8, ~(a[i]) & b[i]);
5704 #endif
5705 }
5706 
5707 
5708 // Implementation had to be after the definition of vint8::Zero.
5710  m_simd = (ival != vint8::Zero());
5711 }
5712 
5713 
5714 
5716  // NO INTEGER MODULUS IN SSE!
5717  SIMD_RETURN (vint8, b[i] ? a[i] % b[i] : 0);
5718 }
5719 
5721  return b ? (a % b) : vint8::Zero();
5722 }
5723 
5724 
5725 
5726 
5727 //////////////////////////////////////////////////////////////////////
5728 // vint16 implementation
5729 
5731  m_simd = other.m_simd;
5732  return *this;
5733 }
5734 
5737  return m_val[i];
5738 }
5739 
5742  return m_val[i];
5743 }
5744 
5747  m_val[i] = val;
5748 }
5749 
5750 
5752 #if OIIO_SIMD_AVX >= 512
5753  m_simd = _mm512_set1_epi32 (a);
5754 #else
5755  m_8[0].load (a);
5756  m_8[1].load (a);
5757 #endif
5758 }
5759 
5760 
5761 OIIO_FORCEINLINE void vint16::load (int v0, int v1, int v2, int v3,
5762  int v4, int v5, int v6, int v7,
5763  int v8, int v9, int v10, int v11,
5764  int v12, int v13, int v14, int v15) {
5765 #if OIIO_SIMD_AVX >= 512
5766  m_simd = _mm512_setr_epi32 (v0, v1, v2, v3, v4, v5, v6, v7,
5767  v8, v9, v10, v11, v12, v13, v14, v15);
5768 #else
5769  m_val[ 0] = v0;
5770  m_val[ 1] = v1;
5771  m_val[ 2] = v2;
5772  m_val[ 3] = v3;
5773  m_val[ 4] = v4;
5774  m_val[ 5] = v5;
5775  m_val[ 6] = v6;
5776  m_val[ 7] = v7;
5777  m_val[ 8] = v8;
5778  m_val[ 9] = v9;
5779  m_val[10] = v10;
5780  m_val[11] = v11;
5781  m_val[12] = v12;
5782  m_val[13] = v13;
5783  m_val[14] = v14;
5784  m_val[15] = v15;
5785 #endif
5786 }
5787 
5788 
5790 #if OIIO_SIMD_AVX >= 512
5791  m_simd = _mm512_loadu_si512 ((const simd_t *)values);
5792 #else
5793  m_8[0].load (values);
5794  m_8[1].load (values+8);
5795 #endif
5796 }
5797 
5798 
5799 OIIO_FORCEINLINE void vint16::load (const int *values, int n)
5800 {
5801 #if OIIO_SIMD_AVX >= 512
5802  m_simd = _mm512_maskz_loadu_epi32 (__mmask16(~(0xffff << n)), values);
5803 #else
5804  if (n > 8) {
5805  m_8[0].load (values);
5806  m_8[1].load (values+8, n-8);
5807  } else {
5808  m_8[0].load (values, n);
5809  m_8[1].clear ();
5810  }
5811 #endif
5812 }
5813 
5814 
5815 OIIO_FORCEINLINE void vint16::load (const short *values) {
5816 #if OIIO_SIMD_AVX >= 512
5817  m_simd = _mm512_cvtepi16_epi32(_mm256_loadu_si256((__m256i*)values));
5818 #else
5819  m_8[0].load (values);
5820  m_8[1].load (values+8);
5821 #endif
5822 }
5823 
5824 OIIO_FORCEINLINE void vint16::load (const unsigned short *values) {
5825 #if OIIO_SIMD_AVX >= 512
5826  m_simd = _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)values));
5827 #else
5828  m_8[0].load (values);
5829  m_8[1].load (values+8);
5830 #endif
5831 }
5832 
5833 
5835 #if OIIO_SIMD_AVX >= 512
5836  m_simd = _mm512_cvtepi8_epi32(_mm_loadu_si128((__m128i*)values));
5837 #else
5838  m_8[0].load (values);
5839  m_8[1].load (values+8);
5840 #endif
5841 }
5842 
5843 OIIO_FORCEINLINE void vint16::load (const unsigned char *values) {
5844 #if OIIO_SIMD_AVX >= 512
5845  m_simd = _mm512_cvtepu8_epi32(_mm_loadu_si128((__m128i*)values));
5846 #else
5847  m_8[0].load (values);
5848  m_8[1].load (values+8);
5849 #endif
5850 }
5851 
5852 
5854 
5856  int v4, int v5, int v6, int v7,
5857  int v8, int v9, int v10, int v11,
5858  int v12, int v13, int v14, int v15) {
5859  load (v0, v1, v2, v3, v4, v5, v6, v7,
5860  v8, v9, v10, v11, v12, v13, v14, v15);
5861 }
5862 
5863 OIIO_FORCEINLINE vint16::vint16 (const int *vals) { load (vals); }
5864 OIIO_FORCEINLINE vint16::vint16 (const unsigned short *vals) { load(vals); }
5865 OIIO_FORCEINLINE vint16::vint16 (const short *vals) { load(vals); }
5866 OIIO_FORCEINLINE vint16::vint16 (const unsigned char *vals) { load(vals); }
5867 OIIO_FORCEINLINE vint16::vint16 (const char *vals) { load(vals); }
5868 
5869 OIIO_FORCEINLINE const vint16 & vint16::operator= (int a) { load(a); return *this; }
5870 
5871 
5873 #if OIIO_SIMD_AVX >= 512
5874  m_simd = _mm512_maskz_loadu_epi32 (mask, (const simd_t *)values);
5875 #else
5876  m_8[0].load_mask (mask.lo(), values);
5877  m_8[1].load_mask (mask.hi(), values+8);
5878 #endif
5879 }
5880 
5881 
5883 #if OIIO_SIMD_AVX >= 512
5884  _mm512_mask_storeu_epi32 (values, mask.bitmask(), m_simd);
5885 #else
5886  lo().store_mask (mask.lo(), values);
5887  hi().store_mask (mask.hi(), values+8);
5888 #endif
5889 }
5890 
5891 
5892 template <int scale>
5893 OIIO_FORCEINLINE void
5894 vint16::gather (const value_t *baseptr, const vint_t& vindex) {
5895 #if OIIO_SIMD_AVX >= 512
5896  m_simd = _mm512_i32gather_epi32 (vindex, baseptr, scale);
5897 #else
5898  m_8[0].gather<scale> (baseptr, vindex.lo());
5899  m_8[1].gather<scale> (baseptr, vindex.hi());
5900 #endif
5901 }
5902 
5903 template<int scale>
5904 OIIO_FORCEINLINE void
5905 vint16::gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex) {
5906 #if OIIO_SIMD_AVX >= 512
5907  m_simd = _mm512_mask_i32gather_epi32 (m_simd, mask, vindex, baseptr, scale);
5908 #else
5909  m_8[0].gather_mask<scale> (mask.lo(), baseptr, vindex.lo());
5910  m_8[1].gather_mask<scale> (mask.hi(), baseptr, vindex.hi());
5911 #endif
5912 }
5913 
5914 template<int scale>
5915 OIIO_FORCEINLINE void
5916 vint16::scatter (value_t *baseptr, const vint_t& vindex) const {
5917 #if OIIO_SIMD_AVX >= 512
5918  _mm512_i32scatter_epi32 (baseptr, vindex, m_simd, scale);
5919 #else
5920  lo().scatter<scale> (baseptr, vindex.lo());
5921  hi().scatter<scale> (baseptr, vindex.hi());
5922 #endif
5923 }
5924 
5925 template<int scale>
5926 OIIO_FORCEINLINE void
5928  const vint_t& vindex) const {
5929 #if OIIO_SIMD_AVX >= 512
5930  _mm512_mask_i32scatter_epi32 (baseptr, mask, vindex, m_simd, scale);
5931 #else
5932  lo().scatter_mask<scale> (mask.lo(), baseptr, vindex.lo());
5933  hi().scatter_mask<scale> (mask.hi(), baseptr, vindex.hi());
5934 #endif
5935 }
5936 
5937 
5939 #if OIIO_SIMD_AVX >= 512
5940  // Use an unaligned store -- it's just as fast when the memory turns
5941  // out to be aligned, nearly as fast even when unaligned. Not worth
5942  // the headache of using stores that require alignment.
5943  _mm512_storeu_si512 ((simd_t *)values, m_simd);
5944 #else
5945  lo().store (values);
5946  hi().store (values+8);
5947 #endif
5948 }
5949 
5950 
5952 #if OIIO_SIMD_AVX >= 512
5953  m_simd = _mm512_setzero_si512();
5954 #else
5955  *this = 0;
5956 #endif
5957 }
5958 
5959 
5961 #if OIIO_SIMD_AVX >= 512
5962  return _mm512_setzero_epi32();
5963 #else
5964  return 0;
5965 #endif
5966 }
5967 
5969 
5971 
5972 
5974  return vint16 (start+0*step, start+1*step, start+2*step, start+3*step,
5975  start+4*step, start+5*step, start+6*step, start+7*step,
5976  start+8*step, start+9*step, start+10*step, start+11*step,
5977  start+12*step, start+13*step, start+14*step, start+15*step);
5978 }
5979 
5980 
5982  return vint16 (1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7,
5983  1<<8, 1<<9, 1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15);
5984 }
5985 
5986 
5988 #if OIIO_SIMD_AVX >= 512
5989  return _mm512_castsi512_si256 (simd());
5990 #else
5991  return m_8[0];
5992 #endif
5993 }
5994 
5996 #if OIIO_SIMD_AVX >= 512
5997  return _mm512_extracti64x4_epi64 (simd(), 1);
5998 #else
5999  return m_8[1];
6000 #endif
6001 }
6002 
6003 
6004 OIIO_FORCEINLINE vint16::vint16 (const vint8& lo, const vint8 &hi) {
6005 #if OIIO_SIMD_AVX >= 512
6006  __m512i r = _mm512_castsi256_si512 (lo);
6007  m_simd = _mm512_inserti32x8 (r, hi, 1);
6008 #else
6009  m_8[0] = lo;
6010  m_8[1] = hi;
6011 #endif
6012 }
6013 
6014 
6015 OIIO_FORCEINLINE vint16::vint16 (const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d) {
6016 #if OIIO_SIMD_AVX >= 512
6017  m_simd = _mm512_broadcast_i32x4(a);
6018  m_simd = _mm512_inserti32x4 (m_simd, b, 1);
6019  m_simd = _mm512_inserti32x4 (m_simd, c, 2);
6020  m_simd = _mm512_inserti32x4 (m_simd, d, 3);
6021 #else
6022  m_8[0] = vint8(a,b);
6023  m_8[1] = vint8(c,d);
6024 #endif
6025 }
6026 
6027 
6029 #if OIIO_SIMD_AVX >= 512
6030  return _mm512_add_epi32 (a.simd(), b.simd());
6031 #else
6032  return vint16 (a.lo()+b.lo(), a.hi()+b.hi());
6033 #endif
6034 }
6035 
6036 
6038  return a = a + b;
6039 }
6040 
6041 
6043 #if OIIO_SIMD_AVX >= 512
6044  return _mm512_sub_epi32 (_mm512_setzero_si512(), a);
6045 #else
6046  return vint16 (-a.lo(), -a.hi());
6047 #endif
6048 }
6049 
6050 
6052 #if OIIO_SIMD_AVX >= 512
6053  return _mm512_sub_epi32 (a.simd(), b.simd());
6054 #else
6055  return vint16 (a.lo()-b.lo(), a.hi()-b.hi());
6056 #endif
6057 }
6058 
6059 
6061  return a = a - b;
6062 }
6063 
6064 
6066 #if OIIO_SIMD_AVX >= 512
6067  return _mm512_mullo_epi32 (a.simd(), b.simd());
6068 #else
6069  return vint16 (a.lo()*b.lo(), a.hi()*b.hi());
6070 #endif
6071 }
6072 
6073 
6074 OIIO_FORCEINLINE const vint16& operator*= (vint16& a, const vint16& b) { return a = a * b; }
6075 OIIO_FORCEINLINE const vint16& operator*= (vint16& a, int b) { return a = a * b; }
6076 
6077 
6079  // NO INTEGER DIVISION IN AVX512!
6080  SIMD_RETURN (vint16, a[i] / b[i]);
6081 }
6082 
6083 OIIO_FORCEINLINE const vint16& operator/= (vint16& a, const vint16& b) { return a = a / b; }
6084 
6085 
6087  // NO INTEGER MODULUS IN AVX512!
6088  SIMD_RETURN (vint16, a[i] % b[i]);
6089 }
6090 
6091 OIIO_FORCEINLINE const vint16& operator%= (vint16& a, const vint16& b) { return a = a % b; }
6092 
6094  // NO INTEGER MODULUS in AVX512!
6095  SIMD_RETURN (vint16, a[i] % w);
6096 }
6097 
6098 OIIO_FORCEINLINE const vint16& operator%= (vint16& a, int b) { return a = a % b; }
6099 
6100 
6102 #if OIIO_SIMD_AVX >= 512
6103  return _mm512_and_si512 (a.simd(), b.simd());
6104 #else
6105  return vint16 (a.lo() & b.lo(), a.hi() & b.hi());
6106 #endif
6107 }
6108 
6109 OIIO_FORCEINLINE const vint16& operator&= (vint16& a, const vint16& b) { return a = a & b; }
6110 
6112 #if OIIO_SIMD_AVX >= 512
6113  return _mm512_or_si512 (a.simd(), b.simd());
6114 #else
6115  return vint16 (a.lo() | b.lo(), a.hi() | b.hi());
6116 #endif
6117 }
6118 
6119 OIIO_FORCEINLINE const vint16& operator|= (vint16& a, const vint16& b) { return a = a | b; }
6120 
6122 #if OIIO_SIMD_AVX >= 512
6123  return _mm512_xor_si512 (a.simd(), b.simd());
6124 #else
6125  return vint16 (a.lo() ^ b.lo(), a.hi() ^ b.hi());
6126 #endif
6127 }
6128 
6129 OIIO_FORCEINLINE const vint16& operator^= (vint16& a, const vint16& b) { return a = a ^ b; }
6130 
6131 
6133 #if OIIO_SIMD_AVX >= 512
6134  return a ^ a.NegOne();
6135 #else
6136  return vint16 (~a.lo(), ~a.hi());
6137 #endif
6138 }
6139 
6140 
6141 OIIO_FORCEINLINE vint16 operator<< (const vint16& a, const unsigned int bits) {
6142 #if OIIO_SIMD_AVX >= 512
6143  return _mm512_sllv_epi32 (a, vint16(int(bits)));
6144  // return _mm512_slli_epi32 (a, bits);
6145  // FIXME: can this be slli?
6146 #else
6147  return vint16 (a.lo() << bits, a.hi() << bits);
6148 #endif
6149 }
6150 
6151 
6152 OIIO_FORCEINLINE const vint16& operator<<= (vint16& a, const unsigned int bits) {
6153  return a = a << bits;
6154 }
6155 
6156 OIIO_FORCEINLINE vint16 operator>> (const vint16& a, const unsigned int bits) {
6157 #if OIIO_SIMD_AVX >= 512
6158  return _mm512_srav_epi32 (a, vint16(int(bits)));
6159  // FIXME: can this be srai?
6160 #else
6161  return vint16 (a.lo() >> bits, a.hi() >> bits);
6162 #endif
6163 }
6164 
6165 OIIO_FORCEINLINE const vint16& operator>>= (vint16& a, const unsigned int bits) {
6166  return a = a >> bits;
6167 }
6168 
6169 
6170 OIIO_FORCEINLINE vint16 srl (const vint16& a, const unsigned int bits) {
6171 #if OIIO_SIMD_AVX >= 512
6172  return _mm512_srlv_epi32 (a, vint16(int(bits)));
6173  // FIXME: can this be srli?
6174 #else
6175  return vint16 (srl(a.lo(), bits), srl (a.hi(), bits));
6176 #endif
6177 }
6178 
6179 
6181 #if OIIO_SIMD_AVX >= 512
6182  return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 0 /*_MM_CMPINT_EQ*/);
6183 #else /* Fall back to 8-wide */
6184  return vbool16 (a.lo() == b.lo(), a.hi() == b.hi());
6185 #endif
6186 }
6187 
6188 
6190 #if OIIO_SIMD_AVX >= 512
6191  return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 4 /*_MM_CMPINT_NEQ*/);
6192 #else /* Fall back to 8-wide */
6193  return vbool16 (a.lo() != b.lo(), a.hi() != b.hi());
6194 #endif
6195 }
6196 
6197 
6199 #if OIIO_SIMD_AVX >= 512
6200  return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 6 /*_MM_CMPINT_NLE*/);
6201 #else /* Fall back to 8-wide */
6202  return vbool16 (a.lo() > b.lo(), a.hi() > b.hi());
6203 #endif
6204 }
6205 
6206 
6208 #if OIIO_SIMD_AVX >= 512
6209  return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 1 /*_MM_CMPINT_LT*/);
6210 #else /* Fall back to 8-wide */
6211  return vbool16 (a.lo() < b.lo(), a.hi() < b.hi());
6212 #endif
6213 }
6214 
6215 
6217 #if OIIO_SIMD_AVX >= 512
6218  return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 5 /*_MM_CMPINT_NLT*/);
6219 #else /* Fall back to 8-wide */
6220  return vbool16 (a.lo() >= b.lo(), a.hi() >= b.hi());
6221 #endif
6222 }
6223 
6224 
6226 #if OIIO_SIMD_AVX >= 512
6227  return _mm512_cmp_epi32_mask (a.simd(), b.simd(), 2 /*_MM_CMPINT_LE*/);
6228 #else /* Fall back to 8-wide */
6229  return vbool16 (a.lo() <= b.lo(), a.hi() <= b.hi());
6230 #endif
6231 }
6232 
6233 
6234 inline std::ostream& operator<< (std::ostream& cout, const vint16& val) {
6235  cout << val[0];
6236  for (int i = 1; i < val.elements; ++i)
6237  cout << ' ' << val[i];
6238  return cout;
6239 }
6240 
6241 
6242 
6243 OIIO_FORCEINLINE void vint16::store (int *values, int n) const {
6244  OIIO_DASSERT (n >= 0 && n <= elements);
6245 #if 0 && OIIO_SIMD_AVX >= 512
6246  // This SHOULD be fast, but in my benchmarks, it is slower!
6247  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
6248  // Re-test this periodically with new Intel hardware.
6249  _mm512_mask_storeu_epi32 (values, __mmask16(~(0xffff << n)), m_simd);
6250 #else
6251  if (n > 8) {
6252  m_8[0].store (values);
6253  m_8[1].store (values+8, n-8);
6254  } else {
6255  m_8[0].store (values, n);
6256  }
6257 #endif
6258 }
6259 
6260 
6261 OIIO_FORCEINLINE void vint16::store (unsigned short *values) const {
6262 #if OIIO_SIMD_AVX512
6263  _mm512_mask_cvtepi32_storeu_epi16 (values, __mmask16(0xff), m_simd);
6264 #elif OIIO_SIMD_AVX >= 2
6265  lo().store (values);
6266  hi().store (values+8);
6267 #else
6268  SIMD_DO (values[i] = m_val[i]);
6269 #endif
6270 }
6271 
6272 
6273 OIIO_FORCEINLINE void vint16::store (unsigned char *values) const {
6274 #if OIIO_SIMD_AVX512
6275  _mm512_mask_cvtepi32_storeu_epi8 (values, __mmask16(0xff), m_simd);
6276 #elif OIIO_SIMD_AVX >= 2
6277  lo().store (values);
6278  hi().store (values+8);
6279 #else
6280  SIMD_DO (values[i] = m_val[i]);
6281 #endif
6282 }
6283 
6284 
6285 
6286 // Shuffle groups of 4
6287 template<int i0, int i1, int i2, int i3>
6288 vint16 shuffle4 (const vint16& a) {
6289 #if OIIO_SIMD_AVX >= 512
6290  __m512 x = _mm512_castsi512_ps(a);
6291  return _mm512_castps_si512(_mm512_shuffle_f32x4(x,x,_MM_SHUFFLE(i3,i2,i1,i0)));
6292 #else
6293  vint4 x[4];
6294  a.store ((int *)x);
6295  return vint16 (x[i0], x[i1], x[i2], x[i3]);
6296 #endif
6297 }
6298 
6299 template<int i> vint16 shuffle4 (const vint16& a) {
6300  return shuffle4<i,i,i,i> (a);
6301 }
6302 
6303 template<int i0, int i1, int i2, int i3>
6304 vint16 shuffle (const vint16& a) {
6305 #if OIIO_SIMD_AVX >= 512
6306  __m512 x = _mm512_castsi512_ps(a);
6307  return _mm512_castps_si512(_mm512_permute_ps(x,_MM_SHUFFLE(i3,i2,i1,i0)));
6308 #else
6309  vint4 x[4];
6310  a.store ((int *)x);
6311  return vint16 (shuffle<i0,i1,i2,i3>(x[0]), shuffle<i0,i1,i2,i3>(x[1]),
6312  shuffle<i0,i1,i2,i3>(x[2]), shuffle<i0,i1,i2,i3>(x[3]));
6313 #endif
6314 }
6315 
6316 template<int i> vint16 shuffle (const vint16& a) {
6317  return shuffle<i,i,i,i> (a);
6318 }
6319 
6320 
6321 template<int i>
6323  return a[i];
6324 }
6325 
6326 
6327 template<int i>
6329  vint16 tmp = a;
6330  tmp[i] = val;
6331  return tmp;
6332 }
6333 
6334 
6336 #if OIIO_SIMD_AVX >= 512
6337  return _mm_cvtsi128_si32(_mm512_castsi512_si128(m_simd));
6338 #else
6339  return m_val[0];
6340 #endif
6341 }
6342 
6343 OIIO_FORCEINLINE int vint16::y () const { return m_val[1]; }
6344 OIIO_FORCEINLINE int vint16::z () const { return m_val[2]; }
6345 OIIO_FORCEINLINE int vint16::w () const { return m_val[3]; }
6350 
6351 
6353 {
6354 #if OIIO_SIMD_AVX >= 512
6355  return _mm512_maskz_set1_epi32 (x, -1);
6356 #else
6357  return vint16 (bitcast_to_int(x.lo()), bitcast_to_int(x.hi()));
6358 #endif
6359 }
6360 
6361 
6363 #if OIIO_SIMD_AVX >= 512
6364  // Nomenclature: ABCD are the vint4's comprising v
6365  // First, add the vint4's and make them all the same
6366  vint16 AB_AB_CD_CD = v + shuffle4<1,0,3,2>(v); // each adjacent vint4 is summed
6367  vint16 w = AB_AB_CD_CD + shuffle4<2,3,0,1>(AB_AB_CD_CD); // ABCD in all quads
6368  // Now, add within each vint4
6369  vint16 ab_ab_cd_cd = w + shuffle<1,0,3,2>(w); // each adjacent int is summed
6370  return ab_ab_cd_cd + shuffle<2,3,0,1>(ab_ab_cd_cd);
6371 #else
6372  vint8 sum = vreduce_add(v.lo()) + vreduce_add(v.hi());
6373  return vint16 (sum, sum);
6374 #endif
6375 }
6376 
6377 
6379 #if OIIO_SIMD_AVX >= 512
6380  return vreduce_add(v).x();
6381 #else
6382  return reduce_add(v.lo()) + reduce_add(v.hi());
6383 #endif
6384 }
6385 
6386 
6388 #if OIIO_SIMD_AVX >= 512
6389  // Nomenclature: ABCD are the vint4's comprising v
6390  // First, and the vint4's and make them all the same
6391  vint16 AB_AB_CD_CD = v & shuffle4<1,0,3,2>(v); // each adjacent vint4 is summed
6392  vint16 w = AB_AB_CD_CD & shuffle4<2,3,0,1>(AB_AB_CD_CD);
6393  // Now, and within each vint4
6394  vint16 ab_ab_cd_cd = w & shuffle<1,0,3,2>(w); // each adjacent int is summed
6395  vint16 r = ab_ab_cd_cd & shuffle<2,3,0,1>(ab_ab_cd_cd);
6396  return r.x();
6397 #else
6398  return reduce_and(v.lo()) & reduce_and(v.hi());
6399 #endif
6400 }
6401 
6402 
6404 #if OIIO_SIMD_AVX >= 512
6405  // Nomenclature: ABCD are the vint4's comprising v
6406  // First, or the vint4's or make them all the same
6407  vint16 AB_AB_CD_CD = v | shuffle4<1,0,3,2>(v); // each adjacent vint4 is summed
6408  vint16 w = AB_AB_CD_CD | shuffle4<2,3,0,1>(AB_AB_CD_CD);
6409  // Now, or within each vint4
6410  vint16 ab_ab_cd_cd = w | shuffle<1,0,3,2>(w); // each adjacent int is summed
6411  vint16 r = ab_ab_cd_cd | shuffle<2,3,0,1>(ab_ab_cd_cd);
6412  return r.x();
6413 #else
6414  return reduce_or(v.lo()) | reduce_or(v.hi());
6415 #endif
6416 }
6417 
6418 
6419 
6420 OIIO_FORCEINLINE vint16 blend (const vint16& a, const vint16& b, const vbool16& mask) {
6421 #if OIIO_SIMD_AVX >= 512
6422  return _mm512_mask_blend_epi32 (mask, a, b);
6423 #else
6424  return vint16 (blend (a.lo(), b.lo(), mask.lo()),
6425  blend (a.hi(), b.hi(), mask.hi()));
6426 #endif
6427 }
6428 
6429 
6431 #if OIIO_SIMD_AVX >= 512
6432  return _mm512_maskz_mov_epi32 (mask, a);
6433 #else
6434  return vint16 (blend0 (a.lo(), mask.lo()),
6435  blend0 (a.hi(), mask.hi()));
6436 #endif
6437 }
6438 
6439 
6441 #if OIIO_SIMD_AVX >= 512
6442  return _mm512_maskz_mov_epi32 (!mask, a);
6443 #else
6444  return vint16 (blend0not (a.lo(), mask.lo()),
6445  blend0not (a.hi(), mask.hi()));
6446 #endif
6447 }
6448 
6449 OIIO_FORCEINLINE vint16 select (const vbool16& mask, const vint16& a, const vint16& b) {
6450  return blend (b, a, mask);
6451 }
6452 
6453 
6455 #if OIIO_SIMD_AVX >= 512
6456  return _mm512_abs_epi32(a.simd());
6457 #else
6458  return vint16 (abs(a.lo()), abs(a.hi()));
6459 #endif
6460 }
6461 
6462 
6463 OIIO_FORCEINLINE vint16 min (const vint16& a, const vint16& b) {
6464 #if OIIO_SIMD_AVX >= 512
6465  return _mm512_min_epi32 (a, b);
6466 #else
6467  return vint16 (min(a.lo(), b.lo()), min(a.hi(), b.hi()));
6468 #endif
6469 }
6470 
6471 
6472 OIIO_FORCEINLINE vint16 max (const vint16& a, const vint16& b) {
6473 #if OIIO_SIMD_AVX >= 512
6474  return _mm512_max_epi32 (a, b);
6475 #else
6476  return vint16 (max(a.lo(), b.lo()), max(a.hi(), b.hi()));
6477 #endif
6478 }
6479 
6480 
6482 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6483  // return _mm512_rol_epi32 (x, s);
6484  // We want to do this ^^^ but this intrinsic only takes an *immediate*
6485  // argument for s, and there isn't a way to express in C++ that a
6486  // parameter must be an immediate/literal value from the caller.
6487  return (x<<s) | srl(x,32-s);
6488 #else
6489  return (x<<s) | srl(x,32-s);
6490 #endif
6491 }
6492 
6493 // DEPRECATED (2.1)
6494 OIIO_FORCEINLINE vint16 rotl32 (const vint16& x, const unsigned int k) {
6495  return rotl(x, k);
6496 }
6497 
6498 
6500 #if OIIO_SIMD_AVX >= 512
6501  return _mm512_andnot_epi32 (a.simd(), b.simd());
6502 #else
6503  return vint16 (andnot(a.lo(), b.lo()), andnot(a.hi(), b.hi()));
6504 #endif
6505 }
6506 
6507 
6508 
6510  // NO INTEGER MODULUS IN SSE!
6511  SIMD_RETURN (vint16, b[i] ? a[i] % b[i] : 0);
6512 }
6513 
6515  return b ? (a % b) : vint16::Zero();
6516 }
6517 
6518 
6519 
6520 
6521 
6522 //////////////////////////////////////////////////////////////////////
6523 // vfloat4 implementation
6524 
6525 
6527 #if OIIO_SIMD_SSE
6528  m_simd = _mm_cvtepi32_ps (ival.simd());
6529 #elif OIIO_SIMD_NEON
6530  m_simd = vcvtq_f32_s32(ival.simd());
6531 #else
6532  SIMD_CONSTRUCT (float(ival[i]));
6533 #endif
6534 }
6535 
6536 
6538 #if OIIO_SIMD_SSE
6539  return _mm_setzero_ps();
6540 #else
6541  return vfloat4(0.0f);
6542 #endif
6543 }
6544 
6546  return vfloat4(1.0f);
6547 }
6548 
6549 OIIO_FORCEINLINE const vfloat4 vfloat4::Iota (float start, float step) {
6550  return vfloat4 (start+0.0f*step, start+1.0f*step, start+2.0f*step, start+3.0f*step);
6551 }
6552 
6553 /// Set all components to 0.0
6555 #if OIIO_SIMD_SSE
6556  m_simd = _mm_setzero_ps();
6557 #else
6558  load (0.0f);
6559 #endif
6560 }
6561 
6563  load ((const float *)&v);
6564  return *this;
6565 }
6566 
6568  load (v[0], v[1], v[2], 0.0f);
6569  return *this;
6570 }
6571 
6574  return m_val[i];
6575 }
6576 
6579  return m_val[i];
6580 }
6581 
6582 
6584 #if OIIO_SIMD_SSE
6585  m_simd = _mm_set1_ps (val);
6586 #elif OIIO_SIMD_NEON
6587  m_simd = vdupq_n_f32 (val);
6588 #else
6589  SIMD_CONSTRUCT (val);
6590 #endif
6591 }
6592 
6593 OIIO_FORCEINLINE void vfloat4::load (float a, float b, float c, float d) {
6594 #if OIIO_SIMD_SSE
6595  m_simd = _mm_set_ps (d, c, b, a);
6596 #elif OIIO_SIMD_NEON
6597  float values[4] = { a, b, c, d };
6598  m_simd = vld1q_f32 (values);
6599 #else
6600  m_val[0] = a;
6601  m_val[1] = b;
6602  m_val[2] = c;
6603  m_val[3] = d;
6604 #endif
6605 }
6606 
6607  /// Load from an array of 4 values
6609 #if OIIO_SIMD_SSE
6610  m_simd = _mm_loadu_ps (values);
6611 #elif OIIO_SIMD_NEON
6612  m_simd = vld1q_f32 (values);
6613 #else
6614  SIMD_CONSTRUCT (values[i]);
6615 #endif
6616 }
6617 
6618 
6619 OIIO_FORCEINLINE void vfloat4::load (const float *values, int n) {
6620  OIIO_DASSERT (n >= 0 && n <= elements);
6621 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6622  m_simd = _mm_maskz_loadu_ps (__mmask8(~(0xf << n)), values);
6623 #elif OIIO_SIMD_SSE
6624  switch (n) {
6625  case 1:
6626  m_simd = _mm_load_ss (values);
6627  break;
6628  case 2:
6629  // Trickery: load one double worth of bits!
6630  m_simd = _mm_castpd_ps (_mm_load_sd ((const double*)values));
6631  break;
6632  case 3:
6633  m_simd = _mm_setr_ps (values[0], values[1], values[2], 0.0f);
6634  // This looks wasteful, but benchmarks show that it's the
6635  // fastest way to set 3 values with the 4th getting zero.
6636  // Actually, gcc and clang both turn it into something more
6637  // efficient than _mm_setr_ps. The version below looks smart,
6638  // but was much more expensive as the _mm_setr_ps!
6639  // __m128 xy = _mm_castsi128_ps(_mm_loadl_epi64((const __m128i*)values));
6640  // m_simd = _mm_movelh_ps(xy, _mm_load_ss (values + 2));
6641  break;
6642  case 4:
6643  m_simd = _mm_loadu_ps (values);
6644  break;
6645  default:
6646  clear();
6647  break;
6648  }
6649 #elif OIIO_SIMD_NEON
6650  switch (n) {
6651  case 1: m_simd = vdupq_n_f32(0); m_simd[0] = values[0]; break;
6652  case 2: load (values[0], values[1], 0.0f, 0.0f); break;
6653  case 3: load (values[0], values[1], values[2], 0.0f); break;
6654  case 4: m_simd = vld1q_f32 (values); break;
6655  default: break;
6656  }
6657 #else
6658  for (int i = 0; i < n; ++i)
6659  m_val[i] = values[i];
6660  for (int i = n; i < paddedelements; ++i)
6661  m_val[i] = 0;
6662 #endif
6663 }
6664 
6665 
6666 OIIO_FORCEINLINE void vfloat4::load (const unsigned short *values) {
6667 #if OIIO_SIMD_SSE >= 2
6668  m_simd = _mm_cvtepi32_ps (vint4(values).simd());
6669  // You might guess that the following is faster, but it's NOT:
6670  // NO! m_simd = _mm_cvtpu16_ps (*(__m64*)values);
6671 #else
6672  SIMD_CONSTRUCT (values[i]);
6673 #endif
6674 }
6675 
6676 
6678 #if OIIO_SIMD_SSE >= 2
6679  m_simd = _mm_cvtepi32_ps (vint4(values).simd());
6680 #else
6681  SIMD_CONSTRUCT (values[i]);
6682 #endif
6683 }
6684 
6685 
6686 OIIO_FORCEINLINE void vfloat4::load (const unsigned char *values) {
6687 #if OIIO_SIMD_SSE >= 2
6688  m_simd = _mm_cvtepi32_ps (vint4(values).simd());
6689 #else
6690  SIMD_CONSTRUCT (values[i]);
6691 #endif
6692 }
6693 
6694 // Load from an array of 4 char values, convert to float
6696 #if OIIO_SIMD_SSE >= 2
6697  m_simd = _mm_cvtepi32_ps (vint4(values).simd());
6698 #else
6699  SIMD_CONSTRUCT (values[i]);
6700 #endif
6701 }
6702 
6703 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
6704 OIIO_FORCEINLINE void vfloat4::load (const half *values) {
6705 #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE
6706  /* Enabled 16 bit float instructions! */
6707  __m128i a = _mm_castpd_si128 (_mm_load_sd ((const double *)values));
6708  m_simd = _mm_cvtph_ps (a);
6709 #elif OIIO_SIMD_SSE >= 2
6710  // SSE half-to-float by Fabian "ryg" Giesen. Public domain.
6711  // https://gist.github.com/rygorous/2144712
6712  vint4 h ((const unsigned short *)values);
6713 # define CONSTI(name) *(const __m128i *)&name
6714 # define CONSTF(name) *(const __m128 *)&name
6715  OIIO_SIMD_UINT4_CONST(mask_nosign, 0x7fff);
6716  OIIO_SIMD_UINT4_CONST(magic, (254 - 15) << 23);
6717  OIIO_SIMD_UINT4_CONST(was_infnan, 0x7bff);
6718  OIIO_SIMD_UINT4_CONST(exp_infnan, 255 << 23);
6719  __m128i mnosign = CONSTI(mask_nosign);
6720  __m128i expmant = _mm_and_si128(mnosign, h);
6721  __m128i justsign = _mm_xor_si128(h, expmant);
6722  __m128i expmant2 = expmant; // copy (just here for counting purposes)
6723  __m128i shifted = _mm_slli_epi32(expmant, 13);
6724  __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), *(const __m128 *)&magic);
6725  __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant2, CONSTI(was_infnan));
6726  __m128i sign = _mm_slli_epi32(justsign, 16);
6727  __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), CONSTF(exp_infnan));
6728  __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp);
6729  __m128 final = _mm_or_ps(scaled, sign_inf);
6730  // ~11 SSE2 ops.
6731  m_simd = final;
6732 # undef CONSTI
6733 # undef CONSTF
6734 #else /* No SIMD defined: */
6735  SIMD_CONSTRUCT (values[i]);
6736 #endif
6737 }
6738 #endif /* _HALF_H_ or _IMATH_H_ */
6739 
6740 OIIO_FORCEINLINE void vfloat4::store (float *values) const {
6741 #if OIIO_SIMD_SSE
6742  // Use an unaligned store -- it's just as fast when the memory turns
6743  // out to be aligned, nearly as fast even when unaligned. Not worth
6744  // the headache of using stores that require alignment.
6745  _mm_storeu_ps (values, m_simd);
6746 #elif OIIO_SIMD_NEON
6747  vst1q_f32 (values, m_simd);
6748 #else
6749  SIMD_DO (values[i] = m_val[i]);
6750 #endif
6751 }
6752 
6753 OIIO_FORCEINLINE void vfloat4::store (float *values, int n) const {
6754  OIIO_DASSERT (n >= 0 && n <= 4);
6755 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6756  // This SHOULD be fast, but in my benchmarks, it is slower!
6757  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
6758  // Re-test this periodically with new Intel hardware.
6759  _mm_mask_storeu_ps (values, __mmask8(~(0xf << n)), m_simd);
6760 #elif OIIO_SIMD_SSE
6761  switch (n) {
6762  case 1:
6763  _mm_store_ss (values, m_simd);
6764  break;
6765  case 2:
6766  // Trickery: store two floats as a double worth of bits
6767  _mm_store_sd ((double*)values, _mm_castps_pd(m_simd));
6768  break;
6769  case 3:
6770  values[0] = m_val[0];
6771  values[1] = m_val[1];
6772  values[2] = m_val[2];
6773  // This looks wasteful, but benchmarks show that it's the
6774  // fastest way to store 3 values, in benchmarks was faster than
6775  // this, below:
6776  // _mm_store_sd ((double*)values, _mm_castps_pd(m_simd));
6777  // _mm_store_ss (values + 2, _mm_movehl_ps(m_simd,m_simd));
6778  break;
6779  case 4:
6780  store (values);
6781  break;
6782  default:
6783  break;
6784  }
6785 #elif OIIO_SIMD_NEON
6786  switch (n) {
6787  case 1:
6788  vst1q_lane_f32 (values, m_simd, 0);
6789  break;
6790  case 2:
6791  vst1q_lane_f32 (values++, m_simd, 0);
6792  vst1q_lane_f32 (values, m_simd, 1);
6793  break;
6794  case 3:
6795  vst1q_lane_f32 (values++, m_simd, 0);
6796  vst1q_lane_f32 (values++, m_simd, 1);
6797  vst1q_lane_f32 (values, m_simd, 2);
6798  break;
6799  case 4:
6800  vst1q_f32 (values, m_simd); break;
6801  default:
6802  break;
6803  }
6804 #else
6805  for (int i = 0; i < n; ++i)
6806  values[i] = m_val[i];
6807 #endif
6808 }
6809 
6810 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
6811 OIIO_FORCEINLINE void vfloat4::store (half *values) const {
6812 #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE
6813  __m128i h = _mm_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
6814  _mm_store_sd ((double *)values, _mm_castsi128_pd(h));
6815 #else
6816  SIMD_DO (values[i] = m_val[i]);
6817 #endif
6818 }
6819 #endif
6820 
6821 
6822 OIIO_FORCEINLINE void vfloat4::load_mask (int mask, const float *values) {
6823 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6824  m_simd = _mm_maskz_loadu_ps (__mmask8(mask), (const simd_t *)values);
6825 #elif OIIO_SIMD_AVX
6826  m_simd = _mm_maskload_ps (values, _mm_castps_si128(vbool_t::from_bitmask(mask)));
6827 #else
6828  SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0.0f);
6829 #endif
6830 }
6831 
6832 
6833 OIIO_FORCEINLINE void vfloat4::load_mask (const vbool_t& mask, const float *values) {
6834 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6835  m_simd = _mm_maskz_loadu_ps (__mmask8(mask.bitmask()), (const simd_t *)values);
6836 #elif OIIO_SIMD_AVX
6837  m_simd = _mm_maskload_ps (values, _mm_castps_si128(mask));
6838 #else
6839  SIMD_CONSTRUCT (mask[i] ? values[i] : 0.0f);
6840 #endif
6841 }
6842 
6843 
6844 OIIO_FORCEINLINE void vfloat4::store_mask (int mask, float *values) const {
6845 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6846  _mm_mask_storeu_ps (values, __mmask8(mask), m_simd);
6847 #elif OIIO_SIMD_AVX
6848  _mm_maskstore_ps (values, _mm_castps_si128(vbool_t::from_bitmask(mask)), m_simd);
6849 #else
6850  SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]);
6851 #endif
6852 }
6853 
6854 
6855 OIIO_FORCEINLINE void vfloat4::store_mask (const vbool_t& mask, float *values) const {
6856 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6857  _mm_mask_storeu_ps (values, __mmask8(mask.bitmask()), m_simd);
6858 #elif OIIO_SIMD_AVX
6859  _mm_maskstore_ps (values, _mm_castps_si128(mask.simd()), m_simd);
6860 #else
6861  SIMD_DO (if (mask[i]) values[i] = (*this)[i]);
6862 #endif
6863 }
6864 
6865 
6866 template <int scale>
6867 OIIO_FORCEINLINE void
6868 vfloat4::gather (const value_t *baseptr, const vint_t& vindex)
6869 {
6870 #if OIIO_SIMD_AVX >= 2
6871  m_simd = _mm_i32gather_ps (baseptr, vindex, scale);
6872 #else
6873  SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale));
6874 #endif
6875 }
6876 
6877 template<int scale>
6878 OIIO_FORCEINLINE void
6879 vfloat4::gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex)
6880 {
6881 #if OIIO_SIMD_AVX >= 2
6882  m_simd = _mm_mask_i32gather_ps (m_simd, baseptr, vindex, mask, scale);
6883 #else
6884  SIMD_CONSTRUCT (mask[i] ? *(const value_t *)((const char *)baseptr + vindex[i]*scale) : 0);
6885 #endif
6886 }
6887 
6888 template<int scale>
6889 OIIO_FORCEINLINE void
6890 vfloat4::scatter (value_t *baseptr, const vint_t& vindex) const
6891 {
6892 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6893  // FIXME: disable because it benchmarks slower than the dumb way
6894  _mm_i32scatter_ps (baseptr, vindex, m_simd, scale);
6895 #else
6896  SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
6897 #endif
6898 }
6899 
6900 template<int scale>
6901 OIIO_FORCEINLINE void
6903  const vint_t& vindex) const
6904 {
6905 #if 0 && OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
6906  // FIXME: disable because it benchmarks slower than the dumb way
6907  _mm_mask_i32scatter_ps (baseptr, mask.bitmask(), vindex, m_simd, scale);
6908 #else
6909  SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
6910 #endif
6911 }
6912 
6913 
6915 #if OIIO_SIMD_SSE
6916  return _mm_add_ps (a.m_simd, b.m_simd);
6917 #elif OIIO_SIMD_NEON
6918  return vaddq_f32 (a.m_simd, b.m_simd);
6919 #else
6920  SIMD_RETURN (vfloat4, a[i] + b[i]);
6921 #endif
6922 }
6923 
6925 #if OIIO_SIMD_SSE
6926  m_simd = _mm_add_ps (m_simd, a.m_simd);
6927 #elif OIIO_SIMD_NEON
6928  m_simd = vaddq_f32 (m_simd, a.m_simd);
6929 #else
6930  SIMD_DO (m_val[i] += a[i]);
6931 #endif
6932  return *this;
6933  }
6934 
6936 #if OIIO_SIMD_SSE
6937  return _mm_sub_ps (_mm_setzero_ps(), m_simd);
6938 #elif OIIO_SIMD_NEON
6939  return vsubq_f32 (Zero(), m_simd);
6940 #else
6941  SIMD_RETURN (vfloat4, -m_val[i]);
6942 #endif
6943 }
6944 
6946 #if OIIO_SIMD_SSE
6947  return _mm_sub_ps (a.m_simd, b.m_simd);
6948 #elif OIIO_SIMD_NEON
6949  return vsubq_f32 (a.m_simd, b.m_simd);
6950 #else
6951  SIMD_RETURN (vfloat4, a[i] - b[i]);
6952 #endif
6953 }
6954 
6956 #if OIIO_SIMD_SSE
6957  m_simd = _mm_sub_ps (m_simd, a.m_simd);
6958 #elif OIIO_SIMD_NEON
6959  m_simd = vsubq_f32 (m_simd, a.m_simd);
6960 #else
6961  SIMD_DO (m_val[i] -= a[i]);
6962 #endif
6963  return *this;
6964 }
6965 
6967 #if OIIO_SIMD_SSE
6968  return _mm_mul_ps (a.m_simd, _mm_set1_ps(b));
6969 #elif OIIO_SIMD_NEON
6970  return vmulq_n_f32 (a.m_simd, b);
6971 #else
6972  SIMD_RETURN (vfloat4, a[i] * b);
6973 #endif
6974 }
6975 
6977  return b * a;
6978 }
6979 
6981 #if OIIO_SIMD_SSE
6982  return _mm_mul_ps (a.m_simd, b.m_simd);
6983 #elif OIIO_SIMD_NEON
6984  return vmulq_f32 (a.m_simd, b.m_simd);
6985 #else
6986  SIMD_RETURN (vfloat4, a[i] * b[i]);
6987 #endif
6988 }
6989 
6991 #if OIIO_SIMD_SSE
6992  m_simd = _mm_mul_ps (m_simd, a.m_simd);
6993 #elif OIIO_SIMD_NEON
6994  m_simd = vmulq_f32 (m_simd, a.m_simd);
6995 #else
6996  SIMD_DO (m_val[i] *= a[i]);
6997 #endif
6998  return *this;
6999 }
7000 
7002 #if OIIO_SIMD_SSE
7003  m_simd = _mm_mul_ps (m_simd, _mm_set1_ps(val));
7004 #elif OIIO_SIMD_NEON
7005  m_simd = vmulq_n_f32 (m_simd, val);
7006 #else
7007  SIMD_DO (m_val[i] *= val);
7008 #endif
7009  return *this;
7010 }
7011 
7013 #if OIIO_SIMD_SSE
7014  return _mm_div_ps (a.m_simd, b.m_simd);
7015 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7016  return vdivq_f32 (a.m_simd, b.m_simd);
7017 #else
7018  SIMD_RETURN (vfloat4, a[i] / b[i]);
7019 #endif
7020 }
7021 
7023 #if OIIO_SIMD_SSE
7024  m_simd = _mm_div_ps (m_simd, a.m_simd);
7025 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7026  m_simd = vdivq_f32 (m_simd, a.m_simd);
7027 #else
7028  SIMD_DO (m_val[i] /= a[i]);
7029 #endif
7030  return *this;
7031 }
7032 
7034 #if OIIO_SIMD_SSE
7035  m_simd = _mm_div_ps (m_simd, _mm_set1_ps(val));
7036 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7037  m_simd = vdivq_f32 (m_simd, vfloat4(val));
7038 #else
7039  SIMD_DO (m_val[i] /= val);
7040 #endif
7041  return *this;
7042 }
7043 
7045 #if OIIO_SIMD_SSE
7046  return _mm_cmpeq_ps (a.m_simd, b.m_simd);
7047 #elif OIIO_SIMD_NEON
7048  return vceqq_f32 (a.m_simd, b.m_simd);
7049 #else
7050  SIMD_RETURN (vbool4, a[i] == b[i] ? -1 : 0);
7051 #endif
7052 }
7053 
7055 #if OIIO_SIMD_SSE
7056  return _mm_cmpneq_ps (a.m_simd, b.m_simd);
7057 #elif OIIO_SIMD_NEON
7058  // implemented as NOT(a == b)
7059  return vmvnq_u32(vceqq_f32 (a.m_simd, b.m_simd));
7060 #else
7061  SIMD_RETURN (vbool4, a[i] != b[i] ? -1 : 0);
7062 #endif
7063 }
7064 
7066 #if OIIO_SIMD_SSE
7067  return _mm_cmplt_ps (a.m_simd, b.m_simd);
7068 #elif OIIO_SIMD_NEON
7069  return vcltq_f32 (a.m_simd, b.m_simd);
7070 #else
7071  SIMD_RETURN (vbool4, a[i] < b[i] ? -1 : 0);
7072 #endif
7073 }
7074 
7076 #if OIIO_SIMD_SSE
7077  return _mm_cmpgt_ps (a.m_simd, b.m_simd);
7078 #elif OIIO_SIMD_NEON
7079  return vcgtq_f32 (a.m_simd, b.m_simd);
7080 #else
7081  SIMD_RETURN (vbool4, a[i] > b[i] ? -1 : 0);
7082 #endif
7083 }
7084 
7086 #if OIIO_SIMD_SSE
7087  return _mm_cmpge_ps (a.m_simd, b.m_simd);
7088 #elif OIIO_SIMD_NEON
7089  return vcgeq_f32 (a.m_simd, b.m_simd);
7090 #else
7091  SIMD_RETURN (vbool4, a[i] >= b[i] ? -1 : 0);
7092 #endif
7093 }
7094 
7096 #if OIIO_SIMD_SSE
7097  return _mm_cmple_ps (a.m_simd, b.m_simd);
7098 #elif OIIO_SIMD_NEON
7099  return vcleq_f32 (a.m_simd, b.m_simd);
7100 #else
7101  SIMD_RETURN (vbool4, a[i] <= b[i] ? -1 : 0);
7102 #endif
7103 }
7104 
7106 #if OIIO_SIMD_SSE
7107  return _mm_movelh_ps (a.m_simd, b.m_simd);
7108 #else
7109  return vfloat4 (a[0], a[1], b[0], b[1]);
7110 #endif
7111 }
7112 
7114 #if OIIO_SIMD_SSE
7115  return _mm_unpacklo_ps (a.m_simd, b.m_simd);
7116 #else
7117  return vfloat4 (a[0], b[0], a[1], b[1]);
7118 #endif
7119 }
7120 
7122  return insert<3>(*this, 0.0f);
7123 }
7124 
7126  return insert<3>(*this, 1.0f);
7127 }
7128 
7129 inline std::ostream& operator<< (std::ostream& cout, const vfloat4& val) {
7130  cout << val[0];
7131  for (int i = 1; i < val.elements; ++i)
7132  cout << ' ' << val[i];
7133  return cout;
7134 }
7135 
7136 
7137 // Implementation had to be after the definition of vfloat4.
7139 {
7140 #if OIIO_SIMD_SSE
7141  m_simd = _mm_cvttps_epi32(f.simd());
7142 #else
7143  SIMD_CONSTRUCT ((int) f[i]);
7144 #endif
7145 }
7146 
7147 
7148 template<int i0, int i1, int i2, int i3>
7150 #if OIIO_SIMD_SSE
7151  return shuffle_sse<i0,i1,i2,i3> (__m128(a));
7152 #else
7153  return vfloat4(a[i0], a[i1], a[i2], a[i3]);
7154 #endif
7155 }
7156 
7157 template<int i> OIIO_FORCEINLINE vfloat4 shuffle (const vfloat4& a) { return shuffle<i,i,i,i>(a); }
7158 
7159 #if OIIO_SIMD_NEON
7160 template<> OIIO_FORCEINLINE vfloat4 shuffle<0> (const vfloat4& a) {
7161  float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,0);
7162 }
7163 template<> OIIO_FORCEINLINE vfloat4 shuffle<1> (const vfloat4& a) {
7164  float32x2_t t = vget_low_f32(a.simd()); return vdupq_lane_f32(t,1);
7165 }
7166 template<> OIIO_FORCEINLINE vfloat4 shuffle<2> (const vfloat4& a) {
7167  float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,0);
7168 }
7169 template<> OIIO_FORCEINLINE vfloat4 shuffle<3> (const vfloat4& a) {
7170  float32x2_t t = vget_high_f32(a.simd()); return vdupq_lane_f32(t,1);
7171 }
7172 #endif
7173 
7174 
7175 
7176 /// Helper: as rapid as possible extraction of one component, when the
7177 /// index is fixed.
7178 template<int i>
7180 #if OIIO_SIMD_SSE
7181  return _mm_cvtss_f32(shuffle_sse<i,i,i,i>(a.simd()));
7182 #else
7183  return a[i];
7184 #endif
7185 }
7186 
7187 #if OIIO_SIMD_SSE
7188 template<> OIIO_FORCEINLINE float extract<0> (const vfloat4& a) {
7189  return _mm_cvtss_f32(a.simd());
7190 }
7191 #endif
7192 
7193 
7194 /// Helper: substitute val for a[i]
7195 template<int i>
7197 #if OIIO_SIMD_SSE >= 4
7198  return _mm_insert_ps (a, _mm_set_ss(val), i<<4);
7199 #else
7200  vfloat4 tmp = a;
7201  tmp[i] = val;
7202  return tmp;
7203 #endif
7204 }
7205 
7206 #if OIIO_SIMD_SSE
7207 // Slightly faster special cases for SSE
7208 template<> OIIO_FORCEINLINE vfloat4 insert<0> (const vfloat4& a, float val) {
7209  return _mm_move_ss (a.simd(), _mm_set_ss(val));
7210 }
7211 #endif
7212 
7213 
7214 OIIO_FORCEINLINE float vfloat4::x () const { return extract<0>(*this); }
7215 OIIO_FORCEINLINE float vfloat4::y () const { return extract<1>(*this); }
7216 OIIO_FORCEINLINE float vfloat4::z () const { return extract<2>(*this); }
7217 OIIO_FORCEINLINE float vfloat4::w () const { return extract<3>(*this); }
7218 OIIO_FORCEINLINE void vfloat4::set_x (float val) { *this = insert<0>(*this, val); }
7219 OIIO_FORCEINLINE void vfloat4::set_y (float val) { *this = insert<1>(*this, val); }
7220 OIIO_FORCEINLINE void vfloat4::set_z (float val) { *this = insert<2>(*this, val); }
7221 OIIO_FORCEINLINE void vfloat4::set_w (float val) { *this = insert<3>(*this, val); }
7222 
7223 
7225 {
7226 #if OIIO_SIMD_SSE
7227  return _mm_castps_si128 (x.simd());
7228 #else
7229  return *(vint4 *)&x;
7230 #endif
7231 }
7232 
7234 {
7235 #if OIIO_SIMD_SSE
7236  return _mm_castsi128_ps (x.simd());
7237 #else
7238  return *(vfloat4 *)&x;
7239 #endif
7240 }
7241 
7242 
7243 // Old names:
7244 inline vint4 bitcast_to_int4 (const vfloat4& x) { return bitcast_to_int(x); }
7245 inline vfloat4 bitcast_to_float4 (const vint4& x) { return bitcast_to_float(x); }
7246 
7247 
7248 
7250 #if OIIO_SIMD_SSE >= 3
7251  // People seem to agree that SSE3 does add reduction best with 2
7252  // horizontal adds.
7253  // suppose v = (a, b, c, d)
7254  simd::vfloat4 ab_cd = _mm_hadd_ps (v.simd(), v.simd());
7255  // ab_cd = (a+b, c+d, a+b, c+d)
7256  simd::vfloat4 abcd = _mm_hadd_ps (ab_cd.simd(), ab_cd.simd());
7257  // all abcd elements are a+b+c+d
7258  return abcd;
7259 #elif OIIO_SIMD_SSE
7260  // I think this is the best we can do for SSE2, and I'm still not sure
7261  // it's faster than the default scalar operation. But anyway...
7262  // suppose v = (a, b, c, d)
7263  vfloat4 ab_ab_cd_cd = shuffle<1,0,3,2>(v) + v;
7264  // now x = (b,a,d,c) + (a,b,c,d) = (a+b,a+b,c+d,c+d)
7265  vfloat4 cd_cd_ab_ab = shuffle<2,3,0,1>(ab_ab_cd_cd);
7266  // now y = (c+d,c+d,a+b,a+b)
7267  vfloat4 abcd = ab_ab_cd_cd + cd_cd_ab_ab; // a+b+c+d in all components
7268  return abcd;
7269 #else
7270  return vfloat4 (v[0] + v[1] + v[2] + v[3]);
7271 #endif
7272 }
7273 
7274 
7276 #if OIIO_SIMD_SSE
7277  return _mm_cvtss_f32(vreduce_add (v));
7278 #elif OIIO_SIMD_NEON && defined(__aarch64__)
7279  return vaddvq_f32(v);
7280 #else
7281  return v[0] + v[1] + v[2] + v[3];
7282 #endif
7283 }
7284 
7286 #if OIIO_SIMD_SSE >= 4
7287  return _mm_dp_ps (a.simd(), b.simd(), 0xff);
7288 #elif OIIO_SIMD_NEON
7289  float32x4_t ab = vmulq_f32(a, b);
7290  float32x4_t sum1 = vaddq_f32(ab, vrev64q_f32(ab));
7291  return vaddq_f32(sum1, vcombine_f32(vget_high_f32(sum1), vget_low_f32(sum1)));
7292 #else
7293  return vreduce_add (a*b);
7294 #endif
7295 }
7296 
7297 OIIO_FORCEINLINE float dot (const vfloat4 &a, const vfloat4 &b) {
7298 #if OIIO_SIMD_SSE >= 4
7299  return _mm_cvtss_f32 (_mm_dp_ps (a.simd(), b.simd(), 0xff));
7300 #else
7301  return reduce_add (a*b);
7302 #endif
7303 }
7304 
7306 #if OIIO_SIMD_SSE >= 4
7307  return _mm_dp_ps (a.simd(), b.simd(), 0x7f);
7308 #else
7309  return vreduce_add((a*b).xyz0());
7310 #endif
7311 }
7312 
7313 OIIO_FORCEINLINE float dot3 (const vfloat4 &a, const vfloat4 &b) {
7314 #if OIIO_SIMD_SSE >= 4
7315  return _mm_cvtss_f32 (_mm_dp_ps (a.simd(), b.simd(), 0x77));
7316 #else
7317  return reduce_add ((a*b).xyz0());
7318 #endif
7319 }
7320 
7321 
7323 {
7324 #if OIIO_SIMD_SSE >= 4
7325  // SSE >= 4.1 only
7326  return _mm_blendv_ps (a.simd(), b.simd(), mask.simd());
7327 #elif OIIO_SIMD_SSE
7328  // Trick for SSE < 4.1
7329  return _mm_or_ps (_mm_and_ps(mask.simd(), b.simd()),
7330  _mm_andnot_ps(mask.simd(), a.simd()));
7331 #elif OIIO_SIMD_NEON
7332  return vbslq_f32 (mask.simd(), b.simd(), a.simd());
7333 #else
7334  return vfloat4 (mask[0] ? b[0] : a[0],
7335  mask[1] ? b[1] : a[1],
7336  mask[2] ? b[2] : a[2],
7337  mask[3] ? b[3] : a[3]);
7338 #endif
7339 }
7340 
7341 
7343 {
7344 #if OIIO_SIMD_SSE
7345  return _mm_and_ps(mask.simd(), a.simd());
7346 #else
7347  return vfloat4 (mask[0] ? a[0] : 0.0f,
7348  mask[1] ? a[1] : 0.0f,
7349  mask[2] ? a[2] : 0.0f,
7350  mask[3] ? a[3] : 0.0f);
7351 #endif
7352 }
7353 
7354 
7356 {
7357 #if OIIO_SIMD_SSE
7358  return _mm_andnot_ps(mask.simd(), a.simd());
7359 #else
7360  return vfloat4 (mask[0] ? 0.0f : a[0],
7361  mask[1] ? 0.0f : a[1],
7362  mask[2] ? 0.0f : a[2],
7363  mask[3] ? 0.0f : a[3]);
7364 #endif
7365 }
7366 
7367 
7369 #if OIIO_SIMD_SSE
7370  return blend0not (a/b, b == vfloat4::Zero());
7371 #else
7372  return vfloat4 (b[0] == 0.0f ? 0.0f : a[0] / b[0],
7373  b[1] == 0.0f ? 0.0f : a[1] / b[1],
7374  b[2] == 0.0f ? 0.0f : a[2] / b[2],
7375  b[3] == 0.0f ? 0.0f : a[3] / b[3]);
7376 #endif
7377 }
7378 
7379 
7381 {
7382 #if OIIO_SIMD_SSE
7383  return vfloat3(safe_div(a, shuffle<3>(a)).xyz0());
7384 #else
7385  float d = a[3];
7386  return d == 0.0f ? vfloat3 (0.0f) : vfloat3 (a[0]/d, a[1]/d, a[2]/d);
7387 #endif
7388 }
7389 
7390 
7391 
7393 {
7394  return blend (b, a, mask);
7395 }
7396 
7397 
7399 {
7400 #if OIIO_SIMD_SSE
7401  // Just clear the sign bit for cheap fabsf
7402  return _mm_and_ps (a.simd(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
7403 #elif OIIO_SIMD_NEON
7404  return vabsq_f32(a.simd());
7405 #else
7406  SIMD_RETURN (vfloat4, fabsf(a[i]));
7407 #endif
7408 }
7409 
7410 
7412 {
7413  vfloat4 one(1.0f);
7414  return blend (one, -one, a < vfloat4::Zero());
7415 }
7416 
7417 
7419 {
7420 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
7421  return _mm_ceil_ps (a);
7422 #else
7423  SIMD_RETURN (vfloat4, ceilf(a[i]));
7424 #endif
7425 }
7426 
7428 {
7429 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
7430  return _mm_floor_ps (a);
7431 #else
7432  SIMD_RETURN (vfloat4, floorf(a[i]));
7433 #endif
7434 }
7435 
7437 {
7438 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
7439  return _mm_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
7440 #else
7441  SIMD_RETURN (vfloat4, roundf(a[i]));
7442 #endif
7443 }
7444 
7446 {
7447  // FIXME: look into this, versus the method of quick_floor in texturesys.cpp
7448 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
7449  return vint4(floor(a));
7450 #else
7451  SIMD_RETURN (vint4, (int)floorf(a[i]));
7452 #endif
7453 }
7454 
7455 
7457 {
7458  return vint4 (round(a));
7459 }
7460 
7461 
7463 {
7464 #if OIIO_SIMD_AVX512 && OIIO_AVX512VL_ENABLED
7465  // avx512vl directly has rcp14 on float4
7466  vfloat4 r = _mm_rcp14_ps(a);
7467  return r * nmadd(r,a,vfloat4(2.0f));
7468 #elif OIIO_SIMD_AVX512
7469  // Trickery: in and out of the 512 bit registers to use fast approx rcp
7470  vfloat16 r = _mm512_rcp14_ps(_mm512_castps128_ps512(a));
7471  return _mm512_castps512_ps128(r);
7472 #elif OIIO_SIMD_SSE
7473  vfloat4 r = _mm_rcp_ps(a);
7474  return r * nmadd(r,a,vfloat4(2.0f));
7475 #else
7476  SIMD_RETURN (vfloat4, 1.0f/a[i]);
7477 #endif
7478 }
7479 
7480 
7482 {
7483 #if OIIO_SIMD_SSE
7484  return _mm_sqrt_ps (a.simd());
7485 #else
7486  SIMD_RETURN (vfloat4, sqrtf(a[i]));
7487 #endif
7488 }
7489 
7490 
7492 {
7493 #if OIIO_SIMD_SSE
7494  return _mm_div_ps (_mm_set1_ps(1.0f), _mm_sqrt_ps (a.simd()));
7495 #else
7496  SIMD_RETURN (vfloat4, 1.0f/sqrtf(a[i]));
7497 #endif
7498 }
7499 
7500 
7502 {
7503 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
7504  // Trickery: in and out of the 512 bit registers to use fast approx rsqrt
7505  return _mm512_castps512_ps128(_mm512_rsqrt28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC));
7506 #elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
7507  // Trickery: in and out of the 512 bit registers to use fast approx rsqrt
7508  return _mm512_castps512_ps128(_mm512_rsqrt14_ps(_mm512_castps128_ps512(a)));
7509 #elif OIIO_SIMD_SSE
7510  return _mm_rsqrt_ps (a.simd());
7511 #else
7512  SIMD_RETURN (vfloat4, 1.0f/sqrtf(a[i]));
7513 #endif
7514 }
7515 
7516 
7518 {
7519 #if OIIO_SIMD_SSE
7520  return _mm_min_ps (a, b);
7521 #elif OIIO_SIMD_NEON
7522  return vminq_f32(a, b);
7523 #else
7524  SIMD_RETURN (vfloat4, std::min (a[i], b[i]));
7525 #endif
7526 }
7527 
7529 {
7530 #if OIIO_SIMD_SSE
7531  return _mm_max_ps (a, b);
7532 #elif OIIO_SIMD_NEON
7533  return vmaxq_f32(a, b);
7534 #else
7535  SIMD_RETURN (vfloat4, std::max (a[i], b[i]));
7536 #endif
7537 }
7538 
7539 
7541 #if OIIO_SIMD_SSE
7542  return _mm_andnot_ps (a.simd(), b.simd());
7543 #else
7544  const int *ai = (const int *)&a;
7545  const int *bi = (const int *)&b;
7546  return bitcast_to_float (vint4(~(ai[0]) & bi[0],
7547  ~(ai[1]) & bi[1],
7548  ~(ai[2]) & bi[2],
7549  ~(ai[3]) & bi[3]));
7550 #endif
7551 }
7552 
7553 
7555  const simd::vfloat4& c)
7556 {
7557 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7558  // If we are sure _mm_fmadd_ps intrinsic is available, use it.
7559  return _mm_fmadd_ps (a, b, c);
7560 #elif OIIO_SIMD_NEON
7561  return vmlaq_f32(c.simd(), a.simd(), b.simd());
7562 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7563  // If we directly access the underlying __m128, on some platforms and
7564  // compiler flags, it will turn into fma anyway, even if we don't use
7565  // the intrinsic.
7566  return a.simd() * b.simd() + c.simd();
7567 #else
7568  // Fallback: just use regular math and hope for the best.
7569  return a * b + c;
7570 #endif
7571 }
7572 
7573 
7575  const simd::vfloat4& c)
7576 {
7577 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7578  // If we are sure _mm_fnmsub_ps intrinsic is available, use it.
7579  return _mm_fmsub_ps (a, b, c);
7580 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7581  // If we directly access the underlying __m128, on some platforms and
7582  // compiler flags, it will turn into fma anyway, even if we don't use
7583  // the intrinsic.
7584  return a.simd() * b.simd() - c.simd();
7585 #else
7586  // Fallback: just use regular math and hope for the best.
7587  return a * b - c;
7588 #endif
7589 }
7590 
7591 
7592 
7594  const simd::vfloat4& c)
7595 {
7596 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7597  // If we are sure _mm_fnmadd_ps intrinsic is available, use it.
7598  return _mm_fnmadd_ps (a, b, c);
7599 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7600  // If we directly access the underlying __m128, on some platforms and
7601  // compiler flags, it will turn into fma anyway, even if we don't use
7602  // the intrinsic.
7603  return c.simd() - a.simd() * b.simd();
7604 #else
7605  // Fallback: just use regular math and hope for the best.
7606  return c - a * b;
7607 #endif
7608 }
7609 
7610 
7611 
7613  const simd::vfloat4& c)
7614 {
7615 #if OIIO_SIMD_SSE && OIIO_FMA_ENABLED
7616  // If we are sure _mm_fnmsub_ps intrinsic is available, use it.
7617  return _mm_fnmsub_ps (a, b, c);
7618 #elif OIIO_SIMD_SSE && !defined(_MSC_VER)
7619  // If we directly access the underlying __m128, on some platforms and
7620  // compiler flags, it will turn into fma anyway, even if we don't use
7621  // the intrinsic.
7622  return -(a.simd() * b.simd()) - c.simd();
7623 #else
7624  // Fallback: just use regular math and hope for the best.
7625  return -(a * b) - c;
7626 #endif
7627 }
7628 
7629 
7630 
7631 // Full precision exp() of all components of a SIMD vector.
7632 template<typename T>
7633 OIIO_FORCEINLINE T exp (const T& v)
7634 {
7635 #if OIIO_SIMD_SSE
7636  // Implementation inspired by:
7637  // https://github.com/embree/embree/blob/master/common/simd/sse_special.h
7638  // Which is listed as Copyright (C) 2007 Julien Pommier and distributed
7639  // under the zlib license.
7640  typedef typename T::vint_t int_t;
7641  T x = v;
7642  const float exp_hi (88.3762626647949f);
7643  const float exp_lo (-88.3762626647949f);
7644  const float cephes_LOG2EF (1.44269504088896341f);
7645  const float cephes_exp_C1 (0.693359375f);
7646  const float cephes_exp_C2 (-2.12194440e-4f);
7647  const float cephes_exp_p0 (1.9875691500E-4f);
7648  const float cephes_exp_p1 (1.3981999507E-3f);
7649  const float cephes_exp_p2 (8.3334519073E-3f);
7650  const float cephes_exp_p3 (4.1665795894E-2f);
7651  const float cephes_exp_p4 (1.6666665459E-1f);
7652  const float cephes_exp_p5 (5.0000001201E-1f);
7653  T tmp (0.0f);
7654  T one (1.0f);
7655  x = min (x, T(exp_hi));
7656  x = max (x, T(exp_lo));
7657  T fx = madd (x, T(cephes_LOG2EF), T(0.5f));
7658  int_t emm0 = int_t(fx);
7659  tmp = T(emm0);
7660  T mask = bitcast_to_float (bitcast_to_int(tmp > fx) & bitcast_to_int(one));
7661  fx = tmp - mask;
7662  tmp = fx * cephes_exp_C1;
7663  T z = fx * cephes_exp_C2;
7664  x = x - tmp;
7665  x = x - z;
7666  z = x * x;
7667  T y = cephes_exp_p0;
7668  y = madd (y, x, cephes_exp_p1);
7669  y = madd (y, x, cephes_exp_p2);
7670  y = madd (y, x, cephes_exp_p3);
7671  y = madd (y, x, cephes_exp_p4);
7672  y = madd (y, x, cephes_exp_p5);
7673  y = madd (y, z, x);
7674  y = y + one;
7675  emm0 = (int_t(fx) + int_t(0x7f)) << 23;
7676  T pow2n = bitcast_to_float(emm0);
7677  y = y * pow2n;
7678  return y;
7679 #else
7680  SIMD_RETURN (T, expf(v[i]));
7681 #endif
7682 }
7683 
7684 
7685 
7686 // Full precision log() of all components of a SIMD vector.
7687 template<typename T>
7688 OIIO_FORCEINLINE T log (const T& v)
7689 {
7690 #if OIIO_SIMD_SSE
7691  // Implementation inspired by:
7692  // https://github.com/embree/embree/blob/master/common/simd/sse_special.h
7693  // Which is listed as Copyright (C) 2007 Julien Pommier and distributed
7694  // under the zlib license.
7695  typedef typename T::vint_t int_t;
7696  typedef typename T::vbool_t bool_t;
7697  T x = v;
7698  int_t emm0;
7699  T zero (T::Zero());
7700  T one (1.0f);
7701  bool_t invalid_mask = (x <= zero);
7702  const int min_norm_pos ((int)0x00800000);
7703  const int inv_mant_mask ((int)~0x7f800000);
7704  x = max(x, bitcast_to_float(int_t(min_norm_pos))); /* cut off denormalized stuff */
7705  emm0 = srl (bitcast_to_int(x), 23);
7706  /* keep only the fractional part */
7707  x = bitcast_to_float (bitcast_to_int(x) & int_t(inv_mant_mask));
7709  emm0 = emm0 - int_t(0x7f);
7710  T e (emm0);
7711  e = e + one;
7712  // OIIO_SIMD_vFLOAT4_CONST (cephes_SQRTHF, 0.707106781186547524f);
7713  const float cephes_SQRTHF (0.707106781186547524f);
7714  bool_t mask = (x < T(cephes_SQRTHF));
7715  T tmp = bitcast_to_float (bitcast_to_int(x) & bitcast_to_int(mask));
7716  x = x - one;
7717  e = e - bitcast_to_float (bitcast_to_int(one) & bitcast_to_int(mask));
7718  x = x + tmp;
7719  T z = x * x;
7720  const float cephes_log_p0 (7.0376836292E-2f);
7721  const float cephes_log_p1 (- 1.1514610310E-1f);
7722  const float cephes_log_p2 (1.1676998740E-1f);
7723  const float cephes_log_p3 (- 1.2420140846E-1f);
7724  const float cephes_log_p4 (+ 1.4249322787E-1f);
7725  const float cephes_log_p5 (- 1.6668057665E-1f);
7726  const float cephes_log_p6 (+ 2.0000714765E-1f);
7727  const float cephes_log_p7 (- 2.4999993993E-1f);
7728  const float cephes_log_p8 (+ 3.3333331174E-1f);
7729  const float cephes_log_q1 (-2.12194440e-4f);
7730  const float cephes_log_q2 (0.693359375f);
7731  T y = cephes_log_p0;
7732  y = madd (y, x, T(cephes_log_p1));
7733  y = madd (y, x, T(cephes_log_p2));
7734  y = madd (y, x, T(cephes_log_p3));
7735  y = madd (y, x, T(cephes_log_p4));
7736  y = madd (y, x, T(cephes_log_p5));
7737  y = madd (y, x, T(cephes_log_p6));
7738  y = madd (y, x, T(cephes_log_p7));
7739  y = madd (y, x, T(cephes_log_p8));
7740  y = y * x;
7741  y = y * z;
7742  y = madd(e, T(cephes_log_q1), y);
7743  y = nmadd (z, 0.5f, y);
7744  x = x + y;
7745  x = madd (e, T(cephes_log_q2), x);
7746  x = bitcast_to_float (bitcast_to_int(x) | bitcast_to_int(invalid_mask)); // negative arg will be NAN
7747  return x;
7748 #else
7749  SIMD_RETURN (T, logf(v[i]));
7750 #endif
7751 }
7752 
7753 
7754 
7756 {
7757 #if OIIO_SIMD_SSE
7758  _MM_TRANSPOSE4_PS (a.simd(), b.simd(), c.simd(), d.simd());
7759 #else
7760  vfloat4 A (a[0], b[0], c[0], d[0]);
7761  vfloat4 B (a[1], b[1], c[1], d[1]);
7762  vfloat4 C (a[2], b[2], c[2], d[2]);
7763  vfloat4 D (a[3], b[3], c[3], d[3]);
7764  a = A; b = B; c = C; d = D;
7765 #endif
7766 }
7767 
7768 
7769 OIIO_FORCEINLINE void transpose (const vfloat4& a, const vfloat4& b, const vfloat4& c, const vfloat4& d,
7770  vfloat4 &r0, vfloat4 &r1, vfloat4 &r2, vfloat4 &r3)
7771 {
7772 #if OIIO_SIMD_SSE
7773  //_MM_TRANSPOSE4_PS (a, b, c, d);
7774  auto l02 = _mm_unpacklo_ps (a, c);
7775  auto h02 = _mm_unpackhi_ps (a, c);
7776  auto l13 = _mm_unpacklo_ps (b, d);
7777  auto h13 = _mm_unpackhi_ps (b, d);
7778  r0 = vfloat4(_mm_unpacklo_ps (l02, l13));
7779  r1 = vfloat4(_mm_unpackhi_ps (l02, l13));
7780  r2 = vfloat4(_mm_unpacklo_ps (h02, h13));
7781  r3 = vfloat4(_mm_unpackhi_ps (h02, h13));
7782 #else
7783  r0.load (a[0], b[0], c[0], d[0]);
7784  r1.load (a[1], b[1], c[1], d[1]);
7785  r2.load (a[2], b[2], c[2], d[2]);
7786  r3.load (a[3], b[3], c[3], d[3]);
7787 #endif
7788 }
7789 
7790 
7792 {
7793 #if OIIO_SIMD_SSE
7794  __m128 A = _mm_castsi128_ps (a);
7795  __m128 B = _mm_castsi128_ps (b);
7796  __m128 C = _mm_castsi128_ps (c);
7797  __m128 D = _mm_castsi128_ps (d);
7798  _MM_TRANSPOSE4_PS (A, B, C, D);
7799  a = _mm_castps_si128 (A);
7800  b = _mm_castps_si128 (B);
7801  c = _mm_castps_si128 (C);
7802  d = _mm_castps_si128 (D);
7803 #else
7804  vint4 A (a[0], b[0], c[0], d[0]);
7805  vint4 B (a[1], b[1], c[1], d[1]);
7806  vint4 C (a[2], b[2], c[2], d[2]);
7807  vint4 D (a[3], b[3], c[3], d[3]);
7808  a = A; b = B; c = C; d = D;
7809 #endif
7810 }
7811 
7812 OIIO_FORCEINLINE void transpose (const vint4& a, const vint4& b, const vint4& c, const vint4& d,
7813  vint4 &r0, vint4 &r1, vint4 &r2, vint4 &r3)
7814 {
7815 #if OIIO_SIMD_SSE
7816  //_MM_TRANSPOSE4_PS (a, b, c, d);
7817  __m128 A = _mm_castsi128_ps (a);
7818  __m128 B = _mm_castsi128_ps (b);
7819  __m128 C = _mm_castsi128_ps (c);
7820  __m128 D = _mm_castsi128_ps (d);
7821  _MM_TRANSPOSE4_PS (A, B, C, D);
7822  r0 = _mm_castps_si128 (A);
7823  r1 = _mm_castps_si128 (B);
7824  r2 = _mm_castps_si128 (C);
7825  r3 = _mm_castps_si128 (D);
7826 #else
7827  r0.load (a[0], b[0], c[0], d[0]);
7828  r1.load (a[1], b[1], c[1], d[1]);
7829  r2.load (a[2], b[2], c[2], d[2]);
7830  r3.load (a[3], b[3], c[3], d[3]);
7831 #endif
7832 }
7833 
7834 
7836  const vfloat4& c, const vfloat4& d)
7837 {
7838 #if OIIO_SIMD_SSE
7839  vfloat4 l02 = _mm_unpacklo_ps (a, c);
7840  vfloat4 l13 = _mm_unpacklo_ps (b, d);
7841  return _mm_unpacklo_ps (l02, l13);
7842 #else
7843  return vfloat4 (a[0], b[0], c[0], d[0]);
7844 #endif
7845 }
7846 
7847 
7849  const vint4& c, const vint4& d)
7850 {
7851 #if OIIO_SIMD_SSE
7852  vint4 l02 = _mm_unpacklo_epi32 (a, c);
7853  vint4 l13 = _mm_unpacklo_epi32 (b, d);
7854  return _mm_unpacklo_epi32 (l02, l13);
7855 #else
7856  return vint4 (a[0], b[0], c[0], d[0]);
7857 #endif
7858 }
7859 
7860 
7861 
7862 //////////////////////////////////////////////////////////////////////
7863 // vfloat3 implementation
7864 
7866 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
7867  m_simd = other.m_simd;
7868 #else
7869  SIMD_CONSTRUCT_PAD (other[i]);
7870 #endif
7871 }
7872 
7874 #if OIIO_SIMD_SSE || OIIO_SIMD_NEON
7875  m_simd = other.simd();
7876 #else
7877  SIMD_CONSTRUCT_PAD (other[i]);
7878  m_val[3] = 0.0f;
7879 #endif
7880 }
7881 
7883 
7885 
7886 OIIO_FORCEINLINE const vfloat3 vfloat3::Iota (float start, float step) {
7887  return vfloat3 (start+0.0f*step, start+1.0f*step, start+2.0f*step);
7888 }
7889 
7890 
7891 OIIO_FORCEINLINE void vfloat3::load (float val) { vfloat4::load (val, val, val, 0.0f); }
7892 
7893 OIIO_FORCEINLINE void vfloat3::load (const float *values) { vfloat4::load (values, 3); }
7894 
7895 OIIO_FORCEINLINE void vfloat3::load (const float *values, int n) {
7896  vfloat4::load (values, n);
7897 }
7898 
7899 OIIO_FORCEINLINE void vfloat3::load (const unsigned short *values) {
7900  vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
7901 }
7902 
7903 OIIO_FORCEINLINE void vfloat3::load (const short *values) {
7904  vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
7905 }
7906 
7907 OIIO_FORCEINLINE void vfloat3::load (const unsigned char *values) {
7908  vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
7909 }
7910 
7911 OIIO_FORCEINLINE void vfloat3::load (const char *values) {
7912  vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
7913 }
7914 
7915 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
7916 OIIO_FORCEINLINE void vfloat3::load (const half *values) {
7917  vfloat4::load (float(values[0]), float(values[1]), float(values[2]));
7918 }
7919 #endif /* _HALF_H_ or _IMATH_H_ */
7920 
7921 OIIO_FORCEINLINE void vfloat3::store (float *values) const {
7922  vfloat4::store (values, 3);
7923 }
7924 
7925 OIIO_FORCEINLINE void vfloat3::store (float *values, int n) const {
7926  vfloat4::store (values, n);
7927 }
7928 
7929 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
7930 OIIO_FORCEINLINE void vfloat3::store (half *values) const {
7931  SIMD_DO (values[i] = m_val[i]);
7932 }
7933 #endif
7934 
7936  store ((float *)&vec);
7937 }
7938 
7940  return vfloat3 (vfloat4(a) + vfloat4(b));
7941 }
7942 
7944  *this = *this + a; return *this;
7945 }
7946 
7948  return vfloat3 (-vfloat4(*this));
7949 }
7950 
7952  return vfloat3 (vfloat4(a) - vfloat4(b));
7953 }
7954 
7956  *this = *this - a; return *this;
7957 }
7958 
7960  return vfloat3 (vfloat4(a) * vfloat4(b));
7961 }
7962 
7964  return vfloat3 (vfloat4(a) * b);
7965 }
7966 
7968  return b * a;
7969 }
7970 
7972  *this = *this * a; return *this;
7973 }
7974 
7976  *this = *this * a; return *this;
7977 }
7978 
7980  return vfloat3 (vfloat4(a) / b.xyz1()); // Avoid divide by zero!
7981 }
7982 
7984  *this = *this / a; return *this;
7985 }
7986 
7988  *this = *this / a; return *this;
7989 }
7990 
7991 
7992 inline std::ostream& operator<< (std::ostream& cout, const vfloat3& val) {
7993  cout << val[0];
7994  for (int i = 1; i < val.elements; ++i)
7995  cout << ' ' << val[i];
7996  return cout;
7997 }
7998 
7999 
8001 {
8002 #if OIIO_SIMD_SSE
8003  // Just clear the sign bit for cheap fabsf
8004  return vfloat3(_mm_and_ps (a.simd(), _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff))));
8005 #elif OIIO_SIMD_NEON
8006  return vfloat3(vabsq_f32(a.simd()));
8007 #else
8008  SIMD_RETURN (vfloat3, fabsf(a[i]));
8009 #endif
8010 }
8011 
8012 
8014 {
8015  vfloat3 one(1.0f);
8016  return vfloat3(blend (one, -one, a < vfloat3::Zero()));
8017 }
8018 
8019 
8021 {
8022 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
8023  return vfloat3(_mm_ceil_ps (a));
8024 #else
8025  SIMD_RETURN (vfloat3, ceilf(a[i]));
8026 #endif
8027 }
8028 
8030 {
8031 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
8032  return vfloat3(_mm_floor_ps (a));
8033 #else
8034  SIMD_RETURN (vfloat3, floorf(a[i]));
8035 #endif
8036 }
8037 
8039 {
8040 #if OIIO_SIMD_SSE >= 4 /* SSE >= 4.1 */
8041  return vfloat3(_mm_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)));
8042 #else
8043  SIMD_RETURN (vfloat3, roundf(a[i]));
8044 #endif
8045 }
8046 
8047 
8049 #if OIIO_SIMD_SSE
8050  return vfloat3 ((vreduce_add(vfloat4(v))).xyz0());
8051 #else
8052  return vfloat3 (v[0] + v[1] + v[2]);
8053 #endif
8054 }
8055 
8056 
8058 #if OIIO_SIMD_SSE >= 4
8059  return vfloat3(_mm_dp_ps (a.simd(), b.simd(), 0x77));
8060 #else
8061  return vreduce_add (a*b);
8062 #endif
8063 }
8064 
8065 
8066 OIIO_FORCEINLINE float dot (const vfloat3 &a, const vfloat3 &b) {
8067 #if OIIO_SIMD_SSE >= 4
8068  return _mm_cvtss_f32 (_mm_dp_ps (a.simd(), b.simd(), 0x77));
8069 #elif OIIO_SIMD
8070  return reduce_add (a*b);
8071 #else
8072  return a[0]*b[0] + a[1]*b[1] + a[2]*b[2];
8073 #endif
8074 }
8075 
8076 
8078 #if OIIO_SIMD_SSE >= 4
8079  return vfloat3(_mm_dp_ps (a.simd(), b.simd(), 0x77));
8080 #else
8081  return vfloat3 (vreduce_add((a*b).xyz0()).xyz0());
8082 #endif
8083 }
8084 
8085 
8087 {
8088  return dot(*this, *this);
8089 }
8090 
8091 
8093 {
8094  return sqrtf(dot(*this, *this));
8095 }
8096 
8097 
8099 {
8100 #if OIIO_SIMD
8101  vfloat3 len2 = vdot3 (*this, *this);
8102  return vfloat3 (safe_div (*this, sqrt(len2)));
8103 #else
8104  float len2 = dot (*this, *this);
8105  return len2 > 0.0f ? (*this) / sqrtf(len2) : vfloat3::Zero();
8106 #endif
8107 }
8108 
8109 
8111 {
8112 #if OIIO_SIMD
8113  vfloat3 len2 = vdot3 (*this, *this);
8114  vfloat4 invlen = blend0not (rsqrt_fast (len2), len2 == vfloat4::Zero());
8115  return vfloat3 ((*this) * invlen);
8116 #else
8117  float len2 = dot (*this, *this);
8118  return len2 > 0.0f ? (*this) / sqrtf(len2) : vfloat3::Zero();
8119 #endif
8120 }
8121 
8122 
8123 
8124 //////////////////////////////////////////////////////////////////////
8125 // matrix44 implementation
8126 
8127 
8129  return *(Imath::M44f*)this;
8130 }
8131 
8132 
8134 #if OIIO_SIMD_SSE
8135  return m_row[i];
8136 #else
8137  return vfloat4 (m_mat[i]);
8138 #endif
8139 }
8140 
8141 
8143  matrix44 T;
8144 #if OIIO_SIMD_SSE
8145  simd::transpose (m_row[0], m_row[1], m_row[2], m_row[3],
8146  T.m_row[0], T.m_row[1], T.m_row[2], T.m_row[3]);
8147 #else
8148  T.m_mat = m_mat.transposed();
8149 #endif
8150  return T;
8151 }
8152 
8154 #if OIIO_SIMD_SSE
8155  vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
8156  shuffle<2>(V) * m_row[2] + m_row[3];
8157  R = R / shuffle<3>(R);
8158  return vfloat3 (R.xyz0());
8159 #else
8160  Imath::V3f R;
8161  m_mat.multVecMatrix (*(Imath::V3f *)&V, R);
8162  return vfloat3(R);
8163 #endif
8164 }
8165 
8167 #if OIIO_SIMD_SSE
8168  vfloat4 R = shuffle<0>(V) * m_row[0] + shuffle<1>(V) * m_row[1] +
8169  shuffle<2>(V) * m_row[2];
8170  return vfloat3 (R.xyz0());
8171 #else
8172  Imath::V3f R;
8173  m_mat.multDirMatrix (*(Imath::V3f *)&V, R);
8174  return vfloat3(R);
8175 #endif
8176 }
8177 
8179 #if OIIO_SIMD_SSE
8180  matrix44 T = transposed();
8181  vfloat4 R = shuffle<0>(V) * T[0] + shuffle<1>(V) * T[1] +
8182  shuffle<2>(V) * T[2];
8183  return vfloat3 (R.xyz0());
8184 #else
8185  Imath::V3f R;
8186  m_mat.transposed().multDirMatrix (*(Imath::V3f *)&V, R);
8187  return vfloat3(R);
8188 #endif
8189 }
8190 
8192 {
8193 #if OIIO_SIMD_SSE
8194  return shuffle<0>(V) * M[0] + shuffle<1>(V) * M[1] +
8195  shuffle<2>(V) * M[2] + shuffle<3>(V) * M[3];
8196 #else
8197  return vfloat4(V.V4f() * M.M44f());
8198 #endif
8199 }
8200 
8202 {
8203 #if OIIO_SIMD_SSE >= 3
8204  vfloat4 m0v = M[0] * V; // [ M00*Vx, M01*Vy, M02*Vz, M03*Vw ]
8205  vfloat4 m1v = M[1] * V; // [ M10*Vx, M11*Vy, M12*Vz, M13*Vw ]
8206  vfloat4 m2v = M[2] * V; // [ M20*Vx, M21*Vy, M22*Vz, M23*Vw ]
8207  vfloat4 m3v = M[3] * V; // [ M30*Vx, M31*Vy, M32*Vz, M33*Vw ]
8208  vfloat4 s01 = _mm_hadd_ps(m0v, m1v);
8209  // [ M00*Vx + M01*Vy, M02*Vz + M03*Vw, M10*Vx + M11*Vy, M12*Vz + M13*Vw ]
8210  vfloat4 s23 = _mm_hadd_ps(m2v, m3v);
8211  // [ M20*Vx + M21*Vy, M22*Vz + M23*Vw, M30*Vx + M31*Vy, M32*Vz + M33*Vw ]
8212  vfloat4 result = _mm_hadd_ps(s01, s23);
8213  // [ M00*Vx + M01*Vy + M02*Vz + M03*Vw,
8214  // M10*Vx + M11*Vy + M12*Vz + M13*Vw,
8215  // M20*Vx + M21*Vy + M22*Vz + M23*Vw,
8216  // M30*Vx + M31*Vy + M32*Vz + M33*Vw ]
8217  return result;
8218 #else
8219  return vfloat4(dot(M[0], V), dot(M[1], V), dot(M[2], V), dot(M[3], V));
8220 #endif
8221 }
8222 
8223 
8225 #if OIIO_SIMD_SSE
8226  vbool4 b0 = (m_row[0] == m[0]);
8227  vbool4 b1 = (m_row[1] == m[1]);
8228  vbool4 b2 = (m_row[2] == m[2]);
8229  vbool4 b3 = (m_row[3] == m[3]);
8230  return simd::all (b0 & b1 & b2 & b3);
8231 #else
8232  return memcmp(this, &m, 16*sizeof(float)) == 0;
8233 #endif
8234 }
8235 
8237  return memcmp(this, &m, 16*sizeof(float)) == 0;
8238 }
8239 
8241  return (b == a);
8242 }
8243 
8245 #if OIIO_SIMD_SSE
8246  vbool4 b0 = (m_row[0] != m[0]);
8247  vbool4 b1 = (m_row[1] != m[1]);
8248  vbool4 b2 = (m_row[2] != m[2]);
8249  vbool4 b3 = (m_row[3] != m[3]);
8250  return simd::any (b0 | b1 | b2 | b3);
8251 #else
8252  return memcmp(this, &m, 16*sizeof(float)) != 0;
8253 #endif
8254 }
8255 
8257  return memcmp(this, &m, 16*sizeof(float)) != 0;
8258 }
8259 
8261  return (b != a);
8262 }
8263 
8265 #if OIIO_SIMD_SSE
8266  // Adapted from this code from Intel:
8267  // ftp://download.intel.com/design/pentiumiii/sml/24504301.pdf
8268  vfloat4 minor0, minor1, minor2, minor3;
8269  vfloat4 row0, row1, row2, row3;
8270  vfloat4 det, tmp1;
8271  const float *src = (const float *)this;
8273  tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src)), (__m64*)(src+ 4)));
8274  row1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+8)), (__m64*)(src+12)));
8275  row0 = vfloat4(_mm_shuffle_ps(tmp1, row1, 0x88));
8276  row1 = vfloat4(_mm_shuffle_ps(row1, tmp1, 0xDD));
8277  tmp1 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)));
8278  row3 = vfloat4(_mm_loadh_pi(_mm_loadl_pi(zero, (__m64*)(src+10)), (__m64*)(src+14)));
8279  row2 = vfloat4(_mm_shuffle_ps(tmp1, row3, 0x88));
8280  row3 = vfloat4(_mm_shuffle_ps(row3, tmp1, 0xDD));
8281  // -----------------------------------------------
8282  tmp1 = row2 * row3;
8283  tmp1 = shuffle<1,0,3,2>(tmp1);
8284  minor0 = row1 * tmp1;
8285  minor1 = row0 * tmp1;
8286  tmp1 = shuffle<2,3,0,1>(tmp1);
8287  minor0 = (row1 * tmp1) - minor0;
8288  minor1 = (row0 * tmp1) - minor1;
8289  minor1 = shuffle<2,3,0,1>(minor1);
8290  // -----------------------------------------------
8291  tmp1 = row1 * row2;
8292  tmp1 = shuffle<1,0,3,2>(tmp1);
8293  minor0 = (row3 * tmp1) + minor0;
8294  minor3 = row0 * tmp1;
8295  tmp1 = shuffle<2,3,0,1>(tmp1);
8296  minor0 = minor0 - (row3 * tmp1);
8297  minor3 = (row0 * tmp1) - minor3;
8298  minor3 = shuffle<2,3,0,1>(minor3);
8299  // -----------------------------------------------
8300  tmp1 = shuffle<2,3,0,1>(row1) * row3;
8301  tmp1 = shuffle<1,0,3,2>(tmp1);
8302  row2 = shuffle<2,3,0,1>(row2);
8303  minor0 = (row2 * tmp1) + minor0;
8304  minor2 = row0 * tmp1;
8305  tmp1 = shuffle<2,3,0,1>(tmp1);
8306  minor0 = minor0 - (row2 * tmp1);
8307  minor2 = (row0 * tmp1) - minor2;
8308  minor2 = shuffle<2,3,0,1>(minor2);
8309  // -----------------------------------------------
8310  tmp1 = row0 * row1;
8311  tmp1 = shuffle<1,0,3,2>(tmp1);
8312  minor2 = (row3 * tmp1) + minor2;
8313  minor3 = (row2 * tmp1) - minor3;
8314  tmp1 = shuffle<2,3,0,1>(tmp1);
8315  minor2 = (row3 * tmp1) - minor2;
8316  minor3 = minor3 - (row2 * tmp1);
8317  // -----------------------------------------------
8318  tmp1 = row0 * row3;
8319  tmp1 = shuffle<1,0,3,2>(tmp1);
8320  minor1 = minor1 - (row2 * tmp1);
8321  minor2 = (row1 * tmp1) + minor2;
8322  tmp1 = shuffle<2,3,0,1>(tmp1);
8323  minor1 = (row2 * tmp1) + minor1;
8324  minor2 = minor2 - (row1 * tmp1);
8325  // -----------------------------------------------
8326  tmp1 = row0 * row2;
8327  tmp1 = shuffle<1,0,3,2>(tmp1);
8328  minor1 = (row3 * tmp1) + minor1;
8329  minor3 = minor3 - (row1 * tmp1);
8330  tmp1 = shuffle<2,3,0,1>(tmp1);
8331  minor1 = minor1 - (row3 * tmp1);
8332  minor3 = (row1 * tmp1) + minor3;
8333  // -----------------------------------------------
8334  det = row0 * minor0;
8335  det = shuffle<2,3,0,1>(det) + det;
8336  det = vfloat4(_mm_add_ss(shuffle<1,0,3,2>(det), det));
8337  tmp1 = vfloat4(_mm_rcp_ss(det));
8338  det = vfloat4(_mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))));
8339  det = shuffle<0>(det);
8340  return matrix44 (det*minor0, det*minor1, det*minor2, det*minor3);
8341 #else
8342  return matrix44 (m_mat.inverse());
8343 #endif
8344 }
8345 
8346 
8347 inline std::ostream& operator<< (std::ostream& cout, const matrix44 &M) {
8348  const float *m = (const float *)&M;
8349  cout << m[0];
8350  for (int i = 1; i < 16; ++i)
8351  cout << ' ' << m[i];
8352  return cout;
8353 }
8354 
8355 
8356 
8358  return M.transformp (V);
8359 }
8360 
8362 {
8363 #if OIIO_SIMD
8364  return matrix44(M).transformp (V);
8365 #else
8366  Imath::V3f R;
8367  M.multVecMatrix (*(const Imath::V3f *)&V, R);
8368  return vfloat3(R);
8369 #endif
8370 }
8371 
8372 
8374  return M.transformv (V);
8375 }
8376 
8378 {
8379 #if OIIO_SIMD
8380  return matrix44(M).transformv (V);
8381 #else
8382  Imath::V3f R;
8383  M.multDirMatrix (*(const Imath::V3f *)&V, R);
8384  return vfloat3(R);
8385 #endif
8386 }
8387 
8389 {
8390  return M.transformvT (V);
8391 }
8392 
8394 {
8395 #if OIIO_SIMD
8396  return matrix44(M).transformvT(V);
8397 #else
8398  return transformv (M.transposed(), V);
8399 #endif
8400 }
8401 
8402 
8403 
8404 //////////////////////////////////////////////////////////////////////
8405 // vfloat8 implementation
8406 
8409  return m_val[i];
8410 }
8411 
8414  return m_val[i];
8415 }
8416 
8417 
8418 inline std::ostream& operator<< (std::ostream& cout, const vfloat8& val) {
8419  cout << val[0];
8420  for (int i = 1; i < val.elements; ++i)
8421  cout << ' ' << val[i];
8422  return cout;
8423 }
8424 
8425 
8427 #if OIIO_SIMD_AVX
8428  return _mm256_castps256_ps128 (simd());
8429 #else
8430  return m_4[0];
8431 #endif
8432 }
8433 
8435 #if OIIO_SIMD_AVX
8436  return _mm256_extractf128_ps (simd(), 1);
8437 #else
8438  return m_4[1];
8439 #endif
8440 }
8441 
8442 
8444 #if OIIO_SIMD_AVX
8445  __m256 r = _mm256_castps128_ps256 (lo);
8446  m_simd = _mm256_insertf128_ps (r, hi, 1);
8447  // N.B. equivalent, if available: m_simd = _mm256_set_m128 (hi, lo);
8448  // FIXME: when would that not be available?
8449 #else
8450  m_4[0] = lo;
8451  m_4[1] = hi;
8452 #endif
8453 }
8454 
8455 
8457 #if OIIO_SIMD_AVX
8458  m_simd = _mm256_cvtepi32_ps (ival);
8459 #else
8460  SIMD_CONSTRUCT (float(ival[i]));
8461 #endif
8462 }
8463 
8464 
8466 #if OIIO_SIMD_AVX
8467  return _mm256_setzero_ps();
8468 #else
8469  return vfloat8(0.0f);
8470 #endif
8471 }
8472 
8474  return vfloat8(1.0f);
8475 }
8476 
8477 OIIO_FORCEINLINE const vfloat8 vfloat8::Iota (float start, float step) {
8478  return vfloat8 (start+0.0f*step, start+1.0f*step, start+2.0f*step, start+3.0f*step,
8479  start+4.0f*step, start+5.0f*step, start+6.0f*step, start+7.0f*step);
8480 }
8481 
8482 /// Set all components to 0.0
8484 #if OIIO_SIMD_AVX
8485  m_simd = _mm256_setzero_ps();
8486 #else
8487  load (0.0f);
8488 #endif
8489 }
8490 
8491 
8492 
8494 #if OIIO_SIMD_AVX
8495  m_simd = _mm256_set1_ps (val);
8496 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8497  m_4[0].load(val);
8498  m_4[1].load(val);
8499 #else
8500  SIMD_CONSTRUCT (val);
8501 #endif
8502 }
8503 
8504 OIIO_FORCEINLINE void vfloat8::load (float a, float b, float c, float d,
8505  float e, float f, float g, float h) {
8506 #if OIIO_SIMD_AVX
8507  m_simd = _mm256_set_ps (h, g, f, e, d, c, b, a);
8508 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8509  m_4[0].load(a, b, c, d);
8510  m_4[1].load(e, f, g, h);
8511 #else
8512  m_val[0] = a;
8513  m_val[1] = b;
8514  m_val[2] = c;
8515  m_val[3] = d;
8516  m_val[4] = e;
8517  m_val[5] = f;
8518  m_val[6] = g;
8519  m_val[7] = h;
8520 #endif
8521 }
8522 
8523 
8524 OIIO_FORCEINLINE void vfloat8::load (const float *values) {
8525 #if OIIO_SIMD_AVX
8526  m_simd = _mm256_loadu_ps (values);
8527 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8528  m_4[0].load(values);
8529  m_4[1].load(values+4);
8530 #else
8531  SIMD_CONSTRUCT (values[i]);
8532 #endif
8533 }
8534 
8535 
8536 OIIO_FORCEINLINE void vfloat8::load (const float *values, int n) {
8537  OIIO_DASSERT (n >= 0 && n <= elements);
8538 #if 0 && OIIO_AVX512VL_ENABLED
8539  // This SHOULD be fast, but in my benchmarks, it is slower!
8540  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
8541  // Re-test this periodically with new Intel hardware.
8542  m_simd = _mm256_maskz_loadu_ps ((~(0xff << n)), values);
8543 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8544  if (n > 4) {
8545  vfloat4 lo, hi;
8546  lo.load (values);
8547  hi.load (values+4, n-4);
8548  m_4[0] = lo;
8549  m_4[1] = hi;
8550  } else {
8551  vfloat4 lo, hi;
8552  lo.load (values, n);
8553  hi.clear();
8554  m_4[0] = lo;
8555  m_4[1] = hi;
8556  }
8557 #else
8558  for (int i = 0; i < n; ++i)
8559  m_val[i] = values[i];
8560  for (int i = n; i < paddedelements; ++i)
8561  m_val[i] = 0;
8562 #endif
8563 }
8564 
8565 
8566 OIIO_FORCEINLINE void vfloat8::load (const unsigned short *values) {
8567 #if OIIO_SIMD_AVX
8568  // Rely on the ushort->int conversion, then convert to float
8569  m_simd = _mm256_cvtepi32_ps (vint8(values).simd());
8570 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8571  m_4[0].load(values);
8572  m_4[1].load(values+4);
8573 #else
8574  SIMD_CONSTRUCT (values[i]);
8575 #endif
8576 }
8577 
8578 
8579 OIIO_FORCEINLINE void vfloat8::load (const short *values) {
8580 #if OIIO_SIMD_AVX
8581  // Rely on the short->int conversion, then convert to float
8582  m_simd = _mm256_cvtepi32_ps (vint8(values).simd());
8583 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8584  m_4[0].load(values);
8585  m_4[1].load(values+4);
8586 #else
8587  SIMD_CONSTRUCT (values[i]);
8588 #endif
8589 }
8590 
8591 
8592 OIIO_FORCEINLINE void vfloat8::load (const unsigned char *values) {
8593 #if OIIO_SIMD_AVX
8594  m_simd = _mm256_cvtepi32_ps (vint8(values).simd());
8595 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8596  m_4[0].load(values);
8597  m_4[1].load(values+4);
8598 #else
8599  SIMD_CONSTRUCT (values[i]);
8600 #endif
8601 }
8602 
8603 
8604 OIIO_FORCEINLINE void vfloat8::load (const char *values) {
8605 #if OIIO_SIMD_AVX
8606  m_simd = _mm256_cvtepi32_ps (vint8(values).simd());
8607 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8608  m_4[0].load(values);
8609  m_4[1].load(values+4);
8610 #else
8611  SIMD_CONSTRUCT (values[i]);
8612 #endif
8613 }
8614 
8615 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8616 OIIO_FORCEINLINE void vfloat8::load (const half *values) {
8617 #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED
8618  /* Enabled 16 bit float instructions! */
8619  vint4 a ((const int *)values);
8620  m_simd = _mm256_cvtph_ps (a);
8621 #elif OIIO_SIMD_SSE >= 2
8622  m_4[0] = vfloat4(values);
8623  m_4[1] = vfloat4(values+4);
8624 #else /* No SIMD defined: */
8625  SIMD_CONSTRUCT (values[i]);
8626 #endif
8627 }
8628 #endif /* _HALF_H_ or _IMATH_H_ */
8629 
8630 
8631 OIIO_FORCEINLINE void vfloat8::store (float *values) const {
8632 #if OIIO_SIMD_AVX
8633  // Use an unaligned store -- it's just as fast when the memory turns
8634  // out to be aligned, nearly as fast even when unaligned. Not worth
8635  // the headache of using stores that require alignment.
8636  _mm256_storeu_ps (values, m_simd);
8637 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8638  m_4[0].store(values);
8639  m_4[1].store(values+4);
8640 #else
8641  SIMD_DO (values[i] = m_val[i]);
8642 #endif
8643 }
8644 
8645 
8646 OIIO_FORCEINLINE void vfloat8::store (float *values, int n) const {
8647  OIIO_DASSERT (n >= 0 && n <= elements);
8648 #if 0 && OIIO_AVX512VL_ENABLED
8649  // This SHOULD be fast, but in my benchmarks, it is slower!
8650  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
8651  // Re-test this periodically with new Intel hardware.
8652  _mm256_mask_storeu_ps (values, __mmask8(~(0xff << n)), m_simd);
8653 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8654  if (n <= 4) {
8655  lo().store (values, n);
8656  } else if (n <= 8) {
8657  lo().store (values);
8658  hi().store (values+4, n-4);
8659  }
8660 #else
8661  for (int i = 0; i < n; ++i)
8662  values[i] = m_val[i];
8663 #endif
8664 }
8665 
8666 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
8667 OIIO_FORCEINLINE void vfloat8::store (half *values) const {
8668 #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED
8669  __m128i h = _mm256_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
8670  _mm_storeu_si128 ((__m128i *)values, h);
8671 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8672  m_4[0].store(values);
8673  m_4[1].store(values+4);
8674 #else
8675  SIMD_DO (values[i] = m_val[i]);
8676 #endif
8677 }
8678 #endif
8679 
8680 
8681 OIIO_FORCEINLINE void vfloat8::load_mask (int mask, const float *values) {
8682 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8683  m_simd = _mm256_maskz_loadu_ps (__mmask8(mask), (const simd_t *)values);
8684 #elif OIIO_SIMD_AVX
8685  m_simd = _mm256_maskload_ps (values, _mm256_castps_si256(vbool8::from_bitmask(mask)));
8686 #else
8687  SIMD_CONSTRUCT ((mask>>i) & 1 ? values[i] : 0.0f);
8688 #endif
8689 }
8690 
8691 
8692 OIIO_FORCEINLINE void vfloat8::load_mask (const vbool8& mask, const float *values) {
8693 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8694  m_simd = _mm256_maskz_loadu_ps (__mmask8(mask.bitmask()), (const simd_t *)values);
8695 #elif OIIO_SIMD_AVX
8696  m_simd = _mm256_maskload_ps (values, _mm256_castps_si256(mask));
8697 #else
8698  SIMD_CONSTRUCT (mask[i] ? values[i] : 0.0f);
8699 #endif
8700 }
8701 
8702 
8703 OIIO_FORCEINLINE void vfloat8::store_mask (int mask, float *values) const {
8704 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8705  _mm256_mask_storeu_ps (values, __mmask8(mask), m_simd);
8706 #elif OIIO_SIMD_AVX
8707  _mm256_maskstore_ps (values, _mm256_castps_si256(vbool8::from_bitmask(mask)), m_simd);
8708 #else
8709  SIMD_DO (if ((mask>>i) & 1) values[i] = (*this)[i]);
8710 #endif
8711 }
8712 
8713 
8714 OIIO_FORCEINLINE void vfloat8::store_mask (const vbool8& mask, float *values) const {
8715 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8716  _mm256_mask_storeu_ps (values, __mmask8(mask.bitmask()), m_simd);
8717 #elif OIIO_SIMD_AVX
8718  _mm256_maskstore_ps (values, _mm256_castps_si256(mask.simd()), m_simd);
8719 #else
8720  SIMD_DO (if (mask[i]) values[i] = (*this)[i]);
8721 #endif
8722 }
8723 
8724 
8725 template <int scale>
8726 OIIO_FORCEINLINE void
8727 vfloat8::gather (const value_t *baseptr, const vint_t& vindex)
8728 {
8729 #if OIIO_SIMD_AVX >= 2
8730  m_simd = _mm256_i32gather_ps (baseptr, vindex, scale);
8731 #else
8732  SIMD_CONSTRUCT (*(const value_t *)((const char *)baseptr + vindex[i]*scale));
8733 #endif
8734 }
8735 
8736 template<int scale>
8737 OIIO_FORCEINLINE void
8738 vfloat8::gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex)
8739 {
8740 #if OIIO_SIMD_AVX >= 2
8741  m_simd = _mm256_mask_i32gather_ps (m_simd, baseptr, vindex, mask, scale);
8742 #else
8743  SIMD_CONSTRUCT (mask[i] ? *(const value_t *)((const char *)baseptr + vindex[i]*scale) : 0);
8744 #endif
8745 }
8746 
8747 template<int scale>
8748 OIIO_FORCEINLINE void
8749 vfloat8::scatter (value_t *baseptr, const vint_t& vindex) const
8750 {
8751 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8752  _mm256_i32scatter_ps (baseptr, vindex, m_simd, scale);
8753 #else
8754  SIMD_DO (*(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
8755 #endif
8756 }
8757 
8758 template<int scale>
8759 OIIO_FORCEINLINE void
8761  const vint_t& vindex) const
8762 {
8763 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
8764  _mm256_mask_i32scatter_ps (baseptr, mask.bitmask(), vindex, m_simd, scale);
8765 #else
8766  SIMD_DO (if (mask[i]) *(value_t *)((char *)baseptr + vindex[i]*scale) = m_val[i]);
8767 #endif
8768 }
8769 
8770 
8771 
8773 #if OIIO_SIMD_AVX
8774  return _mm256_add_ps (a, b);
8775 #else
8776  return vfloat8 (a.lo()+b.lo(), a.hi()+b.hi());
8777 #endif
8778 }
8779 
8781  return a = a + b;
8782 }
8783 
8785 #if OIIO_SIMD_AVX
8786  return _mm256_sub_ps (_mm256_setzero_ps(), a);
8787 #else
8788  return vfloat8 (-a.lo(), -a.hi());
8789 #endif
8790 }
8791 
8793 #if OIIO_SIMD_AVX
8794  return _mm256_sub_ps (a, b);
8795 #else
8796  return vfloat8 (a.lo()-b.lo(), a.hi()-b.hi());
8797 #endif
8798 }
8799 
8801  return a = a - b;
8802 }
8803 
8805 #if OIIO_SIMD_AVX
8806  return _mm256_mul_ps (a.m_simd, _mm256_set1_ps(b));
8807 #else
8808  return vfloat8 (a.lo()*b, a.hi()*b);
8809 #endif
8810 }
8811 
8813  return b * a;
8814 }
8815 
8817 #if OIIO_SIMD_AVX
8818  return _mm256_mul_ps (a, b);
8819 #else
8820  return vfloat8 (a.lo()*b.lo(), a.hi()*b.hi());
8821 #endif
8822 }
8823 
8825  return a = a * b;
8826 }
8827 
8829 #if OIIO_SIMD_AVX
8830  return _mm256_div_ps (a, b);
8831 #else
8832  return vfloat8 (a.lo()/b.lo(), a.hi()/b.hi());
8833 #endif
8834 }
8835 
8837  return a = a / b;
8838 }
8839 
8841 #if OIIO_SIMD_AVX
8842  return _mm256_cmp_ps (a, b, _CMP_EQ_OQ);
8843 #else
8844  return vbool8 (a.lo() == b.lo(), a.hi() == b.hi());
8845 #endif
8846 }
8847 
8849 #if OIIO_SIMD_AVX
8850  return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ);
8851 #else
8852  return vbool8 (a.lo() != b.lo(), a.hi() != b.hi());
8853 #endif
8854 }
8855 
8857 #if OIIO_SIMD_AVX
8858  return _mm256_cmp_ps (a, b, _CMP_LT_OQ);
8859 #else
8860  return vbool8 (a.lo() < b.lo(), a.hi() < b.hi());
8861 #endif
8862 }
8863 
8865 #if OIIO_SIMD_AVX
8866  return _mm256_cmp_ps (a, b, _CMP_GT_OQ);
8867 #else
8868  return vbool8 (a.lo() > b.lo(), a.hi() > b.hi());
8869 #endif
8870 }
8871 
8873 #if OIIO_SIMD_AVX
8874  return _mm256_cmp_ps (a, b, _CMP_GE_OQ);
8875 #else
8876  return vbool8 (a.lo() >= b.lo(), a.hi() >= b.hi());
8877 #endif
8878 }
8879 
8881 #if OIIO_SIMD_AVX
8882  return _mm256_cmp_ps (a, b, _CMP_LE_OQ);
8883 #else
8884  return vbool8 (a.lo() <= b.lo(), a.hi() <= b.hi());
8885 #endif
8886 }
8887 
8888 
8889 // Implementation had to be after the definition of vfloat8.
8891 {
8892 #if OIIO_SIMD_AVX
8893  m_simd = _mm256_cvttps_epi32(f);
8894 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
8895  *this = vint8 (vint4(f.lo()), vint4(f.hi()));
8896 #else
8897  SIMD_CONSTRUCT ((int) f[i]);
8898 #endif
8899 }
8900 
8901 
8902 template<int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
8904 #if OIIO_SIMD_AVX >= 2
8905  vint8 index (i0, i1, i2, i3, i4, i5, i6, i7);
8906  return _mm256_permutevar8x32_ps (a, index);
8907 #else
8908  return vfloat8 (a[i0], a[i1], a[i2], a[i3], a[i4], a[i5], a[i6], a[i7]);
8909 #endif
8910 }
8911 
8912 template<int i> OIIO_FORCEINLINE vfloat8 shuffle (const vfloat8& a) {
8913 #if OIIO_SIMD_AVX >= 2
8914  return _mm256_permutevar8x32_ps (a, vint8(i));
8915 #else
8916  return shuffle<i,i,i,i,i,i,i,i>(a);
8917 #endif
8918 }
8919 
8920 
8921 template<int i>
8923 #if OIIO_SIMD_AVX_NO_FIXME
8924  // Looks like the fastest we can do it is to extract a vfloat4,
8925  // shuffle its one element everywhere, then extract element 0.
8926  _m128 f4 = _mm256_extractf128_ps (i >> 2);
8927  int j = i & 3;
8928  return _mm_cvtss_f32(shuffle_sse<j,j,j,j>(a.simd()));
8929 #else
8930  return v[i];
8931 #endif
8932 }
8933 
8934 
8935 template<int i>
8937 #if OIIO_SIMD_AVX_NO_FIXME
8938  return _mm256_insert_epi32 (a, val, i);
8939 #else
8940  vfloat8 tmp = a;
8941  tmp[i] = val;
8942  return tmp;
8943 #endif
8944 }
8945 
8946 
8947 OIIO_FORCEINLINE float vfloat8::x () const { return extract<0>(*this); }
8948 OIIO_FORCEINLINE float vfloat8::y () const { return extract<1>(*this); }
8949 OIIO_FORCEINLINE float vfloat8::z () const { return extract<2>(*this); }
8950 OIIO_FORCEINLINE float vfloat8::w () const { return extract<3>(*this); }
8951 OIIO_FORCEINLINE void vfloat8::set_x (float val) { *this = insert<0>(*this, val); }
8952 OIIO_FORCEINLINE void vfloat8::set_y (float val) { *this = insert<1>(*this, val); }
8953 OIIO_FORCEINLINE void vfloat8::set_z (float val) { *this = insert<2>(*this, val); }
8954 OIIO_FORCEINLINE void vfloat8::set_w (float val) { *this = insert<3>(*this, val); }
8955 
8956 
8958 {
8959 #if OIIO_SIMD_AVX
8960  return _mm256_castps_si256 (x.simd());
8961 #else
8962  return *(vint8 *)&x;
8963 #endif
8964 }
8965 
8967 {
8968 #if OIIO_SIMD_AVX
8969  return _mm256_castsi256_ps (x.simd());
8970 #else
8971  return *(vfloat8 *)&x;
8972 #endif
8973 }
8974 
8975 
8977 #if OIIO_SIMD_AVX
8978  // From Syrah:
8979  vfloat8 ab_cd_0_0_ef_gh_0_0 = _mm256_hadd_ps(v.simd(), _mm256_setzero_ps());
8980  vfloat8 abcd_0_0_0_efgh_0_0_0 = _mm256_hadd_ps(ab_cd_0_0_ef_gh_0_0, _mm256_setzero_ps());
8981  // get efgh in the 0-idx slot
8982  vfloat8 efgh = shuffle<4>(abcd_0_0_0_efgh_0_0_0);
8983  vfloat8 final_sum = abcd_0_0_0_efgh_0_0_0 + efgh;
8984  return shuffle<0>(final_sum);
8985 #else
8986  vfloat4 hadd4 = vreduce_add(v.lo()) + vreduce_add(v.hi());
8987  return vfloat8(hadd4, hadd4);
8988 #endif
8989 }
8990 
8991 
8993 #if OIIO_SIMD_AVX >= 2
8994  return extract<0>(vreduce_add(v));
8995 #else
8996  return reduce_add(v.lo()) + reduce_add(v.hi());
8997 #endif
8998 }
8999 
9000 
9002 {
9003 #if OIIO_SIMD_AVX
9004  return _mm256_blendv_ps (a, b, mask);
9005 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9006  return vfloat8 (blend (a.lo(), b.lo(), mask.lo()),
9007  blend (a.hi(), b.hi(), mask.hi()));
9008 #else
9009  SIMD_RETURN (vfloat8, mask[i] ? b[i] : a[i]);
9010 #endif
9011 }
9012 
9013 
9015 {
9016 #if OIIO_SIMD_AVX
9017  return _mm256_and_ps(mask, a);
9018 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9019  return vfloat8 (blend0 (a.lo(), mask.lo()),
9020  blend0 (a.hi(), mask.hi()));
9021 #else
9022  SIMD_RETURN (vfloat8, mask[i] ? a[i] : 0.0f);
9023 #endif
9024 }
9025 
9026 
9028 {
9029 #if OIIO_SIMD_AVX
9030  return _mm256_andnot_ps(mask, a);
9031 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9032  return vfloat8 (blend0not (a.lo(), mask.lo()),
9033  blend0not (a.hi(), mask.hi()));
9034 #else
9035  SIMD_RETURN (vfloat8, mask[i] ? 0.0f : a[i]);
9036 #endif
9037 }
9038 
9039 
9041 {
9042  return blend (b, a, mask);
9043 }
9044 
9045 
9047 #if OIIO_SIMD_SSE
9048  return blend0not (a/b, b == vfloat8::Zero());
9049 #else
9050  SIMD_RETURN (vfloat8, b[i] == 0.0f ? 0.0f : a[i] / b[i]);
9051 #endif
9052 }
9053 
9054 
9056 {
9057 #if OIIO_SIMD_AVX
9058  // Just clear the sign bit for cheap fabsf
9059  return _mm256_and_ps (a.simd(), _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
9060 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9061  return vfloat8(abs(a.lo()), abs(a.hi()));
9062 #else
9063  SIMD_RETURN (vfloat8, fabsf(a[i]));
9064 #endif
9065 }
9066 
9067 
9069 {
9070  vfloat8 one(1.0f);
9071  return blend (one, -one, a < vfloat8::Zero());
9072 }
9073 
9074 
9076 {
9077 #if OIIO_SIMD_AVX
9078  return _mm256_ceil_ps (a);
9079 #else
9080  SIMD_RETURN (vfloat8, ceilf(a[i]));
9081 #endif
9082 }
9083 
9085 {
9086 #if OIIO_SIMD_AVX
9087  return _mm256_floor_ps (a);
9088 #else
9089  SIMD_RETURN (vfloat8, floorf(a[i]));
9090 #endif
9091 }
9092 
9094 {
9095 #if OIIO_SIMD_AVX
9096  return _mm256_round_ps (a, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
9097 #else
9098  SIMD_RETURN (vfloat8, roundf(a[i]));
9099 #endif
9100 }
9101 
9103 {
9104  // FIXME: look into this, versus the method of quick_floor in texturesys.cpp
9105 #if OIIO_SIMD_AVX
9106  return vint8(floor(a));
9107 #elif OIIO_SIMD_SSE /* SSE2/3 */
9108  return vint8 (ifloor(a.lo()), ifloor(a.hi()));
9109 #else
9110  SIMD_RETURN (vint8, (int)floorf(a[i]));
9111 #endif
9112 }
9113 
9114 
9116 {
9117  return vint8 (round(a));
9118 }
9119 
9120 
9121 
9123 {
9124 #if OIIO_SIMD_AVX512 && OIIO_AVX512VL_ENABLED
9125  vfloat8 r = _mm256_rcp14_ps(a);
9126  return r * nmadd(r,a,vfloat8(2.0f));
9127 #elif OIIO_SIMD_AVX
9128  vfloat8 r = _mm256_rcp_ps(a);
9129  return r * nmadd(r,a,vfloat8(2.0f));
9130 #else
9131  return vfloat8(rcp_fast(a.lo()), rcp_fast(a.hi()));
9132 #endif
9133 }
9134 
9135 
9137 {
9138 #if OIIO_SIMD_AVX
9139  return _mm256_sqrt_ps (a.simd());
9140 #else
9141  SIMD_RETURN (vfloat8, sqrtf(a[i]));
9142 #endif
9143 }
9144 
9145 
9146 
9148 {
9149 #if OIIO_SIMD_AVX
9150  return _mm256_div_ps (_mm256_set1_ps(1.0f), _mm256_sqrt_ps (a.simd()));
9151 #else
9152  SIMD_RETURN (vfloat8, 1.0f/sqrtf(a[i]));
9153 #endif
9154 }
9155 
9156 
9157 
9159 {
9160 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
9161  // Trickery: in and out of the 512 bit registers to use fast approx rsqrt
9162  return _mm512_castps512_ps256(_mm512_rsqrt28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC));
9163 #elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED
9164  // Trickery: in and out of the 512 bit registers to use fast approx rsqrt
9165  return _mm512_castps512_ps256(_mm512_rsqrt14_ps(_mm512_castps256_ps512(a)));
9166 #elif OIIO_SIMD_AVX
9167  return _mm256_rsqrt_ps (a.simd());
9168 #elif OIIO_SIMD_SSE
9169  return vfloat8 (rsqrt_fast(a.lo()), rsqrt_fast(a.hi()));
9170 #else
9171  SIMD_RETURN (vfloat8, 1.0f/sqrtf(a[i]));
9172 #endif
9173 }
9174 
9175 
9176 
9178 {
9179 #if OIIO_SIMD_AVX
9180  return _mm256_min_ps (a, b);
9181 #else
9182  return vfloat8 (min(a.lo(), b.lo()), min(a.hi(), b.hi()));
9183 #endif
9184 }
9185 
9187 {
9188 #if OIIO_SIMD_AVX
9189  return _mm256_max_ps (a, b);
9190 #else
9191  return vfloat8 (max(a.lo(), b.lo()), max(a.hi(), b.hi()));
9192 #endif
9193 }
9194 
9195 
9197 #if OIIO_SIMD_AVX
9198  return _mm256_andnot_ps (a.simd(), b.simd());
9199 #else
9200  const int *ai = (const int *)&a;
9201  const int *bi = (const int *)&b;
9202  return bitcast_to_float (vint8(~(ai[0]) & bi[0],
9203  ~(ai[1]) & bi[1],
9204  ~(ai[2]) & bi[2],
9205  ~(ai[3]) & bi[3],
9206  ~(ai[4]) & bi[4],
9207  ~(ai[5]) & bi[5],
9208  ~(ai[6]) & bi[6],
9209  ~(ai[7]) & bi[7]));
9210 #endif
9211 }
9212 
9213 
9215  const simd::vfloat8& c)
9216 {
9217 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9218  // If we are sure _mm256_fmadd_ps intrinsic is available, use it.
9219  return _mm256_fmadd_ps (a, b, c);
9220 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9221  return vfloat8 (madd(a.lo(), b.lo(), c.lo()),
9222  madd(a.hi(), b.hi(), c.hi()));
9223 #else
9224  // Fallback: just use regular math and hope for the best.
9225  return a * b + c;
9226 #endif
9227 }
9228 
9229 
9231  const simd::vfloat8& c)
9232 {
9233 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9234  // If we are sure _mm256_fnmsub_ps intrinsic is available, use it.
9235  return _mm256_fmsub_ps (a, b, c);
9236 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9237  return vfloat8 (msub(a.lo(), b.lo(), c.lo()),
9238  msub(a.hi(), b.hi(), c.hi()));
9239 #else
9240  // Fallback: just use regular math and hope for the best.
9241  return a * b - c;
9242 #endif
9243 }
9244 
9245 
9246 
9248  const simd::vfloat8& c)
9249 {
9250 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9251  // If we are sure _mm256_fnmadd_ps intrinsic is available, use it.
9252  return _mm256_fnmadd_ps (a, b, c);
9253 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9254  return vfloat8 (nmadd(a.lo(), b.lo(), c.lo()),
9255  nmadd(a.hi(), b.hi(), c.hi()));
9256 #else
9257  // Fallback: just use regular math and hope for the best.
9258  return c - a * b;
9259 #endif
9260 }
9261 
9262 
9263 
9265  const simd::vfloat8& c)
9266 {
9267 #if OIIO_SIMD_AVX && OIIO_FMA_ENABLED
9268  // If we are sure _mm256_fnmsub_ps intrinsic is available, use it.
9269  return _mm256_fnmsub_ps (a, b, c);
9270 #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON
9271  return vfloat8 (nmsub(a.lo(), b.lo(), c.lo()),
9272  nmsub(a.hi(), b.hi(), c.hi()));
9273 #else
9274  // Fallback: just use regular math and hope for the best.
9275  return -(a * b) - c;
9276 #endif
9277 }
9278 
9279 
9280 
9281 
9282 //////////////////////////////////////////////////////////////////////
9283 // vfloat16 implementation
9284 
9287  return m_val[i];
9288 }
9289 
9292  return m_val[i];
9293 }
9294 
9295 
9296 inline std::ostream& operator<< (std::ostream& cout, const vfloat16& val) {
9297  cout << val[0];
9298  for (int i = 1; i < val.elements; ++i)
9299  cout << ' ' << val[i];
9300  return cout;
9301 }
9302 
9303 
9305 #if OIIO_SIMD_AVX >= 512
9306  return _mm512_castps512_ps256 (simd());
9307 #else
9308  return m_8[0];
9309 #endif
9310 }
9311 
9313 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512DQ_ENABLED
9314  return _mm512_extractf32x8_ps (simd(), 1);
9315 #else
9316  return m_8[1];
9317 #endif
9318 }
9319 
9320 
9321 OIIO_FORCEINLINE vfloat16::vfloat16 (float v0, float v1, float v2, float v3,
9322  float v4, float v5, float v6, float v7,
9323  float v8, float v9, float v10, float v11,
9324  float v12, float v13, float v14, float v15) {
9325  load (v0, v1, v2, v3, v4, v5, v6, v7,
9326  v8, v9, v10, v11, v12, v13, v14, v15);
9327 }
9328 
9330 #if OIIO_SIMD_AVX >= 512
9331  __m512 r = _mm512_castps256_ps512 (lo);
9332  m_simd = _mm512_insertf32x8 (r, hi, 1);
9333 #else
9334  m_8[0] = lo;
9335  m_8[1] = hi;
9336 #endif
9337 }
9338 
9339 OIIO_FORCEINLINE vfloat16::vfloat16 (const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d) {
9340 #if OIIO_SIMD_AVX >= 512
9341  m_simd = _mm512_broadcast_f32x4(a);
9342  m_simd = _mm512_insertf32x4 (m_simd, b, 1);
9343  m_simd = _mm512_insertf32x4 (m_simd, c, 2);
9344  m_simd = _mm512_insertf32x4 (m_simd, d, 3);
9345 #else
9346  m_8[0] = vfloat8(a,b);
9347  m_8[1] = vfloat8(c,d);
9348 #endif
9349 }
9350 
9351 
9353 #if OIIO_SIMD_AVX >= 512
9354  m_simd = _mm512_cvtepi32_ps (ival);
9355 #else
9356  SIMD_CONSTRUCT (float(ival[i]));
9357 #endif
9358 }
9359 
9360 
9362 #if OIIO_SIMD_AVX >= 512
9363  return _mm512_setzero_ps();
9364 #else
9365  return vfloat16(0.0f);
9366 #endif
9367 }
9368 
9370  return vfloat16(1.0f);
9371 }
9372 
9373 OIIO_FORCEINLINE const vfloat16 vfloat16::Iota (float start, float step) {
9374  return vfloat16 (start+0.0f*step, start+1.0f*step, start+2.0f*step, start+3.0f*step,
9375  start+4.0f*step, start+5.0f*step, start+6.0f*step, start+7.0f*step,
9376  start+8.0f*step, start+9.0f*step, start+10.0f*step, start+11.0f*step,
9377  start+12.0f*step, start+13.0f*step, start+14.0f*step, start+15.0f*step);
9378 }
9379 
9380 /// Set all components to 0.0
9382 #if OIIO_SIMD_AVX >= 512
9383  m_simd = _mm512_setzero_ps();
9384 #else
9385  load (0.0f);
9386 #endif
9387 }
9388 
9389 
9391 #if OIIO_SIMD_AVX >= 512
9392  m_simd = _mm512_set1_ps (a);
9393 #else
9394  m_8[0].load (a);
9395  m_8[1].load (a);
9396 #endif
9397 }
9398 
9399 
9400 OIIO_FORCEINLINE void vfloat16::load (float v0, float v1, float v2, float v3,
9401  float v4, float v5, float v6, float v7,
9402  float v8, float v9, float v10, float v11,
9403  float v12, float v13, float v14, float v15) {
9404 #if OIIO_SIMD_AVX >= 512
9405  m_simd = _mm512_setr_ps (v0, v1, v2, v3, v4, v5, v6, v7,
9406  v8, v9, v10, v11, v12, v13, v14, v15);
9407 #else
9408  m_val[ 0] = v0;
9409  m_val[ 1] = v1;
9410  m_val[ 2] = v2;
9411  m_val[ 3] = v3;
9412  m_val[ 4] = v4;
9413  m_val[ 5] = v5;
9414  m_val[ 6] = v6;
9415  m_val[ 7] = v7;
9416  m_val[ 8] = v8;
9417  m_val[ 9] = v9;
9418  m_val[10] = v10;
9419  m_val[11] = v11;
9420  m_val[12] = v12;
9421  m_val[13] = v13;
9422  m_val[14] = v14;
9423  m_val[15] = v15;
9424 #endif
9425 }
9426 
9427 
9428 OIIO_FORCEINLINE void vfloat16::load (const float *values) {
9429 #if OIIO_SIMD_AVX >= 512
9430  m_simd = _mm512_loadu_ps (values);
9431 #else
9432  m_8[0].load (values);
9433  m_8[1].load (values+8);
9434 #endif
9435 }
9436 
9437 
9438 OIIO_FORCEINLINE void vfloat16::load (const float *values, int n)
9439 {
9440  OIIO_DASSERT (n >= 0 && n <= elements);
9441 #if OIIO_SIMD_AVX >= 512
9442  m_simd = _mm512_maskz_loadu_ps (__mmask16(~(0xffff << n)), values);
9443 #else
9444  if (n > 8) {
9445  m_8[0].load (values);
9446  m_8[1].load (values+8, n-8);
9447  } else {
9448  m_8[0].load (values, n);
9449  m_8[1].clear ();
9450  }
9451 #endif
9452 }
9453 
9454 
9455 OIIO_FORCEINLINE void vfloat16::load (const unsigned short *values) {
9456 #if OIIO_SIMD_AVX >= 512
9457  // Rely on the ushort->int conversion, then convert to float
9458  m_simd = _mm512_cvtepi32_ps (vint16(values).simd());
9459 #else
9460  m_8[0].load (values);
9461  m_8[1].load (values+8);
9462 #endif
9463 }
9464 
9465 
9466 OIIO_FORCEINLINE void vfloat16::load (const short *values) {
9467 #if OIIO_SIMD_AVX >= 512
9468  // Rely on the short->int conversion, then convert to float
9469  m_simd = _mm512_cvtepi32_ps (vint16(values).simd());
9470 #else
9471  m_8[0].load (values);
9472  m_8[1].load (values+8);
9473 #endif
9474 }
9475 
9476 
9477 OIIO_FORCEINLINE void vfloat16::load (const unsigned char *values) {
9478 #if OIIO_SIMD_AVX >= 512
9479  m_simd = _mm512_cvtepi32_ps (vint16(values).simd());
9480 #else
9481  m_8[0].load (values);
9482  m_8[1].load (values+8);
9483 #endif
9484 }
9485 
9486 
9487 OIIO_FORCEINLINE void vfloat16::load (const char *values) {
9488 #if OIIO_SIMD_AVX >= 512
9489  m_simd = _mm512_cvtepi32_ps (vint16(values).simd());
9490 #else
9491  m_8[0].load (values);
9492  m_8[1].load (values+8);
9493 #endif
9494 }
9495 
9496 
9497 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
9498 OIIO_FORCEINLINE void vfloat16::load (const half *values) {
9499 #if OIIO_SIMD_AVX >= 512
9500  /* Enabled 16 bit float instructions! */
9501  vint8 a ((const int *)values);
9502  m_simd = _mm512_cvtph_ps (a);
9503 #else
9504  m_8[0].load (values);
9505  m_8[1].load (values+8);
9506 #endif
9507 }
9508 #endif /* _HALF_H_ or _IMATH_H_ */
9509 
9510 
9511 
9512 OIIO_FORCEINLINE void vfloat16::store (float *values) const {
9513 #if OIIO_SIMD_AVX >= 512
9514  // Use an unaligned store -- it's just as fast when the memory turns
9515  // out to be aligned, nearly as fast even when unaligned. Not worth
9516  // the headache of using stores that require alignment.
9517  _mm512_storeu_ps (values, m_simd);
9518 #else
9519  m_8[0].store (values);
9520  m_8[1].store (values+8);
9521 #endif
9522 }
9523 
9524 
9525 OIIO_FORCEINLINE void vfloat16::store (float *values, int n) const {
9526  OIIO_DASSERT (n >= 0 && n <= elements);
9527  // FIXME: is this faster with AVX masked stores?
9528 #if 0 && OIIO_SIMD_AVX >= 512
9529  // This SHOULD be fast, but in my benchmarks, it is slower!
9530  // (At least on the AVX512 hardware I have, Xeon Silver 4110.)
9531  // Re-test this periodically with new Intel hardware.
9532  _mm512_mask_storeu_ps (values, __mmask16(~(0xffff << n)), m_simd);
9533 #else
9534  if (n <= 8) {
9535  lo().store (values, n);
9536  } else if (n < 16) {
9537  lo().store (values);
9538  hi().store (values+8, n-8);
9539  } else {
9540  store (values);
9541  }
9542 #endif
9543 }
9544 
9545 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
9546 OIIO_FORCEINLINE void vfloat16::store (half *values) const {
9547 #if OIIO_SIMD_AVX >= 512
9548  __m256i h = _mm512_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
9549  _mm256_storeu_si256 ((__m256i *)values, h);
9550 #else
9551  m_8[0].store (values);
9552  m_8[1].store (values+8);
9553 #endif
9554 }
9555 #endif
9556 
9557 
9558 OIIO_FORCEINLINE void vfloat16::load_mask (const vbool16 &mask, const float *values) {
9559 #if OIIO_SIMD_AVX >= 512
9560  m_simd = _mm512_maskz_loadu_ps (mask, (const simd_t *)values);
9561 #else
9562  m_8[0].load_mask (mask.lo(), values);
9563  m_8[1].load_mask (mask.hi(), values+8);
9564 #endif
9565 }
9566 
9567 
9568 OIIO_FORCEINLINE void vfloat16::store_mask (const vbool16 &mask, float *values) const {
9569 #if OIIO_SIMD_AVX >= 512
9570  _mm512_mask_storeu_ps (values, mask.bitmask(), m_simd);
9571 #else
9572  lo().store_mask (mask.lo(), values);
9573  hi().store_mask (mask.hi(), values+8);
9574 #endif
9575 }
9576 
9577 
9578 
9579 template <int scale>
9580 OIIO_FORCEINLINE void
9581 vfloat16::gather (const value_t *baseptr, const vint_t& vindex)
9582 {
9583 #if OIIO_SIMD_AVX >= 512
9584  m_simd = _mm512_i32gather_ps (vindex, baseptr, scale);
9585 #else
9586  m_8[0].gather<scale> (baseptr, vindex.lo());
9587  m_8[1].gather<scale> (baseptr, vindex.hi());
9588 #endif
9589 }
9590 
9591 template<int scale>
9592 OIIO_FORCEINLINE void
9593 vfloat16::gather_mask (const vbool_t& mask, const value_t *baseptr, const vint_t& vindex)
9594 {
9595 #if OIIO_SIMD_AVX >= 512
9596  m_simd = _mm512_mask_i32gather_ps (m_simd, mask, vindex, baseptr, scale);
9597 #else
9598  m_8[0].gather_mask<scale> (mask.lo(), baseptr, vindex.lo());
9599  m_8[1].gather_mask<scale> (mask.hi(), baseptr, vindex.hi());
9600 #endif
9601 }
9602 
9603 template<int scale>
9604 OIIO_FORCEINLINE void
9605 vfloat16::scatter (value_t *baseptr, const vint_t& vindex) const
9606 {
9607 #if OIIO_SIMD_AVX >= 512
9608  _mm512_i32scatter_ps (baseptr, vindex, m_simd, scale);
9609 #else
9610  lo().scatter<scale> (baseptr, vindex.lo());
9611  hi().scatter<scale> (baseptr, vindex.hi());
9612 #endif
9613 }
9614 
9615 template<int scale>
9616 OIIO_FORCEINLINE void
9618  const vint_t& vindex) const
9619 {
9620 #if OIIO_SIMD_AVX >= 512
9621  _mm512_mask_i32scatter_ps (baseptr, mask, vindex, m_simd, scale);
9622 #else
9623  lo().scatter_mask<scale> (mask.lo(), baseptr, vindex.lo());
9624  hi().scatter_mask<scale> (mask.hi(), baseptr, vindex.hi());
9625 #endif
9626 }
9627 
9628 
9629 
9631 #if OIIO_SIMD_AVX >= 512
9632  return _mm512_add_ps (a.m_simd, b.m_simd);
9633 #else
9634  return vfloat16 (a.lo()+b.lo(), a.hi()+b.hi());
9635 #endif
9636 }
9637 
9639  return a = a + b;
9640 }
9641 
9643 #if OIIO_SIMD_AVX >= 512
9644  return _mm512_sub_ps (_mm512_setzero_ps(), a.simd());
9645 #else
9646  return vfloat16 (-a.lo(), -a.hi());
9647 #endif
9648 }
9649 
9651 #if OIIO_SIMD_AVX >= 512
9652  return _mm512_sub_ps (a.m_simd, b.m_simd);
9653 #else
9654  return vfloat16 (a.lo()-b.lo(), a.hi()-b.hi());
9655 #endif
9656 }
9657 
9659  return a = a - b;
9660 }
9661 
9662 
9664 #if OIIO_SIMD_AVX >= 512
9665  return _mm512_mul_ps (a.m_simd, _mm512_set1_ps(b));
9666 #else
9667  return vfloat16 (a.lo()*b, a.hi()*b);
9668 #endif
9669 }
9670 
9672  return b * a;
9673 }
9674 
9676 #if OIIO_SIMD_AVX >= 512
9677  return _mm512_mul_ps (a.m_simd, b.m_simd);
9678 #else
9679  return vfloat16 (a.lo()*b.lo(), a.hi()*b.hi());
9680 #endif
9681 }
9682 
9684  return a = a * b;
9685 }
9686 
9688 #if OIIO_SIMD_AVX >= 512
9689  return _mm512_div_ps (a.m_simd, b.m_simd);
9690 #else
9691  return vfloat16 (a.lo()/b.lo(), a.hi()/b.hi());
9692 #endif
9693 }
9694 
9696  return a = a / b;
9697 }
9698 
9699 
9701 #if OIIO_SIMD_AVX >= 512
9702  return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_EQ_OQ);
9703 #else /* Fall back to 8-wide */
9704  return vbool16 (a.lo() == b.lo(), a.hi() == b.hi());
9705 #endif
9706 }
9707 
9708 
9710 #if OIIO_SIMD_AVX >= 512
9711  return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_NEQ_OQ);
9712 #else /* Fall back to 8-wide */
9713  return vbool16 (a.lo() != b.lo(), a.hi() != b.hi());
9714 #endif
9715 }
9716 
9717 
9719 #if OIIO_SIMD_AVX >= 512
9720  return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_LT_OQ);
9721 #else /* Fall back to 8-wide */
9722  return vbool16 (a.lo() < b.lo(), a.hi() < b.hi());
9723 #endif
9724 }
9725 
9726 
9728 #if OIIO_SIMD_AVX >= 512
9729  return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_GT_OQ);
9730 #else /* Fall back to 8-wide */
9731  return vbool16 (a.lo() > b.lo(), a.hi() > b.hi());
9732 #endif
9733 }
9734 
9735 
9737 #if OIIO_SIMD_AVX >= 512
9738  return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_GE_OQ);
9739 #else /* Fall back to 8-wide */
9740  return vbool16 (a.lo() >= b.lo(), a.hi() >= b.hi());
9741 #endif
9742 }
9743 
9744 
9746 #if OIIO_SIMD_AVX >= 512
9747  return _mm512_cmp_ps_mask (a.simd(), b.simd(), _CMP_LE_OQ);
9748 #else /* Fall back to 8-wide */
9749  return vbool16 (a.lo() <= b.lo(), a.hi() <= b.hi());
9750 #endif
9751 }
9752 
9753 
9754 // Implementation had to be after the definition of vfloat16.
9756 {
9757 #if OIIO_SIMD_AVX >= 512
9758  m_simd = _mm512_cvttps_epi32(f);
9759 #else
9760  *this = vint16 (vint8(f.lo()), vint8(f.hi()));
9761 #endif
9762 }
9763 
9764 
9765 
9766 // Shuffle groups of 4
9767 template<int i0, int i1, int i2, int i3>
9769 #if OIIO_SIMD_AVX >= 512
9770  return _mm512_shuffle_f32x4(a,a,_MM_SHUFFLE(i3,i2,i1,i0));
9771 #else
9772  vfloat4 x[4];
9773  a.store ((float *)x);
9774  return vfloat16 (x[i0], x[i1], x[i2], x[i3]);
9775 #endif
9776 }
9777 
9778 template<int i> vfloat16 shuffle4 (const vfloat16& a) {
9779  return shuffle4<i,i,i,i> (a);
9780 }
9781 
9782 template<int i0, int i1, int i2, int i3>
9784 #if OIIO_SIMD_AVX >= 512
9785  return _mm512_permute_ps(a,_MM_SHUFFLE(i3,i2,i1,i0));
9786 #else
9787  vfloat4 x[4];
9788  a.store ((float *)x);
9789  return vfloat16 (shuffle<i0,i1,i2,i3>(x[0]), shuffle<i0,i1,i2,i3>(x[1]),
9790  shuffle<i0,i1,i2,i3>(x[2]), shuffle<i0,i1,i2,i3>(x[3]));
9791 #endif
9792 }
9793 
9794 template<int i> vfloat16 shuffle (const vfloat16& a) {
9795  return shuffle<i,i,i,i> (a);
9796 }
9797 
9798 
9799 template<int i>
9801  return a[i];
9802 }
9803 
9804 
9805 template<int i>
9807  vfloat16 tmp = a;
9808  tmp[i] = val;
9809  return tmp;
9810 }
9811 
9812 
9814 #if OIIO_SIMD_AVX >= 512
9815  return _mm_cvtss_f32(_mm512_castps512_ps128(m_simd));
9816 #else
9817  return m_val[0];
9818 #endif
9819 }
9820 
9821 OIIO_FORCEINLINE float vfloat16::y () const { return m_val[1]; }
9822 OIIO_FORCEINLINE float vfloat16::z () const { return m_val[2]; }
9823 OIIO_FORCEINLINE float vfloat16::w () const { return m_val[3]; }
9828 
9829 
9831 {
9832 #if OIIO_SIMD_AVX >= 512
9833  return _mm512_castps_si512 (x.simd());
9834 #else
9835  return *(vint16 *)&x;
9836 #endif
9837 }
9838 
9840 {
9841 #if OIIO_SIMD_AVX >= 512
9842  return _mm512_castsi512_ps (x.simd());
9843 #else
9844  return *(vfloat16 *)&x;
9845 #endif
9846 }
9847 
9848 
9850 #if OIIO_SIMD_AVX >= 512
9851  // Nomenclature: ABCD are the vint4's comprising v
9852  // First, add the vint4's and make them all the same
9853  vfloat16 AB_AB_CD_CD = v + shuffle4<1,0,3,2>(v); // each adjacent vint4 is summed
9854  vfloat16 w = AB_AB_CD_CD + shuffle4<2,3,0,1>(AB_AB_CD_CD);
9855  // Now, add within each vint4
9856  vfloat16 ab_ab_cd_cd = w + shuffle<1,0,3,2>(w); // each adjacent int is summed
9857  return ab_ab_cd_cd + shuffle<2,3,0,1>(ab_ab_cd_cd);
9858 #else
9859  vfloat8 sum = vreduce_add(v.lo()) + vreduce_add(v.hi());
9860  return vfloat16 (sum, sum);
9861 #endif
9862 }
9863 
9864 
9866 #if OIIO_SIMD_AVX >= 512
9867  return vreduce_add(v).x();
9868 #else
9869  return reduce_add(v.lo()) + reduce_add(v.hi());
9870 #endif
9871 }
9872 
9873 
9875 {
9876 #if OIIO_SIMD_AVX >= 512
9877  return _mm512_mask_blend_ps (mask, a, b);
9878 #else
9879  return vfloat16 (blend (a.lo(), b.lo(), mask.lo()),
9880  blend (a.hi(), b.hi(), mask.hi()));
9881 #endif
9882 }
9883 
9884 
9886 {
9887 #if OIIO_SIMD_AVX >= 512
9888  return _mm512_maskz_mov_ps (mask, a);
9889 #else
9890  return vfloat16 (blend0 (a.lo(), mask.lo()),
9891  blend0 (a.hi(), mask.hi()));
9892 #endif
9893 }
9894 
9895 
9897 {
9898 #if OIIO_SIMD_AVX >= 512
9899  return _mm512_maskz_mov_ps (!mask, a);
9900 #else
9901  return vfloat16 (blend0not (a.lo(), mask.lo()),
9902  blend0not (a.hi(), mask.hi()));
9903 #endif
9904 }
9905 
9906 
9908 {
9909  return blend (b, a, mask);
9910 }
9911 
9912 
9914 #if OIIO_SIMD_SSE
9915  return blend0not (a/b, b == vfloat16::Zero());
9916 #else
9917  SIMD_RETURN (vfloat16, b[i] == 0.0f ? 0.0f : a[i] / b[i]);
9918 #endif
9919 }
9920 
9921 
9923 {
9924 #if OIIO_SIMD_AVX >= 512
9925  // Not available? return _mm512_abs_ps (a.simd());
9926  // Just clear the sign bit for cheap fabsf
9927  return _mm512_castsi512_ps (_mm512_and_epi32 (_mm512_castps_si512(a.simd()),
9928  _mm512_set1_epi32(0x7fffffff)));
9929 #else
9930  return vfloat16(abs(a.lo()), abs(a.hi()));
9931 #endif
9932 }
9933 
9934 
9936 {
9937  vfloat16 one(1.0f);
9938  return blend (one, -one, a < vfloat16::Zero());
9939 }
9940 
9941 
9943 {
9944 #if OIIO_SIMD_AVX >= 512
9945  return _mm512_ceil_ps (a);
9946 #else
9947  return vfloat16(ceil(a.lo()), ceil(a.hi()));
9948 #endif
9949 }
9950 
9952 {
9953 #if OIIO_SIMD_AVX >= 512
9954  return _mm512_floor_ps (a);
9955 #else
9956  return vfloat16(floor(a.lo()), floor(a.hi()));
9957 #endif
9958 }
9959 
9960 
9962 {
9963 #if OIIO_SIMD_AVX >= 512
9964  return _mm512_roundscale_ps (a, (1<<4) | 3); // scale=1, round to nearest smaller mag int
9965 #else
9966  return vfloat16(round(a.lo()), round(a.hi()));
9967 #endif
9968 }
9969 
9971 {
9972 #if OIIO_SIMD_AVX >= 512
9973  return _mm512_cvt_roundps_epi32 (a, (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC));
9974 #else
9975  return vint16(floor(a));
9976 #endif
9977 }
9978 
9979 
9981 {
9982  return vint16(round(a));
9983 }
9984 
9985 
9987 {
9988 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
9989  return _mm512_rcp28_ps(a);
9990 #elif OIIO_SIMD_AVX >= 512
9991  vfloat16 r = _mm512_rcp14_ps(a);
9992  return r * nmadd (r, a, vfloat16(2.0f));
9993 #else
9994  return vfloat16(rcp_fast(a.lo()), rcp_fast(a.hi()));
9995 #endif
9996 }
9997 
9998 
10000 {
10001 #if OIIO_SIMD_AVX >= 512
10002  return _mm512_sqrt_ps (a);
10003 #else
10004  return vfloat16(sqrt(a.lo()), sqrt(a.hi()));
10005 #endif
10006 }
10007 
10008 
10010 {
10011 #if OIIO_SIMD_AVX >= 512
10012  return _mm512_div_ps (_mm512_set1_ps(1.0f), _mm512_sqrt_ps (a));
10013 #else
10014  return vfloat16(rsqrt(a.lo()), rsqrt(a.hi()));
10015 #endif
10016 }
10017 
10018 
10020 {
10021 #if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED
10022  return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
10023 #elif OIIO_SIMD_AVX >= 512
10024  return _mm512_rsqrt14_ps (a);
10025 #else
10026  return vfloat16(rsqrt_fast(a.lo()), rsqrt_fast(a.hi()));
10027 #endif
10028 }
10029 
10030 
10032 {
10033 #if OIIO_SIMD_AVX >= 512
10034  return _mm512_min_ps (a, b);
10035 #else
10036  return vfloat16(min(a.lo(),b.lo()), min(a.hi(),b.hi()));
10037 #endif
10038 }
10039 
10041 {
10042 #if OIIO_SIMD_AVX >= 512
10043  return _mm512_max_ps (a, b);
10044 #else
10045  return vfloat16(max(a.lo(),b.lo()), max(a.hi(),b.hi()));
10046 #endif
10047 }
10048 
10049 
10051 #if OIIO_SIMD_AVX >= 512 && defined(__AVX512DQ__)
10052  return _mm512_andnot_ps (a, b);
10053 #else
10054  return vfloat16(andnot(a.lo(),b.lo()), andnot(a.hi(),b.hi()));
10055 #endif
10056 }
10057 
10058 
10060  const simd::vfloat16& c)
10061 {
10062 #if OIIO_SIMD_AVX >= 512
10063  return _mm512_fmadd_ps (a, b, c);
10064 #else
10065  return vfloat16 (madd(a.lo(), b.lo(), c.lo()),
10066  madd(a.hi(), b.hi(), c.hi()));
10067 #endif
10068 }
10069 
10070 
10072  const simd::vfloat16& c)
10073 {
10074 #if OIIO_SIMD_AVX >= 512
10075  return _mm512_fmsub_ps (a, b, c);
10076 #else
10077  return vfloat16 (msub(a.lo(), b.lo(), c.lo()),
10078  msub(a.hi(), b.hi(), c.hi()));
10079 #endif
10080 }
10081 
10082 
10083 
10085  const simd::vfloat16& c)
10086 {
10087 #if OIIO_SIMD_AVX >= 512
10088  return _mm512_fnmadd_ps (a, b, c);
10089 #else
10090  return vfloat16 (nmadd(a.lo(), b.lo(), c.lo()),
10091  nmadd(a.hi(), b.hi(), c.hi()));
10092 #endif
10093 }
10094 
10095 
10096 
10098  const simd::vfloat16& c)
10099 {
10100 #if OIIO_SIMD_AVX >= 512
10101  return _mm512_fnmsub_ps (a, b, c);
10102 #else
10103  return vfloat16 (nmsub(a.lo(), b.lo(), c.lo()),
10104  nmsub(a.hi(), b.hi(), c.hi()));
10105 #endif
10106 }
10107 
10108 
10109 
10110 
10111 } // end namespace simd
10112 
10114 
10115 
10116 #undef SIMD_DO
10117 #undef SIMD_CONSTRUCT
10118 #undef SIMD_CONSTRUCT_PAD
10119 #undef SIMD_RETURN
10120 #undef SIMD_RETURN_REDUCE
friend const vfloat8 & operator/=(vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8836
friend vfloat8 operator+(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8772
static const char * type_name()
Definition: simd.h:459
static const char * type_name()
Definition: simd.h:2439
static const vbool4 True()
Return a vbool4 the is 'true' for all values.
Definition: simd.h:3251
simd_t simd() const
Definition: simd.h:2805
vint16()
Default constructor (contents undefined)
Definition: simd.h:1495
void set_x(value_t val)
Definition: simd.h:7218
friend vint4 operator|(const vint4 &a, const vint4 &b)
Definition: simd.h:4474
simd_t & simd()
Definition: simd.h:945
void set_x(value_t val)
Definition: simd.h:4704
vfloat16(float a)
Construct from a single value (store it in all slots)
Definition: simd.h:2777
friend const vint8 & operator%=(vint8 &a, const vint8 &b)
Definition: simd.h:5294
static const vint4 NegOne()
Return an vint4 with all components set to -1 (aka 0xffffffff)
Definition: simd.h:4334
friend vbool8 operator!(const vbool8 &a)
Logical/bitwise operators, component-by-component.
Definition: simd.h:3654
friend const vint4 & operator%=(vint4 &a, const vint4 &b)
Definition: simd.h:4447
int operator[](int i) const
Component access (get)
Definition: simd.h:3795
friend const vfloat16 & operator*=(vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9683
vint4 max(const vint4 &a, const vint4 &b)
Definition: simd.h:4845
friend vfloat3 operator*(const vfloat3 &a, const vfloat3 &b)
Definition: simd.h:7959
simd_t & simd()
Definition: simd.h:791
static const char * type_name()
Definition: simd.h:2150
void set_y(value_t val)
Definition: simd.h:9825
typedef int(APIENTRYP RE_PFNGLXSWAPINTERVALSGIPROC)(int)
vfloat4(float a)
Construct from a single value (store it in all slots)
Definition: simd.h:1809
static vbool4 from_bitmask(int bitmask)
Convert from integer bitmask to a true vbool4.
Definition: simd.h:3228
bool none(const vbool4 &v)
Definition: simd.h:3469
void clear()
Set all components to 0.0.
Definition: simd.h:9381
vbool4(bool a)
Construct from a single value (store it in all slots)
Definition: simd.h:470
friend const vbool16 & operator|=(vbool16 &a, const vbool16 &b)
Definition: simd.h:3980
vfloat3 operator-() const
Definition: simd.h:7947
friend vbool8 operator!=(const vint8 &a, const vint8 &b)
Definition: simd.h:5395
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
const vfloat3 & operator-=(const vfloat3 &a)
Definition: simd.h:7955
vfloat4(const Imath::V3f &v)
Construct from a Imath::V3f.
Definition: simd.h:1836
void store(float *values) const
Definition: simd.h:9512
static const vfloat8 One()
Return a vfloat8 with all components set to 1.0.
Definition: simd.h:8473
friend vfloat4 operator*(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:6980
simd_t m_simd
Definition: simd.h:1700
vint16 shuffle4(const vint16 &a)
Shuffle groups of 4.
Definition: simd.h:6288
friend vbool16 operator<=(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9745
OIIO_FORCEINLINE const vint4 & operator/=(vint4 &a, const vint4 &b)
Definition: simd.h:4438
friend vint8 operator~(const vint8 &a)
Definition: simd.h:5335
void store_mask(int mask, value_t *values) const
Definition: simd.h:4243
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vfloat16 operator-(const vfloat16 &a)
Definition: simd.h:9642
value_t z() const
Definition: simd.h:5534
vfloat8(float a)
Construct from a single value (store it in all slots)
Definition: simd.h:2457
void set_x(value_t val)
Definition: simd.h:8951
OIIO_FORCEINLINE vbool4 shuffle(const vbool4 &a)
Definition: simd.h:3409
friend vfloat3 operator+(const vfloat3 &a, const vfloat3 &b)
Definition: simd.h:7939
friend vbool16 operator!=(const vint16 &a, const vint16 &b)
Definition: simd.h:6189
friend const vbool8 & operator|=(vbool8 &a, const vbool8 &b)
Definition: simd.h:3691
value_t x() const
Definition: simd.h:9813
static const char * name()
Definition: simd.h:351
SYS_API float expf(float x)
Matrix44< float > M44f
4x4 matrix of float
Definition: ImathMatrix.h:1137
simd_t simd() const
Definition: simd.h:2480
value_t m_val[elements]
Definition: simd.h:1397
friend vfloat16 operator%(const vfloat16 &a, const vfloat16 &b)
#define OIIO_FORCEINLINE
Definition: platform.h:395
friend std::ostream & operator<<(std::ostream &cout, const vfloat4 &val)
Stream output.
Definition: simd.h:7129
friend vint8 operator/(const vint8 &a, const vint8 &b)
Definition: simd.h:5281
vfloat4 bitcast_to_float(const vint4 &x)
Definition: simd.h:7233
vfloat4 m_4[2]
Definition: simd.h:2649
static const vint4 Giota()
Return an vint4 with "geometric" iota: (1, 2, 4, 8).
Definition: simd.h:4356
value_t y() const
Definition: simd.h:6343
OIIO_FORCEINLINE const vint4 & operator>>=(vint4 &a, const unsigned int bits)
Definition: simd.h:4532
vfloat4 float4
Definition: simd.h:273
friend const vint8 & operator>>=(vint8 &a, unsigned int bits)
Definition: simd.h:5369
void load_mask(const vbool_t &mask, const value_t *values)
Definition: simd.h:5872
void scatter_mask(int mask, value_t *baseptr, const vint_t &vindex) const
Definition: simd.h:1658
friend vint4 operator&(const vint4 &a, const vint4 &b)
Definition: simd.h:4459
int operator[](int i) const
Component access (get)
Definition: simd.h:4050
void set_w(value_t val)
Definition: simd.h:4707
friend vbool8 operator>(const vint8 &a, const vint8 &b)
Definition: simd.h:5401
void set_x(value_t val)
Definition: simd.h:6346
static const vfloat3 Zero()
Return a vfloat3 with all components set to 0.0.
Definition: simd.h:7882
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
friend vfloat4 AxyBxy(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7105
Vec4< float > V4f
Vec4 of float.
Definition: ImathVec.h:864
static const char * type_name()
Definition: simd.h:748
imath_half_bits_t half
if we're in a C-only context, alias the half bits type to half
Definition: half.h:266
void store_mask(int mask, value_t *values) const
Definition: simd.h:8703
vint4 srl(const vint4 &val, const unsigned int bits)
Definition: simd.h:4537
OIIO_FORCEINLINE vint4 operator%(const vint4 &a, const vint4 &b)
Definition: simd.h:4440
vint4 bitcast_to_int4(const vfloat4 &x)
Definition: simd.h:7244
friend const vbool4 & operator&=(vbool4 &a, const vbool4 &b)
Definition: simd.h:3318
friend const vint4 & operator>>=(vint4 &a, unsigned int bits)
Definition: simd.h:4532
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
friend vint8 operator|(const vint8 &a, const vint8 &b)
Definition: simd.h:5314
vfloat4 vfloat_t
SIMD int type.
Definition: simd.h:1797
friend vfloat16 operator*(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9675
int operator[](int i) const
Component access (get)
Definition: simd.h:4910
friend const vfloat8 & operator+=(vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8780
int bitmask() const
Extract the bitmask.
Definition: simd.h:3566
vfloat16 vfloat_t
float type of the same length
Definition: simd.h:1487
vfloat3(const float *f)
Construct from a pointer to 4 values.
Definition: simd.h:2164
value_t x() const
Definition: simd.h:8947
const GLdouble * v
Definition: glcorearb.h:837
friend vfloat4 operator/(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7012
vfloat16(const unsigned short *vals)
Construct from a pointer to unsigned short values.
Definition: simd.h:2813
int value_t
Underlying equivalent scalar value type.
Definition: simd.h:1183
OIIO_FORCEINLINE const vbool4 & operator^=(vbool4 &a, const vbool4 &b)
Definition: simd.h:3326
GLuint start
Definition: glcorearb.h:475
void clear()
Sset all components to 0.
Definition: simd.h:4313
vint8()
Default constructor (contents undefined)
Definition: simd.h:1197
friend vbool16 operator==(const vbool16 &a, const vbool16 &b)
Comparison operators, component by component.
Definition: simd.h:3994
friend vint4 operator/(const vint4 &a, const vint4 &b)
Definition: simd.h:4432
vfloat8(const short *vals)
Construct from a pointer to short values.
Definition: simd.h:2491
vfloat3 transformv(const vfloat3 &V) const
Transform 3-vector V by 4x4 matrix M.
Definition: simd.h:8166
void set_w(value_t val)
Definition: simd.h:5539
int bitmask() const
Extract the bitmask.
Definition: simd.h:3214
void setcomp(int i, bool value)
Component access (set).
Definition: simd.h:3804
void clear()
Set all components to false.
Definition: simd.h:3900
value_t m_val[elements]
Definition: simd.h:1098
static vbool16 from_bitmask(int bitmask)
Convert from integer bitmask to a true vbool16.
Definition: simd.h:796
vfloat8 float8
Definition: simd.h:274
void load(float val)
Helper: load a single value into all components.
Definition: simd.h:8493
vfloat4 sqrt(const vfloat4 &a)
Definition: simd.h:7481
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:848
static const vfloat16 One()
Return a vfloat16 with all components set to 1.0.
Definition: simd.h:9369
GLboolean GLboolean g
Definition: glcorearb.h:1222
simd_t m_simd
Definition: simd.h:2021
static const char * name()
Definition: simd.h:354
vfloat8()
Default constructor (contents undefined)
Definition: simd.h:2454
vbool8 vbool_t
bool type of the same length
Definition: simd.h:1188
OIIO_FORCEINLINE vbool4 operator!(const vbool4 &a)
Definition: simd.h:3277
const vfloat4 & operator=(float a)
Assign a single value to all components.
Definition: simd.h:1865
const vfloat4 & operator/=(const vfloat4 &a)
Definition: simd.h:7022
friend vint16 operator+(const vint16 &a, const vint16 &b)
Definition: simd.h:6028
friend const vint8 & operator*=(vint8 &a, const vint8 &b)
Definition: simd.h:5277
value_t z() const
Definition: simd.h:8949
friend const vint4 & operator*=(vint4 &a, const vint4 &b)
Definition: simd.h:4428
friend vint8 operator<<(const vint8 &a, unsigned int bits)
Definition: simd.h:5344
static const char * name()
Definition: simd.h:352
vfloat8(const float *f)
Construct from a pointer to 8 values.
Definition: simd.h:2464
vfloat4(const vfloat4 &other)
Copy construct from another vfloat4.
Definition: simd.h:1818
friend vbool8 operator!=(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8848
friend vbool8 operator>=(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8872
void clear()
Set all components to false.
Definition: simd.h:3234
GLint GLint i2
Definition: glad.h:2724
friend const vint16 & operator&=(vint16 &a, const vint16 &b)
Definition: simd.h:6109
simd_t simd() const
Definition: simd.h:1237
vfloat4 rsqrt_fast(const vfloat4 &a)
Fast, approximate 1/sqrt.
Definition: simd.h:7501
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222
GLdouble s
Definition: glad.h:3009
vint16(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:1532
friend const vbool16 & operator&=(vbool16 &a, const vbool16 &b)
Definition: simd.h:3976
static const vfloat4 Zero()
Return a vfloat4 with all components set to 0.0.
Definition: simd.h:6537
vbool4(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:489
float operator[](int i) const
Component access (get)
Definition: simd.h:6577
vfloat16 min(const vfloat16 &a, const vfloat16 &b)
Per-element min.
Definition: simd.h:10031
float value_t
Underlying equivalent scalar value type.
Definition: simd.h:1792
vfloat3 transformvT(const vfloat3 &V) const
Transform 3-vector V by the transpose of 4x4 matrix M.
Definition: simd.h:8178
void set_z(value_t val)
Definition: simd.h:5538
int operator[](int i) const
Component access (get)
Definition: simd.h:3477
void load(float val)
Helper: load a single value into all components.
Definition: simd.h:7891
vbool4 bool4
Definition: simd.h:268
simd_t simd() const
Definition: simd.h:1828
#define SIMD_CONSTRUCT_PAD(x)
Definition: simd.h:429
friend const vint16 & operator<<=(vint16 &a, unsigned int bits)
Definition: simd.h:6152
value_t z() const
Definition: simd.h:9822
OIIO_FORCEINLINE const vbool4 & operator|=(vbool4 &a, const vbool4 &b)
Definition: simd.h:3322
friend vint16 operator*(const vint16 &a, const vint16 &b)
Definition: simd.h:6065
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
vfloat16 max(const vfloat16 &a, const vfloat16 &b)
Per-element max.
Definition: simd.h:10040
vfloat3()
Default constructor (contents undefined)
Definition: simd.h:2155
bool value_t
Underlying equivalent scalar value type.
Definition: simd.h:460
simd_raw_t< int, elements >::type simd_t
the native SIMD type used
Definition: simd.h:1187
static const vint8 Zero()
Return an vint8 with all components set to 0.
Definition: simd.h:5177
vbool8 vbool_t
SIMD bool type.
Definition: simd.h:2447
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
GLint y
Definition: glcorearb.h:103
vint16 vint_t
SIMD int type.
Definition: simd.h:2766
vfloat4(float a, float b, float c, float d=0.0f)
Construct from 3 or 4 values.
Definition: simd.h:1812
simd_t m_simd
Definition: simd.h:2647
#define OIIO_SIMD_UINT4_CONST(name, val)
Definition: simd.h:373
static const vint8 NegOne()
Return an vint8 with all components set to -1 (aka 0xffffffff)
Definition: simd.h:5187
value_t * data()
Definition: simd.h:949
static const char * type_name()
Definition: simd.h:2759
bool reduce_or(const vbool4 &v)
Definition: simd.h:3457
friend vfloat8 operator-(const vfloat8 &a)
Definition: simd.h:8784
friend vint16 operator/(const vint16 &a, const vint16 &b)
Definition: simd.h:6078
**But if you need a result
Definition: thread.h:613
static const vint16 Giota()
Return an vint16 with "geometric" iota: (1, 2, 4, 8, ...).
Definition: simd.h:5981
vfloat4()
Default constructor (contents undefined)
Definition: simd.h:1806
value_t * data()
Definition: simd.h:2485
void load_mask(int mask, const value_t *values)
Definition: simd.h:6822
GLfloat GLfloat GLfloat v2
Definition: glcorearb.h:818
value_t x() const
Definition: simd.h:6335
friend const vint16 & operator^=(vint16 &a, const vint16 &b)
Definition: simd.h:6129
const vfloat8 & operator=(float a)
Assign a single value to all components.
Definition: simd.h:2505
uint16_t m_bits
Definition: simd.h:863
Integer 8-vector, accelerated by SIMD instructions when available.
Definition: simd.h:1180
void clear()
Set all components to 0.0.
Definition: simd.h:6554
vfloat4(const char *vals)
Construct from a pointer to 4 char values.
Definition: simd.h:1857
simd_t simd() const
Definition: simd.h:1536
static const char * name()
Definition: simd.h:359
friend const vint4 & operator<<=(vint4 &a, unsigned int bits)
Definition: simd.h:4519
GLfloat GLfloat GLfloat GLfloat v3
Definition: glcorearb.h:819
OIIO_FORCEINLINE vbool4 insert(const vbool4 &a, bool val)
Helper: substitute val for a[i].
Definition: simd.h:3436
simd_raw_t< int, elements >::type simd_t
the native SIMD type used
Definition: simd.h:1485
vfloat3 hdiv(const vfloat4 &a)
Homogeneous divide to turn a vfloat4 into a vfloat3.
Definition: simd.h:7380
vfloat3 transformv(const matrix44 &M, const vfloat3 &V)
Transform 3-vector V by 4x4 matrix M.
Definition: simd.h:8373
const vbool8 & operator=(bool a)
Assign one value to all components.
Definition: simd.h:3556
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
vbool8 lo() const
Extract the lower precision vbool8.
Definition: simd.h:3926
void load_bitmask(int a)
Helper: load all components from a bitmask in an int.
Definition: simd.h:3826
friend vbool4 operator!=(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3351
vint8(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:1233
void set_y(value_t val)
Definition: simd.h:5537
void clear()
Sset all components to 0.
Definition: simd.h:5168
value_t y() const
Definition: simd.h:5533
friend vbool4 operator<(const vint4 &a, const vint4 &b)
Definition: simd.h:4569
vbool8(bool a)
Construct from a single value (store it in all slots)
Definition: simd.h:611
const value_t * data() const
Return a pointer to the underlying scalar type.
Definition: simd.h:1540
friend const vint4 & operator^=(vint4 &a, const vint4 &b)
Definition: simd.h:4498
OIIO_FORCEINLINE vbool4 operator>=(const vint4 &a, const vint4 &b)
Definition: simd.h:4577
int operator[](int i) const
Component access (get)
Definition: simd.h:5735
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vfloat16 operator/(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9687
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
OIIO_FORCEINLINE vbool4 operator~(const vbool4 &a)
Definition: simd.h:3330
float dot3(const vfloat4 &a, const vfloat4 &b)
Return the float 3-component dot (inner) product of a and b.
Definition: simd.h:7313
value_t m_val[paddedelements]
Definition: simd.h:2648
void load_mask(int mask, const value_t *values)
Definition: simd.h:5078
friend const vfloat16 & operator+=(vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9638
vint8 vint_t
SIMD int type.
Definition: simd.h:2446
friend vbool4 operator|(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3297
friend const vint16 & operator%=(vint16 &a, const vint16 &b)
Definition: simd.h:6091
simd_bool_t< 8 >::type simd_t
the native SIMD type used
Definition: simd.h:605
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
static const char * type_name()
Definition: simd.h:1480
Template giving a printable name for each type.
Definition: simd.h:350
vint4 abs(const vint4 &a)
Definition: simd.h:4822
vfloat4 safe_div(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7368
void store(int *values) const
Store the values into memory.
Definition: simd.h:5063
friend vbool8 operator<=(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8880
friend std::ostream & operator<<(std::ostream &cout, const vfloat8 &val)
Stream output.
Definition: simd.h:8418
vbool16(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:786
vfloat8 lo() const
Extract the lower precision vfloat8.
Definition: simd.h:9304
value_t w() const
Definition: simd.h:4703
simd_t & simd()
Definition: simd.h:1238
vbool4(const vbool4 &other)
Copy construct from another vbool4.
Definition: simd.h:478
static const vbool4 False()
Return a vbool4 the is 'false' for all values.
Definition: simd.h:3243
value_t y() const
Definition: simd.h:4701
vbool4 lo() const
Extract the lower precision vbool4.
Definition: simd.h:3625
static const vbool16 False()
Return a vbool16 the is 'false' for all values.
Definition: simd.h:3904
void setcomp(int i, bool value)
Component access (set).
Definition: simd.h:3486
value_t w() const
Definition: simd.h:6345
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vint4 operator~(const vint4 &a)
Definition: simd.h:4501
void gather_mask(int mask, const value_t *baseptr, const vint_t &vindex)
Definition: simd.h:2933
friend vbool4 operator==(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7044
simd_t & simd()
Definition: simd.h:2481
OIIO_FORCEINLINE bool extract(const vbool4 &a)
Definition: simd.h:3426
vfloat3 normalized() const
Return a normalized version of the vector.
Definition: simd.h:8098
vfloat4 floor(const vfloat4 &a)
Definition: simd.h:7427
void gather_mask(int mask, const value_t *baseptr, const vint_t &vindex)
Definition: simd.h:1647
OIIO_FORCEINLINE matrix44(const float *f)
Construct from a float array.
Definition: simd.h:2320
static const vint4 One()
Return an vint4 with all components set to 1.
Definition: simd.h:4332
vint4 blend(const vint4 &a, const vint4 &b, const vbool4 &mask)
Definition: simd.h:4784
int value_t
Underlying equivalent scalar value type.
Definition: simd.h:893
vbool16(const vbool16 &other)
Copy construct from another vbool16.
Definition: simd.h:770
vfloat8(const vfloat8 &other)
Copy construct from another vfloat8.
Definition: simd.h:2467
vint4 blend0not(const vint4 &a, const vbool4 &mask)
Definition: simd.h:4807
vint4 vint_t
SIMD int type.
Definition: simd.h:1798
GA_API const UT_StringHolder scale
friend const vbool16 & operator^=(vbool16 &a, const vbool16 &b)
Definition: simd.h:3984
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
value_t w() const
Definition: simd.h:7217
vbool4 hi() const
Extract the higher precision vbool4.
Definition: simd.h:3633
value_t * data()
Definition: simd.h:2810
OIIO_FORCEINLINE vbool4 operator>(const vint4 &a, const vint4 &b)
Definition: simd.h:4561
friend vbool16 operator!=(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9709
vfloat16(const float *f)
Construct from a pointer to 16 values.
Definition: simd.h:2786
simd_t & simd()
Definition: simd.h:1537
GLdouble n
Definition: glcorearb.h:2008
vbool8(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:631
bool operator!=(const matrix44 &m) const
Definition: simd.h:8244
friend vbool4 operator!(const vbool4 &a)
Logical/bitwise operators, component-by-component.
Definition: simd.h:3277
vfloat8 hi() const
Extract the higher precision vfloat8.
Definition: simd.h:9312
OIIO_FORCEINLINE vbool4 operator<=(const vint4 &a, const vint4 &b)
Definition: simd.h:4581
friend const vint8 & operator+=(vint8 &a, const vint8 &b)
Definition: simd.h:5240
GLfloat f
Definition: glcorearb.h:1926
friend vfloat8 operator/(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8828
const value_t * data() const
Return a pointer to the underlying scalar type.
Definition: simd.h:2484
value_t x() const
Definition: simd.h:7214
vfloat4 vdot3(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7305
friend const vint8 & operator-=(vint8 &a, const vint8 &b)
Definition: simd.h:5263
float value_t
Underlying equivalent scalar value type.
Definition: simd.h:2440
vint8(const vint8 &other)
Copy construct from another vint8.
Definition: simd.h:1224
void set_y(value_t val)
Definition: simd.h:4705
simd_raw_t< int, elements >::type simd_t
the native SIMD type used
Definition: simd.h:897
Integer 4-vector, accelerated by SIMD instructions when available.
Definition: simd.h:890
#define SIMD_RETURN_REDUCE(T, init, op)
Definition: simd.h:432
friend vbool8 operator<(const vint8 &a, const vint8 &b)
Definition: simd.h:5413
vint8 vint_t
int type of the same length
Definition: simd.h:1190
OIIO_FORCEINLINE vbool4 operator==(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3341
friend vfloat3 operator/(const vfloat3 &a, const vfloat3 &b)
Definition: simd.h:7979
OIIO_DEPRECATED("use bitcast_to_int() (1.8)") inline vint4 bitcast_to_int4(const vbool4 &x)
Definition: simd.h:4720
vfloat4 lo() const
Extract the lower precision vfloat4.
Definition: simd.h:8426
vint16(const vint16 &other)
Copy construct from another vint16.
Definition: simd.h:1520
simd_raw_t< float, 8 >::type simd_t
the native SIMD type used
Definition: simd.h:2444
IMATH_NAMESPACE::V2f float
int value_t
Underlying equivalent scalar value type.
Definition: simd.h:1481
float length() const
Length of the vector.
Definition: simd.h:8092
bool any(const vbool4 &v)
Definition: simd.h:3468
const vfloat4 & operator-=(const vfloat4 &a)
Definition: simd.h:6955
void load(bool a)
Helper: load a single value into all components.
Definition: simd.h:3505
bool reduce_and(const vbool4 &v)
Logical reduction across all components.
Definition: simd.h:3447
vint16 vint_t
int type of the same length
Definition: simd.h:1488
vfloat8(const unsigned short *vals)
Construct from a pointer to unsigned short values.
Definition: simd.h:2488
friend vbool16 operator<=(const vint16 &a, const vint16 &b)
Definition: simd.h:6225
vfloat3 float3
Definition: simd.h:272
vfloat3 transformp(const matrix44 &M, const vfloat3 &V)
Transform 3-point V by 4x4 matrix M.
Definition: simd.h:8357
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
friend const vint4 & operator+=(vint4 &a, const vint4 &b)
Definition: simd.h:4369
vfloat8 vfloat_t
float type of the same length
Definition: simd.h:1189
OIIO_FORCEINLINE const vint4 & operator+=(vint4 &a, const vint4 &b)
Definition: simd.h:4369
void transpose(vint4 &a, vint4 &b, vint4 &c, vint4 &d)
Definition: simd.h:7791
vint4 select(const vbool4 &mask, const vint4 &a, const vint4 &b)
Definition: simd.h:4816
const Imath::V3f & V3f() const
Cast to a Imath::V3f.
Definition: simd.h:2185
void set_w(value_t val)
Definition: simd.h:9827
void load(bool a)
Helper: load a single value into all components.
Definition: simd.h:3177
vfloat4(const short *vals)
Construct from a pointer to 4 short values.
Definition: simd.h:1851
static vbool8 from_bitmask(int bitmask)
Convert from integer bitmask to a true vbool8.
Definition: simd.h:3576
value_t y() const
Definition: simd.h:7215
vint4 rotl(const vint4 &x, const int s)
Circular bit rotate by s bits, for N values at once.
Definition: simd.h:4856
vint4 m_4[2]
Definition: simd.h:1398
vbool16(int bitmask)
Definition: simd.h:761
friend vbool16 operator>=(const vint16 &a, const vint16 &b)
Definition: simd.h:6216
value_t z() const
Definition: simd.h:6344
const vfloat4 & operator*=(const vfloat4 &a)
Definition: simd.h:6990
#define OIIO_DASSERT
Definition: dassert.h:55
vbool4 m_4[2]
Definition: simd.h:710
vfloat4 msub(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
Definition: simd.h:7574
vbool16 vbool_t
bool type of the same length
Definition: simd.h:1486
vfloat4 madd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
Definition: simd.h:7554
friend vint8 operator&(const vint8 &a, const vint8 &b)
Definition: simd.h:5304
friend vbool16 operator^(const vbool16 &a, const vbool16 &b)
Definition: simd.h:3967
friend vbool8 operator<(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8856
value_t z() const
Definition: simd.h:7216
void store_mask(int mask, value_t *values) const
Definition: simd.h:1638
float operator[](int i) const
Component access (get)
Definition: simd.h:9290
OIIO_FORCEINLINE matrix44(const Imath::M44f &M)
Construct from a reference to an Imath::M44f.
Definition: simd.h:2308
simd_t m_simd
Definition: simd.h:1097
vfloat3 normalized_fast() const
Return a fast, approximate normalized version of the vector.
Definition: simd.h:8110
void load_mask(int mask, const value_t *values)
Definition: simd.h:2919
const value_t * data() const
Return a pointer to the underlying scalar type.
Definition: simd.h:1832
friend vbool4 operator<(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7065
static const char * type_name()
Definition: simd.h:1182
void setcomp(int i, int value)
Component access (set).
Definition: simd.h:4060
OIIO_FORCEINLINE T exp(const T &v)
Definition: simd.h:7633
void load_mask(int mask, const value_t *values)
Definition: simd.h:1633
static const vfloat3 One()
Return a vfloat3 with all components set to 1.0.
Definition: simd.h:7884
vfloat16(const vfloat16 &other)
Copy construct from another vfloat16.
Definition: simd.h:2789
void load(int a)
Helper: load a single int into all components.
Definition: simd.h:4926
bool set_denorms_zero_mode(bool on)
Definition: simd.h:3094
static const char * name()
Definition: simd.h:350
vbool16()
Default constructor (contents undefined)
Definition: simd.h:756
simd_t simd() const
Definition: simd.h:493
simd_bool_t< 16 >::type simd_t
the native SIMD type used
Definition: simd.h:753
friend vint16 operator<<(const vint16 &a, unsigned int bits)
Definition: simd.h:6141
void store_mask(int mask, value_t *values) const
Definition: simd.h:2924
vbool4(bool a, bool b, bool c, bool d)
Construct from 4 bool values.
Definition: simd.h:475
OIIO_FORCEINLINE std::ostream & operator<<(std::ostream &cout, const vbool4 &a)
Definition: simd.h:3169
static const vfloat16 Zero()
Return a vfloat16 with all components set to 0.0.
Definition: simd.h:9361
vfloat4 operator-() const
Definition: simd.h:6935
simd_t & simd()
Definition: simd.h:494
value_t m_val[paddedelements]
Definition: simd.h:2979
vfloat4(const unsigned short *vals)
Construct from a pointer to 4 unsigned short values.
Definition: simd.h:1848
friend vint4 operator%(const vint4 &a, const vint4 &b)
Definition: simd.h:4440
GLint GLuint mask
Definition: glcorearb.h:124
friend vbool8 operator!=(const vbool8 &a, const vbool8 &b)
Definition: simd.h:3720
void set_x(value_t val)
Definition: simd.h:5536
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
vfloat16 vfloat_t
SIMD int type.
Definition: simd.h:2765
friend const vfloat16 & operator/=(vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9695
value_t y() const
Definition: simd.h:8948
static const vbool16 True()
Return a vbool16 the is 'true' for all values.
Definition: simd.h:3909
const vbool16 & operator=(bool a)
Assign one value to all components.
Definition: simd.h:3880
vfloat4 nmsub(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
Definition: simd.h:7612
void set_y(value_t val)
Definition: simd.h:7219
vbool4()
Default constructor (contents undefined)
Definition: simd.h:467
static const vbool8 True()
Return a vbool8 the is 'true' for all values.
Definition: simd.h:3599
static const char * name()
Definition: simd.h:353
friend vbool8 operator==(const vbool8 &a, const vbool8 &b)
Comparison operators, component by component.
Definition: simd.h:3710
vbool8(const vbool8 &other)
Copy construct from another vbool8.
Definition: simd.h:619
friend vint4 operator^(const vint4 &a, const vint4 &b)
Definition: simd.h:4487
OIIO_FORCEINLINE vfloat4 AxBxAyBy(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7113
simd_t simd() const
Definition: simd.h:635
vint4 blend0(const vint4 &a, const vbool4 &mask)
Definition: simd.h:4798
vfloat3(const short *vals)
Construct from a pointer to 4 short values.
Definition: simd.h:2191
void store(float *values) const
Definition: simd.h:6740
friend vint8 operator*(const vint8 &a, const vint8 &b)
Definition: simd.h:5268
friend vfloat16 operator+(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9630
vfloat4(const float *f)
Construct from a pointer to 4 values.
Definition: simd.h:1815
const vint4 & operator=(int a)
Assign one value to all components.
Definition: simd.h:4206
GLint i1
Definition: glad.h:2724
void set_y(value_t val)
Definition: simd.h:8952
void store(bool *values) const
Helper: store the values into memory as bools.
Definition: simd.h:3914
static const vint4 Zero()
Return an vint4 with all components set to 0.
Definition: simd.h:4323
simd_t m_simd
Definition: simd.h:1396
friend vbool16 operator<(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9718
friend const vbool4 & operator^=(vbool4 &a, const vbool4 &b)
Definition: simd.h:3326
vfloat3(float a)
Construct from a single value (store it in all slots)
Definition: simd.h:2158
friend vint8 operator-(const vint8 &a)
Definition: simd.h:5245
static const char * name()
Definition: simd.h:355
friend const vint16 & operator+=(vint16 &a, const vint16 &b)
Definition: simd.h:6037
bool set_flush_zero_mode(bool on)
Definition: simd.h:3084
friend vint8 operator^(const vint8 &a, const vint8 &b)
Definition: simd.h:5324
void clear()
Set all components to 0.0.
Definition: simd.h:8483
SYS_API float logf(float x)
float length2() const
Square of the length of the vector.
Definition: simd.h:8086
friend vbool4 operator!=(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7054
void set_w(value_t val)
Definition: simd.h:7221
friend vbool16 operator<(const vint16 &a, const vint16 &b)
Definition: simd.h:6207
vbool4(int a, int b, int c, int d)
Construct from 4 int values.
Definition: simd.h:481
vfloat3(float a, float b, float c)
Construct from 3 or 4 values.
Definition: simd.h:2161
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
void store(bool *values) const
Helper: store the values into memory as bools.
Definition: simd.h:3265
friend std::ostream & operator<<(std::ostream &cout, const matrix44 &M)
Stream output.
Definition: simd.h:8347
matrix44 transposed() const
Return the transposed matrix.
Definition: simd.h:8142
OIIO_FORCEINLINE vint4 operator>>(const vint4 &a, const unsigned int bits)
Definition: simd.h:4524
vfloat4 round(const vfloat4 &a)
Definition: simd.h:7436
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222
friend vbool4 operator!(const vbool4 &a)
Logical/bitwise operators, component-by-component.
Definition: simd.h:3277
static const vfloat4 One()
Return a vfloat4 with all components set to 1.0.
Definition: simd.h:6545
GLint GLenum GLint x
Definition: glcorearb.h:409
void store_mask(int mask, value_t *values) const
Definition: simd.h:6844
#define SIMD_RETURN(T, x)
Definition: simd.h:431
friend vbool16 operator>(const vint16 &a, const vint16 &b)
Definition: simd.h:6198
const value_t * data() const
Return a pointer to the underlying scalar type.
Definition: simd.h:948
const vint16 & operator=(int a)
Assign one value to all components.
Definition: simd.h:5869
vfloat8(const char *vals)
Construct from a pointer to char values.
Definition: simd.h:2497
vint8 m_8[2]
Definition: simd.h:1702
bool get_denorms_zero_mode()
Definition: simd.h:3111
friend vbool8 operator~(const vbool8 &a)
Definition: simd.h:3700
friend vbool4 operator!=(const vint4 &a, const vint4 &b)
Definition: simd.h:4556
vfloat3 transformvT(const matrix44 &M, const vfloat3 &V)
Definition: simd.h:8388
const vfloat3 & operator/=(const vfloat3 &a)
Definition: simd.h:7983
value_t * data()
Definition: simd.h:1833
friend vbool4 operator>(const vint4 &a, const vint4 &b)
Definition: simd.h:4561
friend const vint4 & operator-=(vint4 &a, const vint4 &b)
Definition: simd.h:4392
vbool4 mask4
Definition: simd.h:264
bool get_flush_zero_mode()
Definition: simd.h:3103
void set_x(value_t val)
Definition: simd.h:9824
vfloat4 operator[](int i) const
Return one row.
Definition: simd.h:8133
float operator[](int i) const
Component access (get)
Definition: simd.h:8412
const Imath::M44f & M44f() const
Present as an Imath::M44f.
Definition: simd.h:8128
GLdouble t
Definition: glad.h:2397
void load(float val)
Helper: load a single value into all components.
Definition: simd.h:9390
static const vfloat8 Iota(float start=0.0f, float step=1.0f)
Definition: simd.h:8477
void set_w(value_t val)
Definition: simd.h:6349
value_t w() const
Definition: simd.h:9823
friend const vint4 & operator&=(vint4 &a, const vint4 &b)
Definition: simd.h:4470
value_t w() const
Definition: simd.h:8950
void set_y(value_t val)
Definition: simd.h:6347
simd_raw_t< float, 4 >::type simd_t
the native SIMD type used
Definition: simd.h:1796
vbool8()
Default constructor (contents undefined)
Definition: simd.h:608
friend vbool16 operator|(const vbool16 &a, const vbool16 &b)
Definition: simd.h:3959
vfloat16()
Default constructor (contents undefined)
Definition: simd.h:2774
GLfloat v0
Definition: glcorearb.h:816
vint8 lo() const
Extract the lower precision vint8.
Definition: simd.h:5987
vfloat4 vdot(const vfloat4 &a, const vfloat4 &b)
Return the float dot (inner) product of a and b in every component.
Definition: simd.h:7285
vint4 int4
Definition: simd.h:270
static const vfloat8 Zero()
Return a vfloat8 with all components set to 0.0.
Definition: simd.h:8465
friend vint16 operator|(const vint16 &a, const vint16 &b)
Definition: simd.h:6111
vfloat4 rcp_fast(const vfloat4 &a)
Fast, approximate 1/a.
Definition: simd.h:7462
friend vint4 operator+(const vint4 &a, const vint4 &b)
Definition: simd.h:4361
const value_t * data() const
Return a pointer to the underlying scalar type.
Definition: simd.h:1241
friend vint16 operator>>(const vint16 &a, unsigned int bits)
Definition: simd.h:6156
Integer 16-vector, accelerated by SIMD instructions when available.
Definition: simd.h:1478
vfloat16(const unsigned char *vals)
Construct from a pointer to unsigned char values.
Definition: simd.h:2819
static const char * type_name()
Definition: simd.h:600
void set_w(value_t val)
Definition: simd.h:8954
friend vbool4 operator^(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3307
GLint j
Definition: glad.h:2733
const vfloat3 & operator*=(const vfloat3 &a)
Definition: simd.h:7971
static const char * type_name()
Definition: simd.h:1791
OIIO_FORCEINLINE vint4 operator/(const vint4 &a, const vint4 &b)
Definition: simd.h:4432
vfloat16(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:2801
vfloat3(const Imath::V3f &v)
Construct from a Imath::V3f.
Definition: simd.h:2182
OIIO_FORCEINLINE vbool4 operator^(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3307
static const vint16 One()
Return an vint16 with all components set to 1.
Definition: simd.h:5968
vfloat16(const char *vals)
Construct from a pointer to char values.
Definition: simd.h:2822
vbool16 vbool_t
SIMD bool type.
Definition: simd.h:2767
vint4 hi() const
Extract the higher precision vint4.
Definition: simd.h:5209
vfloat3 transformp(const vfloat3 &V) const
Transform 3-point V by 4x4 matrix M.
Definition: simd.h:8153
OIIO_FORCEINLINE matrix44(const float *a, const float *b, const float *c, const float *d)
Construct from 4 float[4] rows.
Definition: simd.h:2344
friend vbool8 operator>(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8864
OIIO_FORCEINLINE vbool4 operator|(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3297
Vec3< float > V3f
Vec3 of float.
Definition: ImathVec.h:849
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
GLsizeiptr size
Definition: glcorearb.h:664
GLfloat GLfloat GLfloat GLfloat h
Definition: glcorearb.h:2002
void store_mask(const vbool_t &mask, value_t *values) const
Definition: simd.h:5882
static const vfloat3 Iota(float start=0.0f, float step=1.0f)
Definition: simd.h:7886
vint4 safe_mod(const vint4 &a, const vint4 &b)
Definition: simd.h:4890
friend const vint8 & operator/=(vint8 &a, const vint8 &b)
Definition: simd.h:5286
static const vfloat4 Iota(float start=0.0f, float step=1.0f)
Definition: simd.h:6549
friend const vfloat8 & operator-=(vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8800
void load(int a)
Helper: load a single int into all components.
Definition: simd.h:5751
void store(bool *values) const
Helper: store the values into memory as bools.
Definition: simd.h:3614
vint4 vreduce_add(const vint4 &v)
The sum of all components, returned in all components.
Definition: simd.h:4724
OIIO_FORCEINLINE const vint4 & operator<<=(vint4 &a, const unsigned int bits)
Definition: simd.h:4519
void scatter_mask(int mask, value_t *baseptr, const vint_t &vindex) const
Definition: simd.h:2944
float value_t
Underlying equivalent scalar value type.
Definition: simd.h:2760
OIIO_FORCEINLINE vbool4 operator<(const vint4 &a, const vint4 &b)
Definition: simd.h:4569
friend vbool8 operator|(const vbool8 &a, const vbool8 &b)
Definition: simd.h:3670
friend const vint16 & operator*=(vint16 &a, const vint16 &b)
Definition: simd.h:6074
#define SIMD_DO(x)
Definition: simd.h:427
OIIO_FORCEINLINE vbool4 operator!=(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3351
vfloat4 ceil(const vfloat4 &a)
Definition: simd.h:7418
friend vbool16 operator==(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9700
vbool4 vbool_t
SIMD bool type.
Definition: simd.h:1799
void store(int *values) const
Store the values into memory.
Definition: simd.h:4209
void clear()
Sset all components to 0.
Definition: simd.h:5951
vfloat4 vfloat_t
float type of the same length
Definition: simd.h:899
GLenum GLsizei GLsizei GLint * values
Definition: glcorearb.h:1602
friend vint16 operator%(const vint16 &a, const vint16 &b)
Definition: simd.h:6086
value_t x() const
Definition: simd.h:5532
OIIO_FORCEINLINE vbool4 operator&(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3287
#define SIMD_CONSTRUCT(x)
Definition: simd.h:428
vfloat8 m_8[2]
Definition: simd.h:2980
vfloat4(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:1824
OIIO_FORCEINLINE matrix44()
Definition: simd.h:2301
value_t * data()
Definition: simd.h:1242
friend const vint16 & operator-=(vint16 &a, const vint16 &b)
Definition: simd.h:6060
vfloat8(float a, float b, float c, float d, float e, float f, float g, float h)
Construct from 8 values.
Definition: simd.h:2460
static const vint4 Iota(int start=0, int step=1)
Definition: simd.h:4351
void load_mask(int mask, const value_t *values)
Definition: simd.h:8681
friend vfloat4 operator+(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:6914
void clear()
Set all components to false.
Definition: simd.h:3582
int m_val[paddedelements]
Definition: simd.h:560
friend vbool8 operator&(const vbool8 &a, const vbool8 &b)
Definition: simd.h:3662
static const vint16 NegOne()
Return an vint16 with all components set to -1 (aka 0xffffffff)
Definition: simd.h:5970
vfloat4 bitcast_to_float4(const vint4 &x)
Definition: simd.h:7245
simd_t m_simd
Definition: simd.h:708
GLuint index
Definition: glcorearb.h:786
int bitmask() const
Definition: simd.h:3891
vint4 lo() const
Extract the lower precision vint4.
Definition: simd.h:5201
friend vbool8 operator^(const vbool8 &a, const vbool8 &b)
Definition: simd.h:3678
friend vint4 operator-(const vint4 &a)
Definition: simd.h:4374
matrix44 inverse() const
Return the inverse of the matrix.
Definition: simd.h:8264
vfloat4 nmadd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
Definition: simd.h:7593
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
vfloat3(const unsigned short *vals)
Construct from a pointer to 4 unsigned short values.
Definition: simd.h:2188
void load_mask(int mask, const value_t *values)
Definition: simd.h:4221
friend vbool16 operator~(const vbool16 &a)
Definition: simd.h:3989
friend const vint16 & operator/=(vint16 &a, const vint16 &b)
Definition: simd.h:6083
vint4(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:940
friend vbool8 operator>=(const vint8 &a, const vint8 &b)
Definition: simd.h:5426
vfloat8 vfloat_t
SIMD int type.
Definition: simd.h:2445
void set_z(value_t val)
Definition: simd.h:7220
friend vint16 operator^(const vint16 &a, const vint16 &b)
Definition: simd.h:6121
friend vint8 operator>>(const vint8 &a, unsigned int bits)
Definition: simd.h:5359
GLfloat GLfloat v1
Definition: glcorearb.h:817
GLuint GLfloat * val
Definition: glcorearb.h:1608
int m_val[paddedelements]
Definition: simd.h:709
value_t m_val[paddedelements]
Definition: simd.h:2022
vint4()
Default constructor (contents undefined)
Definition: simd.h:907
const vbool4 & operator=(bool a)
Assign one value to all components.
Definition: simd.h:512
vint4 bitcast_to_int(const vbool4 &x)
Bitcast back and forth to intN (not a convert – move the bits!)
Definition: simd.h:4710
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
friend vint4 operator<<(const vint4 &a, unsigned int bits)
Definition: simd.h:4511
vfloat4(const unsigned char *vals)
Construct from a pointer to 4 unsigned char values.
Definition: simd.h:1854
GA_API const UT_StringHolder N
friend vint16 operator&(const vint16 &a, const vint16 &b)
Definition: simd.h:6101
simd_t m_simd
Definition: simd.h:559
simd_t simd() const
Definition: simd.h:790
void setcomp(int i, float value)
Component access (set).
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
const vint8 & operator=(int a)
Assign one value to all components.
Definition: simd.h:5060
void store_mask(int mask, value_t *values) const
Definition: simd.h:5100
static const vbool8 False()
Return a vbool8 the is 'false' for all values.
Definition: simd.h:3590
friend vint16 operator-(const vint16 &a)
Definition: simd.h:6042
value_t w() const
Definition: simd.h:5535
friend vbool4 operator&(const vbool4 &a, const vbool4 &b)
Definition: simd.h:3287
friend vbool16 operator&(const vbool16 &a, const vbool16 &b)
Definition: simd.h:3951
vbool4 vbool_t
bool type of the same length
Definition: simd.h:898
friend vfloat8 operator%(const vfloat8 &a, const vfloat8 &b)
vbool8 hi() const
Extract the higher precision vbool8.
Definition: simd.h:3934
friend vint8 operator+(const vint8 &a, const vint8 &b)
Definition: simd.h:5231
friend const vbool8 & operator^=(vbool8 &a, const vbool8 &b)
Definition: simd.h:3695
static const char * name()
Definition: simd.h:356
const Imath::V3f & V3f() const
Cast to a Imath::V3f.
Definition: simd.h:1839
value_t z() const
Definition: simd.h:4702
void scatter(value_t *baseptr, const vint_t &vindex) const
Store values at addresses (char*)basepatr + vindex[i]*scale.
friend const vfloat16 & operator-=(vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9658
void set_z(value_t val)
Definition: simd.h:6348
friend vbool16 operator!=(const vbool16 &a, const vbool16 &b)
Definition: simd.h:4002
const vfloat16 & operator=(float a)
Assign a single value to all components.
Definition: simd.h:2830
vint4 rotl32(const vint4 &x, const unsigned int k)
Definition: simd.h:4869
friend const vbool8 & operator&=(vbool8 &a, const vbool8 &b)
Definition: simd.h:3687
vint4 floori(const vfloat4 &a)
Definition: simd.h:2094
OIIO_FORCEINLINE const vint4 & operator-=(vint4 &a, const vint4 &b)
Definition: simd.h:4392
friend const vint4 & operator|=(vint4 &a, const vint4 &b)
Definition: simd.h:4484
vint4(const vint4 &other)
Copy construct from another vint4.
Definition: simd.h:934
friend vbool16 operator>=(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9736
friend const vint16 & operator>>=(vint16 &a, unsigned int bits)
Definition: simd.h:6165
void setcomp(int i, int value)
Component access (set).
Definition: simd.h:4920
static const vint8 Iota(int start=0, int step=1)
Definition: simd.h:5190
friend vint4 operator>>(const vint4 &a, unsigned int bits)
Definition: simd.h:4524
OIIO_FORCEINLINE vint4 operator*(const vint4 &a, const vint4 &b)
Definition: simd.h:4419
OIIO_FORCEINLINE const vbool4 & operator&=(vbool4 &a, const vbool4 &b)
Definition: simd.h:3318
const value_t * data() const
Return a pointer to the underlying scalar type.
Definition: simd.h:2809
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
GLubyte GLubyte GLubyte GLubyte w
Definition: glcorearb.h:857
friend vbool8 operator==(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8840
Definition: core.h:1131
static const vint8 Giota()
Return an vint8 with "geometric" iota: (1, 2, 4, 8, ...).
Definition: simd.h:5196
IMATH_INTERNAL_NAMESPACE_HEADER_ENTER IMATH_HOSTDEVICE constexpr T abs(T a) IMATH_NOEXCEPT
Definition: ImathFun.h:26
vfloat4(const Imath::V4f &v)
Construct from a Imath::V4f.
Definition: simd.h:1842
friend const vfloat8 & operator*=(vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8824
friend vfloat4 operator*(const vfloat4 &V, const matrix44 &M)
Definition: simd.h:8191
friend const vint8 & operator^=(vint8 &a, const vint8 &b)
Definition: simd.h:5332
vfloat3(const unsigned char *vals)
Construct from a pointer to 4 unsigned char values.
Definition: simd.h:2194
const vfloat4 & operator+=(const vfloat4 &a)
Definition: simd.h:6924
friend vint4 operator*(const vint4 &a, const vint4 &b)
Definition: simd.h:4419
void load(bool a)
Helper: load a single value into all components.
Definition: simd.h:3821
friend const vbool4 & operator|=(vbool4 &a, const vbool4 &b)
Definition: simd.h:3322
simd_t m_simd
Definition: simd.h:862
static const char * name()
Definition: simd.h:357
friend vbool4 operator~(const vbool4 &a)
Definition: simd.h:3330
void set_z(value_t val)
Definition: simd.h:4706
const vfloat3 & operator+=(const vfloat3 &a)
Definition: simd.h:7943
void store_mask(const vbool_t &mask, value_t *values) const
Definition: simd.h:9568
GLboolean r
Definition: glcorearb.h:1222
friend vbool4 operator<=(const vint4 &a, const vint4 &b)
Definition: simd.h:4581
#define OIIO_NAMESPACE_END
Definition: oiioversion.h:94
friend const vint16 & operator|=(vint16 &a, const vint16 &b)
Definition: simd.h:6119
void load(float val)
Helper: load a single value into all components.
Definition: simd.h:6583
vint4 min(const vint4 &a, const vint4 &b)
Definition: simd.h:4834
friend const vint8 & operator&=(vint8 &a, const vint8 &b)
Definition: simd.h:5312
friend std::ostream & operator<<(std::ostream &cout, const vbool4 &a)
Stream output.
Definition: simd.h:3169
static const char * name()
Definition: simd.h:358
bool value_t
Underlying equivalent scalar value type.
Definition: simd.h:749
bool value_t
Underlying equivalent scalar value type.
Definition: simd.h:601
vfloat4 hi() const
Extract the higher precision vfloat4.
Definition: simd.h:8434
void load_mask(const vbool_t &mask, const value_t *values)
Definition: simd.h:9558
OIIO_FORCEINLINE T log(const T &v)
Definition: simd.h:7688
friend vfloat4 AxBxAyBy(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7113
friend vfloat8 operator*(const vfloat8 &a, const vfloat8 &b)
Definition: simd.h:8816
vint8 int8
Definition: simd.h:271
friend vbool8 operator<=(const vint8 &a, const vint8 &b)
Definition: simd.h:5432
simd_t & simd()
Definition: simd.h:636
OIIO_FORCEINLINE vfloat4 AxyBxy(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7105
vfloat4 rsqrt(const vfloat4 &a)
Fully accurate 1/sqrt.
Definition: simd.h:7491
vbool16(bool a)
Construct from a single value (store it in all slots)
Definition: simd.h:759
value_t y() const
Definition: simd.h:9821
void set_z(value_t val)
Definition: simd.h:8953
friend vbool4 operator<=(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7095
static const vint8 One()
Return an vint8 with all components set to 1.
Definition: simd.h:5185
simd_t simd() const
Definition: simd.h:944
void load(int a)
Helper: load a single int into all components.
Definition: simd.h:4066
vint4 vint_t
int type of the same length
Definition: simd.h:900
vfloat16(const short *vals)
Construct from a pointer to short values.
Definition: simd.h:2816
OIIO_FORCEINLINE matrix44(float f00, float f01, float f02, float f03, float f10, float f11, float f12, float f13, float f20, float f21, float f22, float f23, float f30, float f31, float f32, float f33)
Construct from 16 floats.
Definition: simd.h:2357
ImageBuf OIIO_API zero(ROI roi, int nthreads=0)
vint4 rint(const vfloat4 &a)
Definition: simd.h:7456
const Imath::V4f & V4f() const
Cast to a Imath::V4f.
Definition: simd.h:1845
void setcomp(int i, float value)
Component access (set).
simd_raw_t< float, 16 >::type simd_t
the native SIMD type used
Definition: simd.h:2764
vfloat4 sign(const vfloat4 &a)
1.0 when value >= 0, -1 when negative
Definition: simd.h:7411
type
Definition: core.h:1059
simd_t & simd()
Definition: simd.h:2806
static const vint16 Zero()
Return an vint16 with all components set to 0.
Definition: simd.h:5960
value_t * data()
Definition: simd.h:1541
value_t m_val[elements]
Definition: simd.h:1701
void scatter_mask(const vbool_t &mask, value_t *baseptr, const vint_t &vindex) const
Scatter elements defined by the mask.
friend vbool16 operator>(const vfloat16 &a, const vfloat16 &b)
Definition: simd.h:9727
OIIO_FORCEINLINE const vint4 & operator*=(vint4 &a, const vint4 &b)
Definition: simd.h:4428
bool all(const vbool4 &v)
Definition: simd.h:3467
void set_z(value_t val)
Definition: simd.h:9826
void store(float *values) const
Definition: simd.h:8631
friend const vint4 & operator/=(vint4 &a, const vint4 &b)
Definition: simd.h:4438
void gather_mask(const vbool_t &mask, const value_t *baseptr, const vint_t &vindex)
Gather elements defined by the mask, leave others unchanged.
vfloat8(const simd_t &m)
Construct from the underlying SIMD type.
Definition: simd.h:2476
vint4 andnot(const vint4 &a, const vint4 &b)
andnot(a,b) returns ((~a) & b)
Definition: simd.h:4874
vfloat3(const char *vals)
Construct from a pointer to 4 char values.
Definition: simd.h:2197
void store(int *values) const
Store the values into memory.
Definition: simd.h:5938
OIIO_FORCEINLINE matrix44(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c, const vfloat4 &d)
Construct from 4 vfloat4 rows.
Definition: simd.h:2332
void normalize()
Normalize in place.
Definition: simd.h:2279
static const vint16 Iota(int start=0, int step=1)
Definition: simd.h:5973
friend vbool4 operator>=(const vint4 &a, const vint4 &b)
Definition: simd.h:4577
bool operator==(const matrix44 &m) const
Definition: simd.h:8224
simd_t & simd()
Definition: simd.h:1829
vfloat4 xyz1() const
Return xyz components, plus 1 for w.
Definition: simd.h:7125
const vfloat3 & operator=(float a)
Assign a single value to all components.
Definition: simd.h:2205
static const vfloat16 Iota(float start=0.0f, float step=1.0f)
Definition: simd.h:9373
OIIO_FORCEINLINE vint4 operator-(const vint4 &a)
Definition: simd.h:4374
friend vbool4 operator==(const vint4 &a, const vint4 &b)
Definition: simd.h:4546
friend std::ostream & operator<<(std::ostream &cout, const vfloat16 &val)
Stream output.
Definition: simd.h:9296
friend std::ostream & operator<<(std::ostream &cout, const vbool8 &a)
Stream output.
Definition: simd.h:3497
friend std::ostream & operator<<(std::ostream &cout, const vbool16 &a)
Stream output.
Definition: simd.h:3813
vfloat8(const unsigned char *vals)
Construct from a pointer to unsigned char values.
Definition: simd.h:2494
friend vbool16 operator==(const vint16 &a, const vint16 &b)
Definition: simd.h:6180
friend vint16 operator~(const vint16 &a)
Definition: simd.h:6132
int operator[](int i) const
Component access (get)
Definition: simd.h:3148
vint4 ifloor(const vfloat4 &a)
(int)floor
Definition: simd.h:7445
friend vbool4 operator==(const vbool4 &a, const vbool4 &b)
Comparison operators, component by component.
Definition: simd.h:3341
float dot(const vfloat4 &a, const vfloat4 &b)
Return the float dot (inner) product of a and b.
Definition: simd.h:7297
vbool8 bool8
Definition: simd.h:269
Definition: format.h:2459
void setcomp(int i, float value)
Component access (set).
void gather(const value_t *baseptr, const vint_t &vindex)
Load values from addresses (char*)basepatr + vindex[i]*scale.
friend vbool8 operator==(const vint8 &a, const vint8 &b)
Definition: simd.h:5383
OIIO_FORCEINLINE vint4 operator+(const vint4 &a, const vint4 &b)
Definition: simd.h:4361
OIIO_FORCEINLINE const vint4 & operator%=(vint4 &a, const vint4 &b)
Definition: simd.h:4447
value_t x() const
Definition: simd.h:4700
friend vbool4 operator>(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7075
vint4 AxBxCxDx(const vint4 &a, const vint4 &b, const vint4 &c, const vint4 &d)
Definition: simd.h:7848
vint8 hi() const
Extract the higher precision vint8.
Definition: simd.h:5995
friend vint8 operator%(const vint8 &a, const vint8 &b)
Definition: simd.h:5289
void setcomp(int i, bool value)
Component access (set).
Definition: simd.h:3163
vfloat4 xyz0() const
Return xyz components, plus 0 for w.
Definition: simd.h:7121
simd_bool_t< 4 >::type simd_t
the native SIMD type used
Definition: simd.h:464
simd_t m_simd
Definition: simd.h:2978
void setcomp(int i, int value)
Component access (set).
Definition: simd.h:5745
int reduce_add(const vint4 &v)
Definition: simd.h:4750
#define OIIO_NAMESPACE_BEGIN
Definition: oiioversion.h:93
GLenum src
Definition: glcorearb.h:1793
friend std::ostream & operator<<(std::ostream &cout, const vfloat3 &val)
Stream output.
Definition: simd.h:7992
void store(float *values) const
Definition: simd.h:7921
friend const vint8 & operator<<=(vint8 &a, unsigned int bits)
Definition: simd.h:5355
static const char * type_name()
Definition: simd.h:892
friend vbool4 operator>=(const vfloat4 &a, const vfloat4 &b)
Definition: simd.h:7085
friend const vint8 & operator|=(vint8 &a, const vint8 &b)
Definition: simd.h:5322