HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fmath.h
Go to the documentation of this file.
1 // Copyright 2008-present Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/OpenImageIO/oiio
4 
5 /*
6  A few bits here are based upon code from NVIDIA that was also released
7  under the same 3-Clause BSD license, and marked as:
8  Copyright 2004 NVIDIA Corporation. All Rights Reserved.
9 
10  Some parts of this file were first open-sourced in Open Shading Language,
11  also 3-Clause BSD license, then later moved here. The original copyright
12  notice was:
13  Copyright (c) 2009-2014 Sony Pictures Imageworks Inc., et al.
14 
15  Many of the math functions were copied from or inspired by other
16  public domain sources or open source packages with compatible licenses.
17  The individual functions give references were applicable.
18 */
19 
20 
21 // clang-format off
22 
23 /// \file
24 ///
25 /// A variety of floating-point math helper routines (and, slight
26 /// misnomer, some int stuff as well).
27 ///
28 
29 
30 #pragma once
31 #define OIIO_FMATH_H 1
32 
33 #include <algorithm>
34 #include <cmath>
35 #include <cstring>
36 #include <limits>
37 #include <typeinfo>
38 #include <type_traits>
39 
40 #include <OpenImageIO/Imath.h>
41 #include <OpenImageIO/span.h>
42 #include <OpenImageIO/dassert.h>
44 #include <OpenImageIO/platform.h>
45 #include <OpenImageIO/simd.h>
46 
47 
49 
50 /// Occasionally there may be a tradeoff where the best/fastest
51 /// implementation of a math function in an ordinary scalar context does
52 /// things that prevent autovectorization (or OMP pragma vectorization) of a
53 /// loop containing that operation, and the only way to make it SIMD
54 /// friendly slows it down as a scalar op.
55 ///
56 /// By default, we always prefer doing it as correctly and quickly as
57 /// possible in scalar mode, but if OIIO_FMATH_SIMD_FRIENDLY is defined to
58 /// be nonzero, we prefer ensuring that the SIMD loops are as efficient as
59 /// possible, even at the expense of scalar performance.
60 ///
61 /// Because everything here consists of inline functions, and it doesn't
62 /// really matter for OIIO internals themselves, this is primarily intended
63 /// for the sake of 3rd party apps that happen to have OIIO as a dependency
64 /// and use the fmath.h functions. Such a downstream dependency just needs
65 /// to ensure that OIIO_FMATH_SIMD_FRIENDLY is defined to 1 prior to
66 /// including fmath.h.
67 ///
68 #ifndef OIIO_FMATH_SIMD_FRIENDLY
69 # define OIIO_FMATH_SIMD_FRIENDLY 0
70 #endif
71 
72 
73 // Helper template to let us tell if two types are the same.
74 // C++11 defines this, keep in OIIO namespace for back compat.
75 // DEPRECATED(2.0) -- clients should switch OIIO::is_same -> std::is_same.
76 using std::is_same;
77 
78 
79 // For back compatibility: expose these in the OIIO namespace.
80 // DEPRECATED(2.0) -- clients should switch OIIO:: -> std:: for these.
81 using std::isfinite;
82 using std::isinf;
83 using std::isnan;
84 
85 
86 // Define math constants just in case they aren't included (Windows is a
87 // little finicky about this, only defining these if _USE_MATH_DEFINES is
88 // defined before <cmath> is included, which is hard to control).
89 #ifndef M_PI
90 # define M_PI 3.14159265358979323846264338327950288
91 #endif
92 #ifndef M_PI_2
93 # define M_PI_2 1.57079632679489661923132169163975144
94 #endif
95 #ifndef M_PI_4
96 # define M_PI_4 0.785398163397448309615660845819875721
97 #endif
98 #ifndef M_TWO_PI
99 # define M_TWO_PI (M_PI * 2.0)
100 #endif
101 #ifndef M_1_PI
102 # define M_1_PI 0.318309886183790671537767526745028724
103 #endif
104 #ifndef M_2_PI
105 # define M_2_PI 0.636619772367581343075535053490057448
106 #endif
107 #ifndef M_SQRT2
108 # define M_SQRT2 1.41421356237309504880168872420969808
109 #endif
110 #ifndef M_SQRT1_2
111 # define M_SQRT1_2 0.707106781186547524400844362104849039
112 #endif
113 #ifndef M_LN2
114 # define M_LN2 0.69314718055994530941723212145817656
115 #endif
116 #ifndef M_LN10
117 # define M_LN10 2.30258509299404568401799145468436421
118 #endif
119 #ifndef M_E
120 # define M_E 2.71828182845904523536028747135266250
121 #endif
122 #ifndef M_LOG2E
123 # define M_LOG2E 1.44269504088896340735992468100189214
124 #endif
125 
126 
127 
128 ////////////////////////////////////////////////////////////////////////////
129 ////////////////////////////////////////////////////////////////////////////
130 //
131 // INTEGER HELPER FUNCTIONS
132 //
133 // A variety of handy functions that operate on integers.
134 //
135 
136 /// Quick test for whether an integer is a power of 2.
137 ///
138 template<typename T>
140 ispow2(T x) noexcept
141 {
142  // Numerous references for this bit trick are on the web. The
143  // principle is that x is a power of 2 <=> x == 1<<b <=> x-1 is
144  // all 1 bits for bits < b.
145  return (x & (x - 1)) == 0 && (x >= 0);
146 }
147 
148 
149 
150 /// Round up to next higher power of 2 (return x if it's already a power
151 /// of 2).
153 ceil2(int x) noexcept
154 {
155  // Here's a version with no loops.
156  if (x < 0)
157  return 0;
158  // Subtract 1, then round up to a power of 2, that way if we are
159  // already a power of 2, we end up with the same number.
160  --x;
161  // Make all bits past the first 1 also be 1, i.e. 0001xxxx -> 00011111
162  x |= x >> 1;
163  x |= x >> 2;
164  x |= x >> 4;
165  x |= x >> 8;
166  x |= x >> 16;
167  // Now we have 2^n-1, by adding 1, we make it a power of 2 again
168  return x + 1;
169 }
170 
171 
172 
173 /// Round down to next lower power of 2 (return x if it's already a power
174 /// of 2).
176 floor2(int x) noexcept
177 {
178  // Make all bits past the first 1 also be 1, i.e. 0001xxxx -> 00011111
179  x |= x >> 1;
180  x |= x >> 2;
181  x |= x >> 4;
182  x |= x >> 8;
183  x |= x >> 16;
184  // Strip off all but the high bit, i.e. 00011111 -> 00010000
185  // That's the power of two <= the original x
186  return x & ~(x >> 1);
187 }
188 
189 
190 // Old names -- DEPRECATED(2.1)
191 inline OIIO_HOSTDEVICE int pow2roundup(int x) { return ceil2(x); }
192 inline OIIO_HOSTDEVICE int pow2rounddown(int x) { return floor2(x); }
193 
194 
195 
196 /// Round value up to the next whole multiple. For example,
197 /// `round_to_multiple(10,10) == 10`, `round_to_multiple(17,10) == 20`, and
198 /// `round_to_multiple(-17,10) == -10`.
199 template <typename V, typename M>
200 inline OIIO_HOSTDEVICE V round_to_multiple (V value, M multiple)
201 {
202  if (value >= 0)
203  value += V(multiple) - 1;
204  return value - (value % V(multiple));
205 }
206 
207 
208 
209 /// Round up to the next whole multiple of m, for the special case where
210 /// m is definitely a power of 2 (somewhat simpler than the more general
211 /// round_to_multiple). This is a template that should work for any
212 /// integer type.
213 template<typename T>
214 inline OIIO_HOSTDEVICE T
216 {
217  OIIO_DASSERT(ispow2(m));
218  return (x + m - 1) & (~(m - 1));
219 }
220 
221 
222 
223 /// Round `value` down to a whole multiple of `multiple`. For example,
224 /// `round_down_to_multiple(10,10) == 10`,
225 /// `round_down_to_multiple(17,10) == 10`, and
226 /// `round_down_to_multiple(-17,10) == -20`.
227 template <typename V, typename M>
229 {
230  if (value < 0)
231  value -= V(multiple - 1);
232  return value - (value % V(multiple));
233 }
234 
235 
236 
237 /// Multiply two unsigned 32-bit ints safely, carefully checking for
238 /// overflow, and clamping to uint32_t's maximum value.
239 inline OIIO_HOSTDEVICE uint32_t
240 clamped_mult32(uint32_t a, uint32_t b)
241 {
242  const uint32_t Err = std::numeric_limits<uint32_t>::max();
243  uint64_t r = (uint64_t)a * (uint64_t)b; // Multiply into a bigger int
244  return r < Err ? (uint32_t)r : Err;
245 }
246 
247 
248 
249 /// Multiply two unsigned 64-bit ints safely, carefully checking for
250 /// overflow, and clamping to uint64_t's maximum value.
251 inline OIIO_HOSTDEVICE uint64_t
252 clamped_mult64(uint64_t a, uint64_t b)
253 {
254  uint64_t ab = a * b;
255  if (b && ab / b != a)
257  else
258  return ab;
259 }
260 
261 
262 
263 /// Bitwise circular rotation left by `s` bits (for any unsigned integer
264 /// type). For info on the C++20 std::rotl(), see
265 /// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p0553r4.html
266 // FIXME: this should be constexpr, but we're leaving that out for now
267 // because the Cuda specialization uses an intrinsic that isn't constexpr.
268 // Come back to this later when more of the Cuda library is properly
269 // constexpr.
270 template<class T>
272 // constexpr
273 T rotl(T x, int s) noexcept
274 {
276  "rotl only works for unsigned integer types");
277  return (x << s) | (x >> ((sizeof(T) * 8) - s));
278 }
279 
280 
281 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
282 // Cuda has an intrinsic for 32 bit unsigned int rotation
283 // FIXME: This should be constexpr, but __funnelshift_lc seems not to be
284 // marked as such.
285 template<>
287 // constexpr
288 uint32_t rotl(uint32_t x, int s) noexcept
289 {
290  return __funnelshift_lc(x, x, s);
291 }
292 #endif
293 
294 
295 
296 // Old names -- DEPRECATED(2.1)
298 rotl32(uint32_t x, int k)
299 {
300 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
301  return __funnelshift_lc(x, x, k);
302 #else
303  return (x << k) | (x >> (32 - k));
304 #endif
305 }
306 
308 rotl64(uint64_t x, int k)
309 {
310  return (x << k) | (x >> (64 - k));
311 }
312 
313 
314 
315 // safe_mod is like integer a%b, but safely returns 0 when b==0.
316 template <class T>
318 safe_mod(T a, T b)
319 {
320  return b ? (a % b) : T(0);
321 }
322 
323 // (end of integer helper functions)
324 ////////////////////////////////////////////////////////////////////////////
325 
326 
327 
328 ////////////////////////////////////////////////////////////////////////////
329 ////////////////////////////////////////////////////////////////////////////
330 //
331 // FLOAT UTILITY FUNCTIONS
332 //
333 // Helper/utility functions: clamps, blends, interpolations...
334 //
335 
336 
337 /// clamp a to bounds [low,high].
338 template <class T>
340 clamp (const T& a, const T& low, const T& high)
341 {
342 #if 1
343  // This looks clunky, but it generates minimal code. For float, it
344  // should result in just a max and min instruction, thats it.
345  // This implementation is courtesy of Alex Wells, Intel, via OSL.
346  T val = a;
347  if (!(low <= val)) // Forces clamp(NaN,low,high) to return low
348  val = low;
349  if (val > high)
350  val = high;
351  return val;
352 #else
353  // The naive implementation we originally had, below, only does the 2nd
354  // comparison in the else block, which will generate extra code in a
355  // SIMD masking scenario. I (LG) can confirm that benchmarks show that
356  // the above implementation is indeed faster than the one below, even
357  // for non-SIMD use.
358  return (a >= low) ? ((a <= high) ? a : high) : low;
359 #endif
360 }
361 
362 
363 #ifndef __CUDA_ARCH__
364 // Specialization of clamp for vfloat4
366 clamp (const simd::vfloat4& a, const simd::vfloat4& low, const simd::vfloat4& high)
367 {
368  return simd::min (high, simd::max (low, a));
369 }
370 
372 clamp (const simd::vfloat8& a, const simd::vfloat8& low, const simd::vfloat8& high)
373 {
374  return simd::min (high, simd::max (low, a));
375 }
376 
378 clamp (const simd::vfloat16& a, const simd::vfloat16& low, const simd::vfloat16& high)
379 {
380  return simd::min (high, simd::max (low, a));
381 }
382 
383 // Specialization of clamp for vint4
384 template<> OIIO_FORCEINLINE simd::vint4
385 clamp (const simd::vint4& a, const simd::vint4& low, const simd::vint4& high)
386 {
387  return simd::min (high, simd::max (low, a));
388 }
389 
390 template<> OIIO_FORCEINLINE simd::vint8
391 clamp (const simd::vint8& a, const simd::vint8& low, const simd::vint8& high)
392 {
393  return simd::min (high, simd::max (low, a));
394 }
395 
397 clamp (const simd::vint16& a, const simd::vint16& low, const simd::vint16& high)
398 {
399  return simd::min (high, simd::max (low, a));
400 }
401 #endif
402 
403 
404 
405 // For the multply+add (or sub) operations below, note that the results may
406 // differ slightly on different hardware, depending on whether true fused
407 // multiply and add is available or if the code generated just does an old
408 // fashioned multiply followed by a separate add. So please interpret these
409 // as "do a multiply and add as fast as possible for this hardware, with at
410 // least as much precision as a multiply followed by a separate add."
411 
412 /// Fused multiply and add: (a*b + c)
413 OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd (float a, float b, float c) {
414  // NOTE: GCC/ICC will turn this (for float) into a FMA unless
415  // explicitly asked not to, clang will do so if -ffp-contract=fast.
416  OIIO_CLANG_PRAGMA(clang fp contract(fast))
417  return a * b + c;
418 }
419 
420 
421 /// Fused multiply and subtract: (a*b - c)
422 OIIO_FORCEINLINE OIIO_HOSTDEVICE float msub (float a, float b, float c) {
423  OIIO_CLANG_PRAGMA(clang fp contract(fast))
424  return a * b - c;
425 }
426 
427 
428 
429 /// Fused negative multiply and add: -(a*b) + c
430 OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmadd (float a, float b, float c) {
431  OIIO_CLANG_PRAGMA(clang fp contract(fast))
432  return c - (a * b);
433 }
434 
435 
436 
437 /// Negative fused multiply and subtract: -(a*b) - c
438 OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmsub (float a, float b, float c) {
439  OIIO_CLANG_PRAGMA(clang fp contract(fast))
440  return -(a * b) - c;
441 }
442 
443 
444 
445 /// Linearly interpolate values v0-v1 at x: v0*(1-x) + v1*x.
446 /// This is a template, and so should work for any types.
447 template <class T, class Q>
449 lerp (const T& v0, const T& v1, const Q& x)
450 {
451  // NOTE: a*(1-x) + b*x is much more numerically stable than a+x*(b-a)
452  return v0*(Q(1)-x) + v1*x;
453 }
454 
455 
456 
457 /// Bilinearly interoplate values v0-v3 (v0 upper left, v1 upper right,
458 /// v2 lower left, v3 lower right) at coordinates (s,t) and return the
459 /// result. This is a template, and so should work for any types.
460 template <class T, class Q>
462 bilerp(const T& v0, const T& v1, const T& v2, const T& v3, const Q& s, const Q& t)
463 {
464  // NOTE: a*(t-1) + b*t is much more numerically stable than a+t*(b-a)
465  Q s1 = Q(1) - s;
466  return T ((Q(1)-t)*(v0*s1 + v1*s) + t*(v2*s1 + v3*s));
467 }
468 
469 
470 
471 /// Bilinearly interoplate arrays of values v0-v3 (v0 upper left, v1
472 /// upper right, v2 lower left, v3 lower right) at coordinates (s,t),
473 /// storing the results in 'result'. These are all vectors, so do it
474 /// for each of 'n' contiguous values (using the same s,t interpolants).
475 template <class T, class Q>
476 inline OIIO_HOSTDEVICE void
477 bilerp (const T *v0, const T *v1,
478  const T *v2, const T *v3,
479  Q s, Q t, int n, T *result)
480 {
481  Q s1 = Q(1) - s;
482  Q t1 = Q(1) - t;
483  for (int i = 0; i < n; ++i)
484  result[i] = T (t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s));
485 }
486 
487 
488 
489 /// Bilinearly interoplate arrays of values v0-v3 (v0 upper left, v1
490 /// upper right, v2 lower left, v3 lower right) at coordinates (s,t),
491 /// SCALING the interpolated value by 'scale' and then ADDING to
492 /// 'result'. These are all vectors, so do it for each of 'n'
493 /// contiguous values (using the same s,t interpolants).
494 template <class T, class Q>
495 inline OIIO_HOSTDEVICE void
496 bilerp_mad (const T *v0, const T *v1,
497  const T *v2, const T *v3,
498  Q s, Q t, Q scale, int n, T *result)
499 {
500  Q s1 = Q(1) - s;
501  Q t1 = Q(1) - t;
502  for (int i = 0; i < n; ++i)
503  result[i] += T (scale * (t1*(v0[i]*s1 + v1[i]*s) +
504  t*(v2[i]*s1 + v3[i]*s)));
505 }
506 
507 
508 
509 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
510 /// upper right top, ...) at coordinates (s,t,r), and return the
511 /// result. This is a template, and so should work for any types.
512 template <class T, class Q>
514 trilerp (T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
515 {
516  // NOTE: a*(t-1) + b*t is much more numerically stable than a+t*(b-a)
517  Q s1 = Q(1) - s;
518  Q t1 = Q(1) - t;
519  Q r1 = Q(1) - r;
520  return T (r1*(t1*(v0*s1 + v1*s) + t*(v2*s1 + v3*s)) +
521  r*(t1*(v4*s1 + v5*s) + t*(v6*s1 + v7*s)));
522 }
523 
524 
525 
526 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
527 /// upper right top, ...) at coordinates (s,t,r),
528 /// storing the results in 'result'. These are all vectors, so do it
529 /// for each of 'n' contiguous values (using the same s,t,r interpolants).
530 template <class T, class Q>
531 inline OIIO_HOSTDEVICE void
532 trilerp (const T *v0, const T *v1, const T *v2, const T *v3,
533  const T *v4, const T *v5, const T *v6, const T *v7,
534  Q s, Q t, Q r, int n, T *result)
535 {
536  Q s1 = Q(1) - s;
537  Q t1 = Q(1) - t;
538  Q r1 = Q(1) - r;
539  for (int i = 0; i < n; ++i)
540  result[i] = T (r1*(t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s)) +
541  r*(t1*(v4[i]*s1 + v5[i]*s) + t*(v6[i]*s1 + v7[i]*s)));
542 }
543 
544 
545 
546 /// Trilinearly interoplate arrays of values v0-v7 (v0 upper left top, v1
547 /// upper right top, ...) at coordinates (s,t,r),
548 /// SCALING the interpolated value by 'scale' and then ADDING to
549 /// 'result'. These are all vectors, so do it for each of 'n'
550 /// contiguous values (using the same s,t,r interpolants).
551 template <class T, class Q>
552 inline OIIO_HOSTDEVICE void
553 trilerp_mad (const T *v0, const T *v1, const T *v2, const T *v3,
554  const T *v4, const T *v5, const T *v6, const T *v7,
555  Q s, Q t, Q r, Q scale, int n, T *result)
556 {
557  Q r1 = Q(1) - r;
558  bilerp_mad (v0, v1, v2, v3, s, t, scale*r1, n, result);
559  bilerp_mad (v4, v5, v6, v7, s, t, scale*r, n, result);
560 }
561 
562 
563 
564 /// Evaluate B-spline weights in w[0..3] for the given fraction. This
565 /// is an important component of performing a cubic interpolation.
566 template <typename T>
567 inline OIIO_HOSTDEVICE void evalBSplineWeights (T w[4], T fraction)
568 {
569  T one_frac = 1 - fraction;
570  w[0] = T(1.0 / 6.0) * one_frac * one_frac * one_frac;
571  w[1] = T(2.0 / 3.0) - T(0.5) * fraction * fraction * (2 - fraction);
572  w[2] = T(2.0 / 3.0) - T(0.5) * one_frac * one_frac * (2 - one_frac);
573  w[3] = T(1.0 / 6.0) * fraction * fraction * fraction;
574 }
575 
576 
577 /// Evaluate B-spline derivative weights in w[0..3] for the given
578 /// fraction. This is an important component of performing a cubic
579 /// interpolation with derivatives.
580 template <typename T>
581 inline OIIO_HOSTDEVICE void evalBSplineWeightDerivs (T dw[4], T fraction)
582 {
583  T one_frac = 1 - fraction;
584  dw[0] = -T(0.5) * one_frac * one_frac;
585  dw[1] = T(0.5) * fraction * (3 * fraction - 4);
586  dw[2] = -T(0.5) * one_frac * (3 * one_frac - 4);
587  dw[3] = T(0.5) * fraction * fraction;
588 }
589 
590 
591 
592 /// Bicubically interoplate arrays of pointers arranged in a 4x4 pattern
593 /// with val[0] pointing to the data in the upper left corner, val[15]
594 /// pointing to the lower right) at coordinates (s,t), storing the
595 /// results in 'result'. These are all vectors, so do it for each of
596 /// 'n' contiguous values (using the same s,t interpolants).
597 template <class T>
598 inline OIIO_HOSTDEVICE void
599 bicubic_interp (const T **val, T s, T t, int n, T *result)
600 {
601  for (int c = 0; c < n; ++c)
602  result[c] = T(0);
603  T wx[4]; evalBSplineWeights (wx, s);
604  T wy[4]; evalBSplineWeights (wy, t);
605  for (int j = 0; j < 4; ++j) {
606  for (int i = 0; i < 4; ++i) {
607  T w = wx[i] * wy[j];
608  for (int c = 0; c < n; ++c)
609  result[c] += w * val[j*4+i][c];
610  }
611  }
612 }
613 
614 
615 
616 /// Return floor(x) cast to an int.
618 ifloor (float x)
619 {
620  return (int)floorf(x);
621 }
622 
623 
624 
625 /// Return (x-floor(x)) and put (int)floor(x) in *xint. This is similar
626 /// to the built-in modf, but returns a true int, always rounds down
627 /// (compared to modf which rounds toward 0), and always returns
628 /// frac >= 0 (comapred to modf which can return <0 if x<0).
629 inline OIIO_HOSTDEVICE float
630 floorfrac (float x, int *xint)
631 {
632 #if 1
633  float f = std::floor(x);
634  *xint = int(f);
635  return x - f;
636 #else /* vvv This idiom is slower */
637  int i = ifloor(x);
638  *xint = i;
639  return x - static_cast<float>(i); // Return the fraction left over
640 #endif
641 }
642 
643 
644 #ifndef __CUDA_ARCH__
647  *xint = simd::vint4(f);
648  return x - f;
649 }
650 
653  *xint = simd::vint8(f);
654  return x - f;
655 }
656 
659  *xint = simd::vint16(f);
660  return x - f;
661 }
662 #endif
663 
664 
665 
666 
667 /// Convert degrees to radians.
668 template <typename T>
669 OIIO_FORCEINLINE OIIO_HOSTDEVICE T radians (T deg) { return deg * T(M_PI / 180.0); }
670 
671 /// Convert radians to degrees
672 template <typename T>
673 OIIO_FORCEINLINE OIIO_HOSTDEVICE T degrees (T rad) { return rad * T(180.0 / M_PI); }
674 
675 
676 /// Faster floating point negation, in cases where you aren't too picky
677 /// about the difference between +0 and -0. (Courtesy of Alex Wells, Intel,
678 /// via code in OSL.)
679 ///
680 /// Beware: fast_neg(0.0f) returns 0.0f, NOT -0.0f. All other values,
681 /// including -0.0 and +/- Inf, work identically to `-x`. For many use
682 /// cases, that's fine. (When was the last time you wanted a -0.0f anyway?)
683 ///
684 /// The reason it's faster is that `-x` (for float x) is implemented by
685 /// compilers by xor'ing x against a bitmask that flips the sign bit, and
686 /// that bitmask had to come from somewhere -- memory! -- which can be
687 /// expensive, depending on whether/where it is in cache. But `0 - x`
688 /// generates a zero in a register (with xor, no memory access needed) and
689 /// then subtracts, so that's sometimes faster because there is no memory
690 /// read. This works for SIMD types as well!
691 ///
692 /// It's also safe (though pointless) to use fast_neg for integer types,
693 /// where both `-x` and `0-x` are implemented as a `neg` instruction.
694 template <typename T>
696 fast_neg(const T &x) {
697  return T(0) - x;
698 }
699 
700 
701 
702 inline OIIO_HOSTDEVICE void
703 sincos (float x, float* sine, float* cosine)
704 {
705 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
706  __builtin_sincosf(x, sine, cosine);
707 #elif defined(__CUDA_ARCH__)
708  // Explicitly select the single-precision CUDA library function
709  sincosf(x, sine, cosine);
710 #else
711  *sine = std::sin(x);
712  *cosine = std::cos(x);
713 #endif
714 }
715 
716 inline OIIO_HOSTDEVICE void
717 sincos (double x, double* sine, double* cosine)
718 {
719 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
720  __builtin_sincos(x, sine, cosine);
721 #elif defined(__CUDA_ARCH__)
722  // Use of anonymous namespace resolves to the CUDA library function and
723  // avoids infinite recursion
724  ::sincos(x, sine, cosine);
725 #else
726  *sine = std::sin(x);
727  *cosine = std::cos(x);
728 #endif
729 }
730 
731 
732 inline OIIO_HOSTDEVICE float sign (float x)
733 {
734  return x < 0.0f ? -1.0f : (x==0.0f ? 0.0f : 1.0f);
735 }
736 
737 
738 
739 
740 // (end of float helper functions)
741 ////////////////////////////////////////////////////////////////////////////
742 
743 
744 
745 ////////////////////////////////////////////////////////////////////////////
746 ////////////////////////////////////////////////////////////////////////////
747 //
748 // CONVERSION
749 //
750 // Type and range conversion helper functions and classes.
751 
752 
753 template <typename IN_TYPE, typename OUT_TYPE>
754 OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bit_cast (const IN_TYPE& in) {
755  // NOTE: this is the only standards compliant way of doing this type of casting,
756  // luckily the compilers we care about know how to optimize away this idiom.
757  static_assert(sizeof(IN_TYPE) == sizeof(OUT_TYPE),
758  "bit_cast must be between objects of the same size");
759  OUT_TYPE out;
760  memcpy ((void *)&out, &in, sizeof(IN_TYPE));
761  return out;
762 }
763 
764 #if defined(__INTEL_COMPILER)
765  // On x86/x86_64 for certain compilers we can use Intel CPU intrinsics
766  // for some common bit_cast cases that might be even more understandable
767  // to the compiler and generate better code without its getting confused
768  // about the memcpy in the general case.
769  // FIXME: The intrinsics are not in clang <= 9 nor gcc <= 9.1. Check
770  // future releases.
771  template<> OIIO_FORCEINLINE uint32_t bit_cast<float, uint32_t>(const float& val) {
772  return static_cast<uint32_t>(_castf32_u32(val));
773  }
774  template<> OIIO_FORCEINLINE int32_t bit_cast<float, int32_t>(const float& val) {
775  return static_cast<int32_t>(_castf32_u32(val));
776  }
777  template<> OIIO_FORCEINLINE float bit_cast<uint32_t, float>(const uint32_t& val) {
778  return _castu32_f32(val);
779  }
780  template<> OIIO_FORCEINLINE float bit_cast<int32_t, float>(const int32_t& val) {
781  return _castu32_f32(val);
782  }
783  template<> OIIO_FORCEINLINE uint64_t bit_cast<double, uint64_t>(const double& val) {
784  return static_cast<uint64_t>(_castf64_u64(val));
785  }
786  template<> OIIO_FORCEINLINE int64_t bit_cast<double, int64_t>(const double& val) {
787  return static_cast<int64_t>(_castf64_u64(val));
788  }
789  template<> OIIO_FORCEINLINE double bit_cast<uint64_t, double>(const uint64_t& val) {
790  return _castu64_f64(val);
791  }
792  template<> OIIO_FORCEINLINE double bit_cast<int64_t, double>(const int64_t& val) {
793  return _castu64_f64(val);
794  }
795 #endif
796 
797 
799  return bit_cast<float,int>(x);
800 }
802  return bit_cast<int,float>(x);
803 }
804 
805 
806 
807 /// Change endian-ness of one or more data items that are each 2, 4,
808 /// or 8 bytes. This should work for any of short, unsigned short, int,
809 /// unsigned int, float, long long, pointers.
810 template<class T>
811 inline OIIO_HOSTDEVICE void
812 swap_endian (T *f, int len=1)
813 {
814  for (char *c = (char *) f; len--; c += sizeof(T)) {
815  if (sizeof(T) == 2) {
816  std::swap (c[0], c[1]);
817  } else if (sizeof(T) == 4) {
818  std::swap (c[0], c[3]);
819  std::swap (c[1], c[2]);
820  } else if (sizeof(T) == 8) {
821  std::swap (c[0], c[7]);
822  std::swap (c[1], c[6]);
823  std::swap (c[2], c[5]);
824  std::swap (c[3], c[4]);
825  }
826  }
827 }
828 
829 
830 #if (OIIO_GNUC_VERSION || OIIO_CLANG_VERSION || OIIO_APPLE_CLANG_VERSION || OIIO_INTEL_COMPILER_VERSION) && !defined(__CUDACC__)
831 // CPU gcc and compatible can use these intrinsics, 8-15x faster
832 
833 template<> inline void swap_endian(uint16_t* f, int len) {
834  for (int i = 0; i < len; ++i)
835  f[i] = __builtin_bswap16(f[i]);
836 }
837 
838 template<> inline void swap_endian(uint32_t* f, int len) {
839  for (int i = 0; i < len; ++i)
840  f[i] = __builtin_bswap32(f[i]);
841 }
842 
843 template<> inline void swap_endian(uint64_t* f, int len) {
844  for (int i = 0; i < len; ++i)
845  f[i] = __builtin_bswap64(f[i]);
846 }
847 
848 template<> inline void swap_endian(int16_t* f, int len) {
849  for (int i = 0; i < len; ++i)
850  f[i] = __builtin_bswap16(f[i]);
851 }
852 
853 template<> inline void swap_endian(int32_t* f, int len) {
854  for (int i = 0; i < len; ++i)
855  f[i] = __builtin_bswap32(f[i]);
856 }
857 
858 template<> inline void swap_endian(int64_t* f, int len) {
859  for (int i = 0; i < len; ++i)
860  f[i] = __builtin_bswap64(f[i]);
861 }
862 
863 template<> inline void swap_endian(float* f, int len) {
864  swap_endian((uint32_t*)f, len);
865 }
866 
867 template<> inline void swap_endian(double* f, int len) {
868  swap_endian((uint64_t*)f, len);
869 }
870 
871 #elif defined(_MSC_VER) && !defined(__CUDACC__)
872 // CPU MSVS can use these intrinsics
873 
874 template<> inline void swap_endian(uint16_t* f, int len) {
875  for (int i = 0; i < len; ++i)
876  f[i] = _byteswap_ushort(f[i]);
877 }
878 
879 template<> inline void swap_endian(uint32_t* f, int len) {
880  for (int i = 0; i < len; ++i)
881  f[i] = _byteswap_ulong(f[i]);
882 }
883 
884 template<> inline void swap_endian(uint64_t* f, int len) {
885  for (int i = 0; i < len; ++i)
886  f[i] = _byteswap_uint64(f[i]);
887 }
888 
889 template<> inline void swap_endian(int16_t* f, int len) {
890  for (int i = 0; i < len; ++i)
891  f[i] = _byteswap_ushort(f[i]);
892 }
893 
894 template<> inline void swap_endian(int32_t* f, int len) {
895  for (int i = 0; i < len; ++i)
896  f[i] = _byteswap_ulong(f[i]);
897 }
898 
899 template<> inline void swap_endian(int64_t* f, int len) {
900  for (int i = 0; i < len; ++i)
901  f[i] = _byteswap_uint64(f[i]);
902 }
903 #endif
904 
905 
906 
907 // big_enough_float<T>::float_t is a floating-point type big enough to
908 // handle the range and precision of a <T>. It's a float, unless T is big.
909 template <typename T> struct big_enough_float { typedef float float_t; };
910 template<> struct big_enough_float<int> { typedef double float_t; };
911 template<> struct big_enough_float<unsigned int> { typedef double float_t; };
912 template<> struct big_enough_float<int64_t> { typedef double float_t; };
913 template<> struct big_enough_float<uint64_t> { typedef double float_t; };
914 template<> struct big_enough_float<double> { typedef double float_t; };
915 
916 
917 /// Multiply src by scale, clamp to [min,max], and round to the nearest D
918 /// (presumed to be integer). This is just a helper for the convert_type
919 /// templates, it probably has no other use.
920 template<typename S, typename D, typename F>
921 inline OIIO_HOSTDEVICE D
922 scaled_conversion(const S& src, F scale, F min, F max)
923 {
925  F s = src * scale;
926  s += (s < 0 ? (F)-0.5 : (F)0.5);
927  return (D)clamp(s, min, max);
928  } else {
929  return (D)clamp((F)src * scale + (F)0.5, min, max);
930  }
931 }
932 
933 
934 
935 /// Convert n consecutive values from the type of S to the type of D.
936 /// The conversion is not a simple cast, but correctly remaps the
937 /// 0.0->1.0 range from and to the full positive range of integral
938 /// types. Take a memcpy shortcut if both types are the same and no
939 /// conversion is necessary. Optional arguments can give nonstandard
940 /// quantizations.
941 //
942 // FIXME: make table-based specializations for common types with only a
943 // few possible src values (like unsigned char -> float).
944 template<typename S, typename D>
945 void convert_type (const S *src, D *dst, size_t n, D _min, D _max)
946 {
948  // They must be the same type. Just memcpy.
949  memcpy (dst, src, n*sizeof(D));
950  return;
951  }
952  typedef typename big_enough_float<D>::float_t F;
954  (F(1)) / F(std::numeric_limits<S>::max()) : F(1);
956  // Converting to an integer-like type.
957  F min = (F)_min; // std::numeric_limits<D>::min();
958  F max = (F)_max; // std::numeric_limits<D>::max();
959  scale *= _max;
960  // Unroll loop for speed
961  for ( ; n >= 16; n -= 16) {
962  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
963  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
964  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
965  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
966  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
967  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
968  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
969  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
970  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
971  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
972  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
973  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
974  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
975  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
976  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
977  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
978  }
979  while (n--)
980  *dst++ = scaled_conversion<S,D,F> (*src++, scale, min, max);
981  } else {
982  // Converting to a float-like type, so we don't need to remap
983  // the range
984  // Unroll loop for speed
985  for ( ; n >= 16; n -= 16) {
986  *dst++ = (D)((*src++) * scale);
987  *dst++ = (D)((*src++) * scale);
988  *dst++ = (D)((*src++) * scale);
989  *dst++ = (D)((*src++) * scale);
990  *dst++ = (D)((*src++) * scale);
991  *dst++ = (D)((*src++) * scale);
992  *dst++ = (D)((*src++) * scale);
993  *dst++ = (D)((*src++) * scale);
994  *dst++ = (D)((*src++) * scale);
995  *dst++ = (D)((*src++) * scale);
996  *dst++ = (D)((*src++) * scale);
997  *dst++ = (D)((*src++) * scale);
998  *dst++ = (D)((*src++) * scale);
999  *dst++ = (D)((*src++) * scale);
1000  *dst++ = (D)((*src++) * scale);
1001  *dst++ = (D)((*src++) * scale);
1002  }
1003  while (n--)
1004  *dst++ = (D)((*src++) * scale);
1005  }
1006 }
1007 
1008 
1009 #ifndef __CUDA_ARCH__
1010 template<>
1011 inline void convert_type<uint8_t,float> (const uint8_t *src,
1012  float *dst, size_t n,
1013  float /*_min*/, float /*_max*/)
1014 {
1016  simd::vfloat4 scale_simd (scale);
1017  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1018  simd::vfloat4 s_simd (src);
1019  simd::vfloat4 d_simd = s_simd * scale_simd;
1020  d_simd.store (dst);
1021  }
1022  while (n--)
1023  *dst++ = (*src++) * scale;
1024 }
1025 
1026 
1027 
1028 template<>
1029 inline void convert_type<uint16_t,float> (const uint16_t *src,
1030  float *dst, size_t n,
1031  float /*_min*/, float /*_max*/)
1032 {
1034  simd::vfloat4 scale_simd (scale);
1035  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1036  simd::vfloat4 s_simd (src);
1037  simd::vfloat4 d_simd = s_simd * scale_simd;
1038  d_simd.store (dst);
1039  }
1040  while (n--)
1041  *dst++ = (*src++) * scale;
1042 }
1043 
1044 
1045 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1046 template<>
1047 inline void convert_type<half,float> (const half *src,
1048  float *dst, size_t n,
1049  float /*_min*/, float /*_max*/)
1050 {
1051 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
1052  // If f16c ops are enabled, it's worth doing this by 8's
1053  for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
1054  simd::vfloat8 s_simd (src);
1055  s_simd.store (dst);
1056  }
1057 #endif
1058 #if OIIO_SIMD >= 4
1059  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1060  simd::vfloat4 s_simd (src);
1061  s_simd.store (dst);
1062  }
1063 #endif
1064  while (n--)
1065  *dst++ = (*src++);
1066 }
1067 #endif
1068 
1069 
1070 
1071 template<>
1072 inline void
1073 convert_type<float,uint16_t> (const float *src, uint16_t *dst, size_t n,
1074  uint16_t /*_min*/, uint16_t /*_max*/)
1075 {
1078  float scale = max;
1079  simd::vfloat4 max_simd (max);
1080  simd::vfloat4 zero_simd (0.0f);
1081  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1082  simd::vfloat4 scaled = simd::round (simd::vfloat4(src) * max_simd);
1083  simd::vfloat4 clamped = clamp (scaled, zero_simd, max_simd);
1084  simd::vint4 i (clamped);
1085  i.store (dst);
1086  }
1087  while (n--)
1088  *dst++ = scaled_conversion<float,uint16_t,float> (*src++, scale, min, max);
1089 }
1090 
1091 
1092 template<>
1093 inline void
1094 convert_type<float,uint8_t> (const float *src, uint8_t *dst, size_t n,
1095  uint8_t /*_min*/, uint8_t /*_max*/)
1096 {
1099  float scale = max;
1100  simd::vfloat4 max_simd (max);
1101  simd::vfloat4 zero_simd (0.0f);
1102  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1103  simd::vfloat4 scaled = simd::round (simd::vfloat4(src) * max_simd);
1104  simd::vfloat4 clamped = clamp (scaled, zero_simd, max_simd);
1105  simd::vint4 i (clamped);
1106  i.store (dst);
1107  }
1108  while (n--)
1109  *dst++ = scaled_conversion<float,uint8_t,float> (*src++, scale, min, max);
1110 }
1111 
1112 
1113 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1114 template<>
1115 inline void
1116 convert_type<float,half> (const float *src, half *dst, size_t n,
1117  half /*_min*/, half /*_max*/)
1118 {
1119 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
1120  // If f16c ops are enabled, it's worth doing this by 8's
1121  for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
1122  simd::vfloat8 s (src);
1123  s.store (dst);
1124  }
1125 #endif
1126 #if OIIO_SIMD >= 4
1127  for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1128  simd::vfloat4 s (src);
1129  s.store (dst);
1130  }
1131 #endif
1132  while (n--)
1133  *dst++ = *src++;
1134 }
1135 #endif
1136 #endif
1137 
1138 
1139 
1140 template<typename S, typename D>
1141 inline void convert_type (const S *src, D *dst, size_t n)
1142 {
1143  convert_type<S,D> (src, dst, n,
1146 }
1147 
1148 
1149 
1150 
1151 /// Convert a single value from the type of S to the type of D.
1152 /// The conversion is not a simple cast, but correctly remaps the
1153 /// 0.0->1.0 range from and to the full positive range of integral
1154 /// types. Take a copy shortcut if both types are the same and no
1155 /// conversion is necessary.
1156 template<typename S, typename D>
1157 inline D
1158 convert_type (const S &src)
1159 {
1161  // They must be the same type. Just return it.
1162  return (D)src;
1163  }
1164  typedef typename big_enough_float<D>::float_t F;
1166  F(1) / F(std::numeric_limits<S>::max()) : F(1);
1168  // Converting to an integer-like type.
1169  F min = (F) std::numeric_limits<D>::min();
1170  F max = (F) std::numeric_limits<D>::max();
1171  scale *= max;
1172  return scaled_conversion<S,D,F> (src, scale, min, max);
1173  } else {
1174  // Converting to a float-like type, so we don't need to remap
1175  // the range
1176  return (D)((F)src * scale);
1177  }
1178 }
1179 
1180 
1181 
1182 /// Helper function to convert channel values between different bit depths.
1183 /// Roughly equivalent to:
1184 ///
1185 /// out = round (in * (pow (2, TO_BITS) - 1) / (pow (2, FROM_BITS) - 1));
1186 ///
1187 /// but utilizes an integer math trick for speed. It can be proven that the
1188 /// absolute error of this method is less or equal to 1, with an average error
1189 /// (with respect to the entire domain) below 0.2.
1190 ///
1191 /// It is assumed that the original value is a valid FROM_BITS integer, i.e.
1192 /// shifted fully to the right.
1193 template<unsigned int FROM_BITS, unsigned int TO_BITS>
1194 inline OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in) {
1195  unsigned int out = 0;
1196  int shift = TO_BITS - FROM_BITS;
1197  for (; shift > 0; shift -= FROM_BITS)
1198  out |= in << shift;
1199  out |= in >> -shift;
1200  return out;
1201 }
1202 
1203 
1204 
1205 // non-templated version. Slow but general
1206 inline OIIO_HOSTDEVICE unsigned int
1207 bit_range_convert(unsigned int in, unsigned int FROM_BITS, unsigned int TO_BITS)
1208 {
1209  unsigned int out = 0;
1210  int shift = TO_BITS - FROM_BITS;
1211  for (; shift > 0; shift -= FROM_BITS)
1212  out |= in << shift;
1213  out |= in >> -shift;
1214  return out;
1215 }
1216 
1217 
1218 
1219 /// Append the `n` LSB bits of `val` into a bit sting `T out[]`, where the
1220 /// `filled` MSB bits of `*out` are already filled in. Increment `out` and
1221 /// adjust `filled` as required. Type `T` should be uint8_t, uint16_t, or
1222 /// uint32_t.
1223 template<typename T>
1224 inline void
1225 bitstring_add_n_bits (T* &out, int& filled, uint32_t val, int n)
1226 {
1227  static_assert(std::is_same<T,uint8_t>::value ||
1230  "bitstring_add_n_bits must be unsigned int 8/16/32");
1231  const int Tbits = sizeof(T) * 8;
1232  // val: | don't care | copy me |
1233  // <- n bits ->
1234  //
1235  // *out: | don't touch | fill in here |
1236  // <- filled -> <- (Tbits - filled) ->
1237  while (n > 0) {
1238  // Make sure val doesn't have any cruft in bits > n
1239  val &= ~(0xffffffff << n);
1240  // Initialize any new byte we're filling in
1241  if (filled == 0)
1242  *out = 0;
1243  // How many bits can we fill in `*out` without having to increment
1244  // to the next byte?
1245  int bits_left_in_out = Tbits - filled;
1246  int b = 0; // bit pattern to 'or' with *out
1247  int nb = 0; // number of bits to add
1248  if (n <= bits_left_in_out) { // can fit completely in this byte
1249  b = val << (bits_left_in_out - n);
1250  nb = n;
1251  } else { // n > bits_left_in_out, will spill to next byte
1252  b = val >> (n - bits_left_in_out);
1253  nb = bits_left_in_out;
1254  }
1255  *out |= b;
1256  filled += nb;
1257  OIIO_DASSERT (filled <= Tbits);
1258  n -= nb;
1259  if (filled == Tbits) {
1260  ++out;
1261  filled = 0;
1262  }
1263  }
1264 }
1265 
1266 
1267 
1268 /// Pack values from `T in[0..n-1]` (where `T` is expected to be a uint8,
1269 /// uint16, or uint32, into successive raw outbits-bit pieces of `out[]`,
1270 /// where outbits is expected to be less than the number of bits in a `T`.
1271 template<typename T>
1272 inline void
1273 bit_pack(cspan<T> data, void* out, int outbits)
1274 {
1275  static_assert(std::is_same<T,uint8_t>::value ||
1278  "bit_pack must be unsigned int 8/16/32");
1279  unsigned char* outbuffer = (unsigned char*)out;
1280  int filled = 0;
1281  for (size_t i = 0, e = data.size(); i < e; ++i)
1282  bitstring_add_n_bits (outbuffer, filled, data[i], outbits);
1283 }
1284 
1285 
1286 
1287 /// Decode n packed inbits-bits values from in[...] into normal uint8,
1288 /// uint16, or uint32 representation of `T out[0..n-1]`. In other words,
1289 /// each successive `inbits` of `in` (allowing spanning of byte boundaries)
1290 /// will be stored in a successive out[i].
1291 template<typename T>
1292 inline void
1293 bit_unpack(int n, const unsigned char* in, int inbits, T* out)
1294 {
1295  static_assert(std::is_same<T,uint8_t>::value ||
1298  "bit_unpack must be unsigned int 8/16/32");
1299  OIIO_DASSERT(inbits >= 1 && inbits < 32); // surely bugs if not
1300  // int highest = (1 << inbits) - 1;
1301  int B = 0, b = 0;
1302  // Invariant:
1303  // So far, we have used in[0..B-1] and the high b bits of in[B].
1304  for (int i = 0; i < n; ++i) {
1305  long long val = 0;
1306  int valbits = 0; // bits so far we've accumulated in val
1307  while (valbits < inbits) {
1308  // Invariant: we have already accumulated valbits of the next
1309  // needed value (of a total of inbits), living in the valbits
1310  // low bits of val.
1311  int out_left = inbits - valbits; // How much more we still need
1312  int in_left = 8 - b; // Bits still available in in[B].
1313  if (in_left <= out_left) {
1314  // Eat the rest of this byte:
1315  // |---------|--------|
1316  // b in_left
1317  val <<= in_left;
1318  val |= in[B] & ~(0xffffffff << in_left);
1319  ++B;
1320  b = 0;
1321  valbits += in_left;
1322  } else {
1323  // Eat just the bits we need:
1324  // |--|---------|-----|
1325  // b out_left extra
1326  val <<= out_left;
1327  int extra = 8 - b - out_left;
1328  val |= (in[B] >> extra) & ~(0xffffffff << out_left);
1329  b += out_left;
1330  valbits = inbits;
1331  }
1332  }
1333  out[i] = val; //T((val * 0xff) / highest);
1334  }
1335 }
1336 
1337 
1338 
1339 /// A DataProxy<I,E> looks like an (E &), but it really holds an (I &)
1340 /// and does conversions (via convert_type) as it reads in and out.
1341 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1342 template<typename I, typename E>
1343 struct DataProxy {
1344  DataProxy (I &data) : m_data(data) { }
1345  E operator= (E newval) { m_data = convert_type<E,I>(newval); return newval; }
1346  operator E () const { return convert_type<I,E>(m_data); }
1347 private:
1348  DataProxy& operator = (const DataProxy&); // Do not implement
1349  I &m_data;
1350 };
1351 
1352 
1353 
1354 /// A ConstDataProxy<I,E> looks like a (const E &), but it really holds
1355 /// a (const I &) and does conversions (via convert_type) as it reads.
1356 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1357 template<typename I, typename E>
1359  ConstDataProxy (const I &data) : m_data(data) { }
1360  operator E () const { return convert_type<E,I>(*m_data); }
1361 private:
1362  const I &m_data;
1363 };
1364 
1365 
1366 
1367 /// A DataArrayProxy<I,E> looks like an (E *), but it really holds an (I *)
1368 /// and does conversions (via convert_type) as it reads in and out.
1369 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1370 template<typename I, typename E>
1372  DataArrayProxy (I *data=NULL) : m_data(data) { }
1373  E operator* () const { return convert_type<I,E>(*m_data); }
1374  E operator[] (int i) const { return convert_type<I,E>(m_data[i]); }
1375  DataProxy<I,E> operator[] (int i) { return DataProxy<I,E> (m_data[i]); }
1376  void set (I *data) { m_data = data; }
1377  I * get () const { return m_data; }
1379  m_data += i; return *this;
1380  }
1381 private:
1382  I *m_data;
1383 };
1384 
1385 
1386 
1387 /// A ConstDataArrayProxy<I,E> looks like an (E *), but it really holds an
1388 /// (I *) and does conversions (via convert_type) as it reads in and out.
1389 /// (I and E are for INTERNAL and EXTERNAL data types, respectively).
1390 template<typename I, typename E>
1392  ConstDataArrayProxy (const I *data=NULL) : m_data(data) { }
1393  E operator* () const { return convert_type<I,E>(*m_data); }
1394  E operator[] (int i) const { return convert_type<I,E>(m_data[i]); }
1395  void set (const I *data) { m_data = data; }
1396  const I * get () const { return m_data; }
1398  m_data += i; return *this;
1399  }
1400 private:
1401  const I *m_data;
1402 };
1403 
1404 
1405 
1406 /// Fast table-based conversion of 8-bit to other types. Declare this
1407 /// as static to avoid the expensive ctr being called all the time.
1408 template <class T=float>
1410 public:
1411  EightBitConverter () noexcept { init(); }
1412  T operator() (unsigned char c) const noexcept { return val[c]; }
1413 private:
1414  T val[256];
1415  void init () noexcept {
1416  float scale = 1.0f / 255.0f;
1418  scale *= (float)std::numeric_limits<T>::max();
1419  for (int i = 0; i < 256; ++i)
1420  val[i] = (T)(i * scale);
1421  }
1422 };
1423 
1424 
1425 
1426 /// Simple conversion of a (presumably non-negative) float into a
1427 /// rational. This does not attempt to find the simplest fraction
1428 /// that approximates the float, for example 52.83 will simply
1429 /// return 5283/100. This does not attempt to gracefully handle
1430 /// floats that are out of range that could be easily int/int.
1431 inline OIIO_HOSTDEVICE void
1432 float_to_rational (float f, unsigned int &num, unsigned int &den)
1433 {
1434  if (f <= 0) { // Trivial case of zero, and handle all negative values
1435  num = 0;
1436  den = 1;
1437  } else if ((int)(1.0/f) == (1.0/f)) { // Exact results for perfect inverses
1438  num = 1;
1439  den = (int)f;
1440  } else {
1441  num = (int)f;
1442  den = 1;
1443  while (fabsf(f-static_cast<float>(num)) > 0.00001f && den < 1000000) {
1444  den *= 10;
1445  f *= 10;
1446  num = (int)f;
1447  }
1448  }
1449 }
1450 
1451 
1452 
1453 /// Simple conversion of a float into a rational. This does not attempt
1454 /// to find the simplest fraction that approximates the float, for
1455 /// example 52.83 will simply return 5283/100. This does not attempt to
1456 /// gracefully handle floats that are out of range that could be easily
1457 /// int/int.
1458 inline OIIO_HOSTDEVICE void
1459 float_to_rational (float f, int &num, int &den)
1460 {
1461  unsigned int n, d;
1462  float_to_rational (fabsf(f), n, d);
1463  num = (f >= 0) ? (int)n : -(int)n;
1464  den = (int) d;
1465 }
1466 
1467 
1468 // (end of conversion helpers)
1469 ////////////////////////////////////////////////////////////////////////////
1470 
1471 
1472 
1473 
1474 ////////////////////////////////////////////////////////////////////////////
1475 ////////////////////////////////////////////////////////////////////////////
1476 //
1477 // SAFE MATH
1478 //
1479 // The functions named "safe_*" are versions with various internal clamps
1480 // or other deviations from IEEE standards with the specific intent of
1481 // never producing NaN or Inf values or throwing exceptions. But within the
1482 // valid range, they should be full precision and match IEEE standards.
1483 //
1484 
1485 
1486 /// Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
1487 template <typename T>
1489  return x >= T(0) ? std::sqrt(x) : T(0);
1490 }
1491 
1492 /// Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
1493 template <typename T>
1495  return x > T(0) ? T(1) / std::sqrt(x) : T(0);
1496 }
1497 
1498 
1499 /// Safe (clamping) arcsine: clamp to the valid domain.
1500 template <typename T>
1502  if (x <= T(-1)) return T(-M_PI_2);
1503  if (x >= T(+1)) return T(+M_PI_2);
1504  return std::asin(x);
1505 }
1506 
1507 /// Safe (clamping) arccosine: clamp to the valid domain.
1508 template <typename T>
1510  if (x <= T(-1)) return T(M_PI);
1511  if (x >= T(+1)) return T(0);
1512  return std::acos(x);
1513 }
1514 
1515 
1516 /// Safe log2: clamp to valid domain.
1517 template <typename T>
1519  // match clamping from fast version
1522  return std::log2(x);
1523 }
1524 
1525 /// Safe log: clamp to valid domain.
1526 template <typename T>
1528  // slightly different than fast version since clamping happens before scaling
1531  return std::log(x);
1532 }
1533 
1534 /// Safe log10: clamp to valid domain.
1535 template <typename T>
1537  // slightly different than fast version since clamping happens before scaling
1540  return log10f(x);
1541 }
1542 
1543 /// Safe logb: clamp to valid domain.
1544 template <typename T>
1546  return (x != T(0)) ? std::logb(x) : -std::numeric_limits<T>::max();
1547 }
1548 
1549 /// Safe pow: clamp the domain so it never returns Inf or NaN or has divide
1550 /// by zero error.
1551 template <typename T>
1553  if (y == T(0)) return T(1);
1554  if (x == T(0)) return T(0);
1555  // if x is negative, only deal with integer powers
1556  if ((x < T(0)) && (y != floor(y))) return T(0);
1557  // FIXME: this does not match "fast" variant because clamping limits are different
1558  T r = std::pow(x, y);
1559  // Clamp to avoid returning Inf.
1560  const T big = std::numeric_limits<T>::max();
1561  return clamp (r, -big, big);
1562 }
1563 
1564 
1565 /// Safe fmod: guard against b==0.0 (in which case, return 0.0). Also, it
1566 /// seems that this is much faster than std::fmod!
1568 {
1569  if (OIIO_LIKELY(b != 0.0f)) {
1570 #if 0
1571  return std::fmod (a,b);
1572  // std::fmod was getting called serially instead of vectorizing, so
1573  // we will just do the calculation ourselves.
1574 #else
1575  // This formulation is equivalent but much faster in our benchmarks,
1576  // also vectorizes better.
1577  // The floating-point remainder of the division operation
1578  // a/b is a - N*b, where N = a/b with its fractional part truncated.
1579  int N = static_cast<int>(a/b);
1580  return a - N*b;
1581 #endif
1582  }
1583  return 0.0f;
1584 }
1585 
1586 #define OIIO_FMATH_HAS_SAFE_FMOD 1
1587 
1588 // (end of safe_* functions)
1589 ////////////////////////////////////////////////////////////////////////////
1590 
1591 
1592 
1593 ////////////////////////////////////////////////////////////////////////////
1594 ////////////////////////////////////////////////////////////////////////////
1595 //
1596 // FAST & APPROXIMATE MATH
1597 //
1598 // The functions named "fast_*" provide a set of replacements to libm that
1599 // are much faster at the expense of some accuracy and robust handling of
1600 // extreme values. One design goal for these approximation was to avoid
1601 // branches as much as possible and operate on single precision values only
1602 // so that SIMD versions should be straightforward ports. We also try to
1603 // implement "safe" semantics (ie: clamp to valid range where possible)
1604 // natively since wrapping these inline calls in another layer would be
1605 // wasteful.
1606 //
1607 // The "fast_*" functions are not only possibly differing in value
1608 // (slightly) from the std versions of these functions, but also we do not
1609 // guarantee that the results of "fast" will exactly match from platform to
1610 // platform. This is because if a particular platform (HW, library, or
1611 // compiler) provides an intrinsic that is even faster than our usual "fast"
1612 // implementation, we may substitute it.
1613 //
1614 // Some functions are fast_safe_*, which is both a faster approximation as
1615 // well as clamped input domain to ensure no NaN, Inf, or divide by zero.
1616 //
1617 // NB: When compiling for CUDA devices, selection of the 'fast' intrinsics
1618 // is influenced by compiler options (-ffast-math for clang/gcc,
1619 // --use-fast-math for NVCC).
1620 //
1621 // TODO: Quantify the performance and accuracy of the CUDA Math functions
1622 // relative to the definitions in this file. It may be better in some
1623 // cases to use the approximate versions defined below.
1624 
1625 
1626 /// Round to nearest integer, returning as an int.
1627 /// Note that this differs from std::rint, which returns a float; it's more
1628 /// akin to std::lrint, which returns a long (int). Sorry for the naming
1629 /// confusion.
1631  // used by sin/cos/tan range reduction
1632 #if OIIO_SIMD_SSE >= 4
1633  // single roundps instruction on SSE4.1+ (for gcc/clang at least)
1634  return static_cast<int>(std::rint(x));
1635 #else
1636  // emulate rounding by adding/substracting 0.5
1637  return static_cast<int>(x + copysignf(0.5f, x));
1638 #endif
1639 }
1640 
1641 #ifndef __CUDA_ARCH__
1643  return simd::rint (x);
1644 }
1645 #endif
1646 
1647 
1649 #ifndef __CUDA_ARCH__
1650  // very accurate argument reduction from SLEEF
1651  // starts failing around x=262000
1652  // Results on: [-2pi,2pi]
1653  // Examined 2173837240 values of sin: 0.00662760244 avg ulp diff, 2 max ulp, 1.19209e-07 max error
1654  int q = fast_rint (x * float(M_1_PI));
1655  float qf = float(q);
1656  x = madd(qf, -0.78515625f*4, x);
1657  x = madd(qf, -0.00024187564849853515625f*4, x);
1658  x = madd(qf, -3.7747668102383613586e-08f*4, x);
1659  x = madd(qf, -1.2816720341285448015e-12f*4, x);
1660  x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1661  float s = x * x;
1662  if ((q & 1) != 0) x = -x;
1663  // this polynomial approximation has very low error on [-pi/2,+pi/2]
1664  // 1.19209e-07 max error in total over [-2pi,+2pi]
1665  float u = 2.6083159809786593541503e-06f;
1666  u = madd(u, s, -0.0001981069071916863322258f);
1667  u = madd(u, s, +0.00833307858556509017944336f);
1668  u = madd(u, s, -0.166666597127914428710938f);
1669  u = madd(s, u * x, x);
1670  // For large x, the argument reduction can fail and the polynomial can
1671  // be evaluated with arguments outside the valid internal. Just clamp
1672  // the bad values away.
1673  return clamp(u, -1.0f, 1.0f);
1674 #else
1675  return __sinf(x);
1676 #endif
1677 }
1678 
1679 
1681 #ifndef __CUDA_ARCH__
1682  // same argument reduction as fast_sin
1683  int q = fast_rint (x * float(M_1_PI));
1684  float qf = float(q);
1685  x = madd(qf, -0.78515625f*4, x);
1686  x = madd(qf, -0.00024187564849853515625f*4, x);
1687  x = madd(qf, -3.7747668102383613586e-08f*4, x);
1688  x = madd(qf, -1.2816720341285448015e-12f*4, x);
1689  x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1690  float s = x * x;
1691  // polynomial from SLEEF's sincosf, max error is
1692  // 4.33127e-07 over [-2pi,2pi] (98% of values are "exact")
1693  float u = -2.71811842367242206819355e-07f;
1694  u = madd(u, s, +2.47990446951007470488548e-05f);
1695  u = madd(u, s, -0.00138888787478208541870117f);
1696  u = madd(u, s, +0.0416666641831398010253906f);
1697  u = madd(u, s, -0.5f);
1698  u = madd(u, s, +1.0f);
1699  if ((q & 1) != 0) u = -u;
1700  return clamp(u, -1.0f, 1.0f);
1701 #else
1702  return __cosf(x);
1703 #endif
1704 }
1705 
1706 
1707 OIIO_FORCEINLINE OIIO_HOSTDEVICE void fast_sincos (float x, float* sine, float* cosine) {
1708 #ifndef __CUDA_ARCH__
1709  // same argument reduction as fast_sin
1710  int q = fast_rint (x * float(M_1_PI));
1711  float qf = float(q);
1712  x = madd(qf, -0.78515625f*4, x);
1713  x = madd(qf, -0.00024187564849853515625f*4, x);
1714  x = madd(qf, -3.7747668102383613586e-08f*4, x);
1715  x = madd(qf, -1.2816720341285448015e-12f*4, x);
1716  x = float(M_PI_2) - (float(M_PI_2) - x); // crush denormals
1717  float s = x * x;
1718  // NOTE: same exact polynomials as fast_sin and fast_cos above
1719  if ((q & 1) != 0) x = -x;
1720  float su = 2.6083159809786593541503e-06f;
1721  su = madd(su, s, -0.0001981069071916863322258f);
1722  su = madd(su, s, +0.00833307858556509017944336f);
1723  su = madd(su, s, -0.166666597127914428710938f);
1724  su = madd(s, su * x, x);
1725  float cu = -2.71811842367242206819355e-07f;
1726  cu = madd(cu, s, +2.47990446951007470488548e-05f);
1727  cu = madd(cu, s, -0.00138888787478208541870117f);
1728  cu = madd(cu, s, +0.0416666641831398010253906f);
1729  cu = madd(cu, s, -0.5f);
1730  cu = madd(cu, s, +1.0f);
1731  if ((q & 1) != 0) cu = -cu;
1732  *sine = clamp(su, -1.0f, 1.0f);;
1733  *cosine = clamp(cu, -1.0f, 1.0f);;
1734 #else
1735  __sincosf(x, sine, cosine);
1736 #endif
1737 }
1738 
1739 // NOTE: this approximation is only valid on [-8192.0,+8192.0], it starts becoming
1740 // really poor outside of this range because the reciprocal amplifies errors
1742 #ifndef __CUDA_ARCH__
1743  // derived from SLEEF implementation
1744  // note that we cannot apply the "denormal crush" trick everywhere because
1745  // we sometimes need to take the reciprocal of the polynomial
1746  int q = fast_rint (x * float(2 * M_1_PI));
1747  float qf = float(q);
1748  x = madd(qf, -0.78515625f*2, x);
1749  x = madd(qf, -0.00024187564849853515625f*2, x);
1750  x = madd(qf, -3.7747668102383613586e-08f*2, x);
1751  x = madd(qf, -1.2816720341285448015e-12f*2, x);
1752  if ((q & 1) == 0)
1753  x = float(M_PI_4) - (float(M_PI_4) - x); // crush denormals (only if we aren't inverting the result later)
1754  float s = x * x;
1755  float u = 0.00927245803177356719970703f;
1756  u = madd(u, s, 0.00331984995864331722259521f);
1757  u = madd(u, s, 0.0242998078465461730957031f);
1758  u = madd(u, s, 0.0534495301544666290283203f);
1759  u = madd(u, s, 0.133383005857467651367188f);
1760  u = madd(u, s, 0.333331853151321411132812f);
1761  u = madd(s, u * x, x);
1762  if ((q & 1) != 0)
1763  u = -1.0f / u;
1764  return u;
1765 #else
1766  return __tanf(x);
1767 #endif
1768 }
1769 
1770 /// Fast, approximate sin(x*M_PI) with maximum absolute error of 0.000918954611.
1771 /// Adapted from http://devmaster.net/posts/9648/fast-and-accurate-sine-cosine#comment-76773
1772 /// Note that this is MUCH faster, but much less accurate than fast_sin.
1774 {
1775 #ifndef __CUDA_ARCH__
1776  // Fast trick to strip the integral part off, so our domain is [-1,1]
1777  const float z = x - ((x + 25165824.0f) - 25165824.0f);
1778  const float y = z - z * fabsf(z);
1779  const float Q = 3.10396624f;
1780  const float P = 3.584135056f; // P = 16-4*Q
1781  return y * (Q + P * fabsf(y));
1782  /* The original article used used inferior constants for Q and P and
1783  * so had max error 1.091e-3.
1784  *
1785  * The optimal value for Q was determined by exhaustive search, minimizing
1786  * the absolute numerical error relative to float(std::sin(double(phi*M_PI)))
1787  * over the interval [0,2] (which is where most of the invocations happen).
1788  *
1789  * The basic idea of this approximation starts with the coarse approximation:
1790  * sin(pi*x) ~= f(x) = 4 * (x - x * abs(x))
1791  *
1792  * This approximation always _over_ estimates the target. On the otherhand, the
1793  * curve:
1794  * sin(pi*x) ~= f(x) * abs(f(x)) / 4
1795  *
1796  * always lies _under_ the target. Thus we can simply numerically search for the
1797  * optimal constant to LERP these curves into a more precise approximation.
1798  * After folding the constants together and simplifying the resulting math, we
1799  * end up with the compact implementation below.
1800  *
1801  * NOTE: this function actually computes sin(x * pi) which avoids one or two
1802  * mults in many cases and guarantees exact values at integer periods.
1803  */
1804 #else
1805  return sinpif(x);
1806 #endif
1807 }
1808 
1809 /// Fast approximate cos(x*M_PI) with ~0.1% absolute error.
1810 /// Note that this is MUCH faster, but much less accurate than fast_cos.
1812 {
1813 #ifndef __CUDA_ARCH__
1814  return fast_sinpi (x+0.5f);
1815 #else
1816  return cospif(x);
1817 #endif
1818 }
1819 
1821 #ifndef __CUDA_ARCH__
1822  const float f = fabsf(x);
1823  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f; // clamp and crush denormals
1824  // based on http://www.pouet.net/topic.php?which=9132&page=2
1825  // 85% accurate (ulp 0)
1826  // Examined 2130706434 values of acos: 15.2000597 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // without "denormal crush"
1827  // Examined 2130706434 values of acos: 15.2007108 avg ulp diff, 4492 max ulp, 4.51803e-05 max error // with "denormal crush"
1828  const float a = sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1829  return x < 0 ? float(M_PI) - a : a;
1830 #else
1831  return acosf(x);
1832 #endif
1833 }
1834 
1836 #ifndef __CUDA_ARCH__
1837  // based on acosf approximation above
1838  // max error is 4.51133e-05 (ulps are higher because we are consistently off by a little amount)
1839  const float f = fabsf(x);
1840  const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f; // clamp and crush denormals
1841  const float a = float(M_PI_2) - sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1842  return copysignf(a, x);
1843 #else
1844  return asinf(x);
1845 #endif
1846 }
1847 
1849 #ifndef __CUDA_ARCH__
1850  const float a = fabsf(x);
1851  const float k = a > 1.0f ? 1 / a : a;
1852  const float s = 1.0f - (1.0f - k); // crush denormals
1853  const float t = s * s;
1854  // http://mathforum.org/library/drmath/view/62672.html
1855  // the coefficients were tuned in mathematica with the assumption that we want atan(1)=pi/4
1856  // (slightly higher error but no discontinuities)
1857  // Examined 4278190080 values of atan: 2.53989068 avg ulp diff, 315 max ulp, 9.17912e-06 max error // (with denormals)
1858  // Examined 4278190080 values of atan: 171160502 avg ulp diff, 855638016 max ulp, 9.17912e-06 max error // (crush denormals)
1859  float r = s * madd(0.430165678f, t, 1.0f) / madd(madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1860  if (a > 1.0f) r = 1.570796326794896557998982f - r;
1861  return copysignf(r, x);
1862 #else
1863  return atanf(x);
1864 #endif
1865 }
1866 
1868 #ifndef __CUDA_ARCH__
1869  // based on atan approximation above
1870  // the special cases around 0 and infinity were tested explicitly
1871  // the only case not handled correctly is x=NaN,y=0 which returns 0 instead of nan
1872  const float a = fabsf(x);
1873  const float b = fabsf(y);
1874  bool b_is_greater_than_a = b > a;
1875 
1876 #if OIIO_FMATH_SIMD_FRIENDLY
1877  // When applying to all lanes in SIMD, we end up doing extra masking and
1878  // 2 divides. So lets just do 1 divide and swap the parameters instead.
1879  // And if we are going to do a doing a divide anyway, when a == b it
1880  // should be 1.0f anyway so lets not bother special casing it.
1881  float sa = b_is_greater_than_a ? b : a;
1882  float sb = b_is_greater_than_a ? a : b;
1883  const float k = (b == 0) ? 0.0f : sb/sa;
1884 #else
1885  const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b / a));
1886 #endif
1887 
1888  const float s = 1.0f - (1.0f - k); // crush denormals
1889  const float t = s * s;
1890 
1891  float r = s * madd(0.430165678f, t, 1.0f) / madd(madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1892 
1893  if (b_is_greater_than_a)
1894  r = 1.570796326794896557998982f - r; // account for arg reduction
1895  // TODO: investigate if testing x < 0.0f is more efficient
1896  if (bit_cast<float, unsigned>(x) & 0x80000000u) // test sign bit of x
1897  r = float(M_PI) - r;
1898  return copysignf(r, y);
1899 #else
1900  return atan2f(y, x);
1901 #endif
1902 }
1903 
1904 
1905 template<typename T>
1907  using namespace simd;
1908  typedef typename T::vint_t intN;
1909  // See float fast_log2 for explanations
1911  intN bits = bitcast_to_int(x);
1912  intN exponent = srl (bits, 23) - intN(127);
1913  T f = bitcast_to_float ((bits & intN(0x007FFFFF)) | intN(0x3f800000)) - T(1.0f);
1914  T f2 = f * f;
1915  T f4 = f2 * f2;
1916  T hi = madd(f, T(-0.00931049621349f), T( 0.05206469089414f));
1917  T lo = madd(f, T( 0.47868480909345f), T(-0.72116591947498f));
1918  hi = madd(f, hi, T(-0.13753123777116f));
1919  hi = madd(f, hi, T( 0.24187369696082f));
1920  hi = madd(f, hi, T(-0.34730547155299f));
1921  lo = madd(f, lo, T( 1.442689881667200f));
1922  return ((f4 * hi) + (f * lo)) + T(exponent);
1923 }
1924 
1925 
1926 template<>
1927 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log2 (const float& xval) {
1928 #ifndef __CUDA_ARCH__
1929  // NOTE: clamp to avoid special cases and make result "safe" from large negative values/nans
1931  // based on https://github.com/LiraNuna/glsl-sse2/blob/master/source/vec4.h
1932  unsigned bits = bit_cast<float, unsigned>(x);
1933  int exponent = int(bits >> 23) - 127;
1934  float f = bit_cast<unsigned, float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
1935  // Examined 2130706432 values of log2 on [1.17549435e-38,3.40282347e+38]: 0.0797524457 avg ulp diff, 3713596 max ulp, 7.62939e-06 max error
1936  // ulp histogram:
1937  // 0 = 97.46%
1938  // 1 = 2.29%
1939  // 2 = 0.11%
1940  float f2 = f * f;
1941  float f4 = f2 * f2;
1942  float hi = madd(f, -0.00931049621349f, 0.05206469089414f);
1943  float lo = madd(f, 0.47868480909345f, -0.72116591947498f);
1944  hi = madd(f, hi, -0.13753123777116f);
1945  hi = madd(f, hi, 0.24187369696082f);
1946  hi = madd(f, hi, -0.34730547155299f);
1947  lo = madd(f, lo, 1.442689881667200f);
1948  return ((f4 * hi) + (f * lo)) + exponent;
1949 #else
1950  return __log2f(xval);
1951 #endif
1952 }
1953 
1954 
1955 
1956 template<typename T>
1958  // Examined 2130706432 values of logf on [1.17549435e-38,3.40282347e+38]: 0.313865375 avg ulp diff, 5148137 max ulp, 7.62939e-06 max error
1959  return fast_log2(x) * T(M_LN2);
1960 }
1961 
1962 #ifdef __CUDA_ARCH__
1963 template<>
1964 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log(const float& x)
1965 {
1966  return __logf(x);
1967 }
1968 #endif
1969 
1970 
1971 template<typename T>
1973  // Examined 2130706432 values of log10f on [1.17549435e-38,3.40282347e+38]: 0.631237033 avg ulp diff, 4471615 max ulp, 3.8147e-06 max error
1974  return fast_log2(x) * T(M_LN2 / M_LN10);
1975 }
1976 
1977 #ifdef __CUDA_ARCH__
1978 template<>
1979 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log10(const float& x)
1980 {
1981  return __log10f(x);
1982 }
1983 #endif
1984 
1986 #ifndef __CUDA_ARCH__
1987  // don't bother with denormals
1988  x = fabsf(x);
1991  unsigned bits = bit_cast<float, unsigned>(x);
1992  return float (int(bits >> 23) - 127);
1993 #else
1994  return logbf(x);
1995 #endif
1996 }
1997 
1999 #ifndef __CUDA_ARCH__
2000  if (fabsf(x) < 0.01f) {
2001  float y = 1.0f - (1.0f - x); // crush denormals
2002  return copysignf(madd(-0.5f, y * y, y), x);
2003  } else {
2004  return fast_log(x + 1);
2005  }
2006 #else
2007  return log1pf(x);
2008 #endif
2009 }
2010 
2011 
2012 
2013 template<typename T>
2015  using namespace simd;
2016  typedef typename T::vint_t intN;
2017 #if OIIO_SIMD_SSE
2018  // See float specialization for explanations
2019  T x = clamp (xval, T(-126.0f), T(126.0f));
2020  intN m (x); x -= T(m);
2021  T one (1.0f);
2022  x = one - (one - x); // crush denormals (does not affect max ulps!)
2023  const T kA (1.33336498402e-3f);
2024  const T kB (9.810352697968e-3f);
2025  const T kC (5.551834031939e-2f);
2026  const T kD (0.2401793301105f);
2027  const T kE (0.693144857883f);
2028  T r (kA);
2029  r = madd(x, r, kB);
2030  r = madd(x, r, kC);
2031  r = madd(x, r, kD);
2032  r = madd(x, r, kE);
2033  r = madd(x, r, one);
2034  return bitcast_to_float (bitcast_to_int(r) + (m << 23));
2035 #else
2036  T r;
2037  for (int i = 0; i < r.elements; ++i)
2038  r[i] = fast_exp2(xval[i]);
2039  for (int i = r.elements; i < r.paddedelements; ++i)
2040  r[i] = 0.0f;
2041  return r;
2042 #endif
2043 }
2044 
2045 
2046 template<>
2047 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp2 (const float& xval) {
2048 #if OIIO_ANY_CLANG && !OIIO_INTEL_LLVM_COMPILER && OIIO_FMATH_SIMD_FRIENDLY
2049  // Clang was unhappy using the bitcast/memcpy/reinter_cast/union inside
2050  // an explicit SIMD loop, so revert to calling the standard version.
2051  return std::exp2(xval);
2052 #elif !defined(__CUDA_ARCH__)
2053  // clamp to safe range for final addition
2054  float x = clamp (xval, -126.0f, 126.0f);
2055  // range reduction
2056  int m = int(x); x -= m;
2057  x = 1.0f - (1.0f - x); // crush denormals (does not affect max ulps!)
2058  // 5th degree polynomial generated with sollya
2059  // Examined 2247622658 values of exp2 on [-126,126]: 2.75764912 avg ulp diff, 232 max ulp
2060  // ulp histogram:
2061  // 0 = 87.81%
2062  // 1 = 4.18%
2063  float r = 1.33336498402e-3f;
2064  r = madd(x, r, 9.810352697968e-3f);
2065  r = madd(x, r, 5.551834031939e-2f);
2066  r = madd(x, r, 0.2401793301105f);
2067  r = madd(x, r, 0.693144857883f);
2068  r = madd(x, r, 1.0f);
2069  // multiply by 2 ^ m by adding in the exponent
2070  // NOTE: left-shift of negative number is undefined behavior
2071  return bit_cast<unsigned, float>(bit_cast<float, unsigned>(r) + (unsigned(m) << 23));
2072  // Clang: loop not vectorized: unsafe dependent memory operations in loop.
2073  // This is why we special case the OIIO_FMATH_SIMD_FRIENDLY above.
2074  // FIXME: as clang releases continue to improve, periodically check if
2075  // this is still the case.
2076 #else
2077  return exp2f(xval);
2078 #endif
2079 }
2080 
2081 
2082 
2083 
2084 template <typename T>
2086  // Examined 2237485550 values of exp on [-87.3300018,87.3300018]: 2.6666452 avg ulp diff, 230 max ulp
2087  return fast_exp2(x * T(1 / M_LN2));
2088 }
2089 
2090 #ifdef __CUDA_ARCH__
2091 template<>
2092 OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp (const float& x) {
2093  return __expf(x);
2094 }
2095 #endif
2096 
2097 
2098 
2099 /// Faster float exp than is in libm, but still 100% accurate
2101 {
2102 #if defined(__x86_64__) && defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 16
2103  // On x86_64, versions of glibc < 2.16 have an issue where expf is
2104  // much slower than the double version. This was fixed in glibc 2.16.
2105  return static_cast<float>(std::exp(static_cast<double>(x)));
2106 #else
2107  return std::exp(x);
2108 #endif
2109 }
2110 
2111 
2113 #ifndef __CUDA_ARCH__
2114  // Examined 2217701018 values of exp10 on [-37.9290009,37.9290009]: 2.71732409 avg ulp diff, 232 max ulp
2115  return fast_exp2(x * float(M_LN10 / M_LN2));
2116 #else
2117  return __exp10f(x);
2118 #endif
2119 }
2120 
2122 #ifndef __CUDA_ARCH__
2123  if (fabsf(x) < 0.03f) {
2124  float y = 1.0f - (1.0f - x); // crush denormals
2125  return copysignf(madd(0.5f, y * y, y), x);
2126  } else
2127  return fast_exp(x) - 1.0f;
2128 #else
2129  return expm1f(x);
2130 #endif
2131 }
2132 
2134 #ifndef __CUDA_ARCH__
2135  float a = fabsf(x);
2136  if (a > 1.0f) {
2137  // Examined 53389559 values of sinh on [1,87.3300018]: 33.6886442 avg ulp diff, 178 max ulp
2138  float e = fast_exp(a);
2139  return copysignf(0.5f * e - 0.5f / e, x);
2140  } else {
2141  a = 1.0f - (1.0f - a); // crush denorms
2142  float a2 = a * a;
2143  // degree 7 polynomial generated with sollya
2144  // Examined 2130706434 values of sinh on [-1,1]: 1.19209e-07 max error
2145  float r = 2.03945513931e-4f;
2146  r = madd(r, a2, 8.32990277558e-3f);
2147  r = madd(r, a2, 0.1666673421859f);
2148  r = madd(r * a, a2, a);
2149  return copysignf(r, x);
2150  }
2151 #else
2152  return sinhf(x);
2153 #endif
2154 }
2155 
2157 #ifndef __CUDA_ARCH__
2158  // Examined 2237485550 values of cosh on [-87.3300018,87.3300018]: 1.78256726 avg ulp diff, 178 max ulp
2159  float e = fast_exp(fabsf(x));
2160  return 0.5f * e + 0.5f / e;
2161 #else
2162  return coshf(x);
2163 #endif
2164 }
2165 
2167 #ifndef __CUDA_ARCH__
2168  // Examined 4278190080 values of tanh on [-3.40282347e+38,3.40282347e+38]: 3.12924e-06 max error
2169  // NOTE: ulp error is high because of sub-optimal handling around the origin
2170  float e = fast_exp(2.0f * fabsf(x));
2171  return copysignf(1 - 2 / (1 + e), x);
2172 #else
2173  return tanhf(x);
2174 #endif
2175 }
2176 
2178  if (y == 0) return 1.0f; // x^0=1
2179  if (x == 0) return 0.0f; // 0^y=0
2180  // be cheap & exact for special case of squaring and identity
2181  if (y == 1.0f)
2182  return x;
2183  if (y == 2.0f) {
2184 #ifndef __CUDA_ARCH__
2185  return std::min (x*x, std::numeric_limits<float>::max());
2186 #else
2187  return fminf (x*x, std::numeric_limits<float>::max());
2188 #endif
2189  }
2190  float sign = 1.0f;
2191  if (OIIO_UNLIKELY(x < 0.0f)) {
2192  // if x is negative, only deal with integer powers
2193  // powf returns NaN for non-integers, we will return 0 instead
2194  int ybits = bit_cast<float, int>(y) & 0x7fffffff;
2195  if (ybits >= 0x4b800000) {
2196  // always even int, keep positive
2197  } else if (ybits >= 0x3f800000) {
2198  // bigger than 1, check
2199  int k = (ybits >> 23) - 127; // get exponent
2200  int j = ybits >> (23 - k); // shift out possible fractional bits
2201  if ((j << (23 - k)) == ybits) // rebuild number and check for a match
2202  sign = bit_cast<int, float>(0x3f800000 | (j << 31)); // +1 for even, -1 for odd
2203  else
2204  return 0.0f; // not integer
2205  } else {
2206  return 0.0f; // not integer
2207  }
2208  }
2209  return sign * fast_exp2(y * fast_log2(fabsf(x)));
2210 }
2211 
2212 
2213 // Fast simd pow that only needs to work for positive x
2214 template<typename T, typename U>
2216  return fast_exp2(y * fast_log2(x));
2217 }
2218 
2219 
2220 // Fast cube root (performs better that using fast_pow's above with y=1/3)
2222 #ifndef __CUDA_ARCH__
2223  float x0 = fabsf(x);
2224  // from hacker's delight
2225  float a = bit_cast<int, float>(0x2a5137a0 + bit_cast<float, int>(x0) / 3); // Initial guess.
2226  // Examined 14272478 values of cbrt on [-9.99999935e-39,9.99999935e-39]: 8.14687e-14 max error
2227  // Examined 2131958802 values of cbrt on [9.99999935e-39,3.40282347e+38]: 2.46930719 avg ulp diff, 12 max ulp
2228  a = 0.333333333f * (2.0f * a + x0 / (a * a)); // Newton step.
2229  a = 0.333333333f * (2.0f * a + x0 / (a * a)); // Newton step again.
2230  a = (x0 == 0) ? 0 : a; // Fix 0 case
2231  return copysignf(a, x);
2232 #else
2233  return cbrtf(x);
2234 #endif
2235 }
2236 
2237 
2239 {
2240 #ifndef __CUDA_ARCH__
2241  // Examined 1082130433 values of erff on [0,4]: 1.93715e-06 max error
2242  // Abramowitz and Stegun, 7.1.28
2243  const float a1 = 0.0705230784f;
2244  const float a2 = 0.0422820123f;
2245  const float a3 = 0.0092705272f;
2246  const float a4 = 0.0001520143f;
2247  const float a5 = 0.0002765672f;
2248  const float a6 = 0.0000430638f;
2249  const float a = fabsf(x);
2250  const float b = 1.0f - (1.0f - a); // crush denormals
2251  const float r = madd(madd(madd(madd(madd(madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
2252  const float s = r * r; // ^2
2253  const float t = s * s; // ^4
2254  const float u = t * t; // ^8
2255  const float v = u * u; // ^16
2256  return copysignf(1.0f - 1.0f / v, x);
2257 #else
2258  return erff(x);
2259 #endif
2260 }
2261 
2263 {
2264 #ifndef __CUDA_ARCH__
2265  // Examined 2164260866 values of erfcf on [-4,4]: 1.90735e-06 max error
2266  // ulp histogram:
2267  // 0 = 80.30%
2268  return 1.0f - fast_erf(x);
2269 #else
2270  return erfcf(x);
2271 #endif
2272 }
2273 
2275 {
2276  // from: Approximating the erfinv function by Mike Giles
2277  // to avoid trouble at the limit, clamp input to 1-eps
2278  float a = fabsf(x); if (a > 0.99999994f) a = 0.99999994f;
2279  float w = -fast_log((1.0f - a) * (1.0f + a)), p;
2280  if (w < 5.0f) {
2281  w = w - 2.5f;
2282  p = 2.81022636e-08f;
2283  p = madd(p, w, 3.43273939e-07f);
2284  p = madd(p, w, -3.5233877e-06f );
2285  p = madd(p, w, -4.39150654e-06f);
2286  p = madd(p, w, 0.00021858087f );
2287  p = madd(p, w, -0.00125372503f );
2288  p = madd(p, w, -0.00417768164f );
2289  p = madd(p, w, 0.246640727f );
2290  p = madd(p, w, 1.50140941f );
2291  } else {
2292  w = sqrtf(w) - 3.0f;
2293  p = -0.000200214257f;
2294  p = madd(p, w, 0.000100950558f);
2295  p = madd(p, w, 0.00134934322f );
2296  p = madd(p, w, -0.00367342844f );
2297  p = madd(p, w, 0.00573950773f );
2298  p = madd(p, w, -0.0076224613f );
2299  p = madd(p, w, 0.00943887047f );
2300  p = madd(p, w, 1.00167406f );
2301  p = madd(p, w, 2.83297682f );
2302  }
2303  return p * x;
2304 }
2305 
2306 // (end of fast* functions)
2307 ////////////////////////////////////////////////////////////////////////////
2308 
2309 
2310 
2311 
2312 ////////////////////////////////////////////////////////////////////////////
2313 ////////////////////////////////////////////////////////////////////////////
2314 //
2315 // MISCELLANEOUS NUMERICAL METHODS
2316 //
2317 
2318 
2319 /// Solve for the x for which func(x) == y on the interval [xmin,xmax].
2320 /// Use a maximum of maxiter iterations, and stop any time the remaining
2321 /// search interval or the function evaluations <= eps. If brack is
2322 /// non-NULL, set it to true if y is in [f(xmin), f(xmax)], otherwise
2323 /// false (in which case the caller should know that the results may be
2324 /// unreliable. Results are undefined if the function is not monotonic
2325 /// on that interval or if there are multiple roots in the interval (it
2326 /// may not converge, or may converge to any of the roots without
2327 /// telling you that there are more than one).
2328 template<class T, class Func> OIIO_HOSTDEVICE
2329 T invert (Func &func, T y, T xmin=0.0, T xmax=1.0,
2330  int maxiters=32, T eps=1.0e-6, bool *brack=0)
2331 {
2332  // Use the Regula Falsi method, falling back to bisection if it
2333  // hasn't converged after 3/4 of the maximum number of iterations.
2334  // See, e.g., Numerical Recipes for the basic ideas behind both
2335  // methods.
2336  T v0 = func(xmin), v1 = func(xmax);
2337  T x = xmin, v = v0;
2338  bool increasing = (v0 < v1);
2339  T vmin = increasing ? v0 : v1;
2340  T vmax = increasing ? v1 : v0;
2341  bool bracketed = (y >= vmin && y <= vmax);
2342  if (brack)
2343  *brack = bracketed;
2344  if (! bracketed) {
2345  // If our bounds don't bracket the zero, just give up, and
2346  // return the appropriate "edge" of the interval
2347  return ((y < vmin) == increasing) ? xmin : xmax;
2348  }
2349  if (fabs(v0-v1) < eps) // already close enough
2350  return x;
2351  int rfiters = (3*maxiters)/4; // how many times to try regula falsi
2352  for (int iters = 0; iters < maxiters; ++iters) {
2353  T t; // interpolation factor
2354  if (iters < rfiters) {
2355  // Regula falsi
2356  t = (y-v0)/(v1-v0);
2357  if (t <= T(0) || t >= T(1))
2358  t = T(0.5); // RF convergence failure -- bisect instead
2359  } else {
2360  t = T(0.5); // bisection
2361  }
2362  x = lerp (xmin, xmax, t);
2363  v = func(x);
2364  if ((v < y) == increasing) {
2365  xmin = x; v0 = v;
2366  } else {
2367  xmax = x; v1 = v;
2368  }
2369  if (fabs(xmax-xmin) < eps || fabs(v-y) < eps)
2370  return x; // converged
2371  }
2372  return x;
2373 }
2374 
2375 
2376 
2377 /// Linearly interpolate a list of evenly-spaced knots y[0..len-1] with
2378 /// y[0] corresponding to the value at x==0.0 and y[len-1] corresponding to
2379 /// x==1.0.
2380 inline OIIO_HOSTDEVICE float
2382 {
2383 #ifndef __CUDA_ARCH__
2384  DASSERT_MSG (y.size() >= 2, "interpolate_linear needs at least 2 knot values (%zd)", y.size());
2385 #endif
2386  x = clamp (x, float(0.0), float(1.0));
2387  int nsegs = int(y.size()) - 1;
2388  int segnum;
2389  x = floorfrac (x*nsegs, &segnum);
2390 #ifndef __CUDA_ARCH__
2391  int nextseg = std::min (segnum+1, nsegs);
2392 #else
2393  int nextseg = min (segnum+1, nsegs);
2394 #endif
2395  return lerp (y[segnum], y[nextseg], x);
2396 }
2397 
2398 // (end miscellaneous numerical methods)
2399 ////////////////////////////////////////////////////////////////////////////
2400 
2401 
2402 
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_safe_pow(float x, float y)
Definition: fmath.h:2177
SYS_API float acosf(float x)
vint4 max(const vint4 &a, const vint4 &b)
Definition: simd.h:4845
#define OIIO_CONSTEXPR14
Definition: platform.h:101
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_correct_exp(float x)
Faster float exp than is in libm, but still 100% accurate.
Definition: fmath.h:2100
#define OIIO_LIKELY(x)
Definition: platform.h:379
SYS_API double cos(double x)
Definition: SYS_FPUMath.h:69
typedef int(APIENTRYP RE_PFNGLXSWAPINTERVALSGIPROC)(int)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_pow_pos(const T &x, const U &y)
Definition: fmath.h:2215
SYS_API float coshf(float x)
SYS_API double fmod(double x, double y)
Definition: SYS_FPUMath.h:85
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log(const T &x)
Definition: fmath.h:1957
EightBitConverter() noexcept
Definition: fmath.h:1411
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp10(float x)
Definition: fmath.h:2112
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan2(float y, float x)
Definition: fmath.h:1867
#define OIIO_FORCEINLINE
Definition: platform.h:395
SYS_API float log1pf(float x)
vfloat4 bitcast_to_float(const vint4 &x)
Definition: simd.h:7233
#define M_LN2
Definition: fmath.h:114
OIIO_HOSTDEVICE void evalBSplineWeights(T w[4], T fraction)
Definition: fmath.h:567
OIIO_HOSTDEVICE V round_down_to_multiple(V value, M multiple)
Definition: fmath.h:228
imath_half_bits_t half
if we're in a C-only context, alias the half bits type to half
Definition: half.h:266
vint4 srl(const vint4 &val, const unsigned int bits)
Definition: simd.h:4537
void bit_unpack(int n, const unsigned char *in, int inbits, T *out)
Definition: fmath.h:1293
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan(float x)
Definition: fmath.h:1848
IMATH_HOSTDEVICE constexpr int floor(T x) IMATH_NOEXCEPT
Definition: ImathFun.h:112
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_asin(T x)
Safe (clamping) arcsine: clamp to the valid domain.
Definition: fmath.h:1501
GLboolean * data
Definition: glcorearb.h:131
void swap(UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &a, UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &b)
Definition: UT_ArraySet.h:1639
const GLdouble * v
Definition: glcorearb.h:837
#define M_PI
Definition: fmath.h:90
const DataArrayProxy< I, E > & operator+=(int i)
Definition: fmath.h:1378
OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmsub(float a, float b, float c)
Negative fused multiply and subtract: -(a*b) - c.
Definition: fmath.h:438
GLsizei const GLfloat * value
Definition: glcorearb.h:824
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sin(float x)
Definition: fmath.h:1648
#define M_PI_2
Definition: fmath.h:93
OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmadd(float a, float b, float c)
Fused negative multiply and add: -(a*b) + c.
Definition: fmath.h:430
bool_constant< is_integral< T >::value &&!std::is_same< T, bool >::value &&!std::is_same< T, char >::value &&!std::is_same< T, wchar_t >::value > is_integer
Definition: format.h:2010
vfloat4 sqrt(const vfloat4 &a)
Definition: simd.h:7481
GLdouble GLdouble GLdouble z
Definition: glcorearb.h:848
Definition: span.h:73
OIIO_FORCEINLINE OIIO_HOSTDEVICE float safe_fmod(float a, float b)
Definition: fmath.h:1567
OIIO_HOSTDEVICE void sincos(float x, float *sine, float *cosine)
Definition: fmath.h:703
GLboolean GLboolean GLboolean GLboolean a
Definition: glcorearb.h:1222
GLdouble s
Definition: glad.h:3009
SYS_API float atan2f(float y, float x)
ImageBuf OIIO_API min(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
GLint y
Definition: glcorearb.h:103
OIIO_FORCEINLINE OIIO_HOSTDEVICE float msub(float a, float b, float c)
Fused multiply and subtract: (a*b - c)
Definition: fmath.h:422
OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in)
Definition: fmath.h:1194
OIIO_HOSTDEVICE V round_to_multiple(V value, M multiple)
Definition: fmath.h:200
**But if you need a result
Definition: thread.h:613
GLfloat GLfloat GLfloat v2
Definition: glcorearb.h:818
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int floor2(int x) noexcept
Definition: fmath.h:176
GLdouble GLdouble GLdouble q
Definition: glad.h:2445
Integer 8-vector, accelerated by SIMD instructions when available.
Definition: simd.h:1180
OIIO_FORCEINLINE OIIO_HOSTDEVICE int ifloor(float x)
Return floor(x) cast to an int.
Definition: fmath.h:618
std::integral_constant< bool, std::numeric_limits< T >::is_signed||std::is_same< T, int128_t >::value > is_signed
Definition: format.h:821
GLfloat GLfloat GLfloat GLfloat v3
Definition: glcorearb.h:819
OIIO_HOSTDEVICE int pow2roundup(int x)
Definition: fmath.h:191
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erf(float x)
Definition: fmath.h:2238
void convert_type< float, uint8_t >(const float *src, uint8_t *dst, size_t n, uint8_t, uint8_t)
Definition: fmath.h:1094
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cos(float x)
Definition: fmath.h:1680
constexpr size_type size() const noexcept
Definition: span.h:321
OIIO_HOSTDEVICE float floorfrac(float x, int *xint)
Definition: fmath.h:630
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cosh(float x)
Definition: fmath.h:2156
void set(I *data)
Definition: fmath.h:1376
SYS_API float log10f(float x)
void set(const I *data)
Definition: fmath.h:1395
ImageBuf OIIO_API pow(const ImageBuf &A, cspan< float > B, ROI roi={}, int nthreads=0)
vfloat4 floor(const vfloat4 &a)
Definition: simd.h:7427
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log2(T x)
Safe log2: clamp to valid domain.
Definition: fmath.h:1518
OIIO_HOSTDEVICE int pow2rounddown(int x)
Definition: fmath.h:192
GA_API const UT_StringHolder scale
OIIO_NODISCARD OIIO_FORCEINLINE OIIO_HOSTDEVICE T rotl(T x, int s) noexcept
Definition: fmath.h:273
GLdouble n
Definition: glcorearb.h:2008
void convert_type(const S *src, D *dst, size_t n, D _min, D _max)
Definition: fmath.h:945
GLfloat f
Definition: glcorearb.h:1926
SYS_API double asin(double x)
Definition: SYS_FPUMath.h:81
DataArrayProxy(I *data=NULL)
Definition: fmath.h:1372
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_acos(float x)
Definition: fmath.h:1820
Integer 4-vector, accelerated by SIMD instructions when available.
Definition: simd.h:890
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int ceil2(int x) noexcept
Definition: fmath.h:153
IMATH_NAMESPACE::V2f float
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tan(float x)
Definition: fmath.h:1741
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_ierf(float x)
Definition: fmath.h:2274
SYS_API float atanf(float x)
OIIO_HOSTDEVICE void trilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, const T *v4, const T *v5, const T *v6, const T *v7, Q s, Q t, Q r, Q scale, int n, T *result)
Definition: fmath.h:553
OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bit_cast(const IN_TYPE &in)
Definition: fmath.h:754
#define OIIO_DASSERT
Definition: dassert.h:55
vfloat4 madd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
Definition: simd.h:7554
DataProxy(I &data)
Definition: fmath.h:1344
OIIO_FORCEINLINE OIIO_HOSTDEVICE T clamp(const T &a, const T &low, const T &high)
clamp a to bounds [low,high].
Definition: fmath.h:340
void bit_pack(cspan< T > data, void *out, int outbits)
Definition: fmath.h:1273
SYS_API void sincosf(float x, float *s, float *c)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinh(float x)
Definition: fmath.h:2133
OIIO_HOSTDEVICE void bilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, Q s, Q t, Q scale, int n, T *result)
Definition: fmath.h:496
OIIO_HOSTDEVICE T invert(Func &func, T y, T xmin=0.0, T xmax=1.0, int maxiters=32, T eps=1.0e-6, bool *brack=0)
Definition: fmath.h:2329
OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd(float a, float b, float c)
Fused multiply and add: (a*b + c)
Definition: fmath.h:413
#define OIIO_HOSTDEVICE
Definition: platform.h:509
T operator()(unsigned char c) const noexcept
Definition: fmath.h:1412
void store(float *values) const
Definition: simd.h:6740
OIIO_HOSTDEVICE T round_to_multiple_of_pow2(T x, T m)
Definition: fmath.h:215
constexpr size_type size() const noexcept
Definition: span.h:185
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log10(T x)
Safe log10: clamp to valid domain.
Definition: fmath.h:1536
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint32_t rotl32(uint32_t x, int k)
Definition: fmath.h:298
SYS_FORCE_INLINE float log2(float x)
Definition: SYS_FPUMath.h:90
#define M_1_PI
Definition: fmath.h:102
SYS_API double acos(double x)
Definition: SYS_FPUMath.h:83
ConstDataProxy(const I &data)
Definition: fmath.h:1359
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 bool ispow2(T x) noexcept
Definition: fmath.h:140
OIIO_HOSTDEVICE float sign(float x)
Definition: fmath.h:732
vfloat4 round(const vfloat4 &a)
Definition: simd.h:7436
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log(T x)
Safe log: clamp to valid domain.
Definition: fmath.h:1527
GLboolean GLboolean GLboolean b
Definition: glcorearb.h:1222
#define M_PI_4
Definition: fmath.h:96
GLint GLenum GLint x
Definition: glcorearb.h:409
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp2(const T &xval)
Definition: fmath.h:2014
OIIO_HOSTDEVICE float interpolate_linear(float x, span_strided< const float > y)
Definition: fmath.h:2381
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_pow(T x, T y)
Definition: fmath.h:1552
GLdouble t
Definition: glad.h:2397
OIIO_FORCEINLINE OIIO_HOSTDEVICE void fast_sincos(float x, float *sine, float *cosine)
Definition: fmath.h:1707
E operator*() const
Definition: fmath.h:1393
GLfloat v0
Definition: glcorearb.h:816
#define OIIO_CLANG_PRAGMA(UnQuotedPragma)
Definition: platform.h:295
#define OIIO_NODISCARD
Definition: platform.h:477
ConstDataArrayProxy(const I *data=NULL)
Definition: fmath.h:1392
Integer 16-vector, accelerated by SIMD instructions when available.
Definition: simd.h:1478
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cospi(float x)
Definition: fmath.h:1811
GLint j
Definition: glad.h:2733
GLenum GLenum dst
Definition: glcorearb.h:1793
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_logb(float x)
Definition: fmath.h:1985
GLenum func
Definition: glcorearb.h:783
#define DASSERT_MSG
Definition: dassert.h:110
#define OIIO_UNLIKELY(x)
Definition: platform.h:380
OIIO_FORCEINLINE OIIO_HOSTDEVICE T lerp(const T &v0, const T &v1, const Q &x)
Definition: fmath.h:449
OIIO_FORCEINLINE OIIO_HOSTDEVICE T bilerp(const T &v0, const T &v1, const T &v2, const T &v3, const Q &s, const Q &t)
Definition: fmath.h:462
E operator*() const
Definition: fmath.h:1373
void store(int *values) const
Store the values into memory.
Definition: simd.h:4209
OIIO_FORCEINLINE OIIO_HOSTDEVICE int fast_rint(float x)
Definition: fmath.h:1630
SYS_API float sinhf(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_inversesqrt(T x)
Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
Definition: fmath.h:1494
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_expm1(float x)
Definition: fmath.h:2121
SYS_API float copysignf(float x, float y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float bitcast_to_float(int x)
Definition: fmath.h:801
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_neg(const T &x)
Definition: fmath.h:696
OIIO_FORCEINLINE OIIO_HOSTDEVICE int bitcast_to_int(float x)
Definition: fmath.h:798
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cbrt(float x)
Definition: fmath.h:2221
OIIO_HOSTDEVICE void float_to_rational(float f, unsigned int &num, unsigned int &den)
Definition: fmath.h:1432
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_mod(T a, T b)
Definition: fmath.h:318
void convert_type< uint8_t, float >(const uint8_t *src, float *dst, size_t n, float, float)
Definition: fmath.h:1011
GLfloat GLfloat v1
Definition: glcorearb.h:817
GLuint GLfloat * val
Definition: glcorearb.h:1608
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
vint4 bitcast_to_int(const vbool4 &x)
Bitcast back and forth to intN (not a convert – move the bits!)
Definition: simd.h:4710
GA_API const UT_StringHolder N
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log10(const T &x)
Definition: fmath.h:1972
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_acos(T x)
Safe (clamping) arccosine: clamp to the valid domain.
Definition: fmath.h:1509
SYS_API float asinf(float x)
GLubyte GLubyte GLubyte GLubyte w
Definition: glcorearb.h:857
Definition: core.h:1131
SYS_API float expm1f(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_logb(T x)
Safe logb: clamp to valid domain.
Definition: fmath.h:1545
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log1p(float x)
Definition: fmath.h:1998
OIIO_HOSTDEVICE D scaled_conversion(const S &src, F scale, F min, F max)
Definition: fmath.h:922
E operator[](int i) const
Definition: fmath.h:1394
GLboolean r
Definition: glcorearb.h:1222
void convert_type< uint16_t, float >(const uint16_t *src, float *dst, size_t n, float, float)
Definition: fmath.h:1029
SYS_FORCE_INLINE float exp2(float x)
Definition: SYS_FPUMath.h:98
OIIO_FORCEINLINE OIIO_HOSTDEVICE T trilerp(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
Definition: fmath.h:514
OIIO_HOSTDEVICE uint32_t clamped_mult32(uint32_t a, uint32_t b)
Definition: fmath.h:240
#define OIIO_NAMESPACE_END
Definition: oiioversion.h:94
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint64_t rotl64(uint64_t x, int k)
Definition: fmath.h:308
const ConstDataArrayProxy< I, E > & operator+=(int i)
Definition: fmath.h:1397
vint4 min(const vint4 &a, const vint4 &b)
Definition: simd.h:4834
OIIO_FORCEINLINE T log(const T &v)
Definition: simd.h:7688
OIIO_HOSTDEVICE void swap_endian(T *f, int len=1)
Definition: fmath.h:812
void bitstring_add_n_bits(T *&out, int &filled, uint32_t val, int n)
Definition: fmath.h:1225
Classes for SIMD processing.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tanh(float x)
Definition: fmath.h:2166
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_asin(float x)
Definition: fmath.h:1835
vint4 rint(const vfloat4 &a)
Definition: simd.h:7456
OIIO_FORCEINLINE OIIO_HOSTDEVICE T radians(T deg)
Convert degrees to radians.
Definition: fmath.h:669
void convert_type< float, uint16_t >(const float *src, uint16_t *dst, size_t n, uint16_t, uint16_t)
Definition: fmath.h:1073
#define M_LN10
Definition: fmath.h:117
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp(const T &x)
Definition: fmath.h:2085
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erfc(float x)
Definition: fmath.h:2262
SYS_API float tanhf(float x)
OIIO_HOSTDEVICE void bicubic_interp(const T **val, T s, T t, int n, T *result)
Definition: fmath.h:599
OIIO_HOSTDEVICE uint64_t clamped_mult64(uint64_t a, uint64_t b)
Definition: fmath.h:252
SYS_API double sin(double x)
Definition: SYS_FPUMath.h:71
E operator[](int i) const
Definition: fmath.h:1374
float float_t
Definition: fmath.h:909
OIIO_HOSTDEVICE void evalBSplineWeightDerivs(T dw[4], T fraction)
Definition: fmath.h:581
E operator=(E newval)
Definition: fmath.h:1345
Definition: format.h:895
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_sqrt(T x)
Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
Definition: fmath.h:1488
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinpi(float x)
Definition: fmath.h:1773
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log2(const T &xval)
Definition: fmath.h:1906
#define OIIO_NAMESPACE_BEGIN
Definition: oiioversion.h:93
OIIO_FORCEINLINE OIIO_HOSTDEVICE T degrees(T rad)
Convert radians to degrees.
Definition: fmath.h:673
GLenum src
Definition: glcorearb.h:1793