31 #define OIIO_FMATH_H 1
38 #include <type_traits>
68 #ifndef OIIO_FMATH_SIMD_FRIENDLY
69 # define OIIO_FMATH_SIMD_FRIENDLY 0
90 # define M_PI 3.14159265358979323846264338327950288
93 # define M_PI_2 1.57079632679489661923132169163975144
96 # define M_PI_4 0.785398163397448309615660845819875721
99 # define M_TWO_PI (M_PI * 2.0)
102 # define M_1_PI 0.318309886183790671537767526745028724
105 # define M_2_PI 0.636619772367581343075535053490057448
108 # define M_SQRT2 1.41421356237309504880168872420969808
111 # define M_SQRT1_2 0.707106781186547524400844362104849039
114 # define M_LN2 0.69314718055994530941723212145817656
117 # define M_LN10 2.30258509299404568401799145468436421
120 # define M_E 2.71828182845904523536028747135266250
123 # define M_LOG2E 1.44269504088896340735992468100189214
145 return (
x & (
x - 1)) == 0 && (
x >= 0);
186 return x & ~(
x >> 1);
199 template <
typename V,
typename M>
203 value += V(multiple) - 1;
204 return value - (value % V(multiple));
218 return (x + m - 1) & (~(m - 1));
227 template <
typename V,
typename M>
231 value -= V(multiple - 1);
232 return value - (value % V(multiple));
243 uint64_t
r = (uint64_t)a * (uint64_t)
b;
244 return r < Err ? (uint32_t)r : Err;
255 if (b && ab / b != a)
276 "rotl only works for unsigned integer types");
277 return (
x <<
s) | (
x >> ((
sizeof(
T) * 8) -
s));
281 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
288 uint32_t
rotl(uint32_t
x,
int s) noexcept
290 return __funnelshift_lc(
x,
x,
s);
300 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 320
301 return __funnelshift_lc(x, x, k);
303 return (x << k) | (x >> (32 - k));
310 return (x << k) | (x >> (64 - k));
320 return b ? (a %
b) :
T(0);
340 clamp (
const T&
a,
const T& low,
const T& high)
358 return (a >= low) ? ((a <= high) ? a : high) : low;
363 #ifndef __CUDA_ARCH__
447 template <
class T,
class Q>
452 return v0*(Q(1)-
x) + v1*x;
460 template <
class T,
class Q>
466 return T ((Q(1)-t)*(v0*s1 + v1*s) + t*(v2*s1 + v3*s));
475 template <
class T,
class Q>
478 const T *
v2,
const T *
v3,
483 for (
int i = 0; i <
n; ++i)
484 result[i] =
T (t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s));
494 template <
class T,
class Q>
497 const T *
v2,
const T *
v3,
502 for (
int i = 0; i <
n; ++i)
503 result[i] +=
T (scale * (t1*(v0[i]*s1 + v1[i]*s) +
504 t*(v2[i]*s1 + v3[i]*s)));
512 template <
class T,
class Q>
514 trilerp (T
v0, T
v1, T
v2, T
v3, T v4, T v5, T v6, T v7, Q
s, Q
t, Q
r)
520 return T (r1*(t1*(v0*s1 + v1*s) + t*(v2*s1 + v3*s)) +
521 r*(t1*(v4*s1 + v5*s) + t*(v6*s1 + v7*s)));
530 template <
class T,
class Q>
533 const T *v4,
const T *v5,
const T *v6,
const T *v7,
539 for (
int i = 0; i <
n; ++i)
540 result[i] =
T (r1*(t1*(v0[i]*s1 + v1[i]*s) + t*(v2[i]*s1 + v3[i]*s)) +
541 r*(t1*(v4[i]*s1 + v5[i]*s) + t*(v6[i]*s1 + v7[i]*s)));
551 template <
class T,
class Q>
554 const T *v4,
const T *v5,
const T *v6,
const T *v7,
558 bilerp_mad (v0, v1, v2, v3, s, t, scale*r1, n, result);
559 bilerp_mad (v4, v5, v6, v7, s, t, scale*r, n, result);
566 template <
typename T>
569 T one_frac = 1 - fraction;
570 w[0] =
T(1.0 / 6.0) * one_frac * one_frac * one_frac;
571 w[1] =
T(2.0 / 3.0) -
T(0.5) * fraction * fraction * (2 - fraction);
572 w[2] =
T(2.0 / 3.0) -
T(0.5) * one_frac * one_frac * (2 - one_frac);
573 w[3] =
T(1.0 / 6.0) * fraction * fraction * fraction;
580 template <
typename T>
583 T one_frac = 1 - fraction;
584 dw[0] = -
T(0.5) * one_frac * one_frac;
585 dw[1] =
T(0.5) * fraction * (3 * fraction - 4);
586 dw[2] = -
T(0.5) * one_frac * (3 * one_frac - 4);
587 dw[3] =
T(0.5) * fraction * fraction;
601 for (
int c = 0;
c <
n; ++
c)
605 for (
int j = 0;
j < 4; ++
j) {
606 for (
int i = 0; i < 4; ++i) {
608 for (
int c = 0;
c <
n; ++
c)
609 result[
c] += w * val[
j*4+i][
c];
620 return (
int)floorf(x);
639 return x -
static_cast<float>(i);
644 #ifndef __CUDA_ARCH__
668 template <
typename T>
672 template <
typename T>
694 template <
typename T>
705 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
706 __builtin_sincosf(x, sine, cosine);
707 #elif defined(__CUDA_ARCH__)
717 sincos (
double x,
double* sine,
double* cosine)
719 #if defined(__GNUC__) && defined(__linux__) && !defined(__clang__)
720 __builtin_sincos(x, sine, cosine);
721 #elif defined(__CUDA_ARCH__)
734 return x < 0.0f ? -1.0f : (x==0.0f ? 0.0f : 1.0f);
753 template <
typename IN_TYPE,
typename OUT_TYPE>
757 static_assert(
sizeof(IN_TYPE) ==
sizeof(OUT_TYPE),
758 "bit_cast must be between objects of the same size");
760 memcpy ((
void *)&out, &in,
sizeof(IN_TYPE));
764 #if defined(__INTEL_COMPILER)
772 return static_cast<uint32_t
>(_castf32_u32(
val));
775 return static_cast<int32_t
>(_castf32_u32(
val));
778 return _castu32_f32(
val);
781 return _castu32_f32(
val);
784 return static_cast<uint64_t
>(_castf64_u64(
val));
787 return static_cast<int64_t
>(_castf64_u64(
val));
790 return _castu64_f64(
val);
793 return _castu64_f64(
val);
814 for (
char *
c = (
char *) f; len--;
c +=
sizeof(
T)) {
815 if (
sizeof(
T) == 2) {
817 }
else if (
sizeof(
T) == 4) {
820 }
else if (
sizeof(
T) == 8) {
830 #if (OIIO_GNUC_VERSION || OIIO_CLANG_VERSION || OIIO_APPLE_CLANG_VERSION || OIIO_INTEL_COMPILER_VERSION) && !defined(__CUDACC__)
833 template<>
inline void swap_endian(uint16_t*
f,
int len) {
834 for (
int i = 0; i < len; ++i)
835 f[i] = __builtin_bswap16(f[i]);
838 template<>
inline void swap_endian(uint32_t* f,
int len) {
839 for (
int i = 0; i < len; ++i)
840 f[i] = __builtin_bswap32(f[i]);
843 template<>
inline void swap_endian(uint64_t* f,
int len) {
844 for (
int i = 0; i < len; ++i)
845 f[i] = __builtin_bswap64(f[i]);
848 template<>
inline void swap_endian(int16_t* f,
int len) {
849 for (
int i = 0; i < len; ++i)
850 f[i] = __builtin_bswap16(f[i]);
853 template<>
inline void swap_endian(int32_t* f,
int len) {
854 for (
int i = 0; i < len; ++i)
855 f[i] = __builtin_bswap32(f[i]);
858 template<>
inline void swap_endian(int64_t* f,
int len) {
859 for (
int i = 0; i < len; ++i)
860 f[i] = __builtin_bswap64(f[i]);
863 template<>
inline void swap_endian(
float* f,
int len) {
867 template<>
inline void swap_endian(
double* f,
int len) {
871 #elif defined(_MSC_VER) && !defined(__CUDACC__)
874 template<>
inline void swap_endian(uint16_t* f,
int len) {
875 for (
int i = 0; i < len; ++i)
876 f[i] = _byteswap_ushort(f[i]);
879 template<>
inline void swap_endian(uint32_t* f,
int len) {
880 for (
int i = 0; i < len; ++i)
881 f[i] = _byteswap_ulong(f[i]);
884 template<>
inline void swap_endian(uint64_t* f,
int len) {
885 for (
int i = 0; i < len; ++i)
886 f[i] = _byteswap_uint64(f[i]);
889 template<>
inline void swap_endian(int16_t* f,
int len) {
890 for (
int i = 0; i < len; ++i)
891 f[i] = _byteswap_ushort(f[i]);
894 template<>
inline void swap_endian(int32_t* f,
int len) {
895 for (
int i = 0; i < len; ++i)
896 f[i] = _byteswap_ulong(f[i]);
899 template<>
inline void swap_endian(int64_t* f,
int len) {
900 for (
int i = 0; i < len; ++i)
901 f[i] = _byteswap_uint64(f[i]);
920 template<
typename S,
typename D,
typename F>
926 s += (s < 0 ? (F)-0.5 : (F)0.5);
927 return (D)
clamp(s, min, max);
929 return (D)
clamp((F)src * scale + (F)0.5, min, max);
944 template<
typename S,
typename D>
949 memcpy (dst, src, n*
sizeof(D));
961 for ( ; n >= 16; n -= 16) {
962 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
963 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
964 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
965 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
966 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
967 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
968 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
969 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
970 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
971 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
972 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
973 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
974 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
975 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
976 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
977 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
980 *dst++ = scaled_conversion<S,D,F> (*src++,
scale,
min,
max);
985 for ( ; n >= 16; n -= 16) {
986 *dst++ = (D)((*src++) *
scale);
987 *dst++ = (D)((*src++) *
scale);
988 *dst++ = (D)((*src++) *
scale);
989 *dst++ = (D)((*src++) *
scale);
990 *dst++ = (D)((*src++) *
scale);
991 *dst++ = (D)((*src++) *
scale);
992 *dst++ = (D)((*src++) *
scale);
993 *dst++ = (D)((*src++) *
scale);
994 *dst++ = (D)((*src++) *
scale);
995 *dst++ = (D)((*src++) *
scale);
996 *dst++ = (D)((*src++) *
scale);
997 *dst++ = (D)((*src++) *
scale);
998 *dst++ = (D)((*src++) *
scale);
999 *dst++ = (D)((*src++) *
scale);
1000 *dst++ = (D)((*src++) *
scale);
1001 *dst++ = (D)((*src++) *
scale);
1004 *dst++ = (D)((*src++) *
scale);
1009 #ifndef __CUDA_ARCH__
1012 float *
dst,
size_t n,
1017 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1023 *dst++ = (*src++) * scale;
1030 float *
dst,
size_t n,
1035 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1041 *dst++ = (*src++) * scale;
1045 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1047 inline void convert_type<half,float> (
const half *
src,
1048 float *
dst,
size_t n,
1051 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
1053 for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
1059 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1074 uint16_t , uint16_t )
1081 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1088 *dst++ = scaled_conversion<float,uint16_t,float> (*src++,
scale,
min,
max);
1102 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1109 *dst++ = scaled_conversion<float,uint8_t,float> (*src++,
scale,
min,
max);
1113 #if defined(_HALF_H_) || defined(IMATH_HALF_H_)
1116 convert_type<float,half> (
const float *
src,
half *
dst,
size_t n,
1119 #if OIIO_SIMD >= 8 && OIIO_F16C_ENABLED
1121 for ( ; n >= 8; n -= 8, src += 8, dst += 8) {
1127 for ( ; n >= 4; n -= 4, src += 4, dst += 4) {
1140 template<
typename S,
typename D>
1143 convert_type<S,D> (
src,
dst,
n,
1156 template<
typename S,
typename D>
1176 return (D)((F)src * scale);
1193 template<
unsigned int FROM_BITS,
unsigned int TO_BITS>
1195 unsigned int out = 0;
1196 int shift = TO_BITS - FROM_BITS;
1197 for (; shift > 0; shift -= FROM_BITS)
1199 out |= in >> -shift;
1209 unsigned int out = 0;
1210 int shift = TO_BITS - FROM_BITS;
1211 for (; shift > 0; shift -= FROM_BITS)
1213 out |= in >> -shift;
1223 template<
typename T>
1230 "bitstring_add_n_bits must be unsigned int 8/16/32");
1231 const int Tbits =
sizeof(
T) * 8;
1239 val &= ~(0xffffffff <<
n);
1245 int bits_left_in_out = Tbits - filled;
1248 if (n <= bits_left_in_out) {
1249 b = val << (bits_left_in_out -
n);
1252 b = val >> (n - bits_left_in_out);
1253 nb = bits_left_in_out;
1259 if (filled == Tbits) {
1271 template<
typename T>
1278 "bit_pack must be unsigned int 8/16/32");
1279 unsigned char* outbuffer = (
unsigned char*)out;
1281 for (
size_t i = 0, e = data.
size(); i < e; ++i)
1291 template<
typename T>
1298 "bit_unpack must be unsigned int 8/16/32");
1304 for (
int i = 0; i <
n; ++i) {
1307 while (valbits < inbits) {
1311 int out_left = inbits - valbits;
1312 int in_left = 8 -
b;
1313 if (in_left <= out_left) {
1318 val |= in[
B] & ~(0xffffffff << in_left);
1327 int extra = 8 -
b - out_left;
1328 val |= (in[
B] >> extra) & ~(0xffffffff << out_left);
1342 template<
typename I,
typename E>
1345 E
operator= (E newval) { m_data = convert_type<E,I>(newval);
return newval; }
1346 operator E ()
const {
return convert_type<I,E>(m_data); }
1357 template<
typename I,
typename E>
1360 operator E ()
const {
return convert_type<E,I>(*m_data); }
1370 template<
typename I,
typename E>
1374 E
operator[] (
int i)
const {
return convert_type<I,E>(m_data[i]); }
1377 I *
get ()
const {
return m_data; }
1379 m_data += i;
return *
this;
1390 template<
typename I,
typename E>
1394 E
operator[] (
int i)
const {
return convert_type<I,E>(m_data[i]); }
1396 const I *
get ()
const {
return m_data; }
1398 m_data += i;
return *
this;
1408 template <
class T=
float>
1415 void init () noexcept {
1416 float scale = 1.0f / 255.0f;
1419 for (
int i = 0; i < 256; ++i)
1437 }
else if ((
int)(1.0/f) == (1.0/f)) {
1443 while (fabsf(f-static_cast<float>(num)) > 0.00001f && den < 1000000) {
1463 num = (f >= 0) ? (
int)n : -(
int)n;
1487 template <
typename T>
1493 template <
typename T>
1500 template <
typename T>
1508 template <
typename T>
1510 if (x <=
T(-1))
return T(
M_PI);
1511 if (x >=
T(+1))
return T(0);
1517 template <
typename T>
1526 template <
typename T>
1535 template <
typename T>
1544 template <
typename T>
1551 template <
typename T>
1553 if (y ==
T(0))
return T(1);
1554 if (x ==
T(0))
return T(0);
1556 if ((x <
T(0)) && (y !=
floor(y)))
return T(0);
1561 return clamp (r, -big, big);
1579 int N =
static_cast<int>(a/
b);
1586 #define OIIO_FMATH_HAS_SAFE_FMOD 1
1632 #if OIIO_SIMD_SSE >= 4
1637 return static_cast<int>(x +
copysignf(0.5f, x));
1641 #ifndef __CUDA_ARCH__
1649 #ifndef __CUDA_ARCH__
1655 float qf =
float(q);
1656 x =
madd(qf, -0.78515625f*4, x);
1657 x =
madd(qf, -0.00024187564849853515625f*4, x);
1658 x =
madd(qf, -3.7747668102383613586e-08f*4, x);
1659 x =
madd(qf, -1.2816720341285448015e-12f*4, x);
1662 if ((q & 1) != 0) x = -x;
1665 float u = 2.6083159809786593541503e-06
f;
1666 u =
madd(u, s, -0.0001981069071916863322258f);
1667 u =
madd(u, s, +0.00833307858556509017944336f);
1668 u =
madd(u, s, -0.166666597127914428710938f);
1669 u =
madd(s, u * x, x);
1673 return clamp(u, -1.0f, 1.0f);
1681 #ifndef __CUDA_ARCH__
1684 float qf =
float(q);
1685 x =
madd(qf, -0.78515625f*4, x);
1686 x =
madd(qf, -0.00024187564849853515625f*4, x);
1687 x =
madd(qf, -3.7747668102383613586e-08f*4, x);
1688 x =
madd(qf, -1.2816720341285448015e-12f*4, x);
1693 float u = -2.71811842367242206819355e-07
f;
1694 u =
madd(u, s, +2.47990446951007470488548e-05f);
1695 u =
madd(u, s, -0.00138888787478208541870117f);
1696 u =
madd(u, s, +0.0416666641831398010253906f);
1697 u =
madd(u, s, -0.5f);
1698 u =
madd(u, s, +1.0f);
1699 if ((q & 1) != 0) u = -u;
1700 return clamp(u, -1.0f, 1.0f);
1708 #ifndef __CUDA_ARCH__
1711 float qf =
float(q);
1712 x =
madd(qf, -0.78515625f*4, x);
1713 x =
madd(qf, -0.00024187564849853515625f*4, x);
1714 x =
madd(qf, -3.7747668102383613586e-08f*4, x);
1715 x =
madd(qf, -1.2816720341285448015e-12f*4, x);
1719 if ((q & 1) != 0) x = -x;
1720 float su = 2.6083159809786593541503e-06
f;
1721 su =
madd(su, s, -0.0001981069071916863322258f);
1722 su =
madd(su, s, +0.00833307858556509017944336f);
1723 su =
madd(su, s, -0.166666597127914428710938f);
1724 su =
madd(s, su * x, x);
1725 float cu = -2.71811842367242206819355e-07
f;
1726 cu =
madd(cu, s, +2.47990446951007470488548e-05f);
1727 cu =
madd(cu, s, -0.00138888787478208541870117f);
1728 cu =
madd(cu, s, +0.0416666641831398010253906f);
1729 cu =
madd(cu, s, -0.5f);
1730 cu =
madd(cu, s, +1.0f);
1731 if ((q & 1) != 0) cu = -cu;
1732 *sine =
clamp(su, -1.0f, 1.0f);;
1733 *cosine =
clamp(cu, -1.0f, 1.0f);;
1735 __sincosf(x, sine, cosine);
1742 #ifndef __CUDA_ARCH__
1747 float qf =
float(q);
1748 x =
madd(qf, -0.78515625f*2, x);
1749 x =
madd(qf, -0.00024187564849853515625f*2, x);
1750 x =
madd(qf, -3.7747668102383613586e-08f*2, x);
1751 x =
madd(qf, -1.2816720341285448015e-12f*2, x);
1755 float u = 0.00927245803177356719970703f;
1756 u =
madd(u, s, 0.00331984995864331722259521f);
1757 u =
madd(u, s, 0.0242998078465461730957031f);
1758 u =
madd(u, s, 0.0534495301544666290283203f);
1759 u =
madd(u, s, 0.133383005857467651367188f);
1760 u =
madd(u, s, 0.333331853151321411132812f);
1761 u =
madd(s, u * x, x);
1775 #ifndef __CUDA_ARCH__
1777 const float z = x - ((x + 25165824.0f) - 25165824.0f);
1778 const float y = z - z * fabsf(z);
1779 const float Q = 3.10396624f;
1780 const float P = 3.584135056f;
1781 return y * (Q + P * fabsf(y));
1813 #ifndef __CUDA_ARCH__
1821 #ifndef __CUDA_ARCH__
1822 const float f = fabsf(x);
1823 const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
1828 const float a = sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1836 #ifndef __CUDA_ARCH__
1839 const float f = fabsf(x);
1840 const float m = (f < 1.0f) ? 1.0f - (1.0f - f) : 1.0f;
1841 const float a =
float(
M_PI_2) - sqrtf(1.0f - m) * (1.5707963267f + m * (-0.213300989f + m * (0.077980478f + m * -0.02164095f)));
1849 #ifndef __CUDA_ARCH__
1850 const float a = fabsf(x);
1851 const float k = a > 1.0f ? 1 / a :
a;
1852 const float s = 1.0f - (1.0f - k);
1853 const float t = s *
s;
1859 float r = s *
madd(0.430165678f, t, 1.0f) /
madd(
madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1860 if (a > 1.0f) r = 1.570796326794896557998982f -
r;
1868 #ifndef __CUDA_ARCH__
1872 const float a = fabsf(x);
1873 const float b = fabsf(y);
1874 bool b_is_greater_than_a = b >
a;
1876 #if OIIO_FMATH_SIMD_FRIENDLY
1881 float sa = b_is_greater_than_a ? b :
a;
1882 float sb = b_is_greater_than_a ? a :
b;
1883 const float k = (b == 0) ? 0.0f : sb/sa;
1885 const float k = (b == 0) ? 0.0f : ((a == b) ? 1.0f : (b > a ? a / b : b /
a));
1888 const float s = 1.0f - (1.0f - k);
1889 const float t = s *
s;
1891 float r = s *
madd(0.430165678f, t, 1.0f) /
madd(
madd(0.0579354987f, t, 0.763007998f), t, 1.0f);
1893 if (b_is_greater_than_a)
1894 r = 1.570796326794896557998982f -
r;
1896 if (bit_cast<float, unsigned>(x) & 0x80000000u)
1905 template<
typename T>
1907 using namespace simd;
1908 typedef typename T::vint_t intN;
1912 intN exponent =
srl (bits, 23) - intN(127);
1916 T hi =
madd(f,
T(-0.00931049621349f),
T( 0.05206469089414f));
1917 T lo =
madd(f,
T( 0.47868480909345f),
T(-0.72116591947498f));
1918 hi =
madd(f, hi,
T(-0.13753123777116f));
1919 hi =
madd(f, hi,
T( 0.24187369696082f));
1920 hi =
madd(f, hi,
T(-0.34730547155299f));
1921 lo =
madd(f, lo,
T( 1.442689881667200f));
1922 return ((f4 * hi) + (f * lo)) +
T(exponent);
1928 #ifndef __CUDA_ARCH__
1933 int exponent =
int(bits >> 23) - 127;
1934 float f =
bit_cast<unsigned,
float>((bits & 0x007FFFFF) | 0x3f800000) - 1.0f;
1942 float hi =
madd(f, -0.00931049621349f, 0.05206469089414f);
1943 float lo =
madd(f, 0.47868480909345f, -0.72116591947498f);
1944 hi =
madd(f, hi, -0.13753123777116f);
1945 hi =
madd(f, hi, 0.24187369696082f);
1946 hi =
madd(f, hi, -0.34730547155299f);
1947 lo =
madd(f, lo, 1.442689881667200f);
1948 return ((f4 * hi) + (f * lo)) + exponent;
1950 return __log2f(xval);
1956 template<
typename T>
1962 #ifdef __CUDA_ARCH__
1971 template<
typename T>
1977 #ifdef __CUDA_ARCH__
1986 #ifndef __CUDA_ARCH__
1992 return float (
int(bits >> 23) - 127);
1999 #ifndef __CUDA_ARCH__
2000 if (fabsf(x) < 0.01f) {
2001 float y = 1.0f - (1.0f -
x);
2013 template<
typename T>
2015 using namespace simd;
2016 typedef typename T::vint_t intN;
2019 T x =
clamp (xval,
T(-126.0f),
T(126.0f));
2020 intN m (
x);
x -=
T(m);
2022 x = one - (one -
x);
2023 const T kA (1.33336498402e-3f);
2024 const T kB (9.810352697968e-3f);
2025 const T kC (5.551834031939e-2f);
2026 const T kD (0.2401793301105f);
2027 const T kE (0.693144857883f);
2033 r =
madd(
x, r, one);
2037 for (
int i = 0; i < r.elements; ++i)
2039 for (
int i = r.elements; i < r.paddedelements; ++i)
2048 #if OIIO_ANY_CLANG && !OIIO_INTEL_LLVM_COMPILER && OIIO_FMATH_SIMD_FRIENDLY
2052 #elif !defined(__CUDA_ARCH__)
2054 float x =
clamp (xval, -126.0f, 126.0f);
2056 int m =
int(x); x -= m;
2057 x = 1.0f - (1.0f -
x);
2063 float r = 1.33336498402e-3
f;
2064 r =
madd(x, r, 9.810352697968e-3f);
2065 r =
madd(x, r, 5.551834031939e-2f);
2066 r =
madd(x, r, 0.2401793301105f);
2067 r =
madd(x, r, 0.693144857883f);
2068 r =
madd(x, r, 1.0f);
2084 template <
typename T>
2090 #ifdef __CUDA_ARCH__
2102 #if defined(__x86_64__) && defined(__GNU_LIBRARY__) && defined(__GLIBC__ ) && defined(__GLIBC_MINOR__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 16
2105 return static_cast<float>(std::exp(static_cast<double>(x)));
2113 #ifndef __CUDA_ARCH__
2122 #ifndef __CUDA_ARCH__
2123 if (fabsf(x) < 0.03f) {
2124 float y = 1.0f - (1.0f -
x);
2134 #ifndef __CUDA_ARCH__
2139 return copysignf(0.5f * e - 0.5f / e, x);
2141 a = 1.0f - (1.0f -
a);
2145 float r = 2.03945513931e-4
f;
2146 r =
madd(r, a2, 8.32990277558e-3f);
2147 r =
madd(r, a2, 0.1666673421859f);
2148 r =
madd(r * a, a2, a);
2157 #ifndef __CUDA_ARCH__
2160 return 0.5f * e + 0.5f / e;
2167 #ifndef __CUDA_ARCH__
2170 float e =
fast_exp(2.0f * fabsf(x));
2178 if (y == 0)
return 1.0f;
2179 if (x == 0)
return 0.0f;
2184 #ifndef __CUDA_ARCH__
2195 if (ybits >= 0x4b800000) {
2197 }
else if (ybits >= 0x3f800000) {
2199 int k = (ybits >> 23) - 127;
2200 int j = ybits >> (23 - k);
2201 if ((j << (23 - k)) == ybits)
2202 sign =
bit_cast<
int,
float>(0x3f800000 | (j << 31));
2214 template<
typename T,
typename U>
2222 #ifndef __CUDA_ARCH__
2223 float x0 = fabsf(x);
2228 a = 0.333333333f * (2.0f * a + x0 / (a *
a));
2229 a = 0.333333333f * (2.0f * a + x0 / (a *
a));
2230 a = (x0 == 0) ? 0 : a;
2240 #ifndef __CUDA_ARCH__
2243 const float a1 = 0.0705230784f;
2244 const float a2 = 0.0422820123f;
2245 const float a3 = 0.0092705272f;
2246 const float a4 = 0.0001520143f;
2247 const float a5 = 0.0002765672f;
2248 const float a6 = 0.0000430638f;
2249 const float a = fabsf(x);
2250 const float b = 1.0f - (1.0f -
a);
2251 const float r =
madd(
madd(
madd(
madd(
madd(
madd(a6, b, a5), b, a4), b, a3), b, a2), b, a1), b, 1.0f);
2252 const float s = r *
r;
2253 const float t = s *
s;
2254 const float u = t *
t;
2255 const float v = u * u;
2264 #ifndef __CUDA_ARCH__
2278 float a = fabsf(x);
if (a > 0.99999994f) a = 0.99999994f;
2279 float w = -
fast_log((1.0f - a) * (1.0f + a)), p;
2282 p = 2.81022636e-08
f;
2283 p =
madd(p, w, 3.43273939e-07f);
2284 p =
madd(p, w, -3.5233877e-06f );
2285 p =
madd(p, w, -4.39150654e-06f);
2286 p =
madd(p, w, 0.00021858087f );
2287 p =
madd(p, w, -0.00125372503f );
2288 p =
madd(p, w, -0.00417768164f );
2289 p =
madd(p, w, 0.246640727f );
2290 p =
madd(p, w, 1.50140941f );
2292 w = sqrtf(w) - 3.0f;
2293 p = -0.000200214257f;
2294 p =
madd(p, w, 0.000100950558f);
2295 p =
madd(p, w, 0.00134934322f );
2296 p =
madd(p, w, -0.00367342844f );
2297 p =
madd(p, w, 0.00573950773f );
2298 p =
madd(p, w, -0.0076224613f );
2299 p =
madd(p, w, 0.00943887047f );
2300 p =
madd(p, w, 1.00167406f );
2301 p =
madd(p, w, 2.83297682f );
2330 int maxiters=32, T eps=1.0e-6,
bool *brack=0)
2338 bool increasing = (v0 <
v1);
2339 T vmin = increasing ? v0 :
v1;
2340 T vmax = increasing ?
v1 :
v0;
2341 bool bracketed = (y >= vmin && y <= vmax);
2347 return ((y < vmin) == increasing) ? xmin : xmax;
2349 if (fabs(v0-
v1) < eps)
2351 int rfiters = (3*maxiters)/4;
2352 for (
int iters = 0; iters < maxiters; ++iters) {
2354 if (iters < rfiters) {
2357 if (t <=
T(0) || t >=
T(1))
2362 x =
lerp (xmin, xmax, t);
2364 if ((v < y) == increasing) {
2369 if (fabs(xmax-xmin) < eps || fabs(v-y) < eps)
2383 #ifndef __CUDA_ARCH__
2384 DASSERT_MSG (y.
size() >= 2,
"interpolate_linear needs at least 2 knot values (%zd)", y.
size());
2386 x =
clamp (x,
float(0.0),
float(1.0));
2387 int nsegs =
int(y.
size()) - 1;
2390 #ifndef __CUDA_ARCH__
2391 int nextseg =
std::min (segnum+1, nsegs);
2393 int nextseg =
min (segnum+1, nsegs);
2395 return lerp (y[segnum], y[nextseg], x);
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_safe_pow(float x, float y)
SYS_API float acosf(float x)
vint4 max(const vint4 &a, const vint4 &b)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_correct_exp(float x)
Faster float exp than is in libm, but still 100% accurate.
SYS_API double cos(double x)
typedef int(APIENTRYP RE_PFNGLXSWAPINTERVALSGIPROC)(int)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_pow_pos(const T &x, const U &y)
SYS_API float coshf(float x)
SYS_API double fmod(double x, double y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log(const T &x)
EightBitConverter() noexcept
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_exp10(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan2(float y, float x)
SYS_API float log1pf(float x)
vfloat4 bitcast_to_float(const vint4 &x)
OIIO_HOSTDEVICE void evalBSplineWeights(T w[4], T fraction)
OIIO_HOSTDEVICE V round_down_to_multiple(V value, M multiple)
imath_half_bits_t half
if we're in a C-only context, alias the half bits type to half
vint4 srl(const vint4 &val, const unsigned int bits)
void bit_unpack(int n, const unsigned char *in, int inbits, T *out)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_atan(float x)
IMATH_HOSTDEVICE constexpr int floor(T x) IMATH_NOEXCEPT
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_asin(T x)
Safe (clamping) arcsine: clamp to the valid domain.
void swap(UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &a, UT::ArraySet< Key, MULTI, MAX_LOAD_FACTOR_256, Clearer, Hash, KeyEqual > &b)
const DataArrayProxy< I, E > & operator+=(int i)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmsub(float a, float b, float c)
Negative fused multiply and subtract: -(a*b) - c.
GLsizei const GLfloat * value
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sin(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float nmadd(float a, float b, float c)
Fused negative multiply and add: -(a*b) + c.
vfloat4 sqrt(const vfloat4 &a)
GLdouble GLdouble GLdouble z
OIIO_FORCEINLINE OIIO_HOSTDEVICE float safe_fmod(float a, float b)
OIIO_HOSTDEVICE void sincos(float x, float *sine, float *cosine)
GLboolean GLboolean GLboolean GLboolean a
SYS_API float atan2f(float y, float x)
ImageBuf OIIO_API min(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float msub(float a, float b, float c)
Fused multiply and subtract: (a*b - c)
OIIO_HOSTDEVICE unsigned int bit_range_convert(unsigned int in)
OIIO_HOSTDEVICE V round_to_multiple(V value, M multiple)
**But if you need a result
GLfloat GLfloat GLfloat v2
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int floor2(int x) noexcept
GLdouble GLdouble GLdouble q
Integer 8-vector, accelerated by SIMD instructions when available.
OIIO_FORCEINLINE OIIO_HOSTDEVICE int ifloor(float x)
Return floor(x) cast to an int.
GLfloat GLfloat GLfloat GLfloat v3
OIIO_HOSTDEVICE int pow2roundup(int x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erf(float x)
void convert_type< float, uint8_t >(const float *src, uint8_t *dst, size_t n, uint8_t, uint8_t)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cos(float x)
constexpr size_type size() const noexcept
OIIO_HOSTDEVICE float floorfrac(float x, int *xint)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cosh(float x)
SYS_API float log10f(float x)
ImageBuf OIIO_API pow(const ImageBuf &A, cspan< float > B, ROI roi={}, int nthreads=0)
vfloat4 floor(const vfloat4 &a)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log2(T x)
Safe log2: clamp to valid domain.
OIIO_HOSTDEVICE int pow2rounddown(int x)
GA_API const UT_StringHolder scale
OIIO_NODISCARD OIIO_FORCEINLINE OIIO_HOSTDEVICE T rotl(T x, int s) noexcept
void convert_type(const S *src, D *dst, size_t n, D _min, D _max)
SYS_API double asin(double x)
DataArrayProxy(I *data=NULL)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_acos(float x)
Integer 4-vector, accelerated by SIMD instructions when available.
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 int ceil2(int x) noexcept
IMATH_NAMESPACE::V2f float
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tan(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_ierf(float x)
SYS_API float atanf(float x)
OIIO_HOSTDEVICE void trilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, const T *v4, const T *v5, const T *v6, const T *v7, Q s, Q t, Q r, Q scale, int n, T *result)
OIIO_FORCEINLINE OIIO_HOSTDEVICE OUT_TYPE bit_cast(const IN_TYPE &in)
vfloat4 madd(const vfloat4 &a, const vfloat4 &b, const vfloat4 &c)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T clamp(const T &a, const T &low, const T &high)
clamp a to bounds [low,high].
void bit_pack(cspan< T > data, void *out, int outbits)
SYS_API void sincosf(float x, float *s, float *c)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinh(float x)
OIIO_HOSTDEVICE void bilerp_mad(const T *v0, const T *v1, const T *v2, const T *v3, Q s, Q t, Q scale, int n, T *result)
OIIO_HOSTDEVICE T invert(Func &func, T y, T xmin=0.0, T xmax=1.0, int maxiters=32, T eps=1.0e-6, bool *brack=0)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float madd(float a, float b, float c)
Fused multiply and add: (a*b + c)
T operator()(unsigned char c) const noexcept
void store(float *values) const
OIIO_HOSTDEVICE T round_to_multiple_of_pow2(T x, T m)
constexpr size_type size() const noexcept
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log10(T x)
Safe log10: clamp to valid domain.
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint32_t rotl32(uint32_t x, int k)
SYS_FORCE_INLINE float log2(float x)
SYS_API double acos(double x)
ConstDataProxy(const I &data)
OIIO_HOSTDEVICE OIIO_CONSTEXPR14 bool ispow2(T x) noexcept
OIIO_HOSTDEVICE float sign(float x)
vfloat4 round(const vfloat4 &a)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_log(T x)
Safe log: clamp to valid domain.
GLboolean GLboolean GLboolean b
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp2(const T &xval)
OIIO_HOSTDEVICE float interpolate_linear(float x, span_strided< const float > y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_pow(T x, T y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE void fast_sincos(float x, float *sine, float *cosine)
ConstDataArrayProxy(const I *data=NULL)
Integer 16-vector, accelerated by SIMD instructions when available.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cospi(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_logb(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T lerp(const T &v0, const T &v1, const Q &x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T bilerp(const T &v0, const T &v1, const T &v2, const T &v3, const Q &s, const Q &t)
void store(int *values) const
Store the values into memory.
OIIO_FORCEINLINE OIIO_HOSTDEVICE int fast_rint(float x)
SYS_API float sinhf(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_inversesqrt(T x)
Safe (clamping) inverse sqrt: safe_inversesqrt(x<=0) returns 0.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_expm1(float x)
SYS_API float copysignf(float x, float y)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float bitcast_to_float(int x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_neg(const T &x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE int bitcast_to_int(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_cbrt(float x)
OIIO_HOSTDEVICE void float_to_rational(float f, unsigned int &num, unsigned int &den)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_mod(T a, T b)
void convert_type< uint8_t, float >(const uint8_t *src, float *dst, size_t n, float, float)
ImageBuf OIIO_API max(Image_or_Const A, Image_or_Const B, ROI roi={}, int nthreads=0)
vint4 bitcast_to_int(const vbool4 &x)
Bitcast back and forth to intN (not a convert – move the bits!)
GA_API const UT_StringHolder N
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log10(const T &x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_acos(T x)
Safe (clamping) arccosine: clamp to the valid domain.
SYS_API float asinf(float x)
GLubyte GLubyte GLubyte GLubyte w
SYS_API float expm1f(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_logb(T x)
Safe logb: clamp to valid domain.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_log1p(float x)
OIIO_HOSTDEVICE D scaled_conversion(const S &src, F scale, F min, F max)
E operator[](int i) const
void convert_type< uint16_t, float >(const uint16_t *src, float *dst, size_t n, float, float)
SYS_FORCE_INLINE float exp2(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T trilerp(T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, Q s, Q t, Q r)
OIIO_HOSTDEVICE uint32_t clamped_mult32(uint32_t a, uint32_t b)
#define OIIO_NAMESPACE_END
OIIO_FORCEINLINE OIIO_HOSTDEVICE uint64_t rotl64(uint64_t x, int k)
const ConstDataArrayProxy< I, E > & operator+=(int i)
vint4 min(const vint4 &a, const vint4 &b)
OIIO_FORCEINLINE T log(const T &v)
OIIO_HOSTDEVICE void swap_endian(T *f, int len=1)
void bitstring_add_n_bits(T *&out, int &filled, uint32_t val, int n)
Classes for SIMD processing.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_tanh(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_asin(float x)
vint4 rint(const vfloat4 &a)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T radians(T deg)
Convert degrees to radians.
void convert_type< float, uint16_t >(const float *src, uint16_t *dst, size_t n, uint16_t, uint16_t)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_exp(const T &x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_erfc(float x)
SYS_API float tanhf(float x)
OIIO_HOSTDEVICE void bicubic_interp(const T **val, T s, T t, int n, T *result)
OIIO_HOSTDEVICE uint64_t clamped_mult64(uint64_t a, uint64_t b)
SYS_API double sin(double x)
E operator[](int i) const
OIIO_HOSTDEVICE void evalBSplineWeightDerivs(T dw[4], T fraction)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T safe_sqrt(T x)
Safe (clamping) sqrt: safe_sqrt(x<0) returns 0, not NaN.
OIIO_FORCEINLINE OIIO_HOSTDEVICE float fast_sinpi(float x)
OIIO_FORCEINLINE OIIO_HOSTDEVICE T fast_log2(const T &xval)
#define OIIO_NAMESPACE_BEGIN
OIIO_FORCEINLINE OIIO_HOSTDEVICE T degrees(T rad)
Convert radians to degrees.