10 #ifndef __UT_UnicodeImpl__
11 #define __UT_UnicodeImpl__
25 getCharacterInfo(
utf32 cp)
33 return block[cp & 255];
41 return ((
uchar(c) & 0x80) == 0);
45 isContinuation(
utf8 c)
47 return ((
uchar(c) & 0xC0) == 0x80);
51 isValidLeading(
utf8 c)
54 ((
uchar(c) & 0xE0) == 0xC0) ||
55 ((
uchar(c) & 0xF0) == 0xE0) ||
56 ((
uchar(c) & 0xF8) == 0xF0);
60 getContinuationCount(
utf8 c)
62 if ((
uchar(c) & 0xE0) == 0xC0)
64 else if ((
uchar(c) & 0xF0) == 0xE0)
66 else if ((
uchar(c) & 0xF8) == 0xF0)
76 return isValidLeading(octet) || isContinuation(octet);
97 int cont_bytes = getContinuationCount(c);
106 static const utf32 least_values[4] = { 0x0, 0x80, 0x800, 0x10000 };
107 utf32 least_value = least_values[cont_bytes];
111 int shift = cont_bytes * 6;
120 result |= (c &
mask) << shift;
122 if (cont_bytes-- == 0)
128 if (!isContinuation(c))
155 if (buf && buflen >= 1)
159 else if (cp < 0x00000800)
161 if (buf && buflen >= 2)
163 buf[0] = 0xC0 |
utf8(cp >> 6);
164 buf[1] = 0x80 |
utf8(cp & 0x3F);
168 else if (cp < 0x00010000)
174 if (buf && buflen >= 3)
176 buf[0] = 0xE0 |
utf8(cp >> 12);
177 buf[1] = 0x80 |
utf8((cp >> 6) & 0x3F);
178 buf[2] = 0x80 |
utf8(cp & 0x3F);
182 else if (cp < 0x110000)
184 if (buf && buflen >= 4)
186 buf[0] = 0xF0 |
utf8(cp >> 18);
187 buf[1] = 0x80 |
utf8((cp >> 12) & 0x3F);
188 buf[2] = 0x80 |
utf8((cp >> 6) & 0x3F);
189 buf[3] = 0x80 |
utf8(cp & 0x3F);
207 return (c & 0xFF) << 8 | (c >> 8);
221 utf16 c0 = norm16(str[0], big_endian);
229 utf16 c1 = norm16(str[1], big_endian);
230 if ((c0 >= 0xD800 && c0 < 0xDC00) && (c1 >= 0xDC00 && c1 < 0xE000))
232 static const utf32 offset = ((0xD800 << 10) + 0xDC00) - 0x10000;
254 if (buf && buflen >=
sizeof(
utf16))
260 if (buf && buflen >=
sizeof(
utf16[2]))
263 buf[0] =
utf16(0xD800 | ((cp >> 10) & 0x03FF));
264 buf[1] =
utf16(0xDC00 | (cp & 0x03FF));
275 return const_cast<utf8 *
>(
next(const_cast<const utf8 *>(current)));
298 if (isContinuation(c))
305 while (isContinuation(*current++) && nb_cont--) { }
310 if (!isValidLeading(*current))
315 else if ((nb_cont = getContinuationCount(c)) == 0)
323 for (
int i = 0; i < nb_cont; i++)
325 if (!isContinuation(*current++))
329 if (*current && !isValidLeading(*current))
338 return const_cast<utf8 *
>(
prev(start, const_cast<const utf8 *>(current)));
344 if (!current || !start)
348 if (start >= current)
354 if ( isValidLeading(current[0]) &&
isASCII(current[-1]))
362 if (isValidLeading(*current))
366 else if (!isContinuation(*current))
374 while(current >= start && isContinuation(*current))
385 if (nb_cont > getContinuationCount(*current))
391 template<
bool backward>
395 if (!at || (backward && start == at))
398 const auto is_dot = [](
utf32 cp) {
return cp ==
'.'; };
401 const auto is_alnum = [](
utf32 cp)
404 cp ==
'_' || cp ==
'@';
406 const auto is_punct = [](
utf32 cp)
408 return cp ==
'{' || cp ==
'[' || cp ==
'(' || cp ==
';' ||
409 cp ==
'}' || cp ==
']' || cp ==
')' || cp ==
',' || cp ==
'.' ||
410 cp ==
'\n' || cp ==
'\r';
414 const auto read = [](
const utf8 *at)
419 const auto is_numerical = [&](
const utf8 *
start,
const utf8 *at)
423 return is_digit(p) && is_digit(
n);
428 if (!
first && backward)
439 enum State { SPACE, ALNUM, PUNCT, OTHER };
441 : is_punct(
first) ? PUNCT : OTHER;
444 if (state == PUNCT && is_dot(
first) && is_numerical(start, at))
447 const auto is_continuous = [&](
auto &&cp)
451 case SPACE:
return is_space(cp);
452 case ALNUM:
return is_alnum(cp);
453 case PUNCT:
return false;
454 default :
return !is_space(cp) && !is_alnum(cp) && !is_punct(cp);
470 if (!(is_dot(cp) && is_numerical(start, at)) &&
487 if (!(is_dot(cp) && is_numerical(start, p)) &&
501 return utFindWordBoundary<false>(
start, current);
507 return utFindWordBoundary<true>(
start, current);
513 return fixpos(start, const_cast<const utf8 *&>(current));
519 if (isContinuation(*current))
520 return prev(start, current) != NULL;
551 return strlen((
const char *)start);
553 return exint(end - start);
566 return (
utf8 *)::strdup((
const char *)start);
573 ::memcpy(buf, start, length);
583 if (cp == 0 || !start)
592 return (
const utf8 *)strchr((
const char *)start,
char(cp));
595 while(start < end && *start != cp)
597 return start == end ? NULL :
start;
615 if (!end || next < end)
635 return (
const utf8 *)::strstr((
const char *)start, (
const char *)str);
639 size_t len = ::strlen((
const char *)str);
643 if (!len || (end - start) < len)
647 while(start < (end - len))
715 return (cp >= 0x04E00 && cp <= 0x09FFF) ||
716 (cp >= 0x03400 && cp <= 0x04DBF) ||
717 (cp >= 0x20000 && cp <= 0x2A6D6);
751 init(NULL, NULL, NULL);
757 init(start, end, transform);
778 void UT_Unicode::iterator::init(
const utf8 *start,
const utf8 *
end,
785 if (start && isValidLeading(*start) && (!end || start < end))
793 myStart = myEnd = myNext = myNext2 = NULL;
806 else if (to < myStart || (myEnd && to > myEnd))
813 myNext =
convert(myCurrent, myCP);
818 if (myTransform && myCP)
819 myCP = myTransform->transformCodepoint(myCP);
820 if (!myEnd || (myNext < myEnd))
822 myNext2 =
convert(myNext, myNextCP);
823 if (myTransform && myNextCP)
824 myNextCP = myTransform->transformCodepoint(myNextCP);
835 if (!myStart || !myCP || (myEnd && myCurrent >= myEnd))
843 if(!myEnd || (myNext < myEnd))
845 myNext2 =
convert(myNext, myNextCP);
846 if (myTransform && myNextCP)
847 myNextCP = myTransform->transformCodepoint(myNextCP);
861 if (!myStart || myCurrent == myStart)
871 if (myTransform && cp)
872 cp = myTransform->transformCodepoint(cp);
884 #endif // __UT_UnicodeImpl__
static bool isUpper(utf32 cp)
SYS_FORCE_INLINE const_iterator begin() const
GLenum GLuint GLenum GLsizei const GLchar * buf
static bool isValidCodePoint(utf32 cp)
static bool isSpace(utf32 cp, bool break_only=true)
static bool isLower(utf32 cp)
static bool isAlpha(utf32 cp)
unsigned int myComplement
SYS_NO_DISCARD_RESULT SYS_FORCE_INLINE const_iterator end() const
Returns a constant iterator pointing to the end of the string.
GLuint GLsizei GLsizei * length
static bool isFromSupplementaryPlane(utf32 cp)
static bool isPunct(utf32 cp)
**But if you need a result
void reset(const utf8 *to=0)
void read(T &in, bool &v)
const char * c_str() const
static const utf8 * prev(const utf8 *start, const utf8 *current)
SYS_FORCE_INLINE const_iterator end() const
static bool isUTF8(utf8 octet)
A utility class to do read-only operations on a subset of an existing string.
static const utf8 * convert(const utf8 *str, utf32 &cp)
static bool isDigit(utf32 cp)
static bool isWordDelimiter(utf32 cp)
static bool isSurrogatePair(utf32 cp)
static const utf8 * nextWord(const utf8 *start, const utf8 *current)
static exint count(const utf8 *start, const utf8 *end=0)
Returns the number of code points this variable encoding represents.
static const utf8 * next(const utf8 *current)
GA_API const UT_StringHolder transform
static bool isAlnum(utf32 cp)
static bool fixpos(const utf8 *start, const utf8 *¤t)
static exint length(const utf8 *start, const utf8 *end=0)
static const utf8 * prevWord(const utf8 *start, const utf8 *current)
SYS_NO_DISCARD_RESULT SYS_FORCE_INLINE const_iterator begin() const
Returns a constant iterator pointing to the beginning of the string.
static bool isASCII(utf32 cp)
static bool isCJK(utf32 cp)
static const utf8 * find(utf32 cp, const utf8 *start, const utf8 *end=0)
static utf32 replacementCodePoint()
UT_API UT_UnicodeCharacter * theUnicodeTable[256]
static utf32 toLower(utf32 cp)
PXR_NAMESPACE_OPEN_SCOPE typedef unsigned char uchar
static utf8 * duplicate(const utf8 *start, const utf8 *end=0)
static utf32 toUpper(utf32 cp)