HDK
|
Helper functions for Unicode and the UTF-8 variable length encoding. More...
#include <UT_Unicode.h>
Classes | |
class | iterator |
class | transform |
Static Public Member Functions | |
static const utf8 * | convert (const utf8 *str, utf32 &cp) |
static int | convert (utf32 cp, utf8 *str, exint buflen) |
static const utf8 * | next (const utf8 *current) |
static utf8 * | next (utf8 *current) |
static const utf8 * | prev (const utf8 *start, const utf8 *current) |
static utf8 * | prev (const utf8 *start, utf8 *current) |
static const utf8 * | nextWord (const utf8 *start, const utf8 *current) |
static const utf8 * | prevWord (const utf8 *start, const utf8 *current) |
static bool | fixpos (const utf8 *start, const utf8 *¤t) |
static bool | fixpos (const utf8 *start, utf8 *¤t) |
static exint | count (const utf8 *start, const utf8 *end=0) |
Returns the number of code points this variable encoding represents. More... | |
static exint | length (const utf8 *start, const utf8 *end=0) |
static utf8 * | duplicate (const utf8 *start, const utf8 *end=0) |
static const utf8 * | find (utf32 cp, const utf8 *start, const utf8 *end=0) |
static const utf8 * | find (const utf8 *str, const utf8 *start, const utf8 *end=0) |
static const utf16 * | convert (const utf16 *str, utf32 &cp, bool big_endian=false) |
static int | convert (utf32 cp, utf16 *str, exint buflen) |
static utf32 | replacementCodePoint () |
static bool | isSurrogatePair (utf32 cp) |
static bool | isFromSupplementaryPlane (utf32 cp) |
static bool | isValidCodePoint (utf32 cp) |
static bool | isControlChar (utf32 cp) |
static bool | isASCII (utf32 cp) |
static bool | isLatin1 (utf32 cp) |
static bool | isSpace (utf32 cp, bool break_only=true) |
static bool | isDigit (utf32 cp) |
static bool | isAlpha (utf32 cp) |
static bool | isAlnum (utf32 cp) |
static bool | isPunct (utf32 cp) |
static bool | isUpper (utf32 cp) |
static bool | isLower (utf32 cp) |
static bool | isCJK (utf32 cp) |
static utf32 | toLower (utf32 cp) |
static utf32 | toUpper (utf32 cp) |
static bool | isWordDelimiter (utf32 cp) |
static bool | isUTF8 (utf8 octet) |
Helper functions for Unicode and the UTF-8 variable length encoding.
Definition at line 28 of file UT_Unicode.h.
Parses a code point from a UTF-8 encoding and returns it as a single code point value. Returns a pointer to the next encoding if the current one got successfully decoded. If the decoding fails, it return NULL
and cp is set to zero.
Definition at line 81 of file UT_UnicodeImpl.h.
Converts a code point to its UTF-8 encoding. If no buffer is given, returns the number of characters needed to store the resulting encoded sequence. Does not write out a terminating zero but moves the pointer to where the next character after the sequence should be written.
0x10FFFF is the greatest code point value allowed by Unicode and
Definition at line 151 of file UT_UnicodeImpl.h.
|
inlinestatic |
Parses a code point from a UTF-16 encoding and returns it as a single code point value. Returns a pointer to the next encoding if the current one got successfully decoded. If the decoding fails, it return NULL
and cp is set to zero. Set big_endian
to true if the incoming UTF-16 string is encoded as big endian (UTF-16BE).
Definition at line 214 of file UT_UnicodeImpl.h.
Converts a code point to its UTF-16LE encoding into the buffer given. If no buffer is given, or if the buffer size is too small, returns the number of bytes needed to store the resulting encoded sequence. buflen
should be given in bytes, and not number of utf16 entries. Does not write out a terminating zero but moves the pointer to where the next character after the sequence should be written.
Definition at line 246 of file UT_UnicodeImpl.h.
Returns the number of code points this variable encoding represents.
Definition at line 526 of file UT_UnicodeImpl.h.
Duplicates the string using malloc. Use free() to free the resulting string. If a NULL pointer is passed, a NULL pointer is returned.
Definition at line 557 of file UT_UnicodeImpl.h.
Find a code point in a variable length string and return a pointer to it. An optional end point can be supplied, which delineates a search range. Otherwise the string is searched up to the terminating NUL.
Definition at line 581 of file UT_UnicodeImpl.h.
|
inlinestatic |
Find a UTF8 encoded string in another UTF8 encoded string and return a pointer to the start of the match. Returns NULL if the string was not found.
Definition at line 625 of file UT_UnicodeImpl.h.
Given a pointer inside of a string representing variable length encoding, moves the pointer so that it points to the beginning of the encoding, if not there already. Returns false
if it was unable to fix the position and true
if successful or the position was already valid.
Definition at line 517 of file UT_UnicodeImpl.h.
Definition at line 511 of file UT_UnicodeImpl.h.
|
inlinestatic |
Definition at line 687 of file UT_UnicodeImpl.h.
|
inlinestatic |
Definition at line 681 of file UT_UnicodeImpl.h.
|
inlinestatic |
Definition at line 177 of file UT_Unicode.h.
|
inlinestatic |
Returns true if the character is from any of the Unicode CJK Unified Ideographs blocks.
Definition at line 713 of file UT_UnicodeImpl.h.
|
inlinestatic |
Definition at line 171 of file UT_Unicode.h.
|
inlinestatic |
Definition at line 675 of file UT_UnicodeImpl.h.
|
inlinestatic |
Definition at line 156 of file UT_Unicode.h.
|
inlinestatic |
Definition at line 182 of file UT_Unicode.h.
|
inlinestatic |
Definition at line 707 of file UT_UnicodeImpl.h.
|
inlinestatic |
Definition at line 693 of file UT_UnicodeImpl.h.
|
inlinestatic |
Definition at line 667 of file UT_UnicodeImpl.h.
|
inlinestatic |
Returns true
if the code point given is a surrogate pair. This is valid UTF-16 character, since it is used to encode greater-than 0xFFFF code points. It is not a valid UTF-32 code point, however.
Definition at line 151 of file UT_Unicode.h.
|
inlinestatic |
Definition at line 699 of file UT_UnicodeImpl.h.
|
inlinestatic |
Definition at line 74 of file UT_UnicodeImpl.h.
|
inlinestatic |
Definition at line 162 of file UT_Unicode.h.
|
inlinestatic |
Definition at line 743 of file UT_UnicodeImpl.h.
Returns the number of octets for this variable encoding. One octet is the same as a byte for UTF-8 encodings.
Definition at line 542 of file UT_UnicodeImpl.h.
Given a current location in a buffer, moves to the next character. If the location is inside a UTF-8 multi-character encoding (i.e not at the beginning of one), it moves to the next encoded character start after. If the current location is already at the terminating NUL character the function does nothing and just returns the current pointer. If it is unable to move successfully to the next encoded character (e.g. it's already at the end of the string, or the encoding is garbage and no recovery is possible) the function returns NULL.
Definition at line 279 of file UT_UnicodeImpl.h.
Definition at line 273 of file UT_UnicodeImpl.h.
Given a location in a buffer, moves after the end of the word. This is done by grouping characters that are considered continuous. There are 4 types of groups:
_@
{}[]();,.
and \\n\\r
Definition at line 499 of file UT_UnicodeImpl.h.
Given a location in a buffer, moves to the to the previous character, unless already at the beginning of the string, as defined by 'start'. If the location is inside a UTF-8 multi-character encoding, it moves to the beginning of that encoding. If going back lands on an invalid character, it encounters bad encoding (e.g. too many continuation bytes), or it's already at the start, the function returns NULL.
Definition at line 342 of file UT_UnicodeImpl.h.
Definition at line 336 of file UT_UnicodeImpl.h.
Given a location in a buffer, moves to the beginning of the word. This is done by grouping characters that are considered continuous. There are 4 types of groups:
_@
{}[]();,.
and \\n\\r
Definition at line 505 of file UT_UnicodeImpl.h.
|
inlinestatic |
Returns the replacement character, which is returned by the convert functions, when they encounter an invalid, but recoverable, encoding.
Definition at line 143 of file UT_Unicode.h.
Definition at line 722 of file UT_UnicodeImpl.h.
Definition at line 733 of file UT_UnicodeImpl.h.