HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
unicodeUtils.h
Go to the documentation of this file.
1 //
2 // Copyright 2023 Pixar
3 //
4 // Licensed under the Apache License, Version 2.0 (the "Apache License")
5 // with the following modification; you may not use this file except in
6 // compliance with the Apache License and the following modification to it:
7 // Section 6. Trademarks. is deleted and replaced with:
8 //
9 // 6. Trademarks. This License does not grant permission to use the trade
10 // names, trademarks, service marks, or product names of the Licensor
11 // and its affiliates, except as required to comply with Section 4(c) of
12 // the License and to reproduce the content of the NOTICE file.
13 //
14 // You may obtain a copy of the Apache License at
15 //
16 // http://www.apache.org/licenses/LICENSE-2.0
17 //
18 // Unless required by applicable law or agreed to in writing, software
19 // distributed under the Apache License with the above modification is
20 // distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
21 // KIND, either express or implied. See the Apache License for the specific
22 // language governing permissions and limitations under the Apache License.
23 //
24 #ifndef PXR_BASE_TF_UNICODE_UTILS_H
25 #define PXR_BASE_TF_UNICODE_UTILS_H
26 
27 /// \file tf/unicodeUtils.h
28 /// \ingroup group_tf_String
29 /// Definitions of basic UTF-8 utilities in tf.
30 
31 #include "pxr/pxr.h"
32 #include "pxr/base/tf/api.h"
33 #include "pxr/base/tf/diagnostic.h"
34 
35 #include <ostream>
36 #include <string>
37 #include <string_view>
38 
40 
41 /// \class TfUtf8CodePoint
42 /// \ingroup group_tf_String
43 ///
44 /// Wrapper for a 32-bit code point value that can be encoded as UTF-8.
45 ///
46 /// \code{.cpp}
47 /// // Stream operator overload encodes each code point as UTF-8.
48 /// std::stringstream s;
49 /// s << TfUtf8CodePoint(8747) << " " << TfUtf8CodePoint(120);
50 /// \endcode
51 /// A single `TfUtf8CodePoint` may be converted to a string using
52 /// `TfStringify` as well.
54 public:
55  /// Code points that cannot be decoded or are outside of the valid range
56  /// will be replaced with this value.
57  static constexpr uint32_t ReplacementValue = 0xFFFD;
58 
59  /// Values higher than this will be replaced with the replacement
60  /// code point.
61  static constexpr uint32_t MaximumValue = 0x10FFFF;
62 
63  /// Values in this range (inclusive) cannot be constructed and will be
64  /// replaced by the replacement code point.
65  static constexpr std::pair<uint32_t, uint32_t>
66  SurrogateRange = {0xD800, 0xDFFF};
67 
68  /// Construct a code point initialized to the replacement value
69  constexpr TfUtf8CodePoint() = default;
70 
71  /// Construct a UTF-8 valued code point, constrained by the maximum value
72  /// and surrogate range.
73  constexpr explicit TfUtf8CodePoint(uint32_t value) :
74  _value(((value <= MaximumValue) &&
75  ((value < SurrogateRange.first) ||
76  (value > SurrogateRange.second))) ?
77  value : ReplacementValue) {}
78 
79  constexpr uint32_t AsUInt32() const { return _value; }
80 
81  friend constexpr bool operator==(const TfUtf8CodePoint left,
82  const TfUtf8CodePoint right) {
83  return left._value == right._value;
84  }
85  friend constexpr bool operator!=(const TfUtf8CodePoint left,
86  const TfUtf8CodePoint right) {
87  return left._value != right._value;
88  }
89 
90 private:
91  uint32_t _value{ReplacementValue};
92 };
93 
94 TF_API std::ostream& operator<<(std::ostream&, const TfUtf8CodePoint);
95 
96 /// The replacement code point can be used to signal that a code point could
97 /// not be decoded and needed to be replaced.
98 constexpr TfUtf8CodePoint TfUtf8InvalidCodePoint{
100 
101 /// Constructs a TfUtf8CodePoint from an ASCII charcter (0-127).
103 {
104  return static_cast<unsigned char>(value) < 128 ?
105  TfUtf8CodePoint(static_cast<unsigned char>(value)) :
106  TfUtf8InvalidCodePoint;
107 }
108 
109 /// Defines an iterator over a UTF-8 encoded string that extracts unicode
110 /// code point values.
111 ///
112 /// UTF-8 is a variable length encoding, meaning that one Unicode
113 /// code point can be encoded in UTF-8 as 1, 2, 3, or 4 bytes. This
114 /// iterator takes care of consuming the valid UTF-8 bytes for a
115 /// code point while incrementing.
117 public:
118  using iterator_category = std::forward_iterator_tag;
120  using difference_type = std::ptrdiff_t;
121  using pointer = void;
123 
124  /// Model iteration ending when the underlying iterator's end condition
125  /// has been met.
126  class PastTheEndSentinel final {};
127 
128  /// Constructs an iterator that can read UTF-8 character sequences from
129  /// the given starting string_view iterator \a it. \a end is used as a
130  /// guard against reading byte sequences past the end of the source string.
131  ///
132  /// When working with views of substrings, \a end must not point to a
133  /// continuation byte in a valid UTF-8 byte sequence to avoid decoding
134  /// errors.
136  const std::string_view::const_iterator& it,
137  const std::string_view::const_iterator& end) : _it(it), _end(end) {
138  TF_DEV_AXIOM(_it <= _end);
139  }
140 
141  /// Retrieves the current UTF-8 character in the sequence as its Unicode
142  /// code point value. Returns `TfUtf8InvalidCodePoint` when the
143  /// byte sequence pointed to by the iterator cannot be decoded.
144  ///
145  /// A code point might be invalid because it's incorrectly encoded, exceeds
146  /// the maximum allowed value, or is in the disallowed surrogate range.
148  {
149  return TfUtf8CodePoint{_GetCodePoint()};
150  }
151 
152  /// Retrieves the wrapped string iterator.
153  std::string_view::const_iterator GetBase() const
154  {
155  return this->_it;
156  }
157 
158  /// Determines if two iterators are equal.
159  /// This intentionally does not consider the end iterator to allow for
160  /// comparison of iterators between different substring views of the
161  /// same underlying string.
162  bool operator== (const TfUtf8CodePointIterator& rhs) const
163  {
164  return (this->_it == rhs._it);
165  }
166 
167  /// Determines if two iterators are unequal.
168  /// This intentionally does not consider the end iterator to allow for
169  /// comparison of iterators between different substring views of the
170  /// same underlying string.
171  bool operator!= (const TfUtf8CodePointIterator& rhs) const
172  {
173  return (this->_it != rhs._it);
174  }
175 
176  /// Advances the iterator logically one UTF-8 character sequence in
177  /// the string. The underlying string iterator will be advanced
178  /// according to the variable length encoding of the next UTF-8
179  /// character, but will never consume non-continuation bytes after
180  /// the current one.
182  {
183  // The increment operator should never be called if it's past
184  // the end. The user is expected to have already checked this
185  // condition.
186  TF_DEV_AXIOM(!_IsPastTheEnd());
187  _EncodingLength increment = _GetEncodingLength();
188  // Note that in cases where the encoding is invalid, we move to the
189  // next byte. This is necessary because otherwise the iterator would
190  // never advance and the end condition of == iterator::end() would
191  // never be satisfied. This means that we increment, even if the
192  // encoding length is 0.
193  ++_it;
194  // Only continuation bytes will be consumed after the the first byte.
195  // This avoids consumption of ASCII characters or other starting bytes.
196  auto isContinuation = [](const char c) {
197  const auto uc = static_cast<unsigned char>(c);
198  return (uc >= static_cast<unsigned char>('\x80')) &&
199  (uc < static_cast<unsigned char>('\xc0'));
200  };
201  while ((increment > 1) && !_IsPastTheEnd() && isContinuation(*_it)) {
202  ++_it;
203  --increment;
204  }
205  return *this;
206  }
207 
208  /// Advances the iterator logically one UTF-8 character sequence in
209  /// the string. The underlying string iterator will be advanced
210  /// according to the variable length encoding of the next UTF-8
211  /// character, but will never consume non-continuation bytes after
212  /// the current one.
214  {
215  auto temp = *this;
216  ++(*this);
217  return temp;
218  }
219 
220  /// Checks if the `lhs` iterator is at or past the end for the
221  /// underlying `string_view`
222  friend bool operator==(const TfUtf8CodePointIterator& lhs,
224  {
225  return lhs._IsPastTheEnd();
226  }
227 
229  const TfUtf8CodePointIterator& rhs)
230  {
231  return rhs == lhs;
232  }
233 
234  friend bool operator!=(const TfUtf8CodePointIterator& lhs,
235  PastTheEndSentinel rhs)
236  {
237  return !(lhs == rhs);
238  }
240  const TfUtf8CodePointIterator& rhs)
241  {
242  return !(lhs == rhs);
243  }
244 
245 private:
246  using _EncodingLength = unsigned char;
247 
248  // Retrieves the variable encoding length of the UTF-8 character
249  // currently pointed to by the iterator. This can be 1, 2, 3, or 4
250  // depending on the encoding of the UTF-8 character. If the encoding
251  // cannot be determined, this method will return 0.
252  _EncodingLength _GetEncodingLength() const
253  {
254  // already at the end, no valid character sequence
255  if (_IsPastTheEnd())
256  {
257  return 0;
258  }
259  // determine what encoding length the character is
260  // 1-byte characters have a leading 0 sequence
261  // 2-byte characters have a leading 110 sequence
262  // 3-byte characters have a leading 1110 sequence
263  // 4-byte characters have a leading 11110 sequence
264  unsigned char x = static_cast<unsigned char>(*_it);
265  if (x < 0x80)
266  {
267  return 1;
268  }
269  else if ((x >= 0xc0) && (x < 0xe0))
270  {
271  return 2;
272  }
273  else if ((x >= 0xe0) && (x < 0xf0))
274  {
275  return 3;
276  }
277  else if ((x >= 0xf0) && (x < 0xf8))
278  {
279  return 4;
280  }
281  else
282  {
283  // can't determine encoding, this is an error
284  return 0;
285  }
286  }
287 
288  // Retrieves the Unicode code point of the next character in the UTF-8
289  // encoded sequence (defined by \a begin) and returns the value in
290  // \a codePoint. This method will return \a true if the encoded
291  // sequence is valid. If the encoding is invalid, this method will
292  // return \a false and \a codePoint will be set to 0.
293  TF_API uint32_t _GetCodePoint() const;
294 
295  // Returns true if the iterator at or past the end and can no longer be
296  // dereferenced.
297  bool _IsPastTheEnd() const
298  {
299  return _it >= _end;
300  }
301 
302  std::string_view::const_iterator _it;
303  std::string_view::const_iterator _end;
304 };
305 
306 /// \class TfUtf8CodePointView
307 /// \ingroup group_tf_String
308 ///
309 /// Wrapper for a UTF-8 encoded `std::string_view` that can be iterated over
310 /// as code points instead of bytes.
311 ///
312 /// Because of the variable length encoding, the `TfUtf8CodePointView` iterator is
313 /// a ForwardIterator and is read only.
314 ///
315 /// \code{.cpp}
316 /// std::string value{"∫dx"};
317 /// for (const auto codePoint : TfUtf8CodePointView{value}) {
318 /// if (codePoint == TfUtf8InvalidCodePoint) {
319 /// TF_WARN("String cannot be decoded.");
320 /// break;
321 /// }
322 /// }
323 /// \endcode
324 ///
325 /// The `TfUtf8CodePointView`'s sentinel `end()` is compatible with range
326 /// based for loops and the forthcoming STL ranges library; it avoids
327 /// triplicating the storage for the end iterator. `EndAsIterator()`
328 /// can be used for algorithms that require the begin and end iterators to be
329 /// of the same type but necessarily stores redundant copies of the endpoint.
330 ///
331 /// \code{.cpp}
332 /// if (std::any_of(std::cbegin(codePointView), codePointView.EndAsIterator(),
333 /// [](const auto c) { return c == TfUtf8InvalidCodePoint; }))
334 /// {
335 /// TF_WARN("String cannot be decoded");
336 /// }
337 /// \endcode
338 class TfUtf8CodePointView final {
339 public:
341 
342  TfUtf8CodePointView() = default;
343  explicit TfUtf8CodePointView(const std::string_view& view) : _view(view) {}
344 
345  inline const_iterator begin() const
346  {
347  return const_iterator{std::cbegin(_view), std::cend(_view)};
348  }
349 
350  /// The sentinel will compare as equal to any iterator at the end
351  /// of the underlying `string_view`
353  {
355  }
356 
357  inline const_iterator cbegin() const
358  {
359  return begin();
360  }
361 
362  /// The sentinel will compare as equal to any iterator at the end
363  /// of the underlying `string_view`
365  {
366  return end();
367  }
368 
369  /// Returns true if the underlying view is empty
370  bool empty() const
371  {
372  return _view.empty();
373  }
374 
375  /// Returns an iterator of the same type as `begin` that identifies the end
376  /// of the string.
377  ///
378  /// As the end iterator is stored three times, this is slightly heavier
379  /// than using the `PastTheEndSentinel` and should be avoided in performance
380  /// critical code paths. It is provided for convenience when an algorithm
381  /// restricts the iterators to have the same type.
382  ///
383  /// As C++20 ranges exposes more sentinel friendly algorithms, this can
384  /// likely be deprecated in the future.
386  {
387  return const_iterator(std::cend(_view), std::cend(_view));
388  }
389 
390 private:
391  std::string_view _view;
392 };
393 
394 /// Determines whether the given Unicode \a codePoint is in the XID_Start
395 /// character class.
396 ///
397 /// The XID_Start class of characters are derived from the Unicode
398 /// General_Category of uppercase letters, lowercase letters, titlecase
399 /// letters, modifier letters, other letters, letters numbers, plus
400 /// Other_ID_Start, minus Pattern_Syntax and Pattern_White_Space code points.
401 /// That is, the character must have a category of Lu | Ll | Lt | Lm | Lo | Nl
402 ///
403 TF_API
404 bool TfIsUtf8CodePointXidStart(uint32_t codePoint);
405 
406 /// Determines whether the given Unicode \a codePoint is in the XID_Start
407 /// character class.
408 /// \overload
409 ///
410 inline bool TfIsUtf8CodePointXidStart(const TfUtf8CodePoint codePoint)
411 {
412  return TfIsUtf8CodePointXidStart(codePoint.AsUInt32());
413 }
414 
415 /// Determines whether the given Unicode \a codePoint is in the XID_Continue
416 /// character class.
417 ///
418 /// The XID_Continue class of characters include those in XID_Start plus
419 /// characters having the Unicode General Category of nonspacing marks,
420 /// spacing combining marks, decimal number, and connector punctuation.
421 /// That is, the character must have a category of
422 /// XID_Start | Nd | Mn | Mc | Pc
423 ///
424 TF_API
425 bool TfIsUtf8CodePointXidContinue(uint32_t codePoint);
426 
427 /// Determines whether the given Unicode \a codePoint is in the XID_Continue
428 /// character class.
429 /// \overload
430 ///
431 inline bool TfIsUtf8CodePointXidContinue(const TfUtf8CodePoint codePoint)
432 {
433  return TfIsUtf8CodePointXidContinue(codePoint.AsUInt32());
434 }
435 
437 
438 #endif // PXR_BASE_TF_UNICODE_UTILS_H_
value_type operator*() const
Definition: unicodeUtils.h:147
GLint first
Definition: glcorearb.h:405
static constexpr uint32_t MaximumValue
Definition: unicodeUtils.h:61
#define TF_API
Definition: api.h:40
TfUtf8CodePointView(const std::string_view &view)
Definition: unicodeUtils.h:343
static constexpr uint32_t ReplacementValue
Definition: unicodeUtils.h:57
void
Definition: png.h:1083
GLint left
Definition: glcorearb.h:2005
const_iterator cbegin() const
Definition: unicodeUtils.h:357
TfUtf8CodePointView()=default
GLsizei const GLfloat * value
Definition: glcorearb.h:824
std::ptrdiff_t difference_type
Definition: unicodeUtils.h:120
GLdouble right
Definition: glad.h:2817
bool empty() const
Returns true if the underlying view is empty.
Definition: unicodeUtils.h:370
TfUtf8CodePointIterator const_iterator
Definition: unicodeUtils.h:340
TfUtf8CodePointIterator::PastTheEndSentinel cend() const
Definition: unicodeUtils.h:364
TfUtf8CodePointIterator::PastTheEndSentinel end() const
Definition: unicodeUtils.h:352
friend constexpr bool operator==(const TfUtf8CodePoint left, const TfUtf8CodePoint right)
Definition: unicodeUtils.h:81
basic_string_view< char > string_view
Definition: core.h:522
friend bool operator==(PastTheEndSentinel lhs, const TfUtf8CodePointIterator &rhs)
Definition: unicodeUtils.h:228
#define TF_DEV_AXIOM(cond)
constexpr TfUtf8CodePoint(uint32_t value)
Definition: unicodeUtils.h:73
friend bool operator==(const TfUtf8CodePointIterator &lhs, PastTheEndSentinel)
Definition: unicodeUtils.h:222
std::forward_iterator_tag iterator_category
Definition: unicodeUtils.h:118
const_iterator EndAsIterator() const
Definition: unicodeUtils.h:385
TF_API std::ostream & operator<<(std::ostream &, const TfUtf8CodePoint)
GLuint GLuint end
Definition: glcorearb.h:475
TF_API bool TfIsUtf8CodePointXidContinue(uint32_t codePoint)
std::string_view::const_iterator GetBase() const
Retrieves the wrapped string iterator.
Definition: unicodeUtils.h:153
constexpr TfUtf8CodePoint()=default
Construct a code point initialized to the replacement value.
TfUtf8CodePointIterator & operator++()
Definition: unicodeUtils.h:181
GLint GLenum GLint x
Definition: glcorearb.h:409
const_iterator begin() const
Definition: unicodeUtils.h:345
PXR_NAMESPACE_CLOSE_SCOPE PXR_NAMESPACE_OPEN_SCOPE
Definition: path.h:1432
friend bool operator!=(PastTheEndSentinel lhs, const TfUtf8CodePointIterator &rhs)
Definition: unicodeUtils.h:239
#define PXR_NAMESPACE_CLOSE_SCOPE
Definition: pxr.h:91
TfUtf8CodePointIterator(const std::string_view::const_iterator &it, const std::string_view::const_iterator &end)
Definition: unicodeUtils.h:135
static constexpr std::pair< uint32_t, uint32_t > SurrogateRange
Definition: unicodeUtils.h:66
Definition: core.h:982
Definition: core.h:1131
constexpr TfUtf8CodePoint TfUtf8CodePointFromAscii(const char value)
Constructs a TfUtf8CodePoint from an ASCII charcter (0-127).
Definition: unicodeUtils.h:102
bool operator==(const TfUtf8CodePointIterator &rhs) const
Definition: unicodeUtils.h:162
friend bool operator!=(const TfUtf8CodePointIterator &lhs, PastTheEndSentinel rhs)
Definition: unicodeUtils.h:234
TF_API bool TfIsUtf8CodePointXidStart(uint32_t codePoint)
friend constexpr bool operator!=(const TfUtf8CodePoint left, const TfUtf8CodePoint right)
Definition: unicodeUtils.h:85
bool operator!=(const TfUtf8CodePointIterator &rhs) const
Definition: unicodeUtils.h:171
constexpr uint32_t AsUInt32() const
Definition: unicodeUtils.h:79