You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

146 lines
5.1 KiB

  1. // Copyright 2005-2024 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the 'License');
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an 'AS IS' BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // See www.openfst.org for extensive documentation on this weighted
  16. // finite-state transducer library.
  17. //
  18. // This library implements an unrestricted Thompson/Pike UTF-8 parser and
  19. // serializer. UTF-8 is a restricted subset of this byte stream encoding. For
  20. // a description of the encoding details, see:
  21. //
  22. // http://en.wikipedia.org/wiki/UTF-8
  23. #ifndef FST_ICU_H_
  24. #define FST_ICU_H_
  25. #include <cstdint>
  26. #include <sstream>
  27. #include <string>
  28. #include <vector>
  29. #include <fst/log.h>
  30. #include <string_view>
  31. namespace fst {
  32. // Trivial function to copy bytestrings into vectors of labels, truncating
  33. // if necessary. It is possible to use this sensibly with as little as 8 bits
  34. // of Label precision. This returns `true` deterministically for compatibility.
  35. template <class Label>
  36. bool ByteStringToLabels(std::string_view str, std::vector<Label> *labels) {
  37. for (const unsigned char ch : str) labels->push_back(ch);
  38. return true;
  39. }
  40. // This function writes UTF-8 strings into a vector of Labels, truncating if
  41. // necessary. It is possible to use this sensibly with as little as 16 bits of
  42. // Label precision (i.e., when all characters are within the Basic Multilingual
  43. // Plane). With 21 bits, one can label all UTF-8 labelpoints, including those
  44. // from the various Astral Planes. Naturally, it is safe to use this with larger
  45. // Labels (e.g., 64 bits).
  46. template <class Label>
  47. bool UTF8StringToLabels(std::string_view str, std::vector<Label> *labels) {
  48. for (auto it = str.begin(); it != str.end();) {
  49. int c = *it & 0xff;
  50. ++it;
  51. if ((c & 0x80) == 0) {
  52. labels->push_back(c);
  53. } else {
  54. if ((c & 0xc0) == 0x80) {
  55. LOG(ERROR) << "UTF8StringToLabels: Continuation byte as lead byte";
  56. return false;
  57. }
  58. int count =
  59. (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) + (c >= 0xfc);
  60. int32_t label = c & ((1 << (6 - count)) - 1);
  61. while (count != 0) {
  62. if (it == str.end()) {
  63. LOG(ERROR) << "UTF8StringToLabels: Truncated UTF-8 byte sequence";
  64. return false;
  65. }
  66. char cb = *it;
  67. ++it;
  68. if ((cb & 0xc0) != 0x80) {
  69. LOG(ERROR) << "UTF8StringToLabels: Missing/invalid continuation byte";
  70. return false;
  71. }
  72. label = (label << 6) | (cb & 0x3f);
  73. --count;
  74. }
  75. if (label < 0) {
  76. // Should be unreachable.
  77. LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c;
  78. return false;
  79. }
  80. labels->push_back(label);
  81. }
  82. }
  83. return true;
  84. }
  85. template <class Label>
  86. bool LabelsToByteString(const std::vector<Label> &labels, std::string *str) {
  87. std::ostringstream ostrm;
  88. for (const char label : labels) {
  89. if (label != 0) ostrm << label;
  90. }
  91. *str = ostrm.str();
  92. return !!ostrm;
  93. }
  94. template <class Label>
  95. bool LabelsToUTF8String(const std::vector<Label> &labels, std::string *str) {
  96. std::ostringstream ostrm;
  97. for (const int32_t label : labels) {
  98. if (label < 0) {
  99. LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << label;
  100. return false;
  101. } else if (label == 0) {
  102. continue;
  103. } else if (label < 0x80) {
  104. ostrm << static_cast<char>(label);
  105. } else if (label < 0x800) {
  106. ostrm << static_cast<char>((label >> 6) | 0xc0);
  107. ostrm << static_cast<char>((label & 0x3f) | 0x80);
  108. } else if (label < 0x10000) {
  109. ostrm << static_cast<char>((label >> 12) | 0xe0);
  110. ostrm << static_cast<char>(((label >> 6) & 0x3f) | 0x80);
  111. ostrm << static_cast<char>((label & 0x3f) | 0x80);
  112. } else if (label < 0x200000) {
  113. ostrm << static_cast<char>((label >> 18) | 0xf0);
  114. ostrm << static_cast<char>(((label >> 12) & 0x3f) | 0x80);
  115. ostrm << static_cast<char>(((label >> 6) & 0x3f) | 0x80);
  116. ostrm << static_cast<char>((label & 0x3f) | 0x80);
  117. } else if (label < 0x4000000) {
  118. ostrm << static_cast<char>((label >> 24) | 0xf8);
  119. ostrm << static_cast<char>(((label >> 18) & 0x3f) | 0x80);
  120. ostrm << static_cast<char>(((label >> 12) & 0x3f) | 0x80);
  121. ostrm << static_cast<char>(((label >> 6) & 0x3f) | 0x80);
  122. ostrm << static_cast<char>((label & 0x3f) | 0x80);
  123. } else {
  124. ostrm << static_cast<char>((label >> 30) | 0xfc);
  125. ostrm << static_cast<char>(((label >> 24) & 0x3f) | 0x80);
  126. ostrm << static_cast<char>(((label >> 18) & 0x3f) | 0x80);
  127. ostrm << static_cast<char>(((label >> 12) & 0x3f) | 0x80);
  128. ostrm << static_cast<char>(((label >> 6) & 0x3f) | 0x80);
  129. ostrm << static_cast<char>((label & 0x3f) | 0x80);
  130. }
  131. }
  132. *str = ostrm.str();
  133. return !!ostrm;
  134. }
  135. } // namespace fst
  136. #endif // FST_ICU_H_