You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

90 lines
2.9 KiB

  1. // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include "wetext_string.h"
  15. #include "wetext_log.h"
  16. namespace wetext {
  17. using namespace fst;
  18. const char* WHITESPACE = " \n\r\t\f\v";
  19. int UTF8CharLength(char ch) {
  20. int num_bytes = 1;
  21. CHECK_LE((ch & 0xF8), 0xF0);
  22. if ((ch & 0x80) == 0x00) {
  23. // The first 128 characters (US-ASCII) in UTF-8 format only need one byte.
  24. num_bytes = 1;
  25. } else if ((ch & 0xE0) == 0xC0) {
  26. // The next 1,920 characters need two bytes to encode,
  27. // which covers the remainder of almost all Latin-script alphabets.
  28. num_bytes = 2;
  29. } else if ((ch & 0xF0) == 0xE0) {
  30. // Three bytes are needed for characters in the rest of
  31. // the Basic Multilingual Plane, which contains virtually all characters
  32. // in common use, including most Chinese, Japanese and Korean characters.
  33. num_bytes = 3;
  34. } else if ((ch & 0xF8) == 0xF0) {
  35. // Four bytes are needed for characters in the other planes of Unicode,
  36. // which include less common CJK characters, various historic scripts,
  37. // mathematical symbols, and emoji (pictographic symbols).
  38. num_bytes = 4;
  39. }
  40. return num_bytes;
  41. }
  42. int UTF8StringLength(const std::string& str) {
  43. int len = 0;
  44. int num_bytes = 1;
  45. for (size_t i = 0; i < str.length(); i += num_bytes) {
  46. num_bytes = UTF8CharLength(str[i]);
  47. ++len;
  48. }
  49. return len;
  50. }
  51. void SplitUTF8StringToChars(const std::string& str,
  52. std::vector<std::string>* chars) {
  53. chars->clear();
  54. int num_bytes = 1;
  55. for (size_t i = 0; i < str.length(); i += num_bytes) {
  56. num_bytes = UTF8CharLength(str[i]);
  57. chars->push_back(str.substr(i, num_bytes));
  58. }
  59. }
  60. std::string Ltrim(const std::string& str) {
  61. size_t start = str.find_first_not_of(WHITESPACE);
  62. return (start == std::string::npos) ? "" : str.substr(start);
  63. }
  64. std::string Rtrim(const std::string& str) {
  65. size_t end = str.find_last_not_of(WHITESPACE);
  66. return end == std::string::npos ? "" : str.substr(0, end + 1);
  67. }
  68. std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); }
  69. void Split(const std::string& str, const std::string& delim,
  70. std::vector<std::string>* output) {
  71. std::string s = str;
  72. size_t pos = 0;
  73. while ((pos = s.find(delim)) != std::string::npos) {
  74. output->emplace_back(s.substr(0, pos));
  75. s.erase(0, pos + delim.length());
  76. }
  77. output->emplace_back(s);
  78. }
  79. } // namespace wetext