You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

198 lines
5.8 KiB

  1. // Copyright (c) 2021 Mobvoi Inc (Binbin Zhang)
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include "wn_string.h"
  15. #include <sstream>
  16. #include <string>
  17. #include <vector>
  18. #include "wn_utils.h"
  19. namespace wenet {
  20. using namespace fst;
  21. void SplitString(const std::string& str, std::vector<std::string>* strs) {
  22. SplitStringToVector(Trim(str), " \t", true, strs);
  23. }
  24. void SplitStringToVector(const std::string& full, const char* delim,
  25. bool omit_empty_strings,
  26. std::vector<std::string>* out) {
  27. size_t start = 0, found = 0, end = full.size();
  28. out->clear();
  29. while (found != std::string::npos) {
  30. found = full.find_first_of(delim, start);
  31. // start != end condition is for when the delimiter is at the end
  32. if (!omit_empty_strings || (found != start && start != end))
  33. out->push_back(full.substr(start, found - start));
  34. start = found + 1;
  35. }
  36. }
  37. void SplitUTF8StringToChars(const std::string& str,
  38. std::vector<std::string>* chars) {
  39. chars->clear();
  40. int bytes = 1;
  41. for (size_t i = 0; i < str.length(); i += bytes) {
  42. assert((str[i] & 0xF8) <= 0xF0);
  43. if ((str[i] & 0x80) == 0x00) {
  44. // The first 128 characters (US-ASCII) in UTF-8 format only need one byte.
  45. bytes = 1;
  46. } else if ((str[i] & 0xE0) == 0xC0) {
  47. // The next 1,920 characters need two bytes to encode,
  48. // which covers the remainder of almost all Latin-script alphabets.
  49. bytes = 2;
  50. } else if ((str[i] & 0xF0) == 0xE0) {
  51. // Three bytes are needed for characters in the rest of
  52. // the Basic Multilingual Plane, which contains virtually all characters
  53. // in common use, including most Chinese, Japanese and Korean characters.
  54. bytes = 3;
  55. } else if ((str[i] & 0xF8) == 0xF0) {
  56. // Four bytes are needed for characters in the other planes of Unicode,
  57. // which include less common CJK characters, various historic scripts,
  58. // mathematical symbols, and emoji (pictographic symbols).
  59. bytes = 4;
  60. }
  61. chars->push_back(str.substr(i, bytes));
  62. }
  63. }
  64. int UTF8StringLength(const std::string& str) {
  65. int len = 0;
  66. int bytes = 1;
  67. for (size_t i = 0; i < str.length(); i += bytes) {
  68. if ((str[i] & 0x80) == 0x00) {
  69. bytes = 1;
  70. } else if ((str[i] & 0xE0) == 0xC0) {
  71. bytes = 2;
  72. } else if ((str[i] & 0xF0) == 0xE0) {
  73. bytes = 3;
  74. } else if ((str[i] & 0xF8) == 0xF0) {
  75. bytes = 4;
  76. }
  77. ++len;
  78. }
  79. return len;
  80. }
  81. bool CheckEnglishChar(const std::string& ch) {
  82. // all english characters should be encoded in one byte
  83. if (ch.size() != 1) return false;
  84. // english words may contain apostrophe, i.e., "He's"
  85. return isalpha(ch[0]) || ch[0] == '\'';
  86. }
  87. bool CheckEnglishWord(const std::string& word) {
  88. std::vector<std::string> chars;
  89. SplitUTF8StringToChars(word, &chars);
  90. for (size_t k = 0; k < chars.size(); k++) {
  91. if (!CheckEnglishChar(chars[k])) {
  92. return false;
  93. }
  94. }
  95. return true;
  96. }
  97. std::string JoinString(const std::string& c,
  98. const std::vector<std::string>& strs) {
  99. std::string result;
  100. if (strs.size() > 0) {
  101. for (int i = 0; i < strs.size() - 1; i++) {
  102. result += (strs[i] + c);
  103. }
  104. result += strs.back();
  105. }
  106. return result;
  107. }
  108. bool IsAlpha(const std::string& str) {
  109. for (size_t i = 0; i < str.size(); i++) {
  110. if (!isalpha(str[i])) {
  111. return false;
  112. }
  113. }
  114. return true;
  115. }
  116. std::string ProcessBlank(const std::string& str, bool lowercase) {
  117. std::string result;
  118. if (!str.empty()) {
  119. std::vector<std::string> chars;
  120. SplitUTF8StringToChars(Trim(str), &chars);
  121. for (std::string& ch : chars) {
  122. if (ch != kSpaceSymbol) {
  123. result.append(ch);
  124. } else {
  125. // Ignore consecutive space or located in head
  126. if (!result.empty() && result.back() != ' ') {
  127. result.push_back(' ');
  128. }
  129. }
  130. }
  131. // Ignore tailing space
  132. if (!result.empty() && result.back() == ' ') {
  133. result.pop_back();
  134. }
  135. // NOTE: convert string to wstring
  136. // see issue 745: https://github.com/wenet-e2e/wenet/issues/745
  137. try {
  138. std::locale loc("");
  139. std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter;
  140. std::wstring wsresult = converter.from_bytes(result);
  141. for (auto& c : wsresult) {
  142. c = lowercase ? tolower(c, loc) : c;
  143. }
  144. result = converter.to_bytes(wsresult);
  145. } catch (std::exception& e) {
  146. LOG(ERROR) << "convert wstring error " << e.what();
  147. }
  148. }
  149. return result;
  150. }
  151. std::string Ltrim(const std::string& str) {
  152. size_t start = str.find_first_not_of(WHITESPACE);
  153. return (start == std::string::npos) ? "" : str.substr(start);
  154. }
  155. std::string Rtrim(const std::string& str) {
  156. size_t end = str.find_last_not_of(WHITESPACE);
  157. return (end == std::string::npos) ? "" : str.substr(0, end + 1);
  158. }
  159. std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); }
  160. std::string JoinPath(const std::string& left, const std::string& right) {
  161. std::string path(left);
  162. if (path.size() && path.back() != '/') {
  163. path.push_back('/');
  164. }
  165. path.append(right);
  166. return path;
  167. }
  168. #ifdef _MSC_VER
  169. std::wstring ToWString(const std::string& str) {
  170. unsigned len = str.size() * 2;
  171. setlocale(LC_CTYPE, "");
  172. wchar_t* p = new wchar_t[len];
  173. mbstowcs(p, str.c_str(), len);
  174. std::wstring wstr(p);
  175. delete[] p;
  176. return wstr;
  177. }
  178. #endif
  179. } // namespace wenet