You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

198 lines
5.8 KiB

  1. // Copyright (c) 2021 Mobvoi Inc (Binbin Zhang)
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include "wn_string.h"
  15. #include <sstream>
  16. #include <string>
  17. #include <vector>
  18. #include "wn_utils.h"
  19. namespace wenet {
  20. void SplitString(const std::string& str, std::vector<std::string>* strs) {
  21. SplitStringToVector(Trim(str), " \t", true, strs);
  22. }
  23. void SplitStringToVector(const std::string& full, const char* delim,
  24. bool omit_empty_strings,
  25. std::vector<std::string>* out) {
  26. size_t start = 0, found = 0, end = full.size();
  27. out->clear();
  28. while (found != std::string::npos) {
  29. found = full.find_first_of(delim, start);
  30. // start != end condition is for when the delimiter is at the end
  31. if (!omit_empty_strings || (found != start && start != end))
  32. out->push_back(full.substr(start, found - start));
  33. start = found + 1;
  34. }
  35. }
  36. void SplitUTF8StringToChars(const std::string& str,
  37. std::vector<std::string>* chars) {
  38. chars->clear();
  39. int bytes = 1;
  40. for (size_t i = 0; i < str.length(); i += bytes) {
  41. assert((str[i] & 0xF8) <= 0xF0);
  42. if ((str[i] & 0x80) == 0x00) {
  43. // The first 128 characters (US-ASCII) in UTF-8 format only need one byte.
  44. bytes = 1;
  45. } else if ((str[i] & 0xE0) == 0xC0) {
  46. // The next 1,920 characters need two bytes to encode,
  47. // which covers the remainder of almost all Latin-script alphabets.
  48. bytes = 2;
  49. } else if ((str[i] & 0xF0) == 0xE0) {
  50. // Three bytes are needed for characters in the rest of
  51. // the Basic Multilingual Plane, which contains virtually all characters
  52. // in common use, including most Chinese, Japanese and Korean characters.
  53. bytes = 3;
  54. } else if ((str[i] & 0xF8) == 0xF0) {
  55. // Four bytes are needed for characters in the other planes of Unicode,
  56. // which include less common CJK characters, various historic scripts,
  57. // mathematical symbols, and emoji (pictographic symbols).
  58. bytes = 4;
  59. }
  60. chars->push_back(str.substr(i, bytes));
  61. }
  62. }
  63. int UTF8StringLength(const std::string& str) {
  64. int len = 0;
  65. int bytes = 1;
  66. for (size_t i = 0; i < str.length(); i += bytes) {
  67. if ((str[i] & 0x80) == 0x00) {
  68. bytes = 1;
  69. } else if ((str[i] & 0xE0) == 0xC0) {
  70. bytes = 2;
  71. } else if ((str[i] & 0xF0) == 0xE0) {
  72. bytes = 3;
  73. } else if ((str[i] & 0xF8) == 0xF0) {
  74. bytes = 4;
  75. }
  76. ++len;
  77. }
  78. return len;
  79. }
  80. bool CheckEnglishChar(const std::string& ch) {
  81. // all english characters should be encoded in one byte
  82. if (ch.size() != 1) return false;
  83. // english words may contain apostrophe, i.e., "He's"
  84. return isalpha(ch[0]) || ch[0] == '\'';
  85. }
  86. bool CheckEnglishWord(const std::string& word) {
  87. std::vector<std::string> chars;
  88. SplitUTF8StringToChars(word, &chars);
  89. for (size_t k = 0; k < chars.size(); k++) {
  90. if (!CheckEnglishChar(chars[k])) {
  91. return false;
  92. }
  93. }
  94. return true;
  95. }
  96. std::string JoinString(const std::string& c,
  97. const std::vector<std::string>& strs) {
  98. std::string result;
  99. if (strs.size() > 0) {
  100. for (int i = 0; i < strs.size() - 1; i++) {
  101. result += (strs[i] + c);
  102. }
  103. result += strs.back();
  104. }
  105. return result;
  106. }
  107. bool IsAlpha(const std::string& str) {
  108. for (size_t i = 0; i < str.size(); i++) {
  109. if (!isalpha(str[i])) {
  110. return false;
  111. }
  112. }
  113. return true;
  114. }
  115. std::string ProcessBlank(const std::string& str, bool lowercase) {
  116. std::string result;
  117. if (!str.empty()) {
  118. std::vector<std::string> chars;
  119. SplitUTF8StringToChars(Trim(str), &chars);
  120. for (std::string& ch : chars) {
  121. if (ch != kSpaceSymbol) {
  122. result.append(ch);
  123. } else {
  124. // Ignore consecutive space or located in head
  125. if (!result.empty() && result.back() != ' ') {
  126. result.push_back(' ');
  127. }
  128. }
  129. }
  130. // Ignore tailing space
  131. if (!result.empty() && result.back() == ' ') {
  132. result.pop_back();
  133. }
  134. // NOTE: convert string to wstring
  135. // see issue 745: https://github.com/wenet-e2e/wenet/issues/745
  136. try {
  137. std::locale loc("");
  138. std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter;
  139. std::wstring wsresult = converter.from_bytes(result);
  140. for (auto& c : wsresult) {
  141. c = lowercase ? tolower(c, loc) : c;
  142. }
  143. result = converter.to_bytes(wsresult);
  144. } catch (std::exception& e) {
  145. LOG(ERROR) << "convert wstring error " << e.what();
  146. }
  147. }
  148. return result;
  149. }
  150. std::string Ltrim(const std::string& str) {
  151. size_t start = str.find_first_not_of(WHITESPACE);
  152. return (start == std::string::npos) ? "" : str.substr(start);
  153. }
  154. std::string Rtrim(const std::string& str) {
  155. size_t end = str.find_last_not_of(WHITESPACE);
  156. return (end == std::string::npos) ? "" : str.substr(0, end + 1);
  157. }
  158. std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); }
  159. std::string JoinPath(const std::string& left, const std::string& right) {
  160. std::string path(left);
  161. if (path.size() && path.back() != '/') {
  162. path.push_back('/');
  163. }
  164. path.append(right);
  165. return path;
  166. }
  167. #ifdef _MSC_VER
  168. std::wstring ToWString(const std::string& str) {
  169. unsigned len = str.size() * 2;
  170. setlocale(LC_CTYPE, "");
  171. wchar_t* p = new wchar_t[len];
  172. mbstowcs(p, str.c_str(), len);
  173. std::wstring wstr(p);
  174. delete[] p;
  175. return wstr;
  176. }
  177. #endif
  178. } // namespace wenet