You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

81 lines
2.6 KiB

  1. // Copyright (c) 2021 Mobvoi Inc (Binbin Zhang)
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #ifndef UTILS_STRING_H_
  15. #define UTILS_STRING_H_
  16. #include <codecvt>
  17. #include <locale>
  18. #include <memory>
  19. #include <string>
  20. #include <vector>
  21. #include "fst/symbol-table.h"
  22. namespace wenet {
  23. const char WHITESPACE[] = " \n\r\t\f\v";
  24. // Split the string with space or tab.
  25. void SplitString(const std::string& str, std::vector<std::string>* strs);
  26. void SplitStringToVector(const std::string& full, const char* delim,
  27. bool omit_empty_strings,
  28. std::vector<std::string>* out);
  29. // NOTE(Xingchen Song): we add this function to make it possible to
  30. // support multilingual recipe in the future, in which characters of
  31. // different languages are all encoded in UTF-8 format.
  32. // UTF-8 REF: https://en.wikipedia.org/wiki/UTF-8#Encoding
  33. // Split the UTF-8 string into chars.
  34. void SplitUTF8StringToChars(const std::string& str,
  35. std::vector<std::string>* chars);
  36. int UTF8StringLength(const std::string& str);
  37. // Check whether the UTF-8 char is alphabet or '.
  38. bool CheckEnglishChar(const std::string& ch);
  39. // Check whether the UTF-8 word is only contains alphabet or '.
  40. bool CheckEnglishWord(const std::string& word);
  41. std::string JoinString(const std::string& c,
  42. const std::vector<std::string>& strs);
  43. bool IsAlpha(const std::string& str);
  44. // Split the UTF-8 string into words by symbol table.
  45. // Return whether not contains oov.
  46. bool SplitUTF8StringToWords(
  47. const std::string& str,
  48. const std::shared_ptr<fst::SymbolTable>& symbol_table,
  49. std::vector<std::string>* words);
  50. // Replace ▁ with space, then remove head, tail and consecutive space.
  51. std::string ProcessBlank(const std::string& str, bool lowercase);
  52. std::string Ltrim(const std::string& str);
  53. std::string Rtrim(const std::string& str);
  54. std::string Trim(const std::string& str);
  55. std::string JoinPath(const std::string& left, const std::string& right);
  56. #ifdef _MSC_VER
  57. std::wstring ToWString(const std::string& str);
  58. #endif
  59. } // namespace wenet
  60. #endif // UTILS_STRING_H_