You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

91 lines
2.5 KiB

  1. // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #ifndef PROCESSOR_WETEXT_TOKEN_PARSER_H_
  15. #define PROCESSOR_WETEXT_TOKEN_PARSER_H_
  16. #include <set>
  17. #include <string>
  18. #include <unordered_map>
  19. #include <vector>
  20. namespace wetext {
  21. extern const char EOS[];
  22. extern const std::set<std::string> UTF8_WHITESPACE;
  23. extern const std::set<std::string> ASCII_LETTERS;
  24. extern const std::unordered_map<std::string, std::vector<std::string>>
  25. TN_ORDERS;
  26. extern const std::unordered_map<std::string, std::vector<std::string>>
  27. ITN_ORDERS;
  28. struct Token {
  29. std::string name;
  30. std::vector<std::string> order;
  31. std::unordered_map<std::string, std::string> members;
  32. explicit Token(const std::string& name) : name(name) {}
  33. void Append(const std::string& key, const std::string& value) {
  34. order.emplace_back(key);
  35. members[key] = value;
  36. }
  37. std::string String(
  38. const std::unordered_map<std::string, std::vector<std::string>>& orders) {
  39. std::string output = name + " {";
  40. if (orders.count(name) > 0) {
  41. order = orders.at(name);
  42. }
  43. for (const auto& key : order) {
  44. if (members.count(key) == 0) {
  45. continue;
  46. }
  47. output += " " + key + ": \"" + members[key] + "\"";
  48. }
  49. return output + " }";
  50. }
  51. };
  52. enum ParseType {
  53. kTN = 0x00, // Text Normalization
  54. kITN = 0x01 // Inverse Text Normalization
  55. };
  56. class TokenParser {
  57. public:
  58. explicit TokenParser(ParseType type);
  59. std::string Reorder(const std::string& input);
  60. private:
  61. void Load(const std::string& input);
  62. bool Read();
  63. bool ParseWs();
  64. bool ParseChar(const std::string& exp);
  65. bool ParseChars(const std::string& exp);
  66. std::string ParseKey();
  67. std::string ParseValue();
  68. void Parse(const std::string& input);
  69. int index_;
  70. std::string ch_;
  71. std::vector<std::string> text_;
  72. std::vector<Token> tokens_;
  73. std::unordered_map<std::string, std::vector<std::string>> orders_;
  74. };
  75. } // namespace wetext
  76. #endif // PROCESSOR_WETEXT_TOKEN_PARSER_H_