You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

154 lines
4.0 KiB

  1. // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include "wetext_token_parser.h"
  15. #include "../utils/wetext_log.h"
  16. #include "../utils/wetext_string.h"
  17. namespace wetext {
  18. using namespace fst;
  19. const char EOS[] = "<EOS>";
  20. const std::set<std::string> UTF8_WHITESPACE = {" ", "\t", "\n", "\r",
  21. "\x0b\x0c"};
  22. const std::set<std::string> ASCII_LETTERS = {
  23. "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",
  24. "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B",
  25. "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
  26. "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"};
  27. const std::unordered_map<std::string, std::vector<std::string>> TN_ORDERS = {
  28. {"date", {"year", "month", "day"}},
  29. {"fraction", {"denominator", "numerator"}},
  30. {"measure", {"denominator", "numerator", "value"}},
  31. {"money", {"value", "currency"}},
  32. {"time", {"noon", "hour", "minute", "second"}}};
  33. const std::unordered_map<std::string, std::vector<std::string>> ITN_ORDERS = {
  34. {"date", {"year", "month", "day"}},
  35. {"fraction", {"sign", "numerator", "denominator"}},
  36. {"measure", {"numerator", "denominator", "value"}},
  37. {"money", {"currency", "value", "decimal"}},
  38. {"time", {"hour", "minute", "second", "noon"}}};
  39. TokenParser::TokenParser(ParseType type) {
  40. if (type == ParseType::kTN) {
  41. orders_ = TN_ORDERS;
  42. } else {
  43. orders_ = ITN_ORDERS;
  44. }
  45. }
  46. void TokenParser::Load(const std::string& input) {
  47. wetext::SplitUTF8StringToChars(input, &text_);
  48. CHECK_GT(text_.size(), 0);
  49. index_ = 0;
  50. ch_ = text_[0];
  51. }
  52. bool TokenParser::Read() {
  53. if (index_ < text_.size() - 1) {
  54. index_ += 1;
  55. ch_ = text_[index_];
  56. return true;
  57. }
  58. ch_ = EOS;
  59. return false;
  60. }
  61. bool TokenParser::ParseWs() {
  62. bool not_eos = ch_ != EOS;
  63. while (not_eos && ch_ == " ") {
  64. not_eos = Read();
  65. }
  66. return not_eos;
  67. }
  68. bool TokenParser::ParseChar(const std::string& exp) {
  69. if (ch_ == exp) {
  70. Read();
  71. return true;
  72. }
  73. return false;
  74. }
  75. bool TokenParser::ParseChars(const std::string& exp) {
  76. bool ok = false;
  77. std::vector<std::string> chars;
  78. wetext::SplitUTF8StringToChars(exp, &chars);
  79. for (const auto& x : chars) {
  80. ok |= ParseChar(x);
  81. }
  82. return ok;
  83. }
  84. std::string TokenParser::ParseKey() {
  85. CHECK_NE(ch_, EOS);
  86. CHECK_EQ(UTF8_WHITESPACE.count(ch_), 0);
  87. std::string key = "";
  88. while (ASCII_LETTERS.count(ch_) > 0) {
  89. key += ch_;
  90. Read();
  91. }
  92. return key;
  93. }
  94. std::string TokenParser::ParseValue() {
  95. CHECK_NE(ch_, EOS);
  96. bool escape = false;
  97. std::string value = "";
  98. while (ch_ != "\"") {
  99. value += ch_;
  100. escape = ch_ == "\\" && !escape;
  101. Read();
  102. if (escape) {
  103. value += ch_;
  104. Read();
  105. }
  106. }
  107. return value;
  108. }
  109. void TokenParser::Parse(const std::string& input) {
  110. Load(input);
  111. while (ParseWs()) {
  112. std::string name = ParseKey();
  113. ParseChars(" { ");
  114. Token token(name);
  115. while (ParseWs()) {
  116. if (ch_ == "}") {
  117. ParseChar("}");
  118. break;
  119. }
  120. std::string key = ParseKey();
  121. ParseChars(": \"");
  122. std::string value = ParseValue();
  123. ParseChar("\"");
  124. token.Append(key, value);
  125. }
  126. tokens_.emplace_back(token);
  127. }
  128. }
  129. std::string TokenParser::Reorder(const std::string& input) {
  130. Parse(input);
  131. std::string output = "";
  132. for (auto& token : tokens_) {
  133. output += token.String(orders_) + " ";
  134. }
  135. return Trim(output);
  136. }
  137. } // namespace wetext