You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

153 lines
3.9 KiB

  1. // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include "wetext_token_parser.h"
  15. #include "../utils/wetext_log.h"
  16. #include "../utils/wetext_string.h"
  17. namespace wetext {
  18. const char EOS[] = "<EOS>";
  19. const std::set<std::string> UTF8_WHITESPACE = {" ", "\t", "\n", "\r",
  20. "\x0b\x0c"};
  21. const std::set<std::string> ASCII_LETTERS = {
  22. "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",
  23. "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B",
  24. "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
  25. "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"};
  26. const std::unordered_map<std::string, std::vector<std::string>> TN_ORDERS = {
  27. {"date", {"year", "month", "day"}},
  28. {"fraction", {"denominator", "numerator"}},
  29. {"measure", {"denominator", "numerator", "value"}},
  30. {"money", {"value", "currency"}},
  31. {"time", {"noon", "hour", "minute", "second"}}};
  32. const std::unordered_map<std::string, std::vector<std::string>> ITN_ORDERS = {
  33. {"date", {"year", "month", "day"}},
  34. {"fraction", {"sign", "numerator", "denominator"}},
  35. {"measure", {"numerator", "denominator", "value"}},
  36. {"money", {"currency", "value", "decimal"}},
  37. {"time", {"hour", "minute", "second", "noon"}}};
  38. TokenParser::TokenParser(ParseType type) {
  39. if (type == ParseType::kTN) {
  40. orders_ = TN_ORDERS;
  41. } else {
  42. orders_ = ITN_ORDERS;
  43. }
  44. }
  45. void TokenParser::Load(const std::string& input) {
  46. wetext::SplitUTF8StringToChars(input, &text_);
  47. CHECK_GT(text_.size(), 0);
  48. index_ = 0;
  49. ch_ = text_[0];
  50. }
  51. bool TokenParser::Read() {
  52. if (index_ < text_.size() - 1) {
  53. index_ += 1;
  54. ch_ = text_[index_];
  55. return true;
  56. }
  57. ch_ = EOS;
  58. return false;
  59. }
  60. bool TokenParser::ParseWs() {
  61. bool not_eos = ch_ != EOS;
  62. while (not_eos && ch_ == " ") {
  63. not_eos = Read();
  64. }
  65. return not_eos;
  66. }
  67. bool TokenParser::ParseChar(const std::string& exp) {
  68. if (ch_ == exp) {
  69. Read();
  70. return true;
  71. }
  72. return false;
  73. }
  74. bool TokenParser::ParseChars(const std::string& exp) {
  75. bool ok = false;
  76. std::vector<std::string> chars;
  77. wetext::SplitUTF8StringToChars(exp, &chars);
  78. for (const auto& x : chars) {
  79. ok |= ParseChar(x);
  80. }
  81. return ok;
  82. }
  83. std::string TokenParser::ParseKey() {
  84. CHECK_NE(ch_, EOS);
  85. CHECK_EQ(UTF8_WHITESPACE.count(ch_), 0);
  86. std::string key = "";
  87. while (ASCII_LETTERS.count(ch_) > 0) {
  88. key += ch_;
  89. Read();
  90. }
  91. return key;
  92. }
  93. std::string TokenParser::ParseValue() {
  94. CHECK_NE(ch_, EOS);
  95. bool escape = false;
  96. std::string value = "";
  97. while (ch_ != "\"") {
  98. value += ch_;
  99. escape = ch_ == "\\" && !escape;
  100. Read();
  101. if (escape) {
  102. value += ch_;
  103. Read();
  104. }
  105. }
  106. return value;
  107. }
  108. void TokenParser::Parse(const std::string& input) {
  109. Load(input);
  110. while (ParseWs()) {
  111. std::string name = ParseKey();
  112. ParseChars(" { ");
  113. Token token(name);
  114. while (ParseWs()) {
  115. if (ch_ == "}") {
  116. ParseChar("}");
  117. break;
  118. }
  119. std::string key = ParseKey();
  120. ParseChars(": \"");
  121. std::string value = ParseValue();
  122. ParseChar("\"");
  123. token.Append(key, value);
  124. }
  125. tokens_.emplace_back(token);
  126. }
  127. }
  128. std::string TokenParser::Reorder(const std::string& input) {
  129. Parse(input);
  130. std::string output = "";
  131. for (auto& token : tokens_) {
  132. output += token.String(orders_) + " ";
  133. }
  134. return Trim(output);
  135. }
  136. } // namespace wetext