// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "wetext_token_parser.h" #include "../utils/wetext_log.h" #include "../utils/wetext_string.h" namespace wetext { using namespace fst; const char EOS[] = ""; const std::set UTF8_WHITESPACE = {" ", "\t", "\n", "\r", "\x0b\x0c"}; const std::set ASCII_LETTERS = { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"}; const std::unordered_map> TN_ORDERS = { {"date", {"year", "month", "day"}}, {"fraction", {"denominator", "numerator"}}, {"measure", {"denominator", "numerator", "value"}}, {"money", {"value", "currency"}}, {"time", {"noon", "hour", "minute", "second"}}}; const std::unordered_map> ITN_ORDERS = { {"date", {"year", "month", "day"}}, {"fraction", {"sign", "numerator", "denominator"}}, {"measure", {"numerator", "denominator", "value"}}, {"money", {"currency", "value", "decimal"}}, {"time", {"hour", "minute", "second", "noon"}}}; TokenParser::TokenParser(ParseType type) { if (type == ParseType::kTN) { orders_ = TN_ORDERS; } else { orders_ = ITN_ORDERS; } } void TokenParser::Load(const std::string& input) { wetext::SplitUTF8StringToChars(input, &text_); CHECK_GT(text_.size(), 0); index_ = 0; ch_ = text_[0]; } bool TokenParser::Read() { if (index_ < text_.size() - 1) { index_ += 1; ch_ = text_[index_]; return true; } ch_ = EOS; return false; } bool TokenParser::ParseWs() { bool not_eos = ch_ != EOS; while (not_eos && ch_ == " ") { not_eos = Read(); } return not_eos; } bool TokenParser::ParseChar(const std::string& exp) { if (ch_ == exp) { Read(); return true; } return false; } bool TokenParser::ParseChars(const std::string& exp) { bool ok = false; std::vector chars; wetext::SplitUTF8StringToChars(exp, &chars); for (const auto& x : chars) { ok |= ParseChar(x); } return ok; } std::string TokenParser::ParseKey() { CHECK_NE(ch_, EOS); CHECK_EQ(UTF8_WHITESPACE.count(ch_), 0); std::string key = ""; while (ASCII_LETTERS.count(ch_) > 0) { key += ch_; Read(); } return key; } std::string TokenParser::ParseValue() { CHECK_NE(ch_, EOS); bool escape = false; std::string value = ""; while (ch_ != "\"") { value += ch_; escape = ch_ == "\\" && !escape; Read(); if (escape) { value += ch_; Read(); } } return value; } void TokenParser::Parse(const std::string& input) { Load(input); while (ParseWs()) { std::string name = ParseKey(); ParseChars(" { "); Token token(name); while (ParseWs()) { if (ch_ == "}") { ParseChar("}"); break; } std::string key = ParseKey(); ParseChars(": \""); std::string value = ParseValue(); ParseChar("\""); token.Append(key, value); } tokens_.emplace_back(token); } } std::string TokenParser::Reorder(const std::string& input) { Parse(input); std::string output = ""; for (auto& token : tokens_) { output += token.String(orders_) + " "; } return Trim(output); } } // namespace wetext