|
|
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "wetext_token_parser.h"
#include "../utils/wetext_log.h"
#include "../utils/wetext_string.h"
namespace wetext { using namespace fst; const char EOS[] = "<EOS>"; const std::set<std::string> UTF8_WHITESPACE = {" ", "\t", "\n", "\r", "\x0b\x0c"}; const std::set<std::string> ASCII_LETTERS = { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"}; const std::unordered_map<std::string, std::vector<std::string>> TN_ORDERS = { {"date", {"year", "month", "day"}}, {"fraction", {"denominator", "numerator"}}, {"measure", {"denominator", "numerator", "value"}}, {"money", {"value", "currency"}}, {"time", {"noon", "hour", "minute", "second"}}}; const std::unordered_map<std::string, std::vector<std::string>> ITN_ORDERS = { {"date", {"year", "month", "day"}}, {"fraction", {"sign", "numerator", "denominator"}}, {"measure", {"numerator", "denominator", "value"}}, {"money", {"currency", "value", "decimal"}}, {"time", {"hour", "minute", "second", "noon"}}};
TokenParser::TokenParser(ParseType type) { if (type == ParseType::kTN) { orders_ = TN_ORDERS; } else { orders_ = ITN_ORDERS; } }
void TokenParser::Load(const std::string& input) { wetext::SplitUTF8StringToChars(input, &text_); CHECK_GT(text_.size(), 0); index_ = 0; ch_ = text_[0]; }
bool TokenParser::Read() { if (index_ < text_.size() - 1) { index_ += 1; ch_ = text_[index_]; return true; } ch_ = EOS; return false; }
bool TokenParser::ParseWs() { bool not_eos = ch_ != EOS; while (not_eos && ch_ == " ") { not_eos = Read(); } return not_eos; }
bool TokenParser::ParseChar(const std::string& exp) { if (ch_ == exp) { Read(); return true; } return false; }
bool TokenParser::ParseChars(const std::string& exp) { bool ok = false; std::vector<std::string> chars; wetext::SplitUTF8StringToChars(exp, &chars); for (const auto& x : chars) { ok |= ParseChar(x); } return ok; }
std::string TokenParser::ParseKey() { CHECK_NE(ch_, EOS); CHECK_EQ(UTF8_WHITESPACE.count(ch_), 0);
std::string key = ""; while (ASCII_LETTERS.count(ch_) > 0) { key += ch_; Read(); } return key; }
std::string TokenParser::ParseValue() { CHECK_NE(ch_, EOS); bool escape = false;
std::string value = ""; while (ch_ != "\"") { value += ch_; escape = ch_ == "\\" && !escape; Read(); if (escape) { value += ch_; Read(); } } return value; }
void TokenParser::Parse(const std::string& input) { Load(input); while (ParseWs()) { std::string name = ParseKey(); ParseChars(" { ");
Token token(name); while (ParseWs()) { if (ch_ == "}") { ParseChar("}"); break; } std::string key = ParseKey(); ParseChars(": \""); std::string value = ParseValue(); ParseChar("\""); token.Append(key, value); } tokens_.emplace_back(token); } }
std::string TokenParser::Reorder(const std::string& input) { Parse(input); std::string output = ""; for (auto& token : tokens_) { output += token.String(orders_) + " "; } return Trim(output); }
} // namespace wetext
|