xiaoke
/
libtorch-runtime


								// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)

								//

								// Licensed under the Apache License, Version 2.0 (the "License");

								// you may not use this file except in compliance with the License.

								// You may obtain a copy of the License at

								//

								//     http://www.apache.org/licenses/LICENSE-2.0

								//

								// Unless required by applicable law or agreed to in writing, software

								// distributed under the License is distributed on an "AS IS" BASIS,

								// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

								// See the License for the specific language governing permissions and

								// limitations under the License.


								#include "wetext_token_parser.h"


								#include "../utils/wetext_log.h"

								#include "../utils/wetext_string.h"


								namespace wetext {

								using namespace fst;

								const char EOS[] = "<EOS>";

								const std::set<std::string> UTF8_WHITESPACE = {" ", "\t", "\n", "\r",

								                                               "\x0b\x0c"};

								const std::set<std::string> ASCII_LETTERS = {

								    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",

								    "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B",

								    "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",

								    "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"};

								const std::unordered_map<std::string, std::vector<std::string>> TN_ORDERS = {

								    {"date", {"year", "month", "day"}},

								    {"fraction", {"denominator", "numerator"}},

								    {"measure", {"denominator", "numerator", "value"}},

								    {"money", {"value", "currency"}},

								    {"time", {"noon", "hour", "minute", "second"}}};

								const std::unordered_map<std::string, std::vector<std::string>> ITN_ORDERS = {

								    {"date", {"year", "month", "day"}},

								    {"fraction", {"sign", "numerator", "denominator"}},

								    {"measure", {"numerator", "denominator", "value"}},

								    {"money", {"currency", "value", "decimal"}},

								    {"time", {"hour", "minute", "second", "noon"}}};


								TokenParser::TokenParser(ParseType type) {

								  if (type == ParseType::kTN) {

								    orders_ = TN_ORDERS;

								  } else {

								    orders_ = ITN_ORDERS;

								  }

								}


								void TokenParser::Load(const std::string& input) {

								  wetext::SplitUTF8StringToChars(input, &text_);

								  CHECK_GT(text_.size(), 0);

								  index_ = 0;

								  ch_ = text_[0];

								}


								bool TokenParser::Read() {

								  if (index_ < text_.size() - 1) {

								    index_ += 1;

								    ch_ = text_[index_];

								    return true;

								  }

								  ch_ = EOS;

								  return false;

								}


								bool TokenParser::ParseWs() {

								  bool not_eos = ch_ != EOS;

								  while (not_eos && ch_ == " ") {

								    not_eos = Read();

								  }

								  return not_eos;

								}


								bool TokenParser::ParseChar(const std::string& exp) {

								  if (ch_ == exp) {

								    Read();

								    return true;

								  }

								  return false;

								}


								bool TokenParser::ParseChars(const std::string& exp) {

								  bool ok = false;

								  std::vector<std::string> chars;

								  wetext::SplitUTF8StringToChars(exp, &chars);

								  for (const auto& x : chars) {

								    ok |= ParseChar(x);

								  }

								  return ok;

								}


								std::string TokenParser::ParseKey() {

								  CHECK_NE(ch_, EOS);

								  CHECK_EQ(UTF8_WHITESPACE.count(ch_), 0);


								  std::string key = "";

								  while (ASCII_LETTERS.count(ch_) > 0) {

								    key += ch_;

								    Read();

								  }

								  return key;

								}


								std::string TokenParser::ParseValue() {

								  CHECK_NE(ch_, EOS);

								  bool escape = false;


								  std::string value = "";

								  while (ch_ != "\"") {

								    value += ch_;

								    escape = ch_ == "\\" && !escape;

								    Read();

								    if (escape) {

								      value += ch_;

								      Read();

								    }

								  }

								  return value;

								}


								void TokenParser::Parse(const std::string& input) {

								  Load(input);

								  while (ParseWs()) {

								    std::string name = ParseKey();

								    ParseChars(" { ");


								    Token token(name);

								    while (ParseWs()) {

								      if (ch_ == "}") {

								        ParseChar("}");

								        break;

								      }

								      std::string key = ParseKey();

								      ParseChars(": \"");

								      std::string value = ParseValue();

								      ParseChar("\"");

								      token.Append(key, value);

								    }

								    tokens_.emplace_back(token);

								  }

								}


								std::string TokenParser::Reorder(const std::string& input) {

								  Parse(input);

								  std::string output = "";

								  for (auto& token : tokens_) {

								    output += token.String(orders_) + " ";

								  }

								  return Trim(output);

								}


								}  // namespace wetext