// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#ifndef PROCESSOR_WETEXT_TOKEN_PARSER_H_
|
|
#define PROCESSOR_WETEXT_TOKEN_PARSER_H_
|
|
|
|
#include <set>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
namespace wetext {
|
|
|
|
extern const char EOS[];
|
|
extern const std::set<std::string> UTF8_WHITESPACE;
|
|
extern const std::set<std::string> ASCII_LETTERS;
|
|
extern const std::unordered_map<std::string, std::vector<std::string>>
|
|
TN_ORDERS;
|
|
extern const std::unordered_map<std::string, std::vector<std::string>>
|
|
ITN_ORDERS;
|
|
|
|
struct Token {
|
|
std::string name;
|
|
std::vector<std::string> order;
|
|
std::unordered_map<std::string, std::string> members;
|
|
|
|
explicit Token(const std::string& name) : name(name) {}
|
|
|
|
void Append(const std::string& key, const std::string& value) {
|
|
order.emplace_back(key);
|
|
members[key] = value;
|
|
}
|
|
|
|
std::string String(
|
|
const std::unordered_map<std::string, std::vector<std::string>>& orders) {
|
|
std::string output = name + " {";
|
|
if (orders.count(name) > 0) {
|
|
order = orders.at(name);
|
|
}
|
|
|
|
for (const auto& key : order) {
|
|
if (members.count(key) == 0) {
|
|
continue;
|
|
}
|
|
output += " " + key + ": \"" + members[key] + "\"";
|
|
}
|
|
return output + " }";
|
|
}
|
|
};
|
|
|
|
enum ParseType {
|
|
kTN = 0x00, // Text Normalization
|
|
kITN = 0x01 // Inverse Text Normalization
|
|
};
|
|
|
|
class TokenParser {
|
|
public:
|
|
explicit TokenParser(ParseType type);
|
|
std::string Reorder(const std::string& input);
|
|
|
|
private:
|
|
void Load(const std::string& input);
|
|
bool Read();
|
|
bool ParseWs();
|
|
bool ParseChar(const std::string& exp);
|
|
bool ParseChars(const std::string& exp);
|
|
std::string ParseKey();
|
|
std::string ParseValue();
|
|
void Parse(const std::string& input);
|
|
|
|
int index_;
|
|
std::string ch_;
|
|
std::vector<std::string> text_;
|
|
std::vector<Token> tokens_;
|
|
std::unordered_map<std::string, std::vector<std::string>> orders_;
|
|
};
|
|
|
|
} // namespace wetext
|
|
|
|
#endif // PROCESSOR_WETEXT_TOKEN_PARSER_H_
|