You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

91 lines
2.5 KiB

// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PROCESSOR_WETEXT_TOKEN_PARSER_H_
#define PROCESSOR_WETEXT_TOKEN_PARSER_H_
#include <set>
#include <string>
#include <unordered_map>
#include <vector>
namespace wetext {
extern const char EOS[];
extern const std::set<std::string> UTF8_WHITESPACE;
extern const std::set<std::string> ASCII_LETTERS;
extern const std::unordered_map<std::string, std::vector<std::string>>
TN_ORDERS;
extern const std::unordered_map<std::string, std::vector<std::string>>
ITN_ORDERS;
struct Token {
std::string name;
std::vector<std::string> order;
std::unordered_map<std::string, std::string> members;
explicit Token(const std::string& name) : name(name) {}
void Append(const std::string& key, const std::string& value) {
order.emplace_back(key);
members[key] = value;
}
std::string String(
const std::unordered_map<std::string, std::vector<std::string>>& orders) {
std::string output = name + " {";
if (orders.count(name) > 0) {
order = orders.at(name);
}
for (const auto& key : order) {
if (members.count(key) == 0) {
continue;
}
output += " " + key + ": \"" + members[key] + "\"";
}
return output + " }";
}
};
enum ParseType {
kTN = 0x00, // Text Normalization
kITN = 0x01 // Inverse Text Normalization
};
class TokenParser {
public:
explicit TokenParser(ParseType type);
std::string Reorder(const std::string& input);
private:
void Load(const std::string& input);
bool Read();
bool ParseWs();
bool ParseChar(const std::string& exp);
bool ParseChars(const std::string& exp);
std::string ParseKey();
std::string ParseValue();
void Parse(const std::string& input);
int index_;
std::string ch_;
std::vector<std::string> text_;
std::vector<Token> tokens_;
std::unordered_map<std::string, std::vector<std::string>> orders_;
};
} // namespace wetext
#endif // PROCESSOR_WETEXT_TOKEN_PARSER_H_