@ -1,14 +0,0 @@ |
|||||
set(wetext_BUILD_DIR "${LIB_BASE_DIR}/wetextprocessing-build") |
|
||||
set(wetext_SOURCE_DIR "${LIB_BASE_DIR}/wetextprocessing-src") |
|
||||
if(NOT EXISTS ${wetext_BUILD_DIR}) |
|
||||
execute_process(COMMAND mkdir -p ${wetext_BUILD_DIR}) |
|
||||
message(STATUS "${BoldGreen}Install wetextprocessing library${ColourReset}") |
|
||||
execute_process( |
|
||||
COMMAND cmake -B ${wetext_BUILD_DIR} -S ${wetext_SOURCE_DIR}/runtime -DCMAKE_BUILD_TYPE=Release -j4 -fPIC && |
|
||||
cmake --build . |
|
||||
WORKING_DIRECTORY ${wetext_BUILD_DIR}) |
|
||||
endif () |
|
||||
|
|
||||
include_directories(${wetext_SOURCE_DIR}/runtime) |
|
||||
#add_subdirectory(${wetext_SOURCE_DIR}/runtime/utils) |
|
||||
#add_subdirectory(${wetext_SOURCE_DIR}/runtime/processor) |
|
@ -1,10 +1,17 @@ |
|||||
add_library(post_processor STATIC |
|
||||
post_processor.cc |
|
||||
|
message(STATUS "post_processor dir:${CMAKE_CURRENT_SOURCE_DIR}") |
||||
|
add_library(wetext_utils STATIC ${CMAKE_CURRENT_SOURCE_DIR}/utils/wetext_string.cc) |
||||
|
target_link_libraries(wetext_utils PUBLIC glog) |
||||
|
|
||||
|
add_library(wetext_processor STATIC |
||||
|
${CMAKE_CURRENT_SOURCE_DIR}/processor/wetext_processor.cc |
||||
|
${CMAKE_CURRENT_SOURCE_DIR}/processor/wetext_token_parser.cc |
||||
) |
) |
||||
if(ITN) |
|
||||
include(wetextprocessing) |
|
||||
target_link_libraries(post_processor PUBLIC utils wetext_processor wetext_utils) |
|
||||
else() |
|
||||
target_link_libraries(post_processor PUBLIC utils) |
|
||||
endif() |
|
||||
|
target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils) |
||||
|
|
||||
|
add_executable(processor_main processor_main.cc) |
||||
|
target_link_libraries(processor_main PUBLIC wetext_processor) |
||||
|
|
||||
|
add_library(post_processor STATIC |
||||
|
post_processor.cc |
||||
|
) |
||||
|
target_link_libraries(post_processor PUBLIC utils wetext_processor wetext_utils) |
@ -0,0 +1,2 @@ |
|||||
|
add_executable(processor_main processor_main.cc) |
||||
|
target_link_libraries(processor_main PUBLIC wetext_processor) |
@ -0,0 +1,54 @@ |
|||||
|
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
||||
|
//
|
||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
|
// you may not use this file except in compliance with the License.
|
||||
|
// You may obtain a copy of the License at
|
||||
|
//
|
||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
//
|
||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
// See the License for the specific language governing permissions and
|
||||
|
// limitations under the License.
|
||||
|
|
||||
|
#include <fstream>
|
||||
|
#include <iostream>
|
||||
|
#include <string>
|
||||
|
|
||||
|
#include "../processor/wetext_processor.h"
|
||||
|
#include "../utils/wetext_flags.h"
|
||||
|
|
||||
|
DEFINE_string(text, "", "input string"); |
||||
|
DEFINE_string(file, "", "input file"); |
||||
|
DEFINE_string(tagger, "", "tagger fst path"); |
||||
|
DEFINE_string(verbalizer, "", "verbalizer fst path"); |
||||
|
|
||||
|
int main(int argc, char* argv[]) { |
||||
|
gflags::ParseCommandLineFlags(&argc, &argv, false); |
||||
|
google::InitGoogleLogging(argv[0]); |
||||
|
|
||||
|
if (FLAGS_tagger.empty() || FLAGS_verbalizer.empty()) { |
||||
|
LOG(FATAL) << "Please provide the tagger and verbalizer fst files."; |
||||
|
} |
||||
|
wetext::Processor processor(FLAGS_tagger, FLAGS_verbalizer); |
||||
|
|
||||
|
if (!FLAGS_text.empty()) { |
||||
|
std::string tagged_text = processor.Tag(FLAGS_text); |
||||
|
std::cout << tagged_text << std::endl; |
||||
|
std::string normalized_text = processor.Verbalize(tagged_text); |
||||
|
std::cout << normalized_text << std::endl; |
||||
|
} |
||||
|
|
||||
|
if (!FLAGS_file.empty()) { |
||||
|
std::ifstream file(FLAGS_file); |
||||
|
std::string line; |
||||
|
while (getline(file, line)) { |
||||
|
std::string tagged_text = processor.Tag(line); |
||||
|
std::cout << tagged_text << std::endl; |
||||
|
std::string normalized_text = processor.Verbalize(tagged_text); |
||||
|
std::cout << normalized_text << std::endl; |
||||
|
} |
||||
|
} |
||||
|
return 0; |
||||
|
} |
@ -0,0 +1,13 @@ |
|||||
|
add_library(wetext_processor STATIC |
||||
|
wetext_processor.cc |
||||
|
wetext_token_parser.cc |
||||
|
) |
||||
|
if(ANDROID) |
||||
|
target_link_libraries(wetext_processor PUBLIC fst wetext_utils) |
||||
|
else() |
||||
|
if(MSVC) |
||||
|
target_link_libraries(wetext_processor PUBLIC fst wetext_utils) |
||||
|
else() |
||||
|
target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils) |
||||
|
endif() |
||||
|
endif() |
@ -0,0 +1,79 @@ |
|||||
|
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
||||
|
//
|
||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
|
// you may not use this file except in compliance with the License.
|
||||
|
// You may obtain a copy of the License at
|
||||
|
//
|
||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
//
|
||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
// See the License for the specific language governing permissions and
|
||||
|
// limitations under the License.
|
||||
|
|
||||
|
#include "wetext_processor.h"
|
||||
|
|
||||
|
using fst::StringTokenType; |
||||
|
|
||||
|
namespace wetext { |
||||
|
Processor::Processor(const std::string& tagger_path, |
||||
|
const std::string& verbalizer_path) { |
||||
|
tagger_.reset(StdVectorFst::Read(tagger_path)); |
||||
|
verbalizer_.reset(StdVectorFst::Read(verbalizer_path)); |
||||
|
compiler_ = std::make_shared<StringCompiler<StdArc>>(StringTokenType::BYTE); |
||||
|
printer_ = std::make_shared<StringPrinter<StdArc>>(StringTokenType::BYTE); |
||||
|
|
||||
|
if (tagger_path.find("_tn_") != tagger_path.npos) { |
||||
|
parse_type_ = ParseType::kTN; |
||||
|
} else if (tagger_path.find("_itn_") != tagger_path.npos) { |
||||
|
parse_type_ = ParseType::kITN; |
||||
|
} else { |
||||
|
LOG(FATAL) << "Invalid fst prefix, prefix should contain" |
||||
|
<< " either \"_tn_\" or \"_itn_\"."; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
std::string Processor::ShortestPath(const StdVectorFst& lattice) { |
||||
|
StdVectorFst shortest_path; |
||||
|
fst::ShortestPath(lattice, &shortest_path, 1, true); |
||||
|
|
||||
|
std::string output; |
||||
|
printer_->operator()(shortest_path, &output); |
||||
|
return output; |
||||
|
} |
||||
|
|
||||
|
std::string Processor::Compose(const std::string& input, |
||||
|
const StdVectorFst* fst) { |
||||
|
StdVectorFst input_fst; |
||||
|
compiler_->operator()(input, &input_fst); |
||||
|
|
||||
|
StdVectorFst lattice; |
||||
|
fst::Compose(input_fst, *fst, &lattice); |
||||
|
return ShortestPath(lattice); |
||||
|
} |
||||
|
|
||||
|
std::string Processor::Tag(const std::string& input) { |
||||
|
if (input.empty()) { |
||||
|
return ""; |
||||
|
} |
||||
|
return Compose(input, tagger_.get()); |
||||
|
} |
||||
|
|
||||
|
std::string Processor::Verbalize(const std::string& input) { |
||||
|
if (input.empty()) { |
||||
|
return ""; |
||||
|
} |
||||
|
TokenParser parser(parse_type_); |
||||
|
std::string output = parser.Reorder(input); |
||||
|
|
||||
|
output = Compose(output, verbalizer_.get()); |
||||
|
output.erase(std::remove(output.begin(), output.end(), '\0'), output.end()); |
||||
|
return output; |
||||
|
} |
||||
|
|
||||
|
std::string Processor::Normalize(const std::string& input) { |
||||
|
return Verbalize(Tag(input)); |
||||
|
} |
||||
|
|
||||
|
} // namespace wetext
|
@ -0,0 +1,51 @@ |
|||||
|
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) |
||||
|
// |
||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
// you may not use this file except in compliance with the License. |
||||
|
// You may obtain a copy of the License at |
||||
|
// |
||||
|
// http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
// |
||||
|
// Unless required by applicable law or agreed to in writing, software |
||||
|
// distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
// See the License for the specific language governing permissions and |
||||
|
// limitations under the License. |
||||
|
|
||||
|
#ifndef PROCESSOR_WETEXT_PROCESSOR_H_ |
||||
|
#define PROCESSOR_WETEXT_PROCESSOR_H_ |
||||
|
|
||||
|
#include <memory> |
||||
|
#include <string> |
||||
|
|
||||
|
#include "fst/fstlib.h" |
||||
|
|
||||
|
#include "wetext_token_parser.h" |
||||
|
|
||||
|
using fst::StdArc; |
||||
|
using fst::StdVectorFst; |
||||
|
using fst::StringCompiler; |
||||
|
using fst::StringPrinter; |
||||
|
|
||||
|
namespace wetext { |
||||
|
class Processor { |
||||
|
public: |
||||
|
Processor(const std::string& tagger_path, const std::string& verbalizer_path); |
||||
|
std::string Tag(const std::string& input); |
||||
|
std::string Verbalize(const std::string& input); |
||||
|
std::string Normalize(const std::string& input); |
||||
|
|
||||
|
private: |
||||
|
std::string ShortestPath(const StdVectorFst& lattice); |
||||
|
std::string Compose(const std::string& input, const StdVectorFst* fst); |
||||
|
|
||||
|
ParseType parse_type_; |
||||
|
std::shared_ptr<StdVectorFst> tagger_ = nullptr; |
||||
|
std::shared_ptr<StdVectorFst> verbalizer_ = nullptr; |
||||
|
std::shared_ptr<StringCompiler<StdArc>> compiler_ = nullptr; |
||||
|
std::shared_ptr<StringPrinter<StdArc>> printer_ = nullptr; |
||||
|
}; |
||||
|
|
||||
|
} // namespace wetext |
||||
|
|
||||
|
#endif // PROCESSOR_WETEXT_PROCESSOR_H_ |
@ -0,0 +1,153 @@ |
|||||
|
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
||||
|
//
|
||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
|
// you may not use this file except in compliance with the License.
|
||||
|
// You may obtain a copy of the License at
|
||||
|
//
|
||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
//
|
||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
// See the License for the specific language governing permissions and
|
||||
|
// limitations under the License.
|
||||
|
|
||||
|
#include "wetext_token_parser.h"
|
||||
|
|
||||
|
#include "../utils/wetext_log.h"
|
||||
|
#include "../utils/wetext_string.h"
|
||||
|
|
||||
|
namespace wetext { |
||||
|
const char EOS[] = "<EOS>"; |
||||
|
const std::set<std::string> UTF8_WHITESPACE = {" ", "\t", "\n", "\r", |
||||
|
"\x0b\x0c"}; |
||||
|
const std::set<std::string> ASCII_LETTERS = { |
||||
|
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", |
||||
|
"o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", |
||||
|
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", |
||||
|
"Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"}; |
||||
|
const std::unordered_map<std::string, std::vector<std::string>> TN_ORDERS = { |
||||
|
{"date", {"year", "month", "day"}}, |
||||
|
{"fraction", {"denominator", "numerator"}}, |
||||
|
{"measure", {"denominator", "numerator", "value"}}, |
||||
|
{"money", {"value", "currency"}}, |
||||
|
{"time", {"noon", "hour", "minute", "second"}}}; |
||||
|
const std::unordered_map<std::string, std::vector<std::string>> ITN_ORDERS = { |
||||
|
{"date", {"year", "month", "day"}}, |
||||
|
{"fraction", {"sign", "numerator", "denominator"}}, |
||||
|
{"measure", {"numerator", "denominator", "value"}}, |
||||
|
{"money", {"currency", "value", "decimal"}}, |
||||
|
{"time", {"hour", "minute", "second", "noon"}}}; |
||||
|
|
||||
|
TokenParser::TokenParser(ParseType type) { |
||||
|
if (type == ParseType::kTN) { |
||||
|
orders_ = TN_ORDERS; |
||||
|
} else { |
||||
|
orders_ = ITN_ORDERS; |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
void TokenParser::Load(const std::string& input) { |
||||
|
wetext::SplitUTF8StringToChars(input, &text_); |
||||
|
CHECK_GT(text_.size(), 0); |
||||
|
index_ = 0; |
||||
|
ch_ = text_[0]; |
||||
|
} |
||||
|
|
||||
|
bool TokenParser::Read() { |
||||
|
if (index_ < text_.size() - 1) { |
||||
|
index_ += 1; |
||||
|
ch_ = text_[index_]; |
||||
|
return true; |
||||
|
} |
||||
|
ch_ = EOS; |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
bool TokenParser::ParseWs() { |
||||
|
bool not_eos = ch_ != EOS; |
||||
|
while (not_eos && ch_ == " ") { |
||||
|
not_eos = Read(); |
||||
|
} |
||||
|
return not_eos; |
||||
|
} |
||||
|
|
||||
|
bool TokenParser::ParseChar(const std::string& exp) { |
||||
|
if (ch_ == exp) { |
||||
|
Read(); |
||||
|
return true; |
||||
|
} |
||||
|
return false; |
||||
|
} |
||||
|
|
||||
|
bool TokenParser::ParseChars(const std::string& exp) { |
||||
|
bool ok = false; |
||||
|
std::vector<std::string> chars; |
||||
|
wetext::SplitUTF8StringToChars(exp, &chars); |
||||
|
for (const auto& x : chars) { |
||||
|
ok |= ParseChar(x); |
||||
|
} |
||||
|
return ok; |
||||
|
} |
||||
|
|
||||
|
std::string TokenParser::ParseKey() { |
||||
|
CHECK_NE(ch_, EOS); |
||||
|
CHECK_EQ(UTF8_WHITESPACE.count(ch_), 0); |
||||
|
|
||||
|
std::string key = ""; |
||||
|
while (ASCII_LETTERS.count(ch_) > 0) { |
||||
|
key += ch_; |
||||
|
Read(); |
||||
|
} |
||||
|
return key; |
||||
|
} |
||||
|
|
||||
|
std::string TokenParser::ParseValue() { |
||||
|
CHECK_NE(ch_, EOS); |
||||
|
bool escape = false; |
||||
|
|
||||
|
std::string value = ""; |
||||
|
while (ch_ != "\"") { |
||||
|
value += ch_; |
||||
|
escape = ch_ == "\\" && !escape; |
||||
|
Read(); |
||||
|
if (escape) { |
||||
|
value += ch_; |
||||
|
Read(); |
||||
|
} |
||||
|
} |
||||
|
return value; |
||||
|
} |
||||
|
|
||||
|
void TokenParser::Parse(const std::string& input) { |
||||
|
Load(input); |
||||
|
while (ParseWs()) { |
||||
|
std::string name = ParseKey(); |
||||
|
ParseChars(" { "); |
||||
|
|
||||
|
Token token(name); |
||||
|
while (ParseWs()) { |
||||
|
if (ch_ == "}") { |
||||
|
ParseChar("}"); |
||||
|
break; |
||||
|
} |
||||
|
std::string key = ParseKey(); |
||||
|
ParseChars(": \""); |
||||
|
std::string value = ParseValue(); |
||||
|
ParseChar("\""); |
||||
|
token.Append(key, value); |
||||
|
} |
||||
|
tokens_.emplace_back(token); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
std::string TokenParser::Reorder(const std::string& input) { |
||||
|
Parse(input); |
||||
|
std::string output = ""; |
||||
|
for (auto& token : tokens_) { |
||||
|
output += token.String(orders_) + " "; |
||||
|
} |
||||
|
return Trim(output); |
||||
|
} |
||||
|
|
||||
|
} // namespace wetext
|
@ -0,0 +1,91 @@ |
|||||
|
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) |
||||
|
// |
||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
// you may not use this file except in compliance with the License. |
||||
|
// You may obtain a copy of the License at |
||||
|
// |
||||
|
// http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
// |
||||
|
// Unless required by applicable law or agreed to in writing, software |
||||
|
// distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
// See the License for the specific language governing permissions and |
||||
|
// limitations under the License. |
||||
|
|
||||
|
#ifndef PROCESSOR_WETEXT_TOKEN_PARSER_H_ |
||||
|
#define PROCESSOR_WETEXT_TOKEN_PARSER_H_ |
||||
|
|
||||
|
#include <set> |
||||
|
#include <string> |
||||
|
#include <unordered_map> |
||||
|
#include <vector> |
||||
|
|
||||
|
namespace wetext { |
||||
|
|
||||
|
extern const char EOS[]; |
||||
|
extern const std::set<std::string> UTF8_WHITESPACE; |
||||
|
extern const std::set<std::string> ASCII_LETTERS; |
||||
|
extern const std::unordered_map<std::string, std::vector<std::string>> |
||||
|
TN_ORDERS; |
||||
|
extern const std::unordered_map<std::string, std::vector<std::string>> |
||||
|
ITN_ORDERS; |
||||
|
|
||||
|
struct Token { |
||||
|
std::string name; |
||||
|
std::vector<std::string> order; |
||||
|
std::unordered_map<std::string, std::string> members; |
||||
|
|
||||
|
explicit Token(const std::string& name) : name(name) {} |
||||
|
|
||||
|
void Append(const std::string& key, const std::string& value) { |
||||
|
order.emplace_back(key); |
||||
|
members[key] = value; |
||||
|
} |
||||
|
|
||||
|
std::string String( |
||||
|
const std::unordered_map<std::string, std::vector<std::string>>& orders) { |
||||
|
std::string output = name + " {"; |
||||
|
if (orders.count(name) > 0) { |
||||
|
order = orders.at(name); |
||||
|
} |
||||
|
|
||||
|
for (const auto& key : order) { |
||||
|
if (members.count(key) == 0) { |
||||
|
continue; |
||||
|
} |
||||
|
output += " " + key + ": \"" + members[key] + "\""; |
||||
|
} |
||||
|
return output + " }"; |
||||
|
} |
||||
|
}; |
||||
|
|
||||
|
enum ParseType { |
||||
|
kTN = 0x00, // Text Normalization |
||||
|
kITN = 0x01 // Inverse Text Normalization |
||||
|
}; |
||||
|
|
||||
|
class TokenParser { |
||||
|
public: |
||||
|
explicit TokenParser(ParseType type); |
||||
|
std::string Reorder(const std::string& input); |
||||
|
|
||||
|
private: |
||||
|
void Load(const std::string& input); |
||||
|
bool Read(); |
||||
|
bool ParseWs(); |
||||
|
bool ParseChar(const std::string& exp); |
||||
|
bool ParseChars(const std::string& exp); |
||||
|
std::string ParseKey(); |
||||
|
std::string ParseValue(); |
||||
|
void Parse(const std::string& input); |
||||
|
|
||||
|
int index_; |
||||
|
std::string ch_; |
||||
|
std::vector<std::string> text_; |
||||
|
std::vector<Token> tokens_; |
||||
|
std::unordered_map<std::string, std::vector<std::string>> orders_; |
||||
|
}; |
||||
|
|
||||
|
} // namespace wetext |
||||
|
|
||||
|
#endif // PROCESSOR_WETEXT_TOKEN_PARSER_H_ |
@ -0,0 +1 @@ |
|||||
|
|
@ -0,0 +1,23 @@ |
|||||
|
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) |
||||
|
// |
||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
// you may not use this file except in compliance with the License. |
||||
|
// You may obtain a copy of the License at |
||||
|
// |
||||
|
// http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
// |
||||
|
// Unless required by applicable law or agreed to in writing, software |
||||
|
// distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
// See the License for the specific language governing permissions and |
||||
|
// limitations under the License. |
||||
|
|
||||
|
#ifndef UTILS_WETEXT_FLAGS_H_ |
||||
|
#define UTILS_WETEXT_FLAGS_H_ |
||||
|
|
||||
|
// Because openfst is a dynamic library compiled with gflags/glog, we must use |
||||
|
// the gflags/glog from openfst to avoid them linked both statically and |
||||
|
// dynamically into the executable. |
||||
|
#include "../../../utils/flags.h" |
||||
|
|
||||
|
#endif // UTILS_WETEXT_FLAGS_H_ |
@ -0,0 +1,23 @@ |
|||||
|
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) |
||||
|
// |
||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
// you may not use this file except in compliance with the License. |
||||
|
// You may obtain a copy of the License at |
||||
|
// |
||||
|
// http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
// |
||||
|
// Unless required by applicable law or agreed to in writing, software |
||||
|
// distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
// See the License for the specific language governing permissions and |
||||
|
// limitations under the License. |
||||
|
|
||||
|
#ifndef UTILS_WETEXT_LOG_H_ |
||||
|
#define UTILS_WETEXT_LOG_H_ |
||||
|
|
||||
|
// Because openfst is a dynamic library compiled with gflags/glog, we must use |
||||
|
// the gflags/glog from openfst to avoid them linked both statically and |
||||
|
// dynamically into the executable. |
||||
|
#include "../../../utils/log.h" |
||||
|
|
||||
|
#endif // UTILS_WETEXT_LOG_H_ |
@ -0,0 +1,89 @@ |
|||||
|
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
||||
|
//
|
||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
|
// you may not use this file except in compliance with the License.
|
||||
|
// You may obtain a copy of the License at
|
||||
|
//
|
||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
//
|
||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
// See the License for the specific language governing permissions and
|
||||
|
// limitations under the License.
|
||||
|
|
||||
|
#include "wetext_string.h"
|
||||
|
|
||||
|
#include "wetext_log.h"
|
||||
|
|
||||
|
namespace wetext { |
||||
|
const char* WHITESPACE = " \n\r\t\f\v"; |
||||
|
|
||||
|
int UTF8CharLength(char ch) { |
||||
|
int num_bytes = 1; |
||||
|
CHECK_LE((ch & 0xF8), 0xF0); |
||||
|
if ((ch & 0x80) == 0x00) { |
||||
|
// The first 128 characters (US-ASCII) in UTF-8 format only need one byte.
|
||||
|
num_bytes = 1; |
||||
|
} else if ((ch & 0xE0) == 0xC0) { |
||||
|
// The next 1,920 characters need two bytes to encode,
|
||||
|
// which covers the remainder of almost all Latin-script alphabets.
|
||||
|
num_bytes = 2; |
||||
|
} else if ((ch & 0xF0) == 0xE0) { |
||||
|
// Three bytes are needed for characters in the rest of
|
||||
|
// the Basic Multilingual Plane, which contains virtually all characters
|
||||
|
// in common use, including most Chinese, Japanese and Korean characters.
|
||||
|
num_bytes = 3; |
||||
|
} else if ((ch & 0xF8) == 0xF0) { |
||||
|
// Four bytes are needed for characters in the other planes of Unicode,
|
||||
|
// which include less common CJK characters, various historic scripts,
|
||||
|
// mathematical symbols, and emoji (pictographic symbols).
|
||||
|
num_bytes = 4; |
||||
|
} |
||||
|
return num_bytes; |
||||
|
} |
||||
|
|
||||
|
int UTF8StringLength(const std::string& str) { |
||||
|
int len = 0; |
||||
|
int num_bytes = 1; |
||||
|
for (size_t i = 0; i < str.length(); i += num_bytes) { |
||||
|
num_bytes = UTF8CharLength(str[i]); |
||||
|
++len; |
||||
|
} |
||||
|
return len; |
||||
|
} |
||||
|
|
||||
|
void SplitUTF8StringToChars(const std::string& str, |
||||
|
std::vector<std::string>* chars) { |
||||
|
chars->clear(); |
||||
|
int num_bytes = 1; |
||||
|
for (size_t i = 0; i < str.length(); i += num_bytes) { |
||||
|
num_bytes = UTF8CharLength(str[i]); |
||||
|
chars->push_back(str.substr(i, num_bytes)); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
std::string Ltrim(const std::string& str) { |
||||
|
size_t start = str.find_first_not_of(WHITESPACE); |
||||
|
return (start == std::string::npos) ? "" : str.substr(start); |
||||
|
} |
||||
|
|
||||
|
std::string Rtrim(const std::string& str) { |
||||
|
size_t end = str.find_last_not_of(WHITESPACE); |
||||
|
return end == std::string::npos ? "" : str.substr(0, end + 1); |
||||
|
} |
||||
|
|
||||
|
std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); } |
||||
|
|
||||
|
void Split(const std::string& str, const std::string& delim, |
||||
|
std::vector<std::string>* output) { |
||||
|
std::string s = str; |
||||
|
size_t pos = 0; |
||||
|
while ((pos = s.find(delim)) != std::string::npos) { |
||||
|
output->emplace_back(s.substr(0, pos)); |
||||
|
s.erase(0, pos + delim.length()); |
||||
|
} |
||||
|
output->emplace_back(s); |
||||
|
} |
||||
|
|
||||
|
} // namespace wetext
|
@ -0,0 +1,42 @@ |
|||||
|
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) |
||||
|
// |
||||
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
// you may not use this file except in compliance with the License. |
||||
|
// You may obtain a copy of the License at |
||||
|
// |
||||
|
// http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
// |
||||
|
// Unless required by applicable law or agreed to in writing, software |
||||
|
// distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
// See the License for the specific language governing permissions and |
||||
|
// limitations under the License. |
||||
|
|
||||
|
#ifndef UTILS_WETEXT_STRING_H_ |
||||
|
#define UTILS_WETEXT_STRING_H_ |
||||
|
|
||||
|
#include <string> |
||||
|
#include <vector> |
||||
|
|
||||
|
namespace wetext { |
||||
|
extern const char* WHITESPACE; |
||||
|
|
||||
|
int UTF8CharLength(char ch); |
||||
|
|
||||
|
int UTF8StringLength(const std::string& str); |
||||
|
|
||||
|
void SplitUTF8StringToChars(const std::string& str, |
||||
|
std::vector<std::string>* chars); |
||||
|
|
||||
|
std::string Ltrim(const std::string& str); |
||||
|
|
||||
|
std::string Rtrim(const std::string& str); |
||||
|
|
||||
|
std::string Trim(const std::string& str); |
||||
|
|
||||
|
void Split(const std::string& str, const std::string& delim, |
||||
|
std::vector<std::string>* output); |
||||
|
|
||||
|
} // namespace wetext |
||||
|
|
||||
|
#endif // UTILS_WETEXT_STRING_H_ |