@ -1,14 +0,0 @@ |
|||
set(wetext_BUILD_DIR "${LIB_BASE_DIR}/wetextprocessing-build") |
|||
set(wetext_SOURCE_DIR "${LIB_BASE_DIR}/wetextprocessing-src") |
|||
if(NOT EXISTS ${wetext_BUILD_DIR}) |
|||
execute_process(COMMAND mkdir -p ${wetext_BUILD_DIR}) |
|||
message(STATUS "${BoldGreen}Install wetextprocessing library${ColourReset}") |
|||
execute_process( |
|||
COMMAND cmake -B ${wetext_BUILD_DIR} -S ${wetext_SOURCE_DIR}/runtime -DCMAKE_BUILD_TYPE=Release -j4 -fPIC && |
|||
cmake --build . |
|||
WORKING_DIRECTORY ${wetext_BUILD_DIR}) |
|||
endif () |
|||
|
|||
include_directories(${wetext_SOURCE_DIR}/runtime) |
|||
#add_subdirectory(${wetext_SOURCE_DIR}/runtime/utils) |
|||
#add_subdirectory(${wetext_SOURCE_DIR}/runtime/processor) |
@ -1,10 +1,17 @@ |
|||
add_library(post_processor STATIC |
|||
post_processor.cc |
|||
message(STATUS "post_processor dir:${CMAKE_CURRENT_SOURCE_DIR}") |
|||
add_library(wetext_utils STATIC ${CMAKE_CURRENT_SOURCE_DIR}/utils/wetext_string.cc) |
|||
target_link_libraries(wetext_utils PUBLIC glog) |
|||
|
|||
add_library(wetext_processor STATIC |
|||
${CMAKE_CURRENT_SOURCE_DIR}/processor/wetext_processor.cc |
|||
${CMAKE_CURRENT_SOURCE_DIR}/processor/wetext_token_parser.cc |
|||
) |
|||
if(ITN) |
|||
include(wetextprocessing) |
|||
target_link_libraries(post_processor PUBLIC utils wetext_processor wetext_utils) |
|||
else() |
|||
target_link_libraries(post_processor PUBLIC utils) |
|||
endif() |
|||
target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils) |
|||
|
|||
add_executable(processor_main processor_main.cc) |
|||
target_link_libraries(processor_main PUBLIC wetext_processor) |
|||
|
|||
add_library(post_processor STATIC |
|||
post_processor.cc |
|||
) |
|||
target_link_libraries(post_processor PUBLIC utils wetext_processor wetext_utils) |
@ -0,0 +1,2 @@ |
|||
add_executable(processor_main processor_main.cc) |
|||
target_link_libraries(processor_main PUBLIC wetext_processor) |
@ -0,0 +1,54 @@ |
|||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
|||
//
|
|||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|||
// you may not use this file except in compliance with the License.
|
|||
// You may obtain a copy of the License at
|
|||
//
|
|||
// http://www.apache.org/licenses/LICENSE-2.0
|
|||
//
|
|||
// Unless required by applicable law or agreed to in writing, software
|
|||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||
// See the License for the specific language governing permissions and
|
|||
// limitations under the License.
|
|||
|
|||
#include <fstream>
|
|||
#include <iostream>
|
|||
#include <string>
|
|||
|
|||
#include "../processor/wetext_processor.h"
|
|||
#include "../utils/wetext_flags.h"
|
|||
|
|||
DEFINE_string(text, "", "input string"); |
|||
DEFINE_string(file, "", "input file"); |
|||
DEFINE_string(tagger, "", "tagger fst path"); |
|||
DEFINE_string(verbalizer, "", "verbalizer fst path"); |
|||
|
|||
int main(int argc, char* argv[]) { |
|||
gflags::ParseCommandLineFlags(&argc, &argv, false); |
|||
google::InitGoogleLogging(argv[0]); |
|||
|
|||
if (FLAGS_tagger.empty() || FLAGS_verbalizer.empty()) { |
|||
LOG(FATAL) << "Please provide the tagger and verbalizer fst files."; |
|||
} |
|||
wetext::Processor processor(FLAGS_tagger, FLAGS_verbalizer); |
|||
|
|||
if (!FLAGS_text.empty()) { |
|||
std::string tagged_text = processor.Tag(FLAGS_text); |
|||
std::cout << tagged_text << std::endl; |
|||
std::string normalized_text = processor.Verbalize(tagged_text); |
|||
std::cout << normalized_text << std::endl; |
|||
} |
|||
|
|||
if (!FLAGS_file.empty()) { |
|||
std::ifstream file(FLAGS_file); |
|||
std::string line; |
|||
while (getline(file, line)) { |
|||
std::string tagged_text = processor.Tag(line); |
|||
std::cout << tagged_text << std::endl; |
|||
std::string normalized_text = processor.Verbalize(tagged_text); |
|||
std::cout << normalized_text << std::endl; |
|||
} |
|||
} |
|||
return 0; |
|||
} |
@ -0,0 +1,13 @@ |
|||
add_library(wetext_processor STATIC |
|||
wetext_processor.cc |
|||
wetext_token_parser.cc |
|||
) |
|||
if(ANDROID) |
|||
target_link_libraries(wetext_processor PUBLIC fst wetext_utils) |
|||
else() |
|||
if(MSVC) |
|||
target_link_libraries(wetext_processor PUBLIC fst wetext_utils) |
|||
else() |
|||
target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils) |
|||
endif() |
|||
endif() |
@ -0,0 +1,79 @@ |
|||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
|||
//
|
|||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|||
// you may not use this file except in compliance with the License.
|
|||
// You may obtain a copy of the License at
|
|||
//
|
|||
// http://www.apache.org/licenses/LICENSE-2.0
|
|||
//
|
|||
// Unless required by applicable law or agreed to in writing, software
|
|||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||
// See the License for the specific language governing permissions and
|
|||
// limitations under the License.
|
|||
|
|||
#include "wetext_processor.h"
|
|||
|
|||
using fst::StringTokenType; |
|||
|
|||
namespace wetext { |
|||
Processor::Processor(const std::string& tagger_path, |
|||
const std::string& verbalizer_path) { |
|||
tagger_.reset(StdVectorFst::Read(tagger_path)); |
|||
verbalizer_.reset(StdVectorFst::Read(verbalizer_path)); |
|||
compiler_ = std::make_shared<StringCompiler<StdArc>>(StringTokenType::BYTE); |
|||
printer_ = std::make_shared<StringPrinter<StdArc>>(StringTokenType::BYTE); |
|||
|
|||
if (tagger_path.find("_tn_") != tagger_path.npos) { |
|||
parse_type_ = ParseType::kTN; |
|||
} else if (tagger_path.find("_itn_") != tagger_path.npos) { |
|||
parse_type_ = ParseType::kITN; |
|||
} else { |
|||
LOG(FATAL) << "Invalid fst prefix, prefix should contain" |
|||
<< " either \"_tn_\" or \"_itn_\"."; |
|||
} |
|||
} |
|||
|
|||
std::string Processor::ShortestPath(const StdVectorFst& lattice) { |
|||
StdVectorFst shortest_path; |
|||
fst::ShortestPath(lattice, &shortest_path, 1, true); |
|||
|
|||
std::string output; |
|||
printer_->operator()(shortest_path, &output); |
|||
return output; |
|||
} |
|||
|
|||
std::string Processor::Compose(const std::string& input, |
|||
const StdVectorFst* fst) { |
|||
StdVectorFst input_fst; |
|||
compiler_->operator()(input, &input_fst); |
|||
|
|||
StdVectorFst lattice; |
|||
fst::Compose(input_fst, *fst, &lattice); |
|||
return ShortestPath(lattice); |
|||
} |
|||
|
|||
std::string Processor::Tag(const std::string& input) { |
|||
if (input.empty()) { |
|||
return ""; |
|||
} |
|||
return Compose(input, tagger_.get()); |
|||
} |
|||
|
|||
std::string Processor::Verbalize(const std::string& input) { |
|||
if (input.empty()) { |
|||
return ""; |
|||
} |
|||
TokenParser parser(parse_type_); |
|||
std::string output = parser.Reorder(input); |
|||
|
|||
output = Compose(output, verbalizer_.get()); |
|||
output.erase(std::remove(output.begin(), output.end(), '\0'), output.end()); |
|||
return output; |
|||
} |
|||
|
|||
std::string Processor::Normalize(const std::string& input) { |
|||
return Verbalize(Tag(input)); |
|||
} |
|||
|
|||
} // namespace wetext
|
@ -0,0 +1,51 @@ |
|||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) |
|||
// |
|||
// Licensed under the Apache License, Version 2.0 (the "License"); |
|||
// you may not use this file except in compliance with the License. |
|||
// You may obtain a copy of the License at |
|||
// |
|||
// http://www.apache.org/licenses/LICENSE-2.0 |
|||
// |
|||
// Unless required by applicable law or agreed to in writing, software |
|||
// distributed under the License is distributed on an "AS IS" BASIS, |
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
// See the License for the specific language governing permissions and |
|||
// limitations under the License. |
|||
|
|||
#ifndef PROCESSOR_WETEXT_PROCESSOR_H_ |
|||
#define PROCESSOR_WETEXT_PROCESSOR_H_ |
|||
|
|||
#include <memory> |
|||
#include <string> |
|||
|
|||
#include "fst/fstlib.h" |
|||
|
|||
#include "wetext_token_parser.h" |
|||
|
|||
using fst::StdArc; |
|||
using fst::StdVectorFst; |
|||
using fst::StringCompiler; |
|||
using fst::StringPrinter; |
|||
|
|||
namespace wetext { |
|||
class Processor { |
|||
public: |
|||
Processor(const std::string& tagger_path, const std::string& verbalizer_path); |
|||
std::string Tag(const std::string& input); |
|||
std::string Verbalize(const std::string& input); |
|||
std::string Normalize(const std::string& input); |
|||
|
|||
private: |
|||
std::string ShortestPath(const StdVectorFst& lattice); |
|||
std::string Compose(const std::string& input, const StdVectorFst* fst); |
|||
|
|||
ParseType parse_type_; |
|||
std::shared_ptr<StdVectorFst> tagger_ = nullptr; |
|||
std::shared_ptr<StdVectorFst> verbalizer_ = nullptr; |
|||
std::shared_ptr<StringCompiler<StdArc>> compiler_ = nullptr; |
|||
std::shared_ptr<StringPrinter<StdArc>> printer_ = nullptr; |
|||
}; |
|||
|
|||
} // namespace wetext |
|||
|
|||
#endif // PROCESSOR_WETEXT_PROCESSOR_H_ |
@ -0,0 +1,153 @@ |
|||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
|||
//
|
|||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|||
// you may not use this file except in compliance with the License.
|
|||
// You may obtain a copy of the License at
|
|||
//
|
|||
// http://www.apache.org/licenses/LICENSE-2.0
|
|||
//
|
|||
// Unless required by applicable law or agreed to in writing, software
|
|||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||
// See the License for the specific language governing permissions and
|
|||
// limitations under the License.
|
|||
|
|||
#include "wetext_token_parser.h"
|
|||
|
|||
#include "../utils/wetext_log.h"
|
|||
#include "../utils/wetext_string.h"
|
|||
|
|||
namespace wetext { |
|||
const char EOS[] = "<EOS>"; |
|||
const std::set<std::string> UTF8_WHITESPACE = {" ", "\t", "\n", "\r", |
|||
"\x0b\x0c"}; |
|||
const std::set<std::string> ASCII_LETTERS = { |
|||
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", |
|||
"o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", |
|||
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", |
|||
"Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"}; |
|||
const std::unordered_map<std::string, std::vector<std::string>> TN_ORDERS = { |
|||
{"date", {"year", "month", "day"}}, |
|||
{"fraction", {"denominator", "numerator"}}, |
|||
{"measure", {"denominator", "numerator", "value"}}, |
|||
{"money", {"value", "currency"}}, |
|||
{"time", {"noon", "hour", "minute", "second"}}}; |
|||
const std::unordered_map<std::string, std::vector<std::string>> ITN_ORDERS = { |
|||
{"date", {"year", "month", "day"}}, |
|||
{"fraction", {"sign", "numerator", "denominator"}}, |
|||
{"measure", {"numerator", "denominator", "value"}}, |
|||
{"money", {"currency", "value", "decimal"}}, |
|||
{"time", {"hour", "minute", "second", "noon"}}}; |
|||
|
|||
TokenParser::TokenParser(ParseType type) { |
|||
if (type == ParseType::kTN) { |
|||
orders_ = TN_ORDERS; |
|||
} else { |
|||
orders_ = ITN_ORDERS; |
|||
} |
|||
} |
|||
|
|||
void TokenParser::Load(const std::string& input) { |
|||
wetext::SplitUTF8StringToChars(input, &text_); |
|||
CHECK_GT(text_.size(), 0); |
|||
index_ = 0; |
|||
ch_ = text_[0]; |
|||
} |
|||
|
|||
bool TokenParser::Read() { |
|||
if (index_ < text_.size() - 1) { |
|||
index_ += 1; |
|||
ch_ = text_[index_]; |
|||
return true; |
|||
} |
|||
ch_ = EOS; |
|||
return false; |
|||
} |
|||
|
|||
bool TokenParser::ParseWs() { |
|||
bool not_eos = ch_ != EOS; |
|||
while (not_eos && ch_ == " ") { |
|||
not_eos = Read(); |
|||
} |
|||
return not_eos; |
|||
} |
|||
|
|||
bool TokenParser::ParseChar(const std::string& exp) { |
|||
if (ch_ == exp) { |
|||
Read(); |
|||
return true; |
|||
} |
|||
return false; |
|||
} |
|||
|
|||
bool TokenParser::ParseChars(const std::string& exp) { |
|||
bool ok = false; |
|||
std::vector<std::string> chars; |
|||
wetext::SplitUTF8StringToChars(exp, &chars); |
|||
for (const auto& x : chars) { |
|||
ok |= ParseChar(x); |
|||
} |
|||
return ok; |
|||
} |
|||
|
|||
std::string TokenParser::ParseKey() { |
|||
CHECK_NE(ch_, EOS); |
|||
CHECK_EQ(UTF8_WHITESPACE.count(ch_), 0); |
|||
|
|||
std::string key = ""; |
|||
while (ASCII_LETTERS.count(ch_) > 0) { |
|||
key += ch_; |
|||
Read(); |
|||
} |
|||
return key; |
|||
} |
|||
|
|||
std::string TokenParser::ParseValue() { |
|||
CHECK_NE(ch_, EOS); |
|||
bool escape = false; |
|||
|
|||
std::string value = ""; |
|||
while (ch_ != "\"") { |
|||
value += ch_; |
|||
escape = ch_ == "\\" && !escape; |
|||
Read(); |
|||
if (escape) { |
|||
value += ch_; |
|||
Read(); |
|||
} |
|||
} |
|||
return value; |
|||
} |
|||
|
|||
void TokenParser::Parse(const std::string& input) { |
|||
Load(input); |
|||
while (ParseWs()) { |
|||
std::string name = ParseKey(); |
|||
ParseChars(" { "); |
|||
|
|||
Token token(name); |
|||
while (ParseWs()) { |
|||
if (ch_ == "}") { |
|||
ParseChar("}"); |
|||
break; |
|||
} |
|||
std::string key = ParseKey(); |
|||
ParseChars(": \""); |
|||
std::string value = ParseValue(); |
|||
ParseChar("\""); |
|||
token.Append(key, value); |
|||
} |
|||
tokens_.emplace_back(token); |
|||
} |
|||
} |
|||
|
|||
std::string TokenParser::Reorder(const std::string& input) { |
|||
Parse(input); |
|||
std::string output = ""; |
|||
for (auto& token : tokens_) { |
|||
output += token.String(orders_) + " "; |
|||
} |
|||
return Trim(output); |
|||
} |
|||
|
|||
} // namespace wetext
|
@ -0,0 +1,91 @@ |
|||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) |
|||
// |
|||
// Licensed under the Apache License, Version 2.0 (the "License"); |
|||
// you may not use this file except in compliance with the License. |
|||
// You may obtain a copy of the License at |
|||
// |
|||
// http://www.apache.org/licenses/LICENSE-2.0 |
|||
// |
|||
// Unless required by applicable law or agreed to in writing, software |
|||
// distributed under the License is distributed on an "AS IS" BASIS, |
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
// See the License for the specific language governing permissions and |
|||
// limitations under the License. |
|||
|
|||
#ifndef PROCESSOR_WETEXT_TOKEN_PARSER_H_ |
|||
#define PROCESSOR_WETEXT_TOKEN_PARSER_H_ |
|||
|
|||
#include <set> |
|||
#include <string> |
|||
#include <unordered_map> |
|||
#include <vector> |
|||
|
|||
namespace wetext { |
|||
|
|||
extern const char EOS[]; |
|||
extern const std::set<std::string> UTF8_WHITESPACE; |
|||
extern const std::set<std::string> ASCII_LETTERS; |
|||
extern const std::unordered_map<std::string, std::vector<std::string>> |
|||
TN_ORDERS; |
|||
extern const std::unordered_map<std::string, std::vector<std::string>> |
|||
ITN_ORDERS; |
|||
|
|||
struct Token { |
|||
std::string name; |
|||
std::vector<std::string> order; |
|||
std::unordered_map<std::string, std::string> members; |
|||
|
|||
explicit Token(const std::string& name) : name(name) {} |
|||
|
|||
void Append(const std::string& key, const std::string& value) { |
|||
order.emplace_back(key); |
|||
members[key] = value; |
|||
} |
|||
|
|||
std::string String( |
|||
const std::unordered_map<std::string, std::vector<std::string>>& orders) { |
|||
std::string output = name + " {"; |
|||
if (orders.count(name) > 0) { |
|||
order = orders.at(name); |
|||
} |
|||
|
|||
for (const auto& key : order) { |
|||
if (members.count(key) == 0) { |
|||
continue; |
|||
} |
|||
output += " " + key + ": \"" + members[key] + "\""; |
|||
} |
|||
return output + " }"; |
|||
} |
|||
}; |
|||
|
|||
enum ParseType { |
|||
kTN = 0x00, // Text Normalization |
|||
kITN = 0x01 // Inverse Text Normalization |
|||
}; |
|||
|
|||
class TokenParser { |
|||
public: |
|||
explicit TokenParser(ParseType type); |
|||
std::string Reorder(const std::string& input); |
|||
|
|||
private: |
|||
void Load(const std::string& input); |
|||
bool Read(); |
|||
bool ParseWs(); |
|||
bool ParseChar(const std::string& exp); |
|||
bool ParseChars(const std::string& exp); |
|||
std::string ParseKey(); |
|||
std::string ParseValue(); |
|||
void Parse(const std::string& input); |
|||
|
|||
int index_; |
|||
std::string ch_; |
|||
std::vector<std::string> text_; |
|||
std::vector<Token> tokens_; |
|||
std::unordered_map<std::string, std::vector<std::string>> orders_; |
|||
}; |
|||
|
|||
} // namespace wetext |
|||
|
|||
#endif // PROCESSOR_WETEXT_TOKEN_PARSER_H_ |
@ -0,0 +1 @@ |
|||
|
@ -0,0 +1,23 @@ |
|||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) |
|||
// |
|||
// Licensed under the Apache License, Version 2.0 (the "License"); |
|||
// you may not use this file except in compliance with the License. |
|||
// You may obtain a copy of the License at |
|||
// |
|||
// http://www.apache.org/licenses/LICENSE-2.0 |
|||
// |
|||
// Unless required by applicable law or agreed to in writing, software |
|||
// distributed under the License is distributed on an "AS IS" BASIS, |
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
// See the License for the specific language governing permissions and |
|||
// limitations under the License. |
|||
|
|||
#ifndef UTILS_WETEXT_FLAGS_H_ |
|||
#define UTILS_WETEXT_FLAGS_H_ |
|||
|
|||
// Because openfst is a dynamic library compiled with gflags/glog, we must use |
|||
// the gflags/glog from openfst to avoid them linked both statically and |
|||
// dynamically into the executable. |
|||
#include "../../../utils/flags.h" |
|||
|
|||
#endif // UTILS_WETEXT_FLAGS_H_ |
@ -0,0 +1,23 @@ |
|||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) |
|||
// |
|||
// Licensed under the Apache License, Version 2.0 (the "License"); |
|||
// you may not use this file except in compliance with the License. |
|||
// You may obtain a copy of the License at |
|||
// |
|||
// http://www.apache.org/licenses/LICENSE-2.0 |
|||
// |
|||
// Unless required by applicable law or agreed to in writing, software |
|||
// distributed under the License is distributed on an "AS IS" BASIS, |
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
// See the License for the specific language governing permissions and |
|||
// limitations under the License. |
|||
|
|||
#ifndef UTILS_WETEXT_LOG_H_ |
|||
#define UTILS_WETEXT_LOG_H_ |
|||
|
|||
// Because openfst is a dynamic library compiled with gflags/glog, we must use |
|||
// the gflags/glog from openfst to avoid them linked both statically and |
|||
// dynamically into the executable. |
|||
#include "../../../utils/log.h" |
|||
|
|||
#endif // UTILS_WETEXT_LOG_H_ |
@ -0,0 +1,89 @@ |
|||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
|
|||
//
|
|||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|||
// you may not use this file except in compliance with the License.
|
|||
// You may obtain a copy of the License at
|
|||
//
|
|||
// http://www.apache.org/licenses/LICENSE-2.0
|
|||
//
|
|||
// Unless required by applicable law or agreed to in writing, software
|
|||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||
// See the License for the specific language governing permissions and
|
|||
// limitations under the License.
|
|||
|
|||
#include "wetext_string.h"
|
|||
|
|||
#include "wetext_log.h"
|
|||
|
|||
namespace wetext { |
|||
const char* WHITESPACE = " \n\r\t\f\v"; |
|||
|
|||
int UTF8CharLength(char ch) { |
|||
int num_bytes = 1; |
|||
CHECK_LE((ch & 0xF8), 0xF0); |
|||
if ((ch & 0x80) == 0x00) { |
|||
// The first 128 characters (US-ASCII) in UTF-8 format only need one byte.
|
|||
num_bytes = 1; |
|||
} else if ((ch & 0xE0) == 0xC0) { |
|||
// The next 1,920 characters need two bytes to encode,
|
|||
// which covers the remainder of almost all Latin-script alphabets.
|
|||
num_bytes = 2; |
|||
} else if ((ch & 0xF0) == 0xE0) { |
|||
// Three bytes are needed for characters in the rest of
|
|||
// the Basic Multilingual Plane, which contains virtually all characters
|
|||
// in common use, including most Chinese, Japanese and Korean characters.
|
|||
num_bytes = 3; |
|||
} else if ((ch & 0xF8) == 0xF0) { |
|||
// Four bytes are needed for characters in the other planes of Unicode,
|
|||
// which include less common CJK characters, various historic scripts,
|
|||
// mathematical symbols, and emoji (pictographic symbols).
|
|||
num_bytes = 4; |
|||
} |
|||
return num_bytes; |
|||
} |
|||
|
|||
int UTF8StringLength(const std::string& str) { |
|||
int len = 0; |
|||
int num_bytes = 1; |
|||
for (size_t i = 0; i < str.length(); i += num_bytes) { |
|||
num_bytes = UTF8CharLength(str[i]); |
|||
++len; |
|||
} |
|||
return len; |
|||
} |
|||
|
|||
void SplitUTF8StringToChars(const std::string& str, |
|||
std::vector<std::string>* chars) { |
|||
chars->clear(); |
|||
int num_bytes = 1; |
|||
for (size_t i = 0; i < str.length(); i += num_bytes) { |
|||
num_bytes = UTF8CharLength(str[i]); |
|||
chars->push_back(str.substr(i, num_bytes)); |
|||
} |
|||
} |
|||
|
|||
std::string Ltrim(const std::string& str) { |
|||
size_t start = str.find_first_not_of(WHITESPACE); |
|||
return (start == std::string::npos) ? "" : str.substr(start); |
|||
} |
|||
|
|||
std::string Rtrim(const std::string& str) { |
|||
size_t end = str.find_last_not_of(WHITESPACE); |
|||
return end == std::string::npos ? "" : str.substr(0, end + 1); |
|||
} |
|||
|
|||
std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); } |
|||
|
|||
void Split(const std::string& str, const std::string& delim, |
|||
std::vector<std::string>* output) { |
|||
std::string s = str; |
|||
size_t pos = 0; |
|||
while ((pos = s.find(delim)) != std::string::npos) { |
|||
output->emplace_back(s.substr(0, pos)); |
|||
s.erase(0, pos + delim.length()); |
|||
} |
|||
output->emplace_back(s); |
|||
} |
|||
|
|||
} // namespace wetext
|
@ -0,0 +1,42 @@ |
|||
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) |
|||
// |
|||
// Licensed under the Apache License, Version 2.0 (the "License"); |
|||
// you may not use this file except in compliance with the License. |
|||
// You may obtain a copy of the License at |
|||
// |
|||
// http://www.apache.org/licenses/LICENSE-2.0 |
|||
// |
|||
// Unless required by applicable law or agreed to in writing, software |
|||
// distributed under the License is distributed on an "AS IS" BASIS, |
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
// See the License for the specific language governing permissions and |
|||
// limitations under the License. |
|||
|
|||
#ifndef UTILS_WETEXT_STRING_H_ |
|||
#define UTILS_WETEXT_STRING_H_ |
|||
|
|||
#include <string> |
|||
#include <vector> |
|||
|
|||
namespace wetext { |
|||
extern const char* WHITESPACE; |
|||
|
|||
int UTF8CharLength(char ch); |
|||
|
|||
int UTF8StringLength(const std::string& str); |
|||
|
|||
void SplitUTF8StringToChars(const std::string& str, |
|||
std::vector<std::string>* chars); |
|||
|
|||
std::string Ltrim(const std::string& str); |
|||
|
|||
std::string Rtrim(const std::string& str); |
|||
|
|||
std::string Trim(const std::string& str); |
|||
|
|||
void Split(const std::string& str, const std::string& delim, |
|||
std::vector<std::string>* output); |
|||
|
|||
} // namespace wetext |
|||
|
|||
#endif // UTILS_WETEXT_STRING_H_ |