From 7313e112ff33747ea28bbeb37b004bd0d6dab0cf Mon Sep 17 00:00:00 2001 From: Administrator Date: Fri, 10 May 2024 16:23:48 +0800 Subject: [PATCH] =?UTF-8?q?2024.5.10-=E5=8E=BB=E9=99=A4ITN=E7=9A=84?= =?UTF-8?q?=E4=BE=9D=E8=B5=96=E6=9E=84=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 1 + cmake-linux/wetextprocessing.cmake | 14 -- post_processor/CMakeLists.txt | 23 ++- post_processor/wetext/bin/CMakeLists.txt | 2 + post_processor/wetext/bin/processor_main.cc | 54 +++++++ .../wetext/processor/CMakeLists.txt | 13 ++ .../wetext/processor/wetext_processor.cc | 79 +++++++++ .../wetext/processor/wetext_processor.h | 51 ++++++ .../wetext/processor/wetext_token_parser.cc | 153 ++++++++++++++++++ .../wetext/processor/wetext_token_parser.h | 91 +++++++++++ post_processor/wetext/utils/CMakeLists.txt | 1 + post_processor/wetext/utils/wetext_flags.h | 23 +++ post_processor/wetext/utils/wetext_log.h | 23 +++ post_processor/wetext/utils/wetext_string.cc | 89 ++++++++++ post_processor/wetext/utils/wetext_string.h | 42 +++++ 15 files changed, 637 insertions(+), 22 deletions(-) delete mode 100644 cmake-linux/wetextprocessing.cmake create mode 100644 post_processor/wetext/bin/CMakeLists.txt create mode 100644 post_processor/wetext/bin/processor_main.cc create mode 100644 post_processor/wetext/processor/CMakeLists.txt create mode 100644 post_processor/wetext/processor/wetext_processor.cc create mode 100644 post_processor/wetext/processor/wetext_processor.h create mode 100644 post_processor/wetext/processor/wetext_token_parser.cc create mode 100644 post_processor/wetext/processor/wetext_token_parser.h create mode 100644 post_processor/wetext/utils/CMakeLists.txt create mode 100644 post_processor/wetext/utils/wetext_flags.h create mode 100644 post_processor/wetext/utils/wetext_log.h create mode 100644 post_processor/wetext/utils/wetext_string.cc create mode 100644 post_processor/wetext/utils/wetext_string.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 395a66e..2e72c1f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,7 @@ set(BoldRed "${Esc}[31m") # 运行参数 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -pthread -fPIC") set(LIB_BASE_DIR /root/projects/temp_xiaoke/asr_runtime/lib_files) +set(third_party_libraries) option(GRPC "whether to build with gRPC" ON) option(ONNX "whether to build with ONNX" ON) option(ITN "whether to build with ITN" ON) diff --git a/cmake-linux/wetextprocessing.cmake b/cmake-linux/wetextprocessing.cmake deleted file mode 100644 index 28758d9..0000000 --- a/cmake-linux/wetextprocessing.cmake +++ /dev/null @@ -1,14 +0,0 @@ -set(wetext_BUILD_DIR "${LIB_BASE_DIR}/wetextprocessing-build") -set(wetext_SOURCE_DIR "${LIB_BASE_DIR}/wetextprocessing-src") -if(NOT EXISTS ${wetext_BUILD_DIR}) - execute_process(COMMAND mkdir -p ${wetext_BUILD_DIR}) - message(STATUS "${BoldGreen}Install wetextprocessing library${ColourReset}") - execute_process( - COMMAND cmake -B ${wetext_BUILD_DIR} -S ${wetext_SOURCE_DIR}/runtime -DCMAKE_BUILD_TYPE=Release -j4 -fPIC && - cmake --build . - WORKING_DIRECTORY ${wetext_BUILD_DIR}) -endif () - -include_directories(${wetext_SOURCE_DIR}/runtime) -#add_subdirectory(${wetext_SOURCE_DIR}/runtime/utils) -#add_subdirectory(${wetext_SOURCE_DIR}/runtime/processor) \ No newline at end of file diff --git a/post_processor/CMakeLists.txt b/post_processor/CMakeLists.txt index 3fff4ca..0328d49 100644 --- a/post_processor/CMakeLists.txt +++ b/post_processor/CMakeLists.txt @@ -1,10 +1,17 @@ -add_library(post_processor STATIC - post_processor.cc +message(STATUS "post_processor dir:${CMAKE_CURRENT_SOURCE_DIR}") +add_library(wetext_utils STATIC ${CMAKE_CURRENT_SOURCE_DIR}/utils/wetext_string.cc) +target_link_libraries(wetext_utils PUBLIC glog) + +add_library(wetext_processor STATIC + ${CMAKE_CURRENT_SOURCE_DIR}/processor/wetext_processor.cc + ${CMAKE_CURRENT_SOURCE_DIR}/processor/wetext_token_parser.cc ) -if(ITN) - include(wetextprocessing) - target_link_libraries(post_processor PUBLIC utils wetext_processor wetext_utils) -else() - target_link_libraries(post_processor PUBLIC utils) -endif() +target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils) + +add_executable(processor_main processor_main.cc) +target_link_libraries(processor_main PUBLIC wetext_processor) +add_library(post_processor STATIC + post_processor.cc +) +target_link_libraries(post_processor PUBLIC utils wetext_processor wetext_utils) diff --git a/post_processor/wetext/bin/CMakeLists.txt b/post_processor/wetext/bin/CMakeLists.txt new file mode 100644 index 0000000..b06482e --- /dev/null +++ b/post_processor/wetext/bin/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(processor_main processor_main.cc) +target_link_libraries(processor_main PUBLIC wetext_processor) diff --git a/post_processor/wetext/bin/processor_main.cc b/post_processor/wetext/bin/processor_main.cc new file mode 100644 index 0000000..24add7f --- /dev/null +++ b/post_processor/wetext/bin/processor_main.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "../processor/wetext_processor.h" +#include "../utils/wetext_flags.h" + +DEFINE_string(text, "", "input string"); +DEFINE_string(file, "", "input file"); +DEFINE_string(tagger, "", "tagger fst path"); +DEFINE_string(verbalizer, "", "verbalizer fst path"); + +int main(int argc, char* argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + + if (FLAGS_tagger.empty() || FLAGS_verbalizer.empty()) { + LOG(FATAL) << "Please provide the tagger and verbalizer fst files."; + } + wetext::Processor processor(FLAGS_tagger, FLAGS_verbalizer); + + if (!FLAGS_text.empty()) { + std::string tagged_text = processor.Tag(FLAGS_text); + std::cout << tagged_text << std::endl; + std::string normalized_text = processor.Verbalize(tagged_text); + std::cout << normalized_text << std::endl; + } + + if (!FLAGS_file.empty()) { + std::ifstream file(FLAGS_file); + std::string line; + while (getline(file, line)) { + std::string tagged_text = processor.Tag(line); + std::cout << tagged_text << std::endl; + std::string normalized_text = processor.Verbalize(tagged_text); + std::cout << normalized_text << std::endl; + } + } + return 0; +} diff --git a/post_processor/wetext/processor/CMakeLists.txt b/post_processor/wetext/processor/CMakeLists.txt new file mode 100644 index 0000000..8e87ea2 --- /dev/null +++ b/post_processor/wetext/processor/CMakeLists.txt @@ -0,0 +1,13 @@ +add_library(wetext_processor STATIC + wetext_processor.cc + wetext_token_parser.cc +) +if(ANDROID) + target_link_libraries(wetext_processor PUBLIC fst wetext_utils) +else() + if(MSVC) + target_link_libraries(wetext_processor PUBLIC fst wetext_utils) + else() + target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils) + endif() +endif() diff --git a/post_processor/wetext/processor/wetext_processor.cc b/post_processor/wetext/processor/wetext_processor.cc new file mode 100644 index 0000000..b096bac --- /dev/null +++ b/post_processor/wetext/processor/wetext_processor.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "wetext_processor.h" + +using fst::StringTokenType; + +namespace wetext { +Processor::Processor(const std::string& tagger_path, + const std::string& verbalizer_path) { + tagger_.reset(StdVectorFst::Read(tagger_path)); + verbalizer_.reset(StdVectorFst::Read(verbalizer_path)); + compiler_ = std::make_shared>(StringTokenType::BYTE); + printer_ = std::make_shared>(StringTokenType::BYTE); + + if (tagger_path.find("_tn_") != tagger_path.npos) { + parse_type_ = ParseType::kTN; + } else if (tagger_path.find("_itn_") != tagger_path.npos) { + parse_type_ = ParseType::kITN; + } else { + LOG(FATAL) << "Invalid fst prefix, prefix should contain" + << " either \"_tn_\" or \"_itn_\"."; + } +} + +std::string Processor::ShortestPath(const StdVectorFst& lattice) { + StdVectorFst shortest_path; + fst::ShortestPath(lattice, &shortest_path, 1, true); + + std::string output; + printer_->operator()(shortest_path, &output); + return output; +} + +std::string Processor::Compose(const std::string& input, + const StdVectorFst* fst) { + StdVectorFst input_fst; + compiler_->operator()(input, &input_fst); + + StdVectorFst lattice; + fst::Compose(input_fst, *fst, &lattice); + return ShortestPath(lattice); +} + +std::string Processor::Tag(const std::string& input) { + if (input.empty()) { + return ""; + } + return Compose(input, tagger_.get()); +} + +std::string Processor::Verbalize(const std::string& input) { + if (input.empty()) { + return ""; + } + TokenParser parser(parse_type_); + std::string output = parser.Reorder(input); + + output = Compose(output, verbalizer_.get()); + output.erase(std::remove(output.begin(), output.end(), '\0'), output.end()); + return output; +} + +std::string Processor::Normalize(const std::string& input) { + return Verbalize(Tag(input)); +} + +} // namespace wetext diff --git a/post_processor/wetext/processor/wetext_processor.h b/post_processor/wetext/processor/wetext_processor.h new file mode 100644 index 0000000..2397125 --- /dev/null +++ b/post_processor/wetext/processor/wetext_processor.h @@ -0,0 +1,51 @@ +// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PROCESSOR_WETEXT_PROCESSOR_H_ +#define PROCESSOR_WETEXT_PROCESSOR_H_ + +#include +#include + +#include "fst/fstlib.h" + +#include "wetext_token_parser.h" + +using fst::StdArc; +using fst::StdVectorFst; +using fst::StringCompiler; +using fst::StringPrinter; + +namespace wetext { +class Processor { + public: + Processor(const std::string& tagger_path, const std::string& verbalizer_path); + std::string Tag(const std::string& input); + std::string Verbalize(const std::string& input); + std::string Normalize(const std::string& input); + + private: + std::string ShortestPath(const StdVectorFst& lattice); + std::string Compose(const std::string& input, const StdVectorFst* fst); + + ParseType parse_type_; + std::shared_ptr tagger_ = nullptr; + std::shared_ptr verbalizer_ = nullptr; + std::shared_ptr> compiler_ = nullptr; + std::shared_ptr> printer_ = nullptr; +}; + +} // namespace wetext + +#endif // PROCESSOR_WETEXT_PROCESSOR_H_ diff --git a/post_processor/wetext/processor/wetext_token_parser.cc b/post_processor/wetext/processor/wetext_token_parser.cc new file mode 100644 index 0000000..f64f9f3 --- /dev/null +++ b/post_processor/wetext/processor/wetext_token_parser.cc @@ -0,0 +1,153 @@ +// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "wetext_token_parser.h" + +#include "../utils/wetext_log.h" +#include "../utils/wetext_string.h" + +namespace wetext { +const char EOS[] = ""; +const std::set UTF8_WHITESPACE = {" ", "\t", "\n", "\r", + "\x0b\x0c"}; +const std::set ASCII_LETTERS = { + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", + "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", + "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", + "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"}; +const std::unordered_map> TN_ORDERS = { + {"date", {"year", "month", "day"}}, + {"fraction", {"denominator", "numerator"}}, + {"measure", {"denominator", "numerator", "value"}}, + {"money", {"value", "currency"}}, + {"time", {"noon", "hour", "minute", "second"}}}; +const std::unordered_map> ITN_ORDERS = { + {"date", {"year", "month", "day"}}, + {"fraction", {"sign", "numerator", "denominator"}}, + {"measure", {"numerator", "denominator", "value"}}, + {"money", {"currency", "value", "decimal"}}, + {"time", {"hour", "minute", "second", "noon"}}}; + +TokenParser::TokenParser(ParseType type) { + if (type == ParseType::kTN) { + orders_ = TN_ORDERS; + } else { + orders_ = ITN_ORDERS; + } +} + +void TokenParser::Load(const std::string& input) { + wetext::SplitUTF8StringToChars(input, &text_); + CHECK_GT(text_.size(), 0); + index_ = 0; + ch_ = text_[0]; +} + +bool TokenParser::Read() { + if (index_ < text_.size() - 1) { + index_ += 1; + ch_ = text_[index_]; + return true; + } + ch_ = EOS; + return false; +} + +bool TokenParser::ParseWs() { + bool not_eos = ch_ != EOS; + while (not_eos && ch_ == " ") { + not_eos = Read(); + } + return not_eos; +} + +bool TokenParser::ParseChar(const std::string& exp) { + if (ch_ == exp) { + Read(); + return true; + } + return false; +} + +bool TokenParser::ParseChars(const std::string& exp) { + bool ok = false; + std::vector chars; + wetext::SplitUTF8StringToChars(exp, &chars); + for (const auto& x : chars) { + ok |= ParseChar(x); + } + return ok; +} + +std::string TokenParser::ParseKey() { + CHECK_NE(ch_, EOS); + CHECK_EQ(UTF8_WHITESPACE.count(ch_), 0); + + std::string key = ""; + while (ASCII_LETTERS.count(ch_) > 0) { + key += ch_; + Read(); + } + return key; +} + +std::string TokenParser::ParseValue() { + CHECK_NE(ch_, EOS); + bool escape = false; + + std::string value = ""; + while (ch_ != "\"") { + value += ch_; + escape = ch_ == "\\" && !escape; + Read(); + if (escape) { + value += ch_; + Read(); + } + } + return value; +} + +void TokenParser::Parse(const std::string& input) { + Load(input); + while (ParseWs()) { + std::string name = ParseKey(); + ParseChars(" { "); + + Token token(name); + while (ParseWs()) { + if (ch_ == "}") { + ParseChar("}"); + break; + } + std::string key = ParseKey(); + ParseChars(": \""); + std::string value = ParseValue(); + ParseChar("\""); + token.Append(key, value); + } + tokens_.emplace_back(token); + } +} + +std::string TokenParser::Reorder(const std::string& input) { + Parse(input); + std::string output = ""; + for (auto& token : tokens_) { + output += token.String(orders_) + " "; + } + return Trim(output); +} + +} // namespace wetext diff --git a/post_processor/wetext/processor/wetext_token_parser.h b/post_processor/wetext/processor/wetext_token_parser.h new file mode 100644 index 0000000..766ea7a --- /dev/null +++ b/post_processor/wetext/processor/wetext_token_parser.h @@ -0,0 +1,91 @@ +// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PROCESSOR_WETEXT_TOKEN_PARSER_H_ +#define PROCESSOR_WETEXT_TOKEN_PARSER_H_ + +#include +#include +#include +#include + +namespace wetext { + +extern const char EOS[]; +extern const std::set UTF8_WHITESPACE; +extern const std::set ASCII_LETTERS; +extern const std::unordered_map> + TN_ORDERS; +extern const std::unordered_map> + ITN_ORDERS; + +struct Token { + std::string name; + std::vector order; + std::unordered_map members; + + explicit Token(const std::string& name) : name(name) {} + + void Append(const std::string& key, const std::string& value) { + order.emplace_back(key); + members[key] = value; + } + + std::string String( + const std::unordered_map>& orders) { + std::string output = name + " {"; + if (orders.count(name) > 0) { + order = orders.at(name); + } + + for (const auto& key : order) { + if (members.count(key) == 0) { + continue; + } + output += " " + key + ": \"" + members[key] + "\""; + } + return output + " }"; + } +}; + +enum ParseType { + kTN = 0x00, // Text Normalization + kITN = 0x01 // Inverse Text Normalization +}; + +class TokenParser { + public: + explicit TokenParser(ParseType type); + std::string Reorder(const std::string& input); + + private: + void Load(const std::string& input); + bool Read(); + bool ParseWs(); + bool ParseChar(const std::string& exp); + bool ParseChars(const std::string& exp); + std::string ParseKey(); + std::string ParseValue(); + void Parse(const std::string& input); + + int index_; + std::string ch_; + std::vector text_; + std::vector tokens_; + std::unordered_map> orders_; +}; + +} // namespace wetext + +#endif // PROCESSOR_WETEXT_TOKEN_PARSER_H_ diff --git a/post_processor/wetext/utils/CMakeLists.txt b/post_processor/wetext/utils/CMakeLists.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/post_processor/wetext/utils/CMakeLists.txt @@ -0,0 +1 @@ + diff --git a/post_processor/wetext/utils/wetext_flags.h b/post_processor/wetext/utils/wetext_flags.h new file mode 100644 index 0000000..0bea9c7 --- /dev/null +++ b/post_processor/wetext/utils/wetext_flags.h @@ -0,0 +1,23 @@ +// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTILS_WETEXT_FLAGS_H_ +#define UTILS_WETEXT_FLAGS_H_ + +// Because openfst is a dynamic library compiled with gflags/glog, we must use +// the gflags/glog from openfst to avoid them linked both statically and +// dynamically into the executable. +#include "../../../utils/flags.h" + +#endif // UTILS_WETEXT_FLAGS_H_ diff --git a/post_processor/wetext/utils/wetext_log.h b/post_processor/wetext/utils/wetext_log.h new file mode 100644 index 0000000..be5b804 --- /dev/null +++ b/post_processor/wetext/utils/wetext_log.h @@ -0,0 +1,23 @@ +// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTILS_WETEXT_LOG_H_ +#define UTILS_WETEXT_LOG_H_ + +// Because openfst is a dynamic library compiled with gflags/glog, we must use +// the gflags/glog from openfst to avoid them linked both statically and +// dynamically into the executable. +#include "../../../utils/log.h" + +#endif // UTILS_WETEXT_LOG_H_ diff --git a/post_processor/wetext/utils/wetext_string.cc b/post_processor/wetext/utils/wetext_string.cc new file mode 100644 index 0000000..65bed35 --- /dev/null +++ b/post_processor/wetext/utils/wetext_string.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "wetext_string.h" + +#include "wetext_log.h" + +namespace wetext { +const char* WHITESPACE = " \n\r\t\f\v"; + +int UTF8CharLength(char ch) { + int num_bytes = 1; + CHECK_LE((ch & 0xF8), 0xF0); + if ((ch & 0x80) == 0x00) { + // The first 128 characters (US-ASCII) in UTF-8 format only need one byte. + num_bytes = 1; + } else if ((ch & 0xE0) == 0xC0) { + // The next 1,920 characters need two bytes to encode, + // which covers the remainder of almost all Latin-script alphabets. + num_bytes = 2; + } else if ((ch & 0xF0) == 0xE0) { + // Three bytes are needed for characters in the rest of + // the Basic Multilingual Plane, which contains virtually all characters + // in common use, including most Chinese, Japanese and Korean characters. + num_bytes = 3; + } else if ((ch & 0xF8) == 0xF0) { + // Four bytes are needed for characters in the other planes of Unicode, + // which include less common CJK characters, various historic scripts, + // mathematical symbols, and emoji (pictographic symbols). + num_bytes = 4; + } + return num_bytes; +} + +int UTF8StringLength(const std::string& str) { + int len = 0; + int num_bytes = 1; + for (size_t i = 0; i < str.length(); i += num_bytes) { + num_bytes = UTF8CharLength(str[i]); + ++len; + } + return len; +} + +void SplitUTF8StringToChars(const std::string& str, + std::vector* chars) { + chars->clear(); + int num_bytes = 1; + for (size_t i = 0; i < str.length(); i += num_bytes) { + num_bytes = UTF8CharLength(str[i]); + chars->push_back(str.substr(i, num_bytes)); + } +} + +std::string Ltrim(const std::string& str) { + size_t start = str.find_first_not_of(WHITESPACE); + return (start == std::string::npos) ? "" : str.substr(start); +} + +std::string Rtrim(const std::string& str) { + size_t end = str.find_last_not_of(WHITESPACE); + return end == std::string::npos ? "" : str.substr(0, end + 1); +} + +std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); } + +void Split(const std::string& str, const std::string& delim, + std::vector* output) { + std::string s = str; + size_t pos = 0; + while ((pos = s.find(delim)) != std::string::npos) { + output->emplace_back(s.substr(0, pos)); + s.erase(0, pos + delim.length()); + } + output->emplace_back(s); +} + +} // namespace wetext diff --git a/post_processor/wetext/utils/wetext_string.h b/post_processor/wetext/utils/wetext_string.h new file mode 100644 index 0000000..ae890d6 --- /dev/null +++ b/post_processor/wetext/utils/wetext_string.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTILS_WETEXT_STRING_H_ +#define UTILS_WETEXT_STRING_H_ + +#include +#include + +namespace wetext { +extern const char* WHITESPACE; + +int UTF8CharLength(char ch); + +int UTF8StringLength(const std::string& str); + +void SplitUTF8StringToChars(const std::string& str, + std::vector* chars); + +std::string Ltrim(const std::string& str); + +std::string Rtrim(const std::string& str); + +std::string Trim(const std::string& str); + +void Split(const std::string& str, const std::string& delim, + std::vector* output); + +} // namespace wetext + +#endif // UTILS_WETEXT_STRING_H_