From 7313e112ff33747ea28bbeb37b004bd0d6dab0cf Mon Sep 17 00:00:00 2001
From: Administrator <xbirdman@126.com>
Date: Fri, 10 May 2024 16:23:48 +0800
Subject: [PATCH] =?UTF-8?q?2024.5.10-=E5=8E=BB=E9=99=A4ITN=E7=9A=84?=
 =?UTF-8?q?=E4=BE=9D=E8=B5=96=E6=9E=84=E5=BB=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CMakeLists.txt                                |   1 +
 cmake-linux/wetextprocessing.cmake            |  14 --
 post_processor/CMakeLists.txt                 |  23 ++-
 post_processor/wetext/bin/CMakeLists.txt      |   2 +
 post_processor/wetext/bin/processor_main.cc   |  54 +++++++
 .../wetext/processor/CMakeLists.txt           |  13 ++
 .../wetext/processor/wetext_processor.cc      |  79 +++++++++
 .../wetext/processor/wetext_processor.h       |  51 ++++++
 .../wetext/processor/wetext_token_parser.cc   | 153 ++++++++++++++++++
 .../wetext/processor/wetext_token_parser.h    |  91 +++++++++++
 post_processor/wetext/utils/CMakeLists.txt    |   1 +
 post_processor/wetext/utils/wetext_flags.h    |  23 +++
 post_processor/wetext/utils/wetext_log.h      |  23 +++
 post_processor/wetext/utils/wetext_string.cc  |  89 ++++++++++
 post_processor/wetext/utils/wetext_string.h   |  42 +++++
 15 files changed, 637 insertions(+), 22 deletions(-)
 delete mode 100644 cmake-linux/wetextprocessing.cmake
 create mode 100644 post_processor/wetext/bin/CMakeLists.txt
 create mode 100644 post_processor/wetext/bin/processor_main.cc
 create mode 100644 post_processor/wetext/processor/CMakeLists.txt
 create mode 100644 post_processor/wetext/processor/wetext_processor.cc
 create mode 100644 post_processor/wetext/processor/wetext_processor.h
 create mode 100644 post_processor/wetext/processor/wetext_token_parser.cc
 create mode 100644 post_processor/wetext/processor/wetext_token_parser.h
 create mode 100644 post_processor/wetext/utils/CMakeLists.txt
 create mode 100644 post_processor/wetext/utils/wetext_flags.h
 create mode 100644 post_processor/wetext/utils/wetext_log.h
 create mode 100644 post_processor/wetext/utils/wetext_string.cc
 create mode 100644 post_processor/wetext/utils/wetext_string.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 395a66e..2e72c1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,6 +9,7 @@ set(BoldRed     "${Esc}[31m")
 # 运行参数
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -pthread -fPIC")
 set(LIB_BASE_DIR /root/projects/temp_xiaoke/asr_runtime/lib_files)
+set(third_party_libraries)
 option(GRPC "whether to build with gRPC" ON)
 option(ONNX "whether to build with ONNX" ON)
 option(ITN "whether to build with ITN" ON)
diff --git a/cmake-linux/wetextprocessing.cmake b/cmake-linux/wetextprocessing.cmake
deleted file mode 100644
index 28758d9..0000000
--- a/cmake-linux/wetextprocessing.cmake
+++ /dev/null
@@ -1,14 +0,0 @@
-set(wetext_BUILD_DIR "${LIB_BASE_DIR}/wetextprocessing-build")
-set(wetext_SOURCE_DIR "${LIB_BASE_DIR}/wetextprocessing-src")
-if(NOT EXISTS ${wetext_BUILD_DIR})
-    execute_process(COMMAND mkdir -p ${wetext_BUILD_DIR})
-    message(STATUS "${BoldGreen}Install wetextprocessing library${ColourReset}")
-    execute_process(
-            COMMAND cmake -B ${wetext_BUILD_DIR} -S ${wetext_SOURCE_DIR}/runtime -DCMAKE_BUILD_TYPE=Release -j4 -fPIC &&
-            cmake --build .
-            WORKING_DIRECTORY ${wetext_BUILD_DIR})
-endif ()
-
-include_directories(${wetext_SOURCE_DIR}/runtime)
-#add_subdirectory(${wetext_SOURCE_DIR}/runtime/utils)
-#add_subdirectory(${wetext_SOURCE_DIR}/runtime/processor)
\ No newline at end of file
diff --git a/post_processor/CMakeLists.txt b/post_processor/CMakeLists.txt
index 3fff4ca..0328d49 100644
--- a/post_processor/CMakeLists.txt
+++ b/post_processor/CMakeLists.txt
@@ -1,10 +1,17 @@
-add_library(post_processor STATIC
-  post_processor.cc
+message(STATUS "post_processor dir:${CMAKE_CURRENT_SOURCE_DIR}")
+add_library(wetext_utils STATIC ${CMAKE_CURRENT_SOURCE_DIR}/utils/wetext_string.cc)
+target_link_libraries(wetext_utils PUBLIC glog)
+
+add_library(wetext_processor STATIC
+        ${CMAKE_CURRENT_SOURCE_DIR}/processor/wetext_processor.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/processor/wetext_token_parser.cc
 )
-if(ITN)
-  include(wetextprocessing)
-  target_link_libraries(post_processor PUBLIC utils wetext_processor wetext_utils)
-else()
-  target_link_libraries(post_processor PUBLIC utils)
-endif()
+target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils)
+
+add_executable(processor_main processor_main.cc)
+target_link_libraries(processor_main PUBLIC wetext_processor)
 
+add_library(post_processor STATIC
+        post_processor.cc
+)
+target_link_libraries(post_processor PUBLIC utils wetext_processor wetext_utils)
diff --git a/post_processor/wetext/bin/CMakeLists.txt b/post_processor/wetext/bin/CMakeLists.txt
new file mode 100644
index 0000000..b06482e
--- /dev/null
+++ b/post_processor/wetext/bin/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(processor_main processor_main.cc)
+target_link_libraries(processor_main PUBLIC wetext_processor)
diff --git a/post_processor/wetext/bin/processor_main.cc b/post_processor/wetext/bin/processor_main.cc
new file mode 100644
index 0000000..24add7f
--- /dev/null
+++ b/post_processor/wetext/bin/processor_main.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include "../processor/wetext_processor.h"
+#include "../utils/wetext_flags.h"
+
+DEFINE_string(text, "", "input string");
+DEFINE_string(file, "", "input file");
+DEFINE_string(tagger, "", "tagger fst path");
+DEFINE_string(verbalizer, "", "verbalizer fst path");
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, false);
+  google::InitGoogleLogging(argv[0]);
+
+  if (FLAGS_tagger.empty() || FLAGS_verbalizer.empty()) {
+    LOG(FATAL) << "Please provide the tagger and verbalizer fst files.";
+  }
+  wetext::Processor processor(FLAGS_tagger, FLAGS_verbalizer);
+
+  if (!FLAGS_text.empty()) {
+    std::string tagged_text = processor.Tag(FLAGS_text);
+    std::cout << tagged_text << std::endl;
+    std::string normalized_text = processor.Verbalize(tagged_text);
+    std::cout << normalized_text << std::endl;
+  }
+
+  if (!FLAGS_file.empty()) {
+    std::ifstream file(FLAGS_file);
+    std::string line;
+    while (getline(file, line)) {
+      std::string tagged_text = processor.Tag(line);
+      std::cout << tagged_text << std::endl;
+      std::string normalized_text = processor.Verbalize(tagged_text);
+      std::cout << normalized_text << std::endl;
+    }
+  }
+  return 0;
+}
diff --git a/post_processor/wetext/processor/CMakeLists.txt b/post_processor/wetext/processor/CMakeLists.txt
new file mode 100644
index 0000000..8e87ea2
--- /dev/null
+++ b/post_processor/wetext/processor/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_library(wetext_processor STATIC
+        wetext_processor.cc
+        wetext_token_parser.cc
+)
+if(ANDROID)
+  target_link_libraries(wetext_processor PUBLIC fst wetext_utils)
+else()
+  if(MSVC)
+    target_link_libraries(wetext_processor PUBLIC fst wetext_utils)
+  else()
+    target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils)
+  endif()
+endif()
diff --git a/post_processor/wetext/processor/wetext_processor.cc b/post_processor/wetext/processor/wetext_processor.cc
new file mode 100644
index 0000000..b096bac
--- /dev/null
+++ b/post_processor/wetext/processor/wetext_processor.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "wetext_processor.h"
+
+using fst::StringTokenType;
+
+namespace wetext {
+Processor::Processor(const std::string& tagger_path,
+                     const std::string& verbalizer_path) {
+  tagger_.reset(StdVectorFst::Read(tagger_path));
+  verbalizer_.reset(StdVectorFst::Read(verbalizer_path));
+  compiler_ = std::make_shared<StringCompiler<StdArc>>(StringTokenType::BYTE);
+  printer_ = std::make_shared<StringPrinter<StdArc>>(StringTokenType::BYTE);
+
+  if (tagger_path.find("_tn_") != tagger_path.npos) {
+    parse_type_ = ParseType::kTN;
+  } else if (tagger_path.find("_itn_") != tagger_path.npos) {
+    parse_type_ = ParseType::kITN;
+  } else {
+    LOG(FATAL) << "Invalid fst prefix, prefix should contain"
+               << " either \"_tn_\" or \"_itn_\".";
+  }
+}
+
+std::string Processor::ShortestPath(const StdVectorFst& lattice) {
+  StdVectorFst shortest_path;
+  fst::ShortestPath(lattice, &shortest_path, 1, true);
+
+  std::string output;
+  printer_->operator()(shortest_path, &output);
+  return output;
+}
+
+std::string Processor::Compose(const std::string& input,
+                               const StdVectorFst* fst) {
+  StdVectorFst input_fst;
+  compiler_->operator()(input, &input_fst);
+
+  StdVectorFst lattice;
+  fst::Compose(input_fst, *fst, &lattice);
+  return ShortestPath(lattice);
+}
+
+std::string Processor::Tag(const std::string& input) {
+  if (input.empty()) {
+    return "";
+  }
+  return Compose(input, tagger_.get());
+}
+
+std::string Processor::Verbalize(const std::string& input) {
+  if (input.empty()) {
+    return "";
+  }
+  TokenParser parser(parse_type_);
+  std::string output = parser.Reorder(input);
+
+  output = Compose(output, verbalizer_.get());
+  output.erase(std::remove(output.begin(), output.end(), '\0'), output.end());
+  return output;
+}
+
+std::string Processor::Normalize(const std::string& input) {
+  return Verbalize(Tag(input));
+}
+
+}  // namespace wetext
diff --git a/post_processor/wetext/processor/wetext_processor.h b/post_processor/wetext/processor/wetext_processor.h
new file mode 100644
index 0000000..2397125
--- /dev/null
+++ b/post_processor/wetext/processor/wetext_processor.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PROCESSOR_WETEXT_PROCESSOR_H_
+#define PROCESSOR_WETEXT_PROCESSOR_H_
+
+#include <memory>
+#include <string>
+
+#include "fst/fstlib.h"
+
+#include "wetext_token_parser.h"
+
+using fst::StdArc;
+using fst::StdVectorFst;
+using fst::StringCompiler;
+using fst::StringPrinter;
+
+namespace wetext {
+class Processor {
+ public:
+  Processor(const std::string& tagger_path, const std::string& verbalizer_path);
+  std::string Tag(const std::string& input);
+  std::string Verbalize(const std::string& input);
+  std::string Normalize(const std::string& input);
+
+ private:
+  std::string ShortestPath(const StdVectorFst& lattice);
+  std::string Compose(const std::string& input, const StdVectorFst* fst);
+
+  ParseType parse_type_;
+  std::shared_ptr<StdVectorFst> tagger_ = nullptr;
+  std::shared_ptr<StdVectorFst> verbalizer_ = nullptr;
+  std::shared_ptr<StringCompiler<StdArc>> compiler_ = nullptr;
+  std::shared_ptr<StringPrinter<StdArc>> printer_ = nullptr;
+};
+
+}  // namespace wetext
+
+#endif  // PROCESSOR_WETEXT_PROCESSOR_H_
diff --git a/post_processor/wetext/processor/wetext_token_parser.cc b/post_processor/wetext/processor/wetext_token_parser.cc
new file mode 100644
index 0000000..f64f9f3
--- /dev/null
+++ b/post_processor/wetext/processor/wetext_token_parser.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "wetext_token_parser.h"
+
+#include "../utils/wetext_log.h"
+#include "../utils/wetext_string.h"
+
+namespace wetext {
+const char EOS[] = "<EOS>";
+const std::set<std::string> UTF8_WHITESPACE = {" ", "\t", "\n", "\r",
+                                               "\x0b\x0c"};
+const std::set<std::string> ASCII_LETTERS = {
+    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",
+    "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B",
+    "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
+    "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"};
+const std::unordered_map<std::string, std::vector<std::string>> TN_ORDERS = {
+    {"date", {"year", "month", "day"}},
+    {"fraction", {"denominator", "numerator"}},
+    {"measure", {"denominator", "numerator", "value"}},
+    {"money", {"value", "currency"}},
+    {"time", {"noon", "hour", "minute", "second"}}};
+const std::unordered_map<std::string, std::vector<std::string>> ITN_ORDERS = {
+    {"date", {"year", "month", "day"}},
+    {"fraction", {"sign", "numerator", "denominator"}},
+    {"measure", {"numerator", "denominator", "value"}},
+    {"money", {"currency", "value", "decimal"}},
+    {"time", {"hour", "minute", "second", "noon"}}};
+
+TokenParser::TokenParser(ParseType type) {
+  if (type == ParseType::kTN) {
+    orders_ = TN_ORDERS;
+  } else {
+    orders_ = ITN_ORDERS;
+  }
+}
+
+void TokenParser::Load(const std::string& input) {
+  wetext::SplitUTF8StringToChars(input, &text_);
+  CHECK_GT(text_.size(), 0);
+  index_ = 0;
+  ch_ = text_[0];
+}
+
+bool TokenParser::Read() {
+  if (index_ < text_.size() - 1) {
+    index_ += 1;
+    ch_ = text_[index_];
+    return true;
+  }
+  ch_ = EOS;
+  return false;
+}
+
+bool TokenParser::ParseWs() {
+  bool not_eos = ch_ != EOS;
+  while (not_eos && ch_ == " ") {
+    not_eos = Read();
+  }
+  return not_eos;
+}
+
+bool TokenParser::ParseChar(const std::string& exp) {
+  if (ch_ == exp) {
+    Read();
+    return true;
+  }
+  return false;
+}
+
+bool TokenParser::ParseChars(const std::string& exp) {
+  bool ok = false;
+  std::vector<std::string> chars;
+  wetext::SplitUTF8StringToChars(exp, &chars);
+  for (const auto& x : chars) {
+    ok |= ParseChar(x);
+  }
+  return ok;
+}
+
+std::string TokenParser::ParseKey() {
+  CHECK_NE(ch_, EOS);
+  CHECK_EQ(UTF8_WHITESPACE.count(ch_), 0);
+
+  std::string key = "";
+  while (ASCII_LETTERS.count(ch_) > 0) {
+    key += ch_;
+    Read();
+  }
+  return key;
+}
+
+std::string TokenParser::ParseValue() {
+  CHECK_NE(ch_, EOS);
+  bool escape = false;
+
+  std::string value = "";
+  while (ch_ != "\"") {
+    value += ch_;
+    escape = ch_ == "\\" && !escape;
+    Read();
+    if (escape) {
+      value += ch_;
+      Read();
+    }
+  }
+  return value;
+}
+
+void TokenParser::Parse(const std::string& input) {
+  Load(input);
+  while (ParseWs()) {
+    std::string name = ParseKey();
+    ParseChars(" { ");
+
+    Token token(name);
+    while (ParseWs()) {
+      if (ch_ == "}") {
+        ParseChar("}");
+        break;
+      }
+      std::string key = ParseKey();
+      ParseChars(": \"");
+      std::string value = ParseValue();
+      ParseChar("\"");
+      token.Append(key, value);
+    }
+    tokens_.emplace_back(token);
+  }
+}
+
+std::string TokenParser::Reorder(const std::string& input) {
+  Parse(input);
+  std::string output = "";
+  for (auto& token : tokens_) {
+    output += token.String(orders_) + " ";
+  }
+  return Trim(output);
+}
+
+}  // namespace wetext
diff --git a/post_processor/wetext/processor/wetext_token_parser.h b/post_processor/wetext/processor/wetext_token_parser.h
new file mode 100644
index 0000000..766ea7a
--- /dev/null
+++ b/post_processor/wetext/processor/wetext_token_parser.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PROCESSOR_WETEXT_TOKEN_PARSER_H_
+#define PROCESSOR_WETEXT_TOKEN_PARSER_H_
+
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace wetext {
+
+extern const char EOS[];
+extern const std::set<std::string> UTF8_WHITESPACE;
+extern const std::set<std::string> ASCII_LETTERS;
+extern const std::unordered_map<std::string, std::vector<std::string>>
+    TN_ORDERS;
+extern const std::unordered_map<std::string, std::vector<std::string>>
+    ITN_ORDERS;
+
+struct Token {
+  std::string name;
+  std::vector<std::string> order;
+  std::unordered_map<std::string, std::string> members;
+
+  explicit Token(const std::string& name) : name(name) {}
+
+  void Append(const std::string& key, const std::string& value) {
+    order.emplace_back(key);
+    members[key] = value;
+  }
+
+  std::string String(
+      const std::unordered_map<std::string, std::vector<std::string>>& orders) {
+    std::string output = name + " {";
+    if (orders.count(name) > 0) {
+      order = orders.at(name);
+    }
+
+    for (const auto& key : order) {
+      if (members.count(key) == 0) {
+        continue;
+      }
+      output += " " + key + ": \"" + members[key] + "\"";
+    }
+    return output + " }";
+  }
+};
+
+enum ParseType {
+  kTN = 0x00,  // Text Normalization
+  kITN = 0x01  // Inverse Text Normalization
+};
+
+class TokenParser {
+ public:
+  explicit TokenParser(ParseType type);
+  std::string Reorder(const std::string& input);
+
+ private:
+  void Load(const std::string& input);
+  bool Read();
+  bool ParseWs();
+  bool ParseChar(const std::string& exp);
+  bool ParseChars(const std::string& exp);
+  std::string ParseKey();
+  std::string ParseValue();
+  void Parse(const std::string& input);
+
+  int index_;
+  std::string ch_;
+  std::vector<std::string> text_;
+  std::vector<Token> tokens_;
+  std::unordered_map<std::string, std::vector<std::string>> orders_;
+};
+
+}  // namespace wetext
+
+#endif  // PROCESSOR_WETEXT_TOKEN_PARSER_H_
diff --git a/post_processor/wetext/utils/CMakeLists.txt b/post_processor/wetext/utils/CMakeLists.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/post_processor/wetext/utils/CMakeLists.txt
@@ -0,0 +1 @@
+
diff --git a/post_processor/wetext/utils/wetext_flags.h b/post_processor/wetext/utils/wetext_flags.h
new file mode 100644
index 0000000..0bea9c7
--- /dev/null
+++ b/post_processor/wetext/utils/wetext_flags.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTILS_WETEXT_FLAGS_H_
+#define UTILS_WETEXT_FLAGS_H_
+
+// Because openfst is a dynamic library compiled with gflags/glog, we must use
+// the gflags/glog from openfst to avoid them linked both statically and
+// dynamically into the executable.
+#include "../../../utils/flags.h"
+
+#endif  // UTILS_WETEXT_FLAGS_H_
diff --git a/post_processor/wetext/utils/wetext_log.h b/post_processor/wetext/utils/wetext_log.h
new file mode 100644
index 0000000..be5b804
--- /dev/null
+++ b/post_processor/wetext/utils/wetext_log.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTILS_WETEXT_LOG_H_
+#define UTILS_WETEXT_LOG_H_
+
+// Because openfst is a dynamic library compiled with gflags/glog, we must use
+// the gflags/glog from openfst to avoid them linked both statically and
+// dynamically into the executable.
+#include "../../../utils/log.h"
+
+#endif  // UTILS_WETEXT_LOG_H_
diff --git a/post_processor/wetext/utils/wetext_string.cc b/post_processor/wetext/utils/wetext_string.cc
new file mode 100644
index 0000000..65bed35
--- /dev/null
+++ b/post_processor/wetext/utils/wetext_string.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "wetext_string.h"
+
+#include "wetext_log.h"
+
+namespace wetext {
+const char* WHITESPACE = " \n\r\t\f\v";
+
+int UTF8CharLength(char ch) {
+  int num_bytes = 1;
+  CHECK_LE((ch & 0xF8), 0xF0);
+  if ((ch & 0x80) == 0x00) {
+    // The first 128 characters (US-ASCII) in UTF-8 format only need one byte.
+    num_bytes = 1;
+  } else if ((ch & 0xE0) == 0xC0) {
+    // The next 1,920 characters need two bytes to encode,
+    // which covers the remainder of almost all Latin-script alphabets.
+    num_bytes = 2;
+  } else if ((ch & 0xF0) == 0xE0) {
+    // Three bytes are needed for characters in the rest of
+    // the Basic Multilingual Plane, which contains virtually all characters
+    // in common use, including most Chinese, Japanese and Korean characters.
+    num_bytes = 3;
+  } else if ((ch & 0xF8) == 0xF0) {
+    // Four bytes are needed for characters in the other planes of Unicode,
+    // which include less common CJK characters, various historic scripts,
+    // mathematical symbols, and emoji (pictographic symbols).
+    num_bytes = 4;
+  }
+  return num_bytes;
+}
+
+int UTF8StringLength(const std::string& str) {
+  int len = 0;
+  int num_bytes = 1;
+  for (size_t i = 0; i < str.length(); i += num_bytes) {
+    num_bytes = UTF8CharLength(str[i]);
+    ++len;
+  }
+  return len;
+}
+
+void SplitUTF8StringToChars(const std::string& str,
+                            std::vector<std::string>* chars) {
+  chars->clear();
+  int num_bytes = 1;
+  for (size_t i = 0; i < str.length(); i += num_bytes) {
+    num_bytes = UTF8CharLength(str[i]);
+    chars->push_back(str.substr(i, num_bytes));
+  }
+}
+
+std::string Ltrim(const std::string& str) {
+  size_t start = str.find_first_not_of(WHITESPACE);
+  return (start == std::string::npos) ? "" : str.substr(start);
+}
+
+std::string Rtrim(const std::string& str) {
+  size_t end = str.find_last_not_of(WHITESPACE);
+  return end == std::string::npos ? "" : str.substr(0, end + 1);
+}
+
+std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); }
+
+void Split(const std::string& str, const std::string& delim,
+           std::vector<std::string>* output) {
+  std::string s = str;
+  size_t pos = 0;
+  while ((pos = s.find(delim)) != std::string::npos) {
+    output->emplace_back(s.substr(0, pos));
+    s.erase(0, pos + delim.length());
+  }
+  output->emplace_back(s);
+}
+
+}  // namespace wetext
diff --git a/post_processor/wetext/utils/wetext_string.h b/post_processor/wetext/utils/wetext_string.h
new file mode 100644
index 0000000..ae890d6
--- /dev/null
+++ b/post_processor/wetext/utils/wetext_string.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTILS_WETEXT_STRING_H_
+#define UTILS_WETEXT_STRING_H_
+
+#include <string>
+#include <vector>
+
+namespace wetext {
+extern const char* WHITESPACE;
+
+int UTF8CharLength(char ch);
+
+int UTF8StringLength(const std::string& str);
+
+void SplitUTF8StringToChars(const std::string& str,
+                            std::vector<std::string>* chars);
+
+std::string Ltrim(const std::string& str);
+
+std::string Rtrim(const std::string& str);
+
+std::string Trim(const std::string& str);
+
+void Split(const std::string& str, const std::string& delim,
+           std::vector<std::string>* output);
+
+}  // namespace wetext
+
+#endif  // UTILS_WETEXT_STRING_H_