2024.5.10-去除ITN的依赖构建

1 year ago · 7313e112ff
15 changed files with 637 additions and 22 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -9,6 +9,7 @@ set(BoldRed     "${Esc}[31m")
 # 运行参数
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -pthread -fPIC")
 set(LIB_BASE_DIR /root/projects/temp_xiaoke/asr_runtime/lib_files)
 set(third_party_libraries)
 option(GRPC "whether to build with gRPC" ON)
 option(ONNX "whether to build with ONNX" ON)
 option(ITN "whether to build with ITN" ON)
--- a/cmake-linux/wetextprocessing.cmake
+++ b/cmake-linux/wetextprocessing.cmake
@ -1,14 +0,0 @@
 set(wetext_BUILD_DIR "${LIB_BASE_DIR}/wetextprocessing-build")
 set(wetext_SOURCE_DIR "${LIB_BASE_DIR}/wetextprocessing-src")
 if(NOT EXISTS ${wetext_BUILD_DIR})
    execute_process(COMMAND mkdir -p ${wetext_BUILD_DIR})
    message(STATUS "${BoldGreen}Install wetextprocessing library${ColourReset}")
    execute_process(
            COMMAND cmake -B ${wetext_BUILD_DIR} -S ${wetext_SOURCE_DIR}/runtime -DCMAKE_BUILD_TYPE=Release -j4 -fPIC &&
            cmake --build .
            WORKING_DIRECTORY ${wetext_BUILD_DIR})
 endif ()

 include_directories(${wetext_SOURCE_DIR}/runtime)
 #add_subdirectory(${wetext_SOURCE_DIR}/runtime/utils)
 #add_subdirectory(${wetext_SOURCE_DIR}/runtime/processor)
--- a/post_processor/CMakeLists.txt
+++ b/post_processor/CMakeLists.txt
@ -1,10 +1,17 @@
 add_library(post_processor STATIC
  post_processor.cc
 message(STATUS "post_processor dir:${CMAKE_CURRENT_SOURCE_DIR}")
 add_library(wetext_utils STATIC ${CMAKE_CURRENT_SOURCE_DIR}/utils/wetext_string.cc)
 target_link_libraries(wetext_utils PUBLIC glog)

 add_library(wetext_processor STATIC
        ${CMAKE_CURRENT_SOURCE_DIR}/processor/wetext_processor.cc
        ${CMAKE_CURRENT_SOURCE_DIR}/processor/wetext_token_parser.cc
 )
 if(ITN)
  include(wetextprocessing)
  target_link_libraries(post_processor PUBLIC utils wetext_processor wetext_utils)
 else()
  target_link_libraries(post_processor PUBLIC utils)
 endif()
 target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils)

 add_executable(processor_main processor_main.cc)
 target_link_libraries(processor_main PUBLIC wetext_processor)

 add_library(post_processor STATIC
        post_processor.cc
 )
 target_link_libraries(post_processor PUBLIC utils wetext_processor wetext_utils)
--- a/post_processor/wetext/bin/CMakeLists.txt
+++ b/post_processor/wetext/bin/CMakeLists.txt
@ -0,0 +1,2 @@
 add_executable(processor_main processor_main.cc)
 target_link_libraries(processor_main PUBLIC wetext_processor)
--- a/post_processor/wetext/bin/processor_main.cc
+++ b/post_processor/wetext/bin/processor_main.cc
@ -0,0 +1,54 @@
 // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include <fstream>
 #include <iostream>
 #include <string>

 #include "../processor/wetext_processor.h"
 #include "../utils/wetext_flags.h"

 DEFINE_string(text, "", "input string");
 DEFINE_string(file, "", "input file");
 DEFINE_string(tagger, "", "tagger fst path");
 DEFINE_string(verbalizer, "", "verbalizer fst path");

 int main(int argc, char* argv[]) {
  gflags::ParseCommandLineFlags(&argc, &argv, false);
  google::InitGoogleLogging(argv[0]);

  if (FLAGS_tagger.empty() || FLAGS_verbalizer.empty()) {
    LOG(FATAL) << "Please provide the tagger and verbalizer fst files.";
  }
  wetext::Processor processor(FLAGS_tagger, FLAGS_verbalizer);

  if (!FLAGS_text.empty()) {
    std::string tagged_text = processor.Tag(FLAGS_text);
    std::cout << tagged_text << std::endl;
    std::string normalized_text = processor.Verbalize(tagged_text);
    std::cout << normalized_text << std::endl;
  }

  if (!FLAGS_file.empty()) {
    std::ifstream file(FLAGS_file);
    std::string line;
    while (getline(file, line)) {
      std::string tagged_text = processor.Tag(line);
      std::cout << tagged_text << std::endl;
      std::string normalized_text = processor.Verbalize(tagged_text);
      std::cout << normalized_text << std::endl;
    }
  }
  return 0;
 }
--- a/post_processor/wetext/processor/CMakeLists.txt
+++ b/post_processor/wetext/processor/CMakeLists.txt
@ -0,0 +1,13 @@
 add_library(wetext_processor STATIC
        wetext_processor.cc
        wetext_token_parser.cc
 )
 if(ANDROID)
  target_link_libraries(wetext_processor PUBLIC fst wetext_utils)
 else()
  if(MSVC)
    target_link_libraries(wetext_processor PUBLIC fst wetext_utils)
  else()
    target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils)
  endif()
 endif()
--- a/post_processor/wetext/processor/wetext_processor.cc
+++ b/post_processor/wetext/processor/wetext_processor.cc
@ -0,0 +1,79 @@
 // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "wetext_processor.h"

 using fst::StringTokenType;

 namespace wetext {
 Processor::Processor(const std::string& tagger_path,
                     const std::string& verbalizer_path) {
  tagger_.reset(StdVectorFst::Read(tagger_path));
  verbalizer_.reset(StdVectorFst::Read(verbalizer_path));
  compiler_ = std::make_shared<StringCompiler<StdArc>>(StringTokenType::BYTE);
  printer_ = std::make_shared<StringPrinter<StdArc>>(StringTokenType::BYTE);

  if (tagger_path.find("_tn_") != tagger_path.npos) {
    parse_type_ = ParseType::kTN;
  } else if (tagger_path.find("_itn_") != tagger_path.npos) {
    parse_type_ = ParseType::kITN;
  } else {
    LOG(FATAL) << "Invalid fst prefix, prefix should contain"
               << " either \"_tn_\" or \"_itn_\".";
  }
 }

 std::string Processor::ShortestPath(const StdVectorFst& lattice) {
  StdVectorFst shortest_path;
  fst::ShortestPath(lattice, &shortest_path, 1, true);

  std::string output;
  printer_->operator()(shortest_path, &output);
  return output;
 }

 std::string Processor::Compose(const std::string& input,
                               const StdVectorFst* fst) {
  StdVectorFst input_fst;
  compiler_->operator()(input, &input_fst);

  StdVectorFst lattice;
  fst::Compose(input_fst, *fst, &lattice);
  return ShortestPath(lattice);
 }

 std::string Processor::Tag(const std::string& input) {
  if (input.empty()) {
    return "";
  }
  return Compose(input, tagger_.get());
 }

 std::string Processor::Verbalize(const std::string& input) {
  if (input.empty()) {
    return "";
  }
  TokenParser parser(parse_type_);
  std::string output = parser.Reorder(input);

  output = Compose(output, verbalizer_.get());
  output.erase(std::remove(output.begin(), output.end(), '\0'), output.end());
  return output;
 }

 std::string Processor::Normalize(const std::string& input) {
  return Verbalize(Tag(input));
 }

 }  // namespace wetext
--- a/post_processor/wetext/processor/wetext_processor.h
+++ b/post_processor/wetext/processor/wetext_processor.h
@ -0,0 +1,51 @@
 // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef PROCESSOR_WETEXT_PROCESSOR_H_
 #define PROCESSOR_WETEXT_PROCESSOR_H_

 #include <memory>
 #include <string>

 #include "fst/fstlib.h"

 #include "wetext_token_parser.h"

 using fst::StdArc;
 using fst::StdVectorFst;
 using fst::StringCompiler;
 using fst::StringPrinter;

 namespace wetext {
 class Processor {
 public:
  Processor(const std::string& tagger_path, const std::string& verbalizer_path);
  std::string Tag(const std::string& input);
  std::string Verbalize(const std::string& input);
  std::string Normalize(const std::string& input);

 private:
  std::string ShortestPath(const StdVectorFst& lattice);
  std::string Compose(const std::string& input, const StdVectorFst* fst);

  ParseType parse_type_;
  std::shared_ptr<StdVectorFst> tagger_ = nullptr;
  std::shared_ptr<StdVectorFst> verbalizer_ = nullptr;
  std::shared_ptr<StringCompiler<StdArc>> compiler_ = nullptr;
  std::shared_ptr<StringPrinter<StdArc>> printer_ = nullptr;
 };

 }  // namespace wetext

 #endif  // PROCESSOR_WETEXT_PROCESSOR_H_
--- a/post_processor/wetext/processor/wetext_token_parser.cc
+++ b/post_processor/wetext/processor/wetext_token_parser.cc
@ -0,0 +1,153 @@
 // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "wetext_token_parser.h"

 #include "../utils/wetext_log.h"
 #include "../utils/wetext_string.h"

 namespace wetext {
 const char EOS[] = "<EOS>";
 const std::set<std::string> UTF8_WHITESPACE = {" ", "\t", "\n", "\r",
                                               "\x0b\x0c"};
 const std::set<std::string> ASCII_LETTERS = {
    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",
    "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B",
    "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
    "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"};
 const std::unordered_map<std::string, std::vector<std::string>> TN_ORDERS = {
    {"date", {"year", "month", "day"}},
    {"fraction", {"denominator", "numerator"}},
    {"measure", {"denominator", "numerator", "value"}},
    {"money", {"value", "currency"}},
    {"time", {"noon", "hour", "minute", "second"}}};
 const std::unordered_map<std::string, std::vector<std::string>> ITN_ORDERS = {
    {"date", {"year", "month", "day"}},
    {"fraction", {"sign", "numerator", "denominator"}},
    {"measure", {"numerator", "denominator", "value"}},
    {"money", {"currency", "value", "decimal"}},
    {"time", {"hour", "minute", "second", "noon"}}};

 TokenParser::TokenParser(ParseType type) {
  if (type == ParseType::kTN) {
    orders_ = TN_ORDERS;
  } else {
    orders_ = ITN_ORDERS;
  }
 }

 void TokenParser::Load(const std::string& input) {
  wetext::SplitUTF8StringToChars(input, &text_);
  CHECK_GT(text_.size(), 0);
  index_ = 0;
  ch_ = text_[0];
 }

 bool TokenParser::Read() {
  if (index_ < text_.size() - 1) {
    index_ += 1;
    ch_ = text_[index_];
    return true;
  }
  ch_ = EOS;
  return false;
 }

 bool TokenParser::ParseWs() {
  bool not_eos = ch_ != EOS;
  while (not_eos && ch_ == " ") {
    not_eos = Read();
  }
  return not_eos;
 }

 bool TokenParser::ParseChar(const std::string& exp) {
  if (ch_ == exp) {
    Read();
    return true;
  }
  return false;
 }

 bool TokenParser::ParseChars(const std::string& exp) {
  bool ok = false;
  std::vector<std::string> chars;
  wetext::SplitUTF8StringToChars(exp, &chars);
  for (const auto& x : chars) {
    ok |= ParseChar(x);
  }
  return ok;
 }

 std::string TokenParser::ParseKey() {
  CHECK_NE(ch_, EOS);
  CHECK_EQ(UTF8_WHITESPACE.count(ch_), 0);

  std::string key = "";
  while (ASCII_LETTERS.count(ch_) > 0) {
    key += ch_;
    Read();
  }
  return key;
 }

 std::string TokenParser::ParseValue() {
  CHECK_NE(ch_, EOS);
  bool escape = false;

  std::string value = "";
  while (ch_ != "\"") {
    value += ch_;
    escape = ch_ == "\\" && !escape;
    Read();
    if (escape) {
      value += ch_;
      Read();
    }
  }
  return value;
 }

 void TokenParser::Parse(const std::string& input) {
  Load(input);
  while (ParseWs()) {
    std::string name = ParseKey();
    ParseChars(" { ");

    Token token(name);
    while (ParseWs()) {
      if (ch_ == "}") {
        ParseChar("}");
        break;
      }
      std::string key = ParseKey();
      ParseChars(": \"");
      std::string value = ParseValue();
      ParseChar("\"");
      token.Append(key, value);
    }
    tokens_.emplace_back(token);
  }
 }

 std::string TokenParser::Reorder(const std::string& input) {
  Parse(input);
  std::string output = "";
  for (auto& token : tokens_) {
    output += token.String(orders_) + " ";
  }
  return Trim(output);
 }

 }  // namespace wetext
--- a/post_processor/wetext/processor/wetext_token_parser.h
+++ b/post_processor/wetext/processor/wetext_token_parser.h
@ -0,0 +1,91 @@
 // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef PROCESSOR_WETEXT_TOKEN_PARSER_H_
 #define PROCESSOR_WETEXT_TOKEN_PARSER_H_

 #include <set>
 #include <string>
 #include <unordered_map>
 #include <vector>

 namespace wetext {

 extern const char EOS[];
 extern const std::set<std::string> UTF8_WHITESPACE;
 extern const std::set<std::string> ASCII_LETTERS;
 extern const std::unordered_map<std::string, std::vector<std::string>>
    TN_ORDERS;
 extern const std::unordered_map<std::string, std::vector<std::string>>
    ITN_ORDERS;

 struct Token {
  std::string name;
  std::vector<std::string> order;
  std::unordered_map<std::string, std::string> members;

  explicit Token(const std::string& name) : name(name) {}

  void Append(const std::string& key, const std::string& value) {
    order.emplace_back(key);
    members[key] = value;
  }

  std::string String(
      const std::unordered_map<std::string, std::vector<std::string>>& orders) {
    std::string output = name + " {";
    if (orders.count(name) > 0) {
      order = orders.at(name);
    }

    for (const auto& key : order) {
      if (members.count(key) == 0) {
        continue;
      }
      output += " " + key + ": \"" + members[key] + "\"";
    }
    return output + " }";
  }
 };

 enum ParseType {
  kTN = 0x00,  // Text Normalization
  kITN = 0x01  // Inverse Text Normalization
 };

 class TokenParser {
 public:
  explicit TokenParser(ParseType type);
  std::string Reorder(const std::string& input);

 private:
  void Load(const std::string& input);
  bool Read();
  bool ParseWs();
  bool ParseChar(const std::string& exp);
  bool ParseChars(const std::string& exp);
  std::string ParseKey();
  std::string ParseValue();
  void Parse(const std::string& input);

  int index_;
  std::string ch_;
  std::vector<std::string> text_;
  std::vector<Token> tokens_;
  std::unordered_map<std::string, std::vector<std::string>> orders_;
 };

 }  // namespace wetext

 #endif  // PROCESSOR_WETEXT_TOKEN_PARSER_H_
--- a/post_processor/wetext/utils/CMakeLists.txt
+++ b/post_processor/wetext/utils/CMakeLists.txt
@ -0,0 +1 @@

--- a/post_processor/wetext/utils/wetext_flags.h
+++ b/post_processor/wetext/utils/wetext_flags.h
@ -0,0 +1,23 @@
 // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef UTILS_WETEXT_FLAGS_H_
 #define UTILS_WETEXT_FLAGS_H_

 // Because openfst is a dynamic library compiled with gflags/glog, we must use
 // the gflags/glog from openfst to avoid them linked both statically and
 // dynamically into the executable.
 #include "../../../utils/flags.h"

 #endif  // UTILS_WETEXT_FLAGS_H_
--- a/post_processor/wetext/utils/wetext_log.h
+++ b/post_processor/wetext/utils/wetext_log.h
@ -0,0 +1,23 @@
 // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef UTILS_WETEXT_LOG_H_
 #define UTILS_WETEXT_LOG_H_

 // Because openfst is a dynamic library compiled with gflags/glog, we must use
 // the gflags/glog from openfst to avoid them linked both statically and
 // dynamically into the executable.
 #include "../../../utils/log.h"

 #endif  // UTILS_WETEXT_LOG_H_
--- a/post_processor/wetext/utils/wetext_string.cc
+++ b/post_processor/wetext/utils/wetext_string.cc
@ -0,0 +1,89 @@
 // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "wetext_string.h"

 #include "wetext_log.h"

 namespace wetext {
 const char* WHITESPACE = " \n\r\t\f\v";

 int UTF8CharLength(char ch) {
  int num_bytes = 1;
  CHECK_LE((ch & 0xF8), 0xF0);
  if ((ch & 0x80) == 0x00) {
    // The first 128 characters (US-ASCII) in UTF-8 format only need one byte.
    num_bytes = 1;
  } else if ((ch & 0xE0) == 0xC0) {
    // The next 1,920 characters need two bytes to encode,
    // which covers the remainder of almost all Latin-script alphabets.
    num_bytes = 2;
  } else if ((ch & 0xF0) == 0xE0) {
    // Three bytes are needed for characters in the rest of
    // the Basic Multilingual Plane, which contains virtually all characters
    // in common use, including most Chinese, Japanese and Korean characters.
    num_bytes = 3;
  } else if ((ch & 0xF8) == 0xF0) {
    // Four bytes are needed for characters in the other planes of Unicode,
    // which include less common CJK characters, various historic scripts,
    // mathematical symbols, and emoji (pictographic symbols).
    num_bytes = 4;
  }
  return num_bytes;
 }

 int UTF8StringLength(const std::string& str) {
  int len = 0;
  int num_bytes = 1;
  for (size_t i = 0; i < str.length(); i += num_bytes) {
    num_bytes = UTF8CharLength(str[i]);
    ++len;
  }
  return len;
 }

 void SplitUTF8StringToChars(const std::string& str,
                            std::vector<std::string>* chars) {
  chars->clear();
  int num_bytes = 1;
  for (size_t i = 0; i < str.length(); i += num_bytes) {
    num_bytes = UTF8CharLength(str[i]);
    chars->push_back(str.substr(i, num_bytes));
  }
 }

 std::string Ltrim(const std::string& str) {
  size_t start = str.find_first_not_of(WHITESPACE);
  return (start == std::string::npos) ? "" : str.substr(start);
 }

 std::string Rtrim(const std::string& str) {
  size_t end = str.find_last_not_of(WHITESPACE);
  return end == std::string::npos ? "" : str.substr(0, end + 1);
 }

 std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); }

 void Split(const std::string& str, const std::string& delim,
           std::vector<std::string>* output) {
  std::string s = str;
  size_t pos = 0;
  while ((pos = s.find(delim)) != std::string::npos) {
    output->emplace_back(s.substr(0, pos));
    s.erase(0, pos + delim.length());
  }
  output->emplace_back(s);
 }

 }  // namespace wetext
--- a/post_processor/wetext/utils/wetext_string.h
+++ b/post_processor/wetext/utils/wetext_string.h
@ -0,0 +1,42 @@
 // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef UTILS_WETEXT_STRING_H_
 #define UTILS_WETEXT_STRING_H_

 #include <string>
 #include <vector>

 namespace wetext {
 extern const char* WHITESPACE;

 int UTF8CharLength(char ch);

 int UTF8StringLength(const std::string& str);

 void SplitUTF8StringToChars(const std::string& str,
                            std::vector<std::string>* chars);

 std::string Ltrim(const std::string& str);

 std::string Rtrim(const std::string& str);

 std::string Trim(const std::string& str);

 void Split(const std::string& str, const std::string& delim,
           std::vector<std::string>* output);

 }  // namespace wetext

 #endif  // UTILS_WETEXT_STRING_H_