Browse Source

2024.5.10-去除ITN的依赖构建

master
Administrator 1 year ago
parent
commit
7313e112ff
15 changed files with 637 additions and 22 deletions
  1. +1
    -0
      CMakeLists.txt
  2. +0
    -14
      cmake-linux/wetextprocessing.cmake
  3. +15
    -8
      post_processor/CMakeLists.txt
  4. +2
    -0
      post_processor/wetext/bin/CMakeLists.txt
  5. +54
    -0
      post_processor/wetext/bin/processor_main.cc
  6. +13
    -0
      post_processor/wetext/processor/CMakeLists.txt
  7. +79
    -0
      post_processor/wetext/processor/wetext_processor.cc
  8. +51
    -0
      post_processor/wetext/processor/wetext_processor.h
  9. +153
    -0
      post_processor/wetext/processor/wetext_token_parser.cc
  10. +91
    -0
      post_processor/wetext/processor/wetext_token_parser.h
  11. +1
    -0
      post_processor/wetext/utils/CMakeLists.txt
  12. +23
    -0
      post_processor/wetext/utils/wetext_flags.h
  13. +23
    -0
      post_processor/wetext/utils/wetext_log.h
  14. +89
    -0
      post_processor/wetext/utils/wetext_string.cc
  15. +42
    -0
      post_processor/wetext/utils/wetext_string.h

+ 1
- 0
CMakeLists.txt

@ -9,6 +9,7 @@ set(BoldRed "${Esc}[31m")
#
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -pthread -fPIC")
set(LIB_BASE_DIR /root/projects/temp_xiaoke/asr_runtime/lib_files)
set(third_party_libraries)
option(GRPC "whether to build with gRPC" ON)
option(ONNX "whether to build with ONNX" ON)
option(ITN "whether to build with ITN" ON)

+ 0
- 14
cmake-linux/wetextprocessing.cmake

@ -1,14 +0,0 @@
set(wetext_BUILD_DIR "${LIB_BASE_DIR}/wetextprocessing-build")
set(wetext_SOURCE_DIR "${LIB_BASE_DIR}/wetextprocessing-src")
if(NOT EXISTS ${wetext_BUILD_DIR})
execute_process(COMMAND mkdir -p ${wetext_BUILD_DIR})
message(STATUS "${BoldGreen}Install wetextprocessing library${ColourReset}")
execute_process(
COMMAND cmake -B ${wetext_BUILD_DIR} -S ${wetext_SOURCE_DIR}/runtime -DCMAKE_BUILD_TYPE=Release -j4 -fPIC &&
cmake --build .
WORKING_DIRECTORY ${wetext_BUILD_DIR})
endif ()
include_directories(${wetext_SOURCE_DIR}/runtime)
#add_subdirectory(${wetext_SOURCE_DIR}/runtime/utils)
#add_subdirectory(${wetext_SOURCE_DIR}/runtime/processor)

+ 15
- 8
post_processor/CMakeLists.txt

@ -1,10 +1,17 @@
add_library(post_processor STATIC
post_processor.cc
message(STATUS "post_processor dir:${CMAKE_CURRENT_SOURCE_DIR}")
add_library(wetext_utils STATIC ${CMAKE_CURRENT_SOURCE_DIR}/utils/wetext_string.cc)
target_link_libraries(wetext_utils PUBLIC glog)
add_library(wetext_processor STATIC
${CMAKE_CURRENT_SOURCE_DIR}/processor/wetext_processor.cc
${CMAKE_CURRENT_SOURCE_DIR}/processor/wetext_token_parser.cc
)
if(ITN)
include(wetextprocessing)
target_link_libraries(post_processor PUBLIC utils wetext_processor wetext_utils)
else()
target_link_libraries(post_processor PUBLIC utils)
endif()
target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils)
add_executable(processor_main processor_main.cc)
target_link_libraries(processor_main PUBLIC wetext_processor)
add_library(post_processor STATIC
post_processor.cc
)
target_link_libraries(post_processor PUBLIC utils wetext_processor wetext_utils)

+ 2
- 0
post_processor/wetext/bin/CMakeLists.txt

@ -0,0 +1,2 @@
add_executable(processor_main processor_main.cc)
target_link_libraries(processor_main PUBLIC wetext_processor)

+ 54
- 0
post_processor/wetext/bin/processor_main.cc

@ -0,0 +1,54 @@
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <fstream>
#include <iostream>
#include <string>
#include "../processor/wetext_processor.h"
#include "../utils/wetext_flags.h"
DEFINE_string(text, "", "input string");
DEFINE_string(file, "", "input file");
DEFINE_string(tagger, "", "tagger fst path");
DEFINE_string(verbalizer, "", "verbalizer fst path");
int main(int argc, char* argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);
if (FLAGS_tagger.empty() || FLAGS_verbalizer.empty()) {
LOG(FATAL) << "Please provide the tagger and verbalizer fst files.";
}
wetext::Processor processor(FLAGS_tagger, FLAGS_verbalizer);
if (!FLAGS_text.empty()) {
std::string tagged_text = processor.Tag(FLAGS_text);
std::cout << tagged_text << std::endl;
std::string normalized_text = processor.Verbalize(tagged_text);
std::cout << normalized_text << std::endl;
}
if (!FLAGS_file.empty()) {
std::ifstream file(FLAGS_file);
std::string line;
while (getline(file, line)) {
std::string tagged_text = processor.Tag(line);
std::cout << tagged_text << std::endl;
std::string normalized_text = processor.Verbalize(tagged_text);
std::cout << normalized_text << std::endl;
}
}
return 0;
}

+ 13
- 0
post_processor/wetext/processor/CMakeLists.txt

@ -0,0 +1,13 @@
add_library(wetext_processor STATIC
wetext_processor.cc
wetext_token_parser.cc
)
if(ANDROID)
target_link_libraries(wetext_processor PUBLIC fst wetext_utils)
else()
if(MSVC)
target_link_libraries(wetext_processor PUBLIC fst wetext_utils)
else()
target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils)
endif()
endif()

+ 79
- 0
post_processor/wetext/processor/wetext_processor.cc

@ -0,0 +1,79 @@
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "wetext_processor.h"
using fst::StringTokenType;
namespace wetext {
Processor::Processor(const std::string& tagger_path,
const std::string& verbalizer_path) {
tagger_.reset(StdVectorFst::Read(tagger_path));
verbalizer_.reset(StdVectorFst::Read(verbalizer_path));
compiler_ = std::make_shared<StringCompiler<StdArc>>(StringTokenType::BYTE);
printer_ = std::make_shared<StringPrinter<StdArc>>(StringTokenType::BYTE);
if (tagger_path.find("_tn_") != tagger_path.npos) {
parse_type_ = ParseType::kTN;
} else if (tagger_path.find("_itn_") != tagger_path.npos) {
parse_type_ = ParseType::kITN;
} else {
LOG(FATAL) << "Invalid fst prefix, prefix should contain"
<< " either \"_tn_\" or \"_itn_\".";
}
}
std::string Processor::ShortestPath(const StdVectorFst& lattice) {
StdVectorFst shortest_path;
fst::ShortestPath(lattice, &shortest_path, 1, true);
std::string output;
printer_->operator()(shortest_path, &output);
return output;
}
std::string Processor::Compose(const std::string& input,
const StdVectorFst* fst) {
StdVectorFst input_fst;
compiler_->operator()(input, &input_fst);
StdVectorFst lattice;
fst::Compose(input_fst, *fst, &lattice);
return ShortestPath(lattice);
}
std::string Processor::Tag(const std::string& input) {
if (input.empty()) {
return "";
}
return Compose(input, tagger_.get());
}
std::string Processor::Verbalize(const std::string& input) {
if (input.empty()) {
return "";
}
TokenParser parser(parse_type_);
std::string output = parser.Reorder(input);
output = Compose(output, verbalizer_.get());
output.erase(std::remove(output.begin(), output.end(), '\0'), output.end());
return output;
}
std::string Processor::Normalize(const std::string& input) {
return Verbalize(Tag(input));
}
} // namespace wetext

+ 51
- 0
post_processor/wetext/processor/wetext_processor.h

@ -0,0 +1,51 @@
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PROCESSOR_WETEXT_PROCESSOR_H_
#define PROCESSOR_WETEXT_PROCESSOR_H_
#include <memory>
#include <string>
#include "fst/fstlib.h"
#include "wetext_token_parser.h"
using fst::StdArc;
using fst::StdVectorFst;
using fst::StringCompiler;
using fst::StringPrinter;
namespace wetext {
class Processor {
public:
Processor(const std::string& tagger_path, const std::string& verbalizer_path);
std::string Tag(const std::string& input);
std::string Verbalize(const std::string& input);
std::string Normalize(const std::string& input);
private:
std::string ShortestPath(const StdVectorFst& lattice);
std::string Compose(const std::string& input, const StdVectorFst* fst);
ParseType parse_type_;
std::shared_ptr<StdVectorFst> tagger_ = nullptr;
std::shared_ptr<StdVectorFst> verbalizer_ = nullptr;
std::shared_ptr<StringCompiler<StdArc>> compiler_ = nullptr;
std::shared_ptr<StringPrinter<StdArc>> printer_ = nullptr;
};
} // namespace wetext
#endif // PROCESSOR_WETEXT_PROCESSOR_H_

+ 153
- 0
post_processor/wetext/processor/wetext_token_parser.cc

@ -0,0 +1,153 @@
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "wetext_token_parser.h"
#include "../utils/wetext_log.h"
#include "../utils/wetext_string.h"
namespace wetext {
const char EOS[] = "<EOS>";
const std::set<std::string> UTF8_WHITESPACE = {" ", "\t", "\n", "\r",
"\x0b\x0c"};
const std::set<std::string> ASCII_LETTERS = {
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",
"o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B",
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
"Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"};
const std::unordered_map<std::string, std::vector<std::string>> TN_ORDERS = {
{"date", {"year", "month", "day"}},
{"fraction", {"denominator", "numerator"}},
{"measure", {"denominator", "numerator", "value"}},
{"money", {"value", "currency"}},
{"time", {"noon", "hour", "minute", "second"}}};
const std::unordered_map<std::string, std::vector<std::string>> ITN_ORDERS = {
{"date", {"year", "month", "day"}},
{"fraction", {"sign", "numerator", "denominator"}},
{"measure", {"numerator", "denominator", "value"}},
{"money", {"currency", "value", "decimal"}},
{"time", {"hour", "minute", "second", "noon"}}};
TokenParser::TokenParser(ParseType type) {
if (type == ParseType::kTN) {
orders_ = TN_ORDERS;
} else {
orders_ = ITN_ORDERS;
}
}
void TokenParser::Load(const std::string& input) {
wetext::SplitUTF8StringToChars(input, &text_);
CHECK_GT(text_.size(), 0);
index_ = 0;
ch_ = text_[0];
}
bool TokenParser::Read() {
if (index_ < text_.size() - 1) {
index_ += 1;
ch_ = text_[index_];
return true;
}
ch_ = EOS;
return false;
}
bool TokenParser::ParseWs() {
bool not_eos = ch_ != EOS;
while (not_eos && ch_ == " ") {
not_eos = Read();
}
return not_eos;
}
bool TokenParser::ParseChar(const std::string& exp) {
if (ch_ == exp) {
Read();
return true;
}
return false;
}
bool TokenParser::ParseChars(const std::string& exp) {
bool ok = false;
std::vector<std::string> chars;
wetext::SplitUTF8StringToChars(exp, &chars);
for (const auto& x : chars) {
ok |= ParseChar(x);
}
return ok;
}
std::string TokenParser::ParseKey() {
CHECK_NE(ch_, EOS);
CHECK_EQ(UTF8_WHITESPACE.count(ch_), 0);
std::string key = "";
while (ASCII_LETTERS.count(ch_) > 0) {
key += ch_;
Read();
}
return key;
}
std::string TokenParser::ParseValue() {
CHECK_NE(ch_, EOS);
bool escape = false;
std::string value = "";
while (ch_ != "\"") {
value += ch_;
escape = ch_ == "\\" && !escape;
Read();
if (escape) {
value += ch_;
Read();
}
}
return value;
}
void TokenParser::Parse(const std::string& input) {
Load(input);
while (ParseWs()) {
std::string name = ParseKey();
ParseChars(" { ");
Token token(name);
while (ParseWs()) {
if (ch_ == "}") {
ParseChar("}");
break;
}
std::string key = ParseKey();
ParseChars(": \"");
std::string value = ParseValue();
ParseChar("\"");
token.Append(key, value);
}
tokens_.emplace_back(token);
}
}
std::string TokenParser::Reorder(const std::string& input) {
Parse(input);
std::string output = "";
for (auto& token : tokens_) {
output += token.String(orders_) + " ";
}
return Trim(output);
}
} // namespace wetext

+ 91
- 0
post_processor/wetext/processor/wetext_token_parser.h

@ -0,0 +1,91 @@
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PROCESSOR_WETEXT_TOKEN_PARSER_H_
#define PROCESSOR_WETEXT_TOKEN_PARSER_H_
#include <set>
#include <string>
#include <unordered_map>
#include <vector>
namespace wetext {
extern const char EOS[];
extern const std::set<std::string> UTF8_WHITESPACE;
extern const std::set<std::string> ASCII_LETTERS;
extern const std::unordered_map<std::string, std::vector<std::string>>
TN_ORDERS;
extern const std::unordered_map<std::string, std::vector<std::string>>
ITN_ORDERS;
struct Token {
std::string name;
std::vector<std::string> order;
std::unordered_map<std::string, std::string> members;
explicit Token(const std::string& name) : name(name) {}
void Append(const std::string& key, const std::string& value) {
order.emplace_back(key);
members[key] = value;
}
std::string String(
const std::unordered_map<std::string, std::vector<std::string>>& orders) {
std::string output = name + " {";
if (orders.count(name) > 0) {
order = orders.at(name);
}
for (const auto& key : order) {
if (members.count(key) == 0) {
continue;
}
output += " " + key + ": \"" + members[key] + "\"";
}
return output + " }";
}
};
enum ParseType {
kTN = 0x00, // Text Normalization
kITN = 0x01 // Inverse Text Normalization
};
class TokenParser {
public:
explicit TokenParser(ParseType type);
std::string Reorder(const std::string& input);
private:
void Load(const std::string& input);
bool Read();
bool ParseWs();
bool ParseChar(const std::string& exp);
bool ParseChars(const std::string& exp);
std::string ParseKey();
std::string ParseValue();
void Parse(const std::string& input);
int index_;
std::string ch_;
std::vector<std::string> text_;
std::vector<Token> tokens_;
std::unordered_map<std::string, std::vector<std::string>> orders_;
};
} // namespace wetext
#endif // PROCESSOR_WETEXT_TOKEN_PARSER_H_

+ 1
- 0
post_processor/wetext/utils/CMakeLists.txt

@ -0,0 +1 @@

+ 23
- 0
post_processor/wetext/utils/wetext_flags.h

@ -0,0 +1,23 @@
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef UTILS_WETEXT_FLAGS_H_
#define UTILS_WETEXT_FLAGS_H_
// Because openfst is a dynamic library compiled with gflags/glog, we must use
// the gflags/glog from openfst to avoid them linked both statically and
// dynamically into the executable.
#include "../../../utils/flags.h"
#endif // UTILS_WETEXT_FLAGS_H_

+ 23
- 0
post_processor/wetext/utils/wetext_log.h

@ -0,0 +1,23 @@
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef UTILS_WETEXT_LOG_H_
#define UTILS_WETEXT_LOG_H_
// Because openfst is a dynamic library compiled with gflags/glog, we must use
// the gflags/glog from openfst to avoid them linked both statically and
// dynamically into the executable.
#include "../../../utils/log.h"
#endif // UTILS_WETEXT_LOG_H_

+ 89
- 0
post_processor/wetext/utils/wetext_string.cc

@ -0,0 +1,89 @@
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "wetext_string.h"
#include "wetext_log.h"
namespace wetext {
const char* WHITESPACE = " \n\r\t\f\v";
int UTF8CharLength(char ch) {
int num_bytes = 1;
CHECK_LE((ch & 0xF8), 0xF0);
if ((ch & 0x80) == 0x00) {
// The first 128 characters (US-ASCII) in UTF-8 format only need one byte.
num_bytes = 1;
} else if ((ch & 0xE0) == 0xC0) {
// The next 1,920 characters need two bytes to encode,
// which covers the remainder of almost all Latin-script alphabets.
num_bytes = 2;
} else if ((ch & 0xF0) == 0xE0) {
// Three bytes are needed for characters in the rest of
// the Basic Multilingual Plane, which contains virtually all characters
// in common use, including most Chinese, Japanese and Korean characters.
num_bytes = 3;
} else if ((ch & 0xF8) == 0xF0) {
// Four bytes are needed for characters in the other planes of Unicode,
// which include less common CJK characters, various historic scripts,
// mathematical symbols, and emoji (pictographic symbols).
num_bytes = 4;
}
return num_bytes;
}
int UTF8StringLength(const std::string& str) {
int len = 0;
int num_bytes = 1;
for (size_t i = 0; i < str.length(); i += num_bytes) {
num_bytes = UTF8CharLength(str[i]);
++len;
}
return len;
}
void SplitUTF8StringToChars(const std::string& str,
std::vector<std::string>* chars) {
chars->clear();
int num_bytes = 1;
for (size_t i = 0; i < str.length(); i += num_bytes) {
num_bytes = UTF8CharLength(str[i]);
chars->push_back(str.substr(i, num_bytes));
}
}
std::string Ltrim(const std::string& str) {
size_t start = str.find_first_not_of(WHITESPACE);
return (start == std::string::npos) ? "" : str.substr(start);
}
std::string Rtrim(const std::string& str) {
size_t end = str.find_last_not_of(WHITESPACE);
return end == std::string::npos ? "" : str.substr(0, end + 1);
}
std::string Trim(const std::string& str) { return Rtrim(Ltrim(str)); }
void Split(const std::string& str, const std::string& delim,
std::vector<std::string>* output) {
std::string s = str;
size_t pos = 0;
while ((pos = s.find(delim)) != std::string::npos) {
output->emplace_back(s.substr(0, pos));
s.erase(0, pos + delim.length());
}
output->emplace_back(s);
}
} // namespace wetext

+ 42
- 0
post_processor/wetext/utils/wetext_string.h

@ -0,0 +1,42 @@
// Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef UTILS_WETEXT_STRING_H_
#define UTILS_WETEXT_STRING_H_
#include <string>
#include <vector>
namespace wetext {
extern const char* WHITESPACE;
int UTF8CharLength(char ch);
int UTF8StringLength(const std::string& str);
void SplitUTF8StringToChars(const std::string& str,
std::vector<std::string>* chars);
std::string Ltrim(const std::string& str);
std::string Rtrim(const std::string& str);
std::string Trim(const std::string& str);
void Split(const std::string& str, const std::string& delim,
std::vector<std::string>* output);
} // namespace wetext
#endif // UTILS_WETEXT_STRING_H_

Loading…
Cancel
Save