// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn // 2023 Jing Du (thuduj12@163.com) // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License #include "post_processor.h" #include #include #include "wetext_processor.h" #include "../../utils/wn_string.h" namespace wenet { void PostProcessor::InitITNResource(const std::string& tagger_path, const std::string& verbalizer_path) { auto itn_processor = std::make_shared(tagger_path, verbalizer_path); itn_resource = itn_processor; } std::string PostProcessor::ProcessSpace(const std::string& str) { std::string result = str; // 1. remove ' ' if needed // only spaces between mandarin words need to be removed, please note that // if str contains '_', we assume that the decoding type must be // `CtcPrefixBeamSearch` and this branch will do nothing since str must be // obtained via "".join() (in function `AsrDecoder::UpdateResult()`) if (opts_.language_type == kMandarinEnglish && !str.empty()) { result.clear(); // split str by ' ' std::vector words; std::stringstream ss(str); std::string tmp; while (ss >> tmp) { words.push_back(tmp); } // check english word bool is_englishword_prev = false; bool is_englishword_now = false; for (std::string& w : words) { is_englishword_now = CheckEnglishWord(w); if (is_englishword_prev && is_englishword_now) { result += (' ' + w); } else { result += (w); } is_englishword_prev = is_englishword_now; } } // 2. replace '_' with ' ' // this should be done for all cases (both kMandarinEnglish and kIndoEuropean) result = ProcessBlank(result, opts_.lowercase); return result; } std::string del_substr(const std::string& str, const std::string& sub) { std::string result = str; int pos = 0; while (std::string::npos != (pos = result.find(sub))) { result.erase(pos, sub.size()); } return result; } std::string PostProcessor::ProcessSymbols(const std::string& str) { std::string result = str; result = del_substr(result, ""); result = del_substr(result, ""); result = del_substr(result, ""); return result; } std::string PostProcessor::Process(const std::string& str, bool finish) { std::string result; // remove symbols with "<>" first result = ProcessSymbols(str); result = ProcessSpace(result); // TODO(xcsong): do punctuation if finish == true if (finish == true && opts_.itn) { if (nullptr != itn_resource) { result = itn_resource->Normalize(result); } } return result; } } // namespace wenet