|
// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn
|
|
// 2023 Jing Du (thuduj12@163.com)
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License
|
|
|
|
#include "post_processor/post_processor.h"
|
|
#include <sstream>
|
|
#include <vector>
|
|
//#include "processor/wetext_processor.h"
|
|
#include "utils/string.h"
|
|
|
|
namespace wenet {
|
|
void PostProcessor::InitITNResource(const std::string& tagger_path,
|
|
const std::string& verbalizer_path) {
|
|
auto itn_processor =
|
|
std::make_shared<wetext::Processor>(tagger_path, verbalizer_path);
|
|
itn_resource = itn_processor;
|
|
}
|
|
|
|
std::string PostProcessor::ProcessSpace(const std::string& str) {
|
|
std::string result = str;
|
|
// 1. remove ' ' if needed
|
|
// only spaces between mandarin words need to be removed, please note that
|
|
// if str contains '_', we assume that the decoding type must be
|
|
// `CtcPrefixBeamSearch` and this branch will do nothing since str must be
|
|
// obtained via "".join() (in function `AsrDecoder::UpdateResult()`)
|
|
if (opts_.language_type == kMandarinEnglish && !str.empty()) {
|
|
result.clear();
|
|
// split str by ' '
|
|
std::vector<std::string> words;
|
|
std::stringstream ss(str);
|
|
std::string tmp;
|
|
while (ss >> tmp) {
|
|
words.push_back(tmp);
|
|
}
|
|
// check english word
|
|
bool is_englishword_prev = false;
|
|
bool is_englishword_now = false;
|
|
for (std::string& w : words) {
|
|
is_englishword_now = CheckEnglishWord(w);
|
|
if (is_englishword_prev && is_englishword_now) {
|
|
result += (' ' + w);
|
|
} else {
|
|
result += (w);
|
|
}
|
|
is_englishword_prev = is_englishword_now;
|
|
}
|
|
}
|
|
// 2. replace '_' with ' '
|
|
// this should be done for all cases (both kMandarinEnglish and kIndoEuropean)
|
|
result = ProcessBlank(result, opts_.lowercase);
|
|
return result;
|
|
}
|
|
|
|
std::string del_substr(const std::string& str, const std::string& sub) {
|
|
std::string result = str;
|
|
int pos = 0;
|
|
while (string::npos != (pos = result.find(sub))) {
|
|
result.erase(pos, sub.size());
|
|
}
|
|
return result;
|
|
}
|
|
|
|
std::string PostProcessor::ProcessSymbols(const std::string& str) {
|
|
std::string result = str;
|
|
result = del_substr(result, "<unk>");
|
|
result = del_substr(result, "<context>");
|
|
result = del_substr(result, "</context>");
|
|
return result;
|
|
}
|
|
|
|
std::string PostProcessor::Process(const std::string& str, bool finish) {
|
|
std::string result;
|
|
// remove symbols with "<>" first
|
|
result = ProcessSymbols(str);
|
|
result = ProcessSpace(result);
|
|
// TODO(xcsong): do punctuation if finish == true
|
|
if (finish == true && opts_.itn) {
|
|
if (nullptr != itn_resource) {
|
|
result = itn_resource->Normalize(result);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
} // namespace wenet
|