You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

96 lines
3.1 KiB

// Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn
// 2023 Jing Du (thuduj12@163.com)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License
#include "post_processor.h"
#include <sstream>
#include <vector>
#include "processor/wetext_processor.h"
#include "utils/string.h"
namespace wenet {
void PostProcessor::InitITNResource(const std::string& tagger_path,
const std::string& verbalizer_path) {
auto itn_processor =
std::make_shared<wetext::Processor>(tagger_path, verbalizer_path);
itn_resource = itn_processor;
}
std::string PostProcessor::ProcessSpace(const std::string& str) {
std::string result = str;
// 1. remove ' ' if needed
// only spaces between mandarin words need to be removed, please note that
// if str contains '_', we assume that the decoding type must be
// `CtcPrefixBeamSearch` and this branch will do nothing since str must be
// obtained via "".join() (in function `AsrDecoder::UpdateResult()`)
if (opts_.language_type == kMandarinEnglish && !str.empty()) {
result.clear();
// split str by ' '
std::vector<std::string> words;
std::stringstream ss(str);
std::string tmp;
while (ss >> tmp) {
words.push_back(tmp);
}
// check english word
bool is_englishword_prev = false;
bool is_englishword_now = false;
for (std::string& w : words) {
is_englishword_now = CheckEnglishWord(w);
if (is_englishword_prev && is_englishword_now) {
result += (' ' + w);
} else {
result += (w);
}
is_englishword_prev = is_englishword_now;
}
}
// 2. replace '_' with ' '
// this should be done for all cases (both kMandarinEnglish and kIndoEuropean)
result = ProcessBlank(result, opts_.lowercase);
return result;
}
std::string del_substr(const std::string& str, const std::string& sub) {
std::string result = str;
int pos = 0;
while (std::string::npos != (pos = result.find(sub))) {
result.erase(pos, sub.size());
}
return result;
}
std::string PostProcessor::ProcessSymbols(const std::string& str) {
std::string result = str;
result = del_substr(result, "<unk>");
result = del_substr(result, "<context>");
result = del_substr(result, "</context>");
return result;
}
std::string PostProcessor::Process(const std::string& str, bool finish) {
std::string result;
// remove symbols with "<>" first
result = ProcessSymbols(str);
result = ProcessSpace(result);
// TODO(xcsong): do punctuation if finish == true
if (finish == true && opts_.itn) {
if (nullptr != itn_resource) {
result = itn_resource->Normalize(result);
}
}
return result;
}
} // namespace wenet