You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

96 lines
3.1 KiB

  1. // Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn
  2. // 2023 Jing Du (thuduj12@163.com)
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License
  15. #include "post_processor.h"
  16. #include <sstream>
  17. #include <vector>
  18. #include "wetext_processor.h"
  19. #include "../../utils/wn_string.h"
  20. namespace wenet {
  21. void PostProcessor::InitITNResource(const std::string& tagger_path,
  22. const std::string& verbalizer_path) {
  23. auto itn_processor =
  24. std::make_shared<wetext::Processor>(tagger_path, verbalizer_path);
  25. itn_resource = itn_processor;
  26. }
  27. std::string PostProcessor::ProcessSpace(const std::string& str) {
  28. std::string result = str;
  29. // 1. remove ' ' if needed
  30. // only spaces between mandarin words need to be removed, please note that
  31. // if str contains '_', we assume that the decoding type must be
  32. // `CtcPrefixBeamSearch` and this branch will do nothing since str must be
  33. // obtained via "".join() (in function `AsrDecoder::UpdateResult()`)
  34. if (opts_.language_type == kMandarinEnglish && !str.empty()) {
  35. result.clear();
  36. // split str by ' '
  37. std::vector<std::string> words;
  38. std::stringstream ss(str);
  39. std::string tmp;
  40. while (ss >> tmp) {
  41. words.push_back(tmp);
  42. }
  43. // check english word
  44. bool is_englishword_prev = false;
  45. bool is_englishword_now = false;
  46. for (std::string& w : words) {
  47. is_englishword_now = CheckEnglishWord(w);
  48. if (is_englishword_prev && is_englishword_now) {
  49. result += (' ' + w);
  50. } else {
  51. result += (w);
  52. }
  53. is_englishword_prev = is_englishword_now;
  54. }
  55. }
  56. // 2. replace '_' with ' '
  57. // this should be done for all cases (both kMandarinEnglish and kIndoEuropean)
  58. result = ProcessBlank(result, opts_.lowercase);
  59. return result;
  60. }
  61. std::string del_substr(const std::string& str, const std::string& sub) {
  62. std::string result = str;
  63. int pos = 0;
  64. while (std::string::npos != (pos = result.find(sub))) {
  65. result.erase(pos, sub.size());
  66. }
  67. return result;
  68. }
  69. std::string PostProcessor::ProcessSymbols(const std::string& str) {
  70. std::string result = str;
  71. result = del_substr(result, "<unk>");
  72. result = del_substr(result, "<context>");
  73. result = del_substr(result, "</context>");
  74. return result;
  75. }
  76. std::string PostProcessor::Process(const std::string& str, bool finish) {
  77. std::string result;
  78. // remove symbols with "<>" first
  79. result = ProcessSymbols(str);
  80. result = ProcessSpace(result);
  81. // TODO(xcsong): do punctuation if finish == true
  82. if (finish == true && opts_.itn) {
  83. if (nullptr != itn_resource) {
  84. result = itn_resource->Normalize(result);
  85. }
  86. }
  87. return result;
  88. }
  89. } // namespace wenet