You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

89 lines
2.9 KiB

  1. // Copyright (c) 2021 Xingchen Song sxc19@mails.tsinghua.edu.cn
  2. // 2023 Jing Du (thuduj12@163.com)
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License
  15. #include "post_processor/post_processor.h"
  16. #include <sstream>
  17. #include <vector>
  18. #include "utils/string.h"
  19. namespace wenet {
  20. std::string PostProcessor::ProcessSpace(const std::string& str) {
  21. std::string result = str;
  22. // 1. remove ' ' if needed
  23. // only spaces between mandarin words need to be removed, please note that
  24. // if str contains '_', we assume that the decoding type must be
  25. // `CtcPrefixBeamSearch` and this branch will do nothing since str must be
  26. // obtained via "".join() (in function `AsrDecoder::UpdateResult()`)
  27. if (opts_.language_type == kMandarinEnglish && !str.empty()) {
  28. result.clear();
  29. // split str by ' '
  30. std::vector<std::string> words;
  31. std::stringstream ss(str);
  32. std::string tmp;
  33. while (ss >> tmp) {
  34. words.push_back(tmp);
  35. }
  36. // check english word
  37. bool is_englishword_prev = false;
  38. bool is_englishword_now = false;
  39. for (std::string& w : words) {
  40. is_englishword_now = CheckEnglishWord(w);
  41. if (is_englishword_prev && is_englishword_now) {
  42. result += (' ' + w);
  43. } else {
  44. result += (w);
  45. }
  46. is_englishword_prev = is_englishword_now;
  47. }
  48. }
  49. // 2. replace '_' with ' '
  50. // this should be done for all cases (both kMandarinEnglish and kIndoEuropean)
  51. result = ProcessBlank(result, opts_.lowercase);
  52. return result;
  53. }
  54. std::string del_substr(const std::string& str, const std::string& sub) {
  55. std::string result = str;
  56. int pos = 0;
  57. while (string::npos != (pos = result.find(sub))) {
  58. result.erase(pos, sub.size());
  59. }
  60. return result;
  61. }
  62. std::string PostProcessor::ProcessSymbols(const std::string& str) {
  63. std::string result = str;
  64. result = del_substr(result, "<unk>");
  65. result = del_substr(result, "<context>");
  66. result = del_substr(result, "</context>");
  67. return result;
  68. }
  69. std::string PostProcessor::Process(const std::string& str, bool finish) {
  70. std::string result;
  71. // remove symbols with "<>" first
  72. result = ProcessSymbols(str);
  73. result = ProcessSpace(result);
  74. // TODO(xcsong): do punctuation if finish == true
  75. // if (finish == true && opts_.itn) {
  76. // if (nullptr != itn_resource) {
  77. // result = itn_resource->Normalize(result);
  78. // }
  79. // }
  80. return result;
  81. }
  82. } // namespace wenet