You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

145 lines
5.3 KiB

  1. // bin/arpa2fst.cc
  2. //
  3. // Copyright 2009-2011 Gilles Boulianne.
  4. //
  5. // See ../../COPYING for clarification regarding multiple authors
  6. //
  7. // Licensed under the Apache License, Version 2.0 (the "License");
  8. // you may not use this file except in compliance with the License.
  9. // You may obtain a copy of the License at
  10. //
  11. // http://www.apache.org/licenses/LICENSE-2.0
  12. //
  13. // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  14. // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  15. // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  16. // MERCHANTABILITY OR NON-INFRINGEMENT.
  17. // See the Apache 2 License for the specific language governing permissions and
  18. // limitations under the License.
  19. #include <string>
  20. #include "lm/arpa-lm-compiler.h"
  21. #include "util/kaldi-io.h"
  22. #include "util/parse-options.h"
  23. int main(int argc, char* argv[]) {
  24. using namespace kaldi; // NOLINT
  25. try {
  26. const char* usage =
  27. "Convert an ARPA format language model into an FST\n"
  28. "Usage: arpa2fst [opts] <input-arpa> <output-fst>\n"
  29. " e.g.: arpa2fst --disambig-symbol=#0 --read-symbol-table="
  30. "data/lang/words.txt lm/input.arpa G.fst\n\n"
  31. "Note: When called without switches, the output G.fst will contain\n"
  32. "an embedded symbol table. This is compatible with the way a previous\n"
  33. "version of arpa2fst worked.\n";
  34. ParseOptions po(usage);
  35. ArpaParseOptions options;
  36. options.Register(&po);
  37. // Option flags.
  38. std::string bos_symbol = "<s>";
  39. std::string eos_symbol = "</s>";
  40. std::string disambig_symbol;
  41. std::string read_syms_filename;
  42. std::string write_syms_filename;
  43. bool keep_symbols = false;
  44. bool ilabel_sort = true;
  45. po.Register("bos-symbol", &bos_symbol, "Beginning of sentence symbol");
  46. po.Register("eos-symbol", &eos_symbol, "End of sentence symbol");
  47. po.Register("disambig-symbol", &disambig_symbol,
  48. "Disambiguator. If provided (e. g. #0), used on input side of "
  49. "backoff links, and <s> and </s> are replaced with epsilons");
  50. po.Register("read-symbol-table", &read_syms_filename,
  51. "Use existing symbol table");
  52. po.Register("write-symbol-table", &write_syms_filename,
  53. "Write generated symbol table to a file");
  54. po.Register("keep-symbols", &keep_symbols,
  55. "Store symbol table with FST. Symbols always saved to FST if "
  56. "symbol tables are neither read or written (otherwise symbols "
  57. "would be lost entirely)");
  58. po.Register("ilabel-sort", &ilabel_sort, "Ilabel-sort the output FST");
  59. po.Read(argc, argv);
  60. if (po.NumArgs() != 1 && po.NumArgs() != 2) {
  61. po.PrintUsage();
  62. exit(1);
  63. }
  64. std::string arpa_rxfilename = po.GetArg(1),
  65. fst_wxfilename = po.GetOptArg(2);
  66. int64 disambig_symbol_id = 0;
  67. fst::SymbolTable* symbols;
  68. if (!read_syms_filename.empty()) {
  69. // Use existing symbols. Required symbols must be in the table.
  70. kaldi::Input kisym(read_syms_filename);
  71. symbols = fst::SymbolTable::ReadText(
  72. kisym.Stream(), PrintableWxfilename(read_syms_filename));
  73. if (symbols == NULL)
  74. KALDI_ERR << "Could not read symbol table from file "
  75. << read_syms_filename;
  76. options.oov_handling = ArpaParseOptions::kSkipNGram;
  77. if (!disambig_symbol.empty()) {
  78. disambig_symbol_id = symbols->Find(disambig_symbol);
  79. if (disambig_symbol_id == -1) // fst::kNoSymbol
  80. KALDI_ERR << "Symbol table " << read_syms_filename
  81. << " has no symbol for " << disambig_symbol;
  82. }
  83. } else {
  84. // Create a new symbol table and populate it from ARPA file.
  85. symbols = new fst::SymbolTable(PrintableWxfilename(fst_wxfilename));
  86. options.oov_handling = ArpaParseOptions::kAddToSymbols;
  87. symbols->AddSymbol("<eps>", 0);
  88. if (!disambig_symbol.empty()) {
  89. disambig_symbol_id = symbols->AddSymbol(disambig_symbol);
  90. }
  91. }
  92. // Add or use existing BOS and EOS.
  93. options.bos_symbol = symbols->AddSymbol(bos_symbol);
  94. options.eos_symbol = symbols->AddSymbol(eos_symbol);
  95. // If producing new (not reading existing) symbols and not saving them,
  96. // need to keep symbols with FST, otherwise they would be lost.
  97. if (read_syms_filename.empty() && write_syms_filename.empty())
  98. keep_symbols = true;
  99. // Actually compile LM.
  100. KALDI_ASSERT(symbols != NULL);
  101. ArpaLmCompiler lm_compiler(options, disambig_symbol_id, symbols);
  102. {
  103. Input ki(arpa_rxfilename);
  104. lm_compiler.Read(ki.Stream());
  105. }
  106. // Sort the FST in-place if requested by options.
  107. if (ilabel_sort) {
  108. fst::ArcSort(lm_compiler.MutableFst(), fst::StdILabelCompare());
  109. }
  110. // Write symbols if requested.
  111. if (!write_syms_filename.empty()) {
  112. kaldi::Output kosym(write_syms_filename, false);
  113. symbols->WriteText(kosym.Stream());
  114. }
  115. // Write LM FST.
  116. bool write_binary = true, write_header = false;
  117. kaldi::Output kofst(fst_wxfilename, write_binary, write_header);
  118. fst::FstWriteOptions wopts(PrintableWxfilename(fst_wxfilename));
  119. wopts.write_isymbols = wopts.write_osymbols = keep_symbols;
  120. lm_compiler.Fst().Write(kofst.Stream(), wopts);
  121. delete symbols;
  122. } catch (const std::exception& e) {
  123. std::cerr << e.what();
  124. return -1;
  125. }
  126. }