You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

92 lines
3.6 KiB

  1. // Copyright 2005-2024 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the 'License');
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an 'AS IS' BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. //
  15. // See www.openfst.org for extensive documentation on this weighted
  16. // finite-state transducer library.
  17. #ifndef FST_SYMBOL_TABLE_OPS_H_
  18. #define FST_SYMBOL_TABLE_OPS_H_
  19. #include <cstdint>
  20. #include <memory>
  21. #include <string>
  22. #include <vector>
  23. #include <fst/fst.h>
  24. #include <fst/symbol-table.h>
  25. #include <unordered_set>
  26. namespace fst {
  27. // Returns a minimal symbol table containing only symbols referenced by the
  28. // passed fst. Symbols preserve their original numbering, so fst does not
  29. // require relabeling.
  30. template <class Arc>
  31. SymbolTable *PruneSymbolTable(const Fst<Arc> &fst, const SymbolTable &syms,
  32. bool input) {
  33. std::unordered_set<typename Arc::Label> seen;
  34. seen.insert(0); // Always keep epsilon.
  35. for (StateIterator<Fst<Arc>> siter(fst); !siter.Done(); siter.Next()) {
  36. for (ArcIterator<Fst<Arc>> aiter(fst, siter.Value()); !aiter.Done();
  37. aiter.Next()) {
  38. const auto sym = (input) ? aiter.Value().ilabel : aiter.Value().olabel;
  39. seen.insert(sym);
  40. }
  41. }
  42. auto pruned = std::make_unique<SymbolTable>(syms.Name() + "_pruned");
  43. for (const auto &stitem : syms) {
  44. const auto label = stitem.Label();
  45. if (seen.count(label)) pruned->AddSymbol(stitem.Symbol(), label);
  46. }
  47. return pruned.release();
  48. }
  49. // Relabels a symbol table to make it a contiguous mapping.
  50. SymbolTable *CompactSymbolTable(const SymbolTable &syms);
  51. // Merges two SymbolTables, all symbols from left will be merged into right
  52. // with the same IDs. Symbols in right that have conflicting IDs with those
  53. // in left will be assigned to value assigned from the left SymbolTable.
  54. // The returned symbol table will never modify symbol assignments from the left
  55. // side, but may do so on the right. If right_relabel_output is non-null, it
  56. // will be assigned true if the symbols from the right table needed to be
  57. // reassigned.
  58. //
  59. // A potential use case is to compose two FSTs that have different symbol
  60. // tables. You can reconcile them in the following way:
  61. //
  62. // Fst<Arc> a, b;
  63. // bool relabel;
  64. // std::unique_ptr<SymbolTable> bnew(MergeSymbolTable(a.OutputSymbols(),
  65. // b.InputSymbols(), &relabel);
  66. // if (relabel) Relabel(b, bnew.get(), nullptr);
  67. // b.SetInputSymbols(bnew);
  68. SymbolTable *MergeSymbolTable(const SymbolTable &left, const SymbolTable &right,
  69. bool *right_relabel_output = nullptr);
  70. // Read the symbol table from any Fst::Read()able file, without loading the
  71. // corresponding FST. Returns nullptr if the FST does not contain a symbol
  72. // table or the symbol table cannot be read.
  73. SymbolTable * FstReadSymbols(const std::string &source,
  74. bool input);
  75. // Adds a contiguous range of symbols to a symbol table using a simple prefix
  76. // for the string, returning false if the inserted symbol string clashes with
  77. // any currently present.
  78. bool AddAuxiliarySymbols(const std::string &prefix, int64_t start_label,
  79. int64_t nlabels, SymbolTable *syms);
  80. } // namespace fst
  81. #endif // FST_SYMBOL_TABLE_OPS_H_