You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

498 lines
17 KiB

  1. // lat/kaldi-lattice.cc
  2. // Copyright 2009-2011 Microsoft Corporation
  3. // 2013 Johns Hopkins University (author: Daniel Povey)
  4. // See ../../COPYING for clarification regarding multiple authors
  5. //
  6. // Licensed under the Apache License, Version 2.0 (the "License");
  7. // you may not use this file except in compliance with the License.
  8. // You may obtain a copy of the License at
  9. //
  10. // http://www.apache.org/licenses/LICENSE-2.0
  11. //
  12. // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  13. // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  14. // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  15. // MERCHANTABLITY OR NON-INFRINGEMENT.
  16. // See the Apache 2 License for the specific language governing permissions and
  17. // limitations under the License.
  18. #include "lat/kaldi-lattice.h"
  19. #include "fst/script/print-impl.h"
  20. namespace kaldi {
  21. /// Converts lattice types if necessary, deleting its input.
  22. template <class OrigWeightType>
  23. CompactLattice* ConvertToCompactLattice(fst::VectorFst<OrigWeightType>* ifst) {
  24. if (!ifst) return NULL;
  25. CompactLattice* ofst = new CompactLattice();
  26. ConvertLattice(*ifst, ofst);
  27. delete ifst;
  28. return ofst;
  29. }
  30. // This overrides the template if there is no type conversion going on
  31. // (for efficiency).
  32. template <>
  33. CompactLattice* ConvertToCompactLattice(CompactLattice* ifst) {
  34. return ifst;
  35. }
  36. /// Converts lattice types if necessary, deleting its input.
  37. template <class OrigWeightType>
  38. Lattice* ConvertToLattice(fst::VectorFst<OrigWeightType>* ifst) {
  39. if (!ifst) return NULL;
  40. Lattice* ofst = new Lattice();
  41. ConvertLattice(*ifst, ofst);
  42. delete ifst;
  43. return ofst;
  44. }
  45. // This overrides the template if there is no type conversion going on
  46. // (for efficiency).
  47. template <>
  48. Lattice* ConvertToLattice(Lattice* ifst) {
  49. return ifst;
  50. }
  51. bool WriteCompactLattice(std::ostream& os, bool binary,
  52. const CompactLattice& t) {
  53. if (binary) {
  54. fst::FstWriteOptions opts;
  55. // Leave all the options default. Normally these lattices wouldn't have any
  56. // osymbols/isymbols so no point directing it not to write them (who knows
  57. // what we'd want to if we had them).
  58. return t.Write(os, opts);
  59. } else {
  60. // Text-mode output. Note: we expect that t.InputSymbols() and
  61. // t.OutputSymbols() would always return NULL. The corresponding input
  62. // routine would not work if the FST actually had symbols attached.
  63. // Write a newline after the key, so the first line of the FST appears
  64. // on its own line.
  65. os << '\n';
  66. bool acceptor = true, write_one = false;
  67. fst::FstPrinter<CompactLatticeArc> printer(t, t.InputSymbols(),
  68. t.OutputSymbols(), NULL,
  69. acceptor, write_one, "\t");
  70. printer.Print(&os, "<unknown>");
  71. if (os.fail()) KALDI_WARN << "Stream failure detected.";
  72. // Write another newline as a terminating character. The read routine will
  73. // detect this [this is a Kaldi mechanism, not somethig in the original
  74. // OpenFst code].
  75. os << '\n';
  76. return os.good();
  77. }
  78. }
  79. /// LatticeReader provides (static) functions for reading both Lattice
  80. /// and CompactLattice, in text form.
  81. class LatticeReader {
  82. typedef LatticeArc Arc;
  83. typedef LatticeWeight Weight;
  84. typedef CompactLatticeArc CArc;
  85. typedef CompactLatticeWeight CWeight;
  86. typedef Arc::Label Label;
  87. typedef Arc::StateId StateId;
  88. public:
  89. // everything is static in this class.
  90. /** This function reads from the FST text format; it does not know in advance
  91. whether it's a Lattice or CompactLattice in the stream so it tries to
  92. read both formats until it becomes clear which is the correct one.
  93. */
  94. static std::pair<Lattice*, CompactLattice*> ReadText(std::istream& is) {
  95. typedef std::pair<Lattice*, CompactLattice*> PairT;
  96. using std::string;
  97. using std::vector;
  98. Lattice* fst = new Lattice();
  99. CompactLattice* cfst = new CompactLattice();
  100. string line;
  101. size_t nline = 0;
  102. string separator = FLAGS_fst_field_separator + "\r\n";
  103. while (std::getline(is, line)) {
  104. nline++;
  105. vector<string> col;
  106. // on Windows we'll write in text and read in binary mode.
  107. SplitStringToVector(line, separator.c_str(), true, &col);
  108. if (col.size() == 0) break; // Empty line is a signal to stop, in our
  109. // archive format.
  110. if (col.size() > 5) {
  111. KALDI_WARN << "Reading lattice: bad line in FST: " << line;
  112. delete fst;
  113. delete cfst;
  114. return PairT(static_cast<Lattice*>(NULL),
  115. static_cast<CompactLattice*>(NULL));
  116. }
  117. StateId s;
  118. if (!ConvertStringToInteger(col[0], &s)) {
  119. KALDI_WARN << "FstCompiler: bad line in FST: " << line;
  120. delete fst;
  121. delete cfst;
  122. return PairT(static_cast<Lattice*>(NULL),
  123. static_cast<CompactLattice*>(NULL));
  124. }
  125. if (fst)
  126. while (s >= fst->NumStates()) fst->AddState();
  127. if (cfst)
  128. while (s >= cfst->NumStates()) cfst->AddState();
  129. if (nline == 1) {
  130. if (fst) fst->SetStart(s);
  131. if (cfst) cfst->SetStart(s);
  132. }
  133. if (fst) { // we still have fst; try to read that arc.
  134. bool ok = true;
  135. Arc arc;
  136. Weight w;
  137. StateId d = s;
  138. switch (col.size()) {
  139. case 1:
  140. fst->SetFinal(s, Weight::One());
  141. break;
  142. case 2:
  143. if (!StrToWeight(col[1], true, &w))
  144. ok = false;
  145. else
  146. fst->SetFinal(s, w);
  147. break;
  148. case 3: // 3 columns not ok for Lattice format; it's not an acceptor.
  149. ok = false;
  150. break;
  151. case 4:
  152. ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
  153. ConvertStringToInteger(col[2], &arc.ilabel) &&
  154. ConvertStringToInteger(col[3], &arc.olabel);
  155. if (ok) {
  156. d = arc.nextstate;
  157. arc.weight = Weight::One();
  158. fst->AddArc(s, arc);
  159. }
  160. break;
  161. case 5:
  162. ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
  163. ConvertStringToInteger(col[2], &arc.ilabel) &&
  164. ConvertStringToInteger(col[3], &arc.olabel) &&
  165. StrToWeight(col[4], false, &arc.weight);
  166. if (ok) {
  167. d = arc.nextstate;
  168. fst->AddArc(s, arc);
  169. }
  170. break;
  171. default:
  172. ok = false;
  173. }
  174. while (d >= fst->NumStates()) fst->AddState();
  175. if (!ok) {
  176. delete fst;
  177. fst = NULL;
  178. }
  179. }
  180. if (cfst) {
  181. bool ok = true;
  182. CArc arc;
  183. CWeight w;
  184. StateId d = s;
  185. switch (col.size()) {
  186. case 1:
  187. cfst->SetFinal(s, CWeight::One());
  188. break;
  189. case 2:
  190. if (!StrToCWeight(col[1], true, &w))
  191. ok = false;
  192. else
  193. cfst->SetFinal(s, w);
  194. break;
  195. case 3: // compact-lattice is acceptor format: state, next-state,
  196. // label.
  197. ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
  198. ConvertStringToInteger(col[2], &arc.ilabel);
  199. if (ok) {
  200. d = arc.nextstate;
  201. arc.olabel = arc.ilabel;
  202. arc.weight = CWeight::One();
  203. cfst->AddArc(s, arc);
  204. }
  205. break;
  206. case 4:
  207. ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
  208. ConvertStringToInteger(col[2], &arc.ilabel) &&
  209. StrToCWeight(col[3], false, &arc.weight);
  210. if (ok) {
  211. d = arc.nextstate;
  212. arc.olabel = arc.ilabel;
  213. cfst->AddArc(s, arc);
  214. }
  215. break;
  216. case 5:
  217. default:
  218. ok = false;
  219. }
  220. while (d >= cfst->NumStates()) cfst->AddState();
  221. if (!ok) {
  222. delete cfst;
  223. cfst = NULL;
  224. }
  225. }
  226. if (!fst && !cfst) {
  227. KALDI_WARN << "Bad line in lattice text format: " << line;
  228. // read until we get an empty line, so at least we
  229. // have a chance to read the next one (although this might
  230. // be a bit futile since the calling code will get unhappy
  231. // about failing to read this one.
  232. while (std::getline(is, line)) {
  233. SplitStringToVector(line, separator.c_str(), true, &col);
  234. if (col.empty()) break;
  235. }
  236. return PairT(static_cast<Lattice*>(NULL),
  237. static_cast<CompactLattice*>(NULL));
  238. }
  239. }
  240. return PairT(fst, cfst);
  241. }
  242. static bool StrToWeight(const std::string& s, bool allow_zero, Weight* w) {
  243. std::istringstream strm(s);
  244. strm >> *w;
  245. if (!strm || (!allow_zero && *w == Weight::Zero())) {
  246. return false;
  247. }
  248. return true;
  249. }
  250. static bool StrToCWeight(const std::string& s, bool allow_zero, CWeight* w) {
  251. std::istringstream strm(s);
  252. strm >> *w;
  253. if (!strm || (!allow_zero && *w == CWeight::Zero())) {
  254. return false;
  255. }
  256. return true;
  257. }
  258. };
  259. CompactLattice* ReadCompactLatticeText(std::istream& is) {
  260. std::pair<Lattice*, CompactLattice*> lat_pair = LatticeReader::ReadText(is);
  261. if (lat_pair.second != NULL) {
  262. delete lat_pair.first;
  263. return lat_pair.second;
  264. } else if (lat_pair.first != NULL) {
  265. // note: ConvertToCompactLattice frees its input.
  266. return ConvertToCompactLattice(lat_pair.first);
  267. } else {
  268. return NULL;
  269. }
  270. }
  271. Lattice* ReadLatticeText(std::istream& is) {
  272. std::pair<Lattice*, CompactLattice*> lat_pair = LatticeReader::ReadText(is);
  273. if (lat_pair.first != NULL) {
  274. delete lat_pair.second;
  275. return lat_pair.first;
  276. } else if (lat_pair.second != NULL) {
  277. // note: ConvertToLattice frees its input.
  278. return ConvertToLattice(lat_pair.second);
  279. } else {
  280. return NULL;
  281. }
  282. }
  283. bool ReadCompactLattice(std::istream& is, bool binary, CompactLattice** clat) {
  284. KALDI_ASSERT(*clat == NULL);
  285. if (binary) {
  286. fst::FstHeader hdr;
  287. if (!hdr.Read(is, "<unknown>")) {
  288. KALDI_WARN << "Reading compact lattice: error reading FST header.";
  289. return false;
  290. }
  291. if (hdr.FstType() != "vector") {
  292. KALDI_WARN << "Reading compact lattice: unsupported FST type: "
  293. << hdr.FstType();
  294. return false;
  295. }
  296. fst::FstReadOptions ropts("<unspecified>", &hdr);
  297. typedef fst::CompactLatticeWeightTpl<fst::LatticeWeightTpl<float>, int32>
  298. T1;
  299. typedef fst::CompactLatticeWeightTpl<fst::LatticeWeightTpl<double>, int32>
  300. T2;
  301. typedef fst::LatticeWeightTpl<float> T3;
  302. typedef fst::LatticeWeightTpl<double> T4;
  303. typedef fst::VectorFst<fst::ArcTpl<T1> > F1;
  304. typedef fst::VectorFst<fst::ArcTpl<T2> > F2;
  305. typedef fst::VectorFst<fst::ArcTpl<T3> > F3;
  306. typedef fst::VectorFst<fst::ArcTpl<T4> > F4;
  307. CompactLattice* ans = NULL;
  308. if (hdr.ArcType() == T1::Type()) {
  309. ans = ConvertToCompactLattice(F1::Read(is, ropts));
  310. } else if (hdr.ArcType() == T2::Type()) {
  311. ans = ConvertToCompactLattice(F2::Read(is, ropts));
  312. } else if (hdr.ArcType() == T3::Type()) {
  313. ans = ConvertToCompactLattice(F3::Read(is, ropts));
  314. } else if (hdr.ArcType() == T4::Type()) {
  315. ans = ConvertToCompactLattice(F4::Read(is, ropts));
  316. } else {
  317. KALDI_WARN << "FST with arc type " << hdr.ArcType()
  318. << " cannot be converted to CompactLattice.\n";
  319. return false;
  320. }
  321. if (ans == NULL) {
  322. KALDI_WARN << "Error reading compact lattice (after reading header).";
  323. return false;
  324. }
  325. *clat = ans;
  326. return true;
  327. } else {
  328. // The next line would normally consume the \r on Windows, plus any
  329. // extra spaces that might have got in there somehow.
  330. while (std::isspace(is.peek()) && is.peek() != '\n') is.get();
  331. if (is.peek() == '\n')
  332. is.get(); // consume the newline.
  333. else { // saw spaces but no newline.. this is not expected.
  334. KALDI_WARN << "Reading compact lattice: unexpected sequence of spaces "
  335. << " at file position " << is.tellg();
  336. return false;
  337. }
  338. *clat = ReadCompactLatticeText(is); // that routine will warn on error.
  339. return (*clat != NULL);
  340. }
  341. }
  342. bool CompactLatticeHolder::Read(std::istream& is) {
  343. Clear(); // in case anything currently stored.
  344. int c = is.peek();
  345. if (c == -1) {
  346. KALDI_WARN << "End of stream detected reading CompactLattice.";
  347. return false;
  348. } else if (isspace(c)) { // The text form of the lattice begins
  349. // with space (normally, '\n'), so this means it's text (the binary form
  350. // cannot begin with space because it starts with the FST Type() which is
  351. // not space).
  352. return ReadCompactLattice(is, false, &t_);
  353. } else if (c != 214) { // 214 is first char of FST magic number,
  354. // on little-endian machines which is all we support (\326 octal)
  355. KALDI_WARN << "Reading compact lattice: does not appear to be an FST "
  356. << " [non-space but no magic number detected], file pos is "
  357. << is.tellg();
  358. return false;
  359. } else {
  360. return ReadCompactLattice(is, true, &t_);
  361. }
  362. }
  363. bool WriteLattice(std::ostream& os, bool binary, const Lattice& t) {
  364. if (binary) {
  365. fst::FstWriteOptions opts;
  366. // Leave all the options default. Normally these lattices wouldn't have any
  367. // osymbols/isymbols so no point directing it not to write them (who knows
  368. // what we'd want to do if we had them).
  369. return t.Write(os, opts);
  370. } else {
  371. // Text-mode output. Note: we expect that t.InputSymbols() and
  372. // t.OutputSymbols() would always return NULL. The corresponding input
  373. // routine would not work if the FST actually had symbols attached.
  374. // Write a newline after the key, so the first line of the FST appears
  375. // on its own line.
  376. os << '\n';
  377. bool acceptor = false, write_one = false;
  378. fst::FstPrinter<LatticeArc> printer(t, t.InputSymbols(), t.OutputSymbols(),
  379. NULL, acceptor, write_one, "\t");
  380. printer.Print(&os, "<unknown>");
  381. if (os.fail()) KALDI_WARN << "Stream failure detected.";
  382. // Write another newline as a terminating character. The read routine will
  383. // detect this [this is a Kaldi mechanism, not somethig in the original
  384. // OpenFst code].
  385. os << '\n';
  386. return os.good();
  387. }
  388. }
  389. bool ReadLattice(std::istream& is, bool binary, Lattice** lat) {
  390. KALDI_ASSERT(*lat == NULL);
  391. if (binary) {
  392. fst::FstHeader hdr;
  393. if (!hdr.Read(is, "<unknown>")) {
  394. KALDI_WARN << "Reading lattice: error reading FST header.";
  395. return false;
  396. }
  397. if (hdr.FstType() != "vector") {
  398. KALDI_WARN << "Reading lattice: unsupported FST type: " << hdr.FstType();
  399. return false;
  400. }
  401. fst::FstReadOptions ropts("<unspecified>", &hdr);
  402. typedef fst::CompactLatticeWeightTpl<fst::LatticeWeightTpl<float>, int32>
  403. T1;
  404. typedef fst::CompactLatticeWeightTpl<fst::LatticeWeightTpl<double>, int32>
  405. T2;
  406. typedef fst::LatticeWeightTpl<float> T3;
  407. typedef fst::LatticeWeightTpl<double> T4;
  408. typedef fst::VectorFst<fst::ArcTpl<T1> > F1;
  409. typedef fst::VectorFst<fst::ArcTpl<T2> > F2;
  410. typedef fst::VectorFst<fst::ArcTpl<T3> > F3;
  411. typedef fst::VectorFst<fst::ArcTpl<T4> > F4;
  412. Lattice* ans = NULL;
  413. if (hdr.ArcType() == T1::Type()) {
  414. ans = ConvertToLattice(F1::Read(is, ropts));
  415. } else if (hdr.ArcType() == T2::Type()) {
  416. ans = ConvertToLattice(F2::Read(is, ropts));
  417. } else if (hdr.ArcType() == T3::Type()) {
  418. ans = ConvertToLattice(F3::Read(is, ropts));
  419. } else if (hdr.ArcType() == T4::Type()) {
  420. ans = ConvertToLattice(F4::Read(is, ropts));
  421. } else {
  422. KALDI_WARN << "FST with arc type " << hdr.ArcType()
  423. << " cannot be converted to Lattice.\n";
  424. return false;
  425. }
  426. if (ans == NULL) {
  427. KALDI_WARN << "Error reading lattice (after reading header).";
  428. return false;
  429. }
  430. *lat = ans;
  431. return true;
  432. } else {
  433. // The next line would normally consume the \r on Windows, plus any
  434. // extra spaces that might have got in there somehow.
  435. while (std::isspace(is.peek()) && is.peek() != '\n') is.get();
  436. if (is.peek() == '\n')
  437. is.get(); // consume the newline.
  438. else { // saw spaces but no newline.. this is not expected.
  439. KALDI_WARN << "Reading compact lattice: unexpected sequence of spaces "
  440. << " at file position " << is.tellg();
  441. return false;
  442. }
  443. *lat = ReadLatticeText(is); // that routine will warn on error.
  444. return (*lat != NULL);
  445. }
  446. }
  447. /* Since we don't write the binary headers for this type of holder,
  448. we use a different method to work out whether we're in binary mode.
  449. */
  450. bool LatticeHolder::Read(std::istream& is) {
  451. Clear(); // in case anything currently stored.
  452. int c = is.peek();
  453. if (c == -1) {
  454. KALDI_WARN << "End of stream detected reading Lattice.";
  455. return false;
  456. } else if (isspace(c)) { // The text form of the lattice begins
  457. // with space (normally, '\n'), so this means it's text (the binary form
  458. // cannot begin with space because it starts with the FST Type() which is
  459. // not space).
  460. return ReadLattice(is, false, &t_);
  461. } else if (c != 214) { // 214 is first char of FST magic number,
  462. // on little-endian machines which is all we support (\326 octal)
  463. KALDI_WARN << "Reading compact lattice: does not appear to be an FST "
  464. << " [non-space but no magic number detected], file pos is "
  465. << is.tellg();
  466. return false;
  467. } else {
  468. return ReadLattice(is, true, &t_);
  469. }
  470. }
  471. } // end namespace kaldi