You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

511 lines
24 KiB

  1. // lat/lattice-functions.h
  2. // Copyright 2009-2012 Saarland University (author: Arnab Ghoshal)
  3. // 2012-2013 Johns Hopkins University (Author: Daniel Povey);
  4. // Bagher BabaAli
  5. // 2014 Guoguo Chen
  6. // See ../../COPYING for clarification regarding multiple authors
  7. //
  8. // Licensed under the Apache License, Version 2.0 (the "License");
  9. // you may not use this file except in compliance with the License.
  10. // You may obtain a copy of the License at
  11. //
  12. // http://www.apache.org/licenses/LICENSE-2.0
  13. //
  14. // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15. // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  16. // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  17. // MERCHANTABLITY OR NON-INFRINGEMENT.
  18. // See the Apache 2 License for the specific language governing permissions and
  19. // limitations under the License.
  20. #ifndef KALDI_LAT_LATTICE_FUNCTIONS_H_
  21. #define KALDI_LAT_LATTICE_FUNCTIONS_H_
  22. #include <map>
  23. #include <vector>
  24. #include "base/kaldi-common.h"
  25. // #include "hmm/posterior.h"
  26. #include "fstext/fstext-lib.h"
  27. // #include "hmm/transition-model.h"
  28. #include "lat/kaldi-lattice.h"
  29. // #include "itf/decodable-itf.h"
  30. namespace kaldi {
  31. // /**
  32. // This function extracts the per-frame log likelihoods from a linear
  33. // lattice (which we refer to as an 'nbest' lattice elsewhere in Kaldi code).
  34. // The dimension of *per_frame_loglikes will be set to the
  35. // number of input symbols in 'nbest'. The elements of
  36. // '*per_frame_loglikes' will be set to the .Value2() elements of the lattice
  37. // weights, which represent the acoustic costs; you may want to scale this
  38. // vector afterward by -1/acoustic_scale to get the original loglikes.
  39. // If there are acoustic costs on input-epsilon arcs or the final-prob in
  40. // 'nbest' (and this should not normally be the case in situations where it
  41. // makes sense to call this function), they will be included to the cost of
  42. // the preceding input symbol, or the following input symbol for
  43. // input-epsilons encountered prior to any input symbol. If 'nbest' has no
  44. // input symbols, 'per_frame_loglikes' will be set to the empty vector.
  45. // **/
  46. // void GetPerFrameAcousticCosts(const Lattice &nbest,
  47. // Vector<BaseFloat> *per_frame_loglikes);
  48. //
  49. // /// This function iterates over the states of a topologically sorted lattice
  50. // and
  51. // /// counts the time instance corresponding to each state. The times are
  52. // returned
  53. // /// in a vector of integers 'times' which is resized to have a size equal to
  54. // the
  55. // /// number of states in the lattice. The function also returns the maximum
  56. // time
  57. // /// in the lattice (this will equal the number of frames in the file).
  58. // int32 LatticeStateTimes(const Lattice &lat, std::vector<int32> *times);
  59. //
  60. // /// As LatticeStateTimes, but in the CompactLattice format. Note: must
  61. // /// be topologically sorted. Returns length of the utterance in frames,
  62. // which
  63. // /// might not be the same as the maximum time in the lattice, due to frames
  64. // /// in the final-prob.
  65. // int32 CompactLatticeStateTimes(const CompactLattice &clat,
  66. // std::vector<int32> *times);
  67. //
  68. // /// This function does the forward-backward over lattices and computes the
  69. // /// posterior probabilities of the arcs. It returns the total log-probability
  70. // /// of the lattice. The Posterior quantities contain pairs of
  71. // (transition-id, weight)
  72. // /// on each frame.
  73. // /// If the pointer "acoustic_like_sum" is provided, this value is set to
  74. // /// the sum over the arcs, of the posterior of the arc times the
  75. // /// acoustic likelihood [i.e. negated acoustic score] on that link.
  76. // /// This is used in combination with other quantities to work out
  77. // /// the objective function in MMI discriminative training.
  78. // BaseFloat LatticeForwardBackward(const Lattice &lat,
  79. // Posterior *arc_post,
  80. // double *acoustic_like_sum = NULL);
  81. //
  82. // // This function is something similar to LatticeForwardBackward(), but it is
  83. // on
  84. // // the CompactLattice lattice format. Also we only need the alpha in the
  85. // forward
  86. // // path, not the posteriors.
  87. // bool ComputeCompactLatticeAlphas(const CompactLattice &lat,
  88. // std::vector<double> *alpha);
  89. //
  90. // // A sibling of the function CompactLatticeAlphas()... We compute the beta
  91. // from
  92. // // the backward path here.
  93. // bool ComputeCompactLatticeBetas(const CompactLattice &lat,
  94. // std::vector<double> *beta);
  95. //
  96. //
  97. // // Computes (normal or Viterbi) alphas and betas; returns (total-prob, or
  98. // // best-path negated cost) Note: in either case, the alphas and betas are
  99. // // negated costs. Requires that lat be topologically sorted. This code
  100. // // will work for either CompactLattice or Latice.
  101. // template<typename LatticeType>
  102. // double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
  103. // bool viterbi,
  104. // std::vector<double> *alpha,
  105. // std::vector<double> *beta);
  106. //
  107. //
  108. // /// Topologically sort the compact lattice if not already topologically
  109. // sorted.
  110. // /// Will crash if the lattice cannot be topologically sorted.
  111. // void TopSortCompactLatticeIfNeeded(CompactLattice *clat);
  112. //
  113. //
  114. // /// Topologically sort the lattice if not already topologically sorted.
  115. // /// Will crash if lattice cannot be topologically sorted.
  116. // void TopSortLatticeIfNeeded(Lattice *clat);
  117. //
  118. // /// Returns the depth of the lattice, defined as the average number of arcs
  119. // (or
  120. // /// final-prob strings) crossing any given frame. Returns 1 for empty
  121. // lattices.
  122. // /// Requires that clat is topologically sorted!
  123. // BaseFloat CompactLatticeDepth(const CompactLattice &clat,
  124. // int32 *num_frames = NULL);
  125. //
  126. // /// This function returns, for each frame, the number of arcs crossing that
  127. // /// frame.
  128. // void CompactLatticeDepthPerFrame(const CompactLattice &clat,
  129. // std::vector<int32> *depth_per_frame);
  130. //
  131. //
  132. // /// This function limits the depth of the lattice, per frame: that means, it
  133. // /// does not allow more than a specified number of arcs active on any given
  134. // /// frame. This can be used to reduce the size of the "very deep" portions
  135. // of
  136. // /// the lattice.
  137. // void CompactLatticeLimitDepth(int32 max_arcs_per_frame,
  138. // CompactLattice *clat);
  139. //
  140. //
  141. // /// Given a lattice, and a transition model to map pdf-ids to phones,
  142. // /// outputs for each frame the set of phones active on that frame. If
  143. // /// sil_phones (which must be sorted and uniq) is nonempty, it excludes
  144. // /// phones in this list.
  145. // void LatticeActivePhones(const Lattice &lat, const TransitionModel &trans,
  146. // const std::vector<int32> &sil_phones,
  147. // std::vector<std::set<int32> > *active_phones);
  148. //
  149. // /// Given a lattice, and a transition model to map pdf-ids to phones,
  150. // /// replace the output symbols (presumably words), with phones; we
  151. // /// use the TransitionModel to work out the phone sequence. Note
  152. // /// that the phone labels are not exactly aligned with the phone
  153. // /// boundaries. We put a phone label to coincide with any transition
  154. // /// to the final, nonemitting state of a phone (this state always exists,
  155. // /// we ensure this in HmmTopology::Check()). This would be the last
  156. // /// transition-id in the phone if reordering is not done (but typically
  157. // /// we do reorder).
  158. // /// Also see PhoneAlignLattice, in phone-align-lattice.h.
  159. // void ConvertLatticeToPhones(const TransitionModel &trans_model,
  160. // Lattice *lat);
  161. /// Prunes a lattice or compact lattice. Returns true on success, false if
  162. /// there was some kind of failure.
  163. template <class LatticeType>
  164. bool PruneLattice(BaseFloat beam, LatticeType* lat);
  165. //
  166. // /// Given a lattice, and a transition model to map pdf-ids to phones,
  167. // /// replace the sequences of transition-ids with sequences of phones.
  168. // /// Note that this is different from ConvertLatticeToPhones, in that
  169. // /// we replace the transition-ids not the words.
  170. // void ConvertCompactLatticeToPhones(const TransitionModel &trans_model,
  171. // CompactLattice *clat);
  172. //
  173. // /// Boosts LM probabilities by b * [number of frame errors]; equivalently,
  174. // adds
  175. // /// -b*[number of frame errors] to the graph-component of the cost of each
  176. // arc/path.
  177. // /// There is a frame error if a particular transition-id on a particular
  178. // frame
  179. // /// corresponds to a phone not matching transcription's alignment for that
  180. // frame.
  181. // /// This is used in "margin-inspired" discriminative training, esp. Boosted
  182. // MMI.
  183. // /// The TransitionModel is used to map transition-ids in the lattice
  184. // /// input-side to phones; the phones appearing in
  185. // /// "silence_phones" are treated specially in that we replace the frame error
  186. // f
  187. // /// (either zero or 1) for a frame, with the minimum of f or
  188. // max_silence_error.
  189. // /// For the normal recipe, max_silence_error would be zero.
  190. // /// Returns true on success, false if there was some kind of mismatch.
  191. // /// At input, silence_phones must be sorted and unique.
  192. // bool LatticeBoost(const TransitionModel &trans,
  193. // const std::vector<int32> &alignment,
  194. // const std::vector<int32> &silence_phones,
  195. // BaseFloat b,
  196. // BaseFloat max_silence_error,
  197. // Lattice *lat);
  198. //
  199. //
  200. // /**
  201. // This function implements either the MPFE (minimum phone frame error) or
  202. // SMBR (state-level minimum bayes risk) forward-backward, depending on
  203. // whether "criterion" is "mpfe" or "smbr". It returns the MPFE criterion of
  204. // SMBR criterion for this utterance, and outputs the posteriors (which may
  205. // be positive or negative) into "post".
  206. //
  207. // @param [in] trans The transition model. Used to map the
  208. // transition-ids to phones or pdfs.
  209. // @param [in] silence_phones A list of integer ids of silence phones. The
  210. // silence frames i.e. the frames where num_ali
  211. // corresponds to a silence phones are treated
  212. // specially. The behavior is determined by
  213. // 'one_silence_class' being false (traditional
  214. // behavior) or true. Usually in our setup, several
  215. // phones including the silence, vocalized noise,
  216. // non-spoken noise and unk are treated as "silence
  217. // phones"
  218. // @param [in] lat The denominator lattice
  219. // @param [in] num_ali The numerator alignment
  220. // @param [in] criterion The objective function. Must be "mpfe" or "smbr"
  221. // for MPFE (minimum phone frame error) or sMBR
  222. // (state minimum bayes risk) training.
  223. // @param [in] one_silence_class Determines how the silence frames are
  224. // treated.
  225. // Setting this to false gives the old traditional
  226. // behavior, where the silence frames (according to
  227. // num_ali) are treated as incorrect. However, this
  228. // means that the insertions are not penalized by the
  229. // objective. Setting this to true gives the new
  230. // behaviour, where we treat silence as any other phone,
  231. // except that all pdfs of silence phones are collapsed
  232. // into a single class for the frame-error computation.
  233. // This can possible reduce the insertions in the
  234. // trained model. This is closer to the WER metric that
  235. // we actually care about, since WER is generally
  236. // computed after filtering out noises, but does
  237. // penalize insertions.
  238. // @param [out] post The "MBR posteriors" i.e. derivatives w.r.t to the
  239. // pseudo log-likelihoods of states at each frame.
  240. // */
  241. // BaseFloat LatticeForwardBackwardMpeVariants(
  242. // const TransitionModel &trans,
  243. // const std::vector<int32> &silence_phones,
  244. // const Lattice &lat,
  245. // const std::vector<int32> &num_ali,
  246. // std::string criterion,
  247. // bool one_silence_class,
  248. // Posterior *post);
  249. //
  250. // /**
  251. // This function can be used to compute posteriors for MMI, with a positive
  252. // contribution for the numerator and a negative one for the denominator.
  253. // This function is not actually used in our normal MMI training recipes,
  254. // where it's instead done using various command line programs that each do a
  255. // part of the job. This function was written for use in neural-net MMI
  256. // training.
  257. //
  258. // @param [in] trans The transition model. Used to map the
  259. // transition-ids to phones or pdfs.
  260. // @param [in] lat The denominator lattice
  261. // @param [in] num_ali The numerator alignment
  262. // @param [in] drop_frames If "drop_frames" is true, it will not compute
  263. // any
  264. // posteriors on frames where the num and den have
  265. // disjoint pdf-ids.
  266. // @param [in] convert_to_pdf_ids If "convert_to_pdfs_ids" is true, it will
  267. // convert the output to be at the level of pdf-ids, not
  268. // transition-ids.
  269. // @param [in] cancel If "cancel" is true, it will cancel out any positive
  270. // and
  271. // negative parts from the same transition-id (or
  272. // pdf-id, if convert_to_pdf_ids == true).
  273. // @param [out] arc_post The output MMI posteriors of transition-ids (or
  274. // pdf-ids if convert_to_pdf_ids == true) at each frame
  275. // i.e. the difference between the numerator
  276. // and denominator posteriors.
  277. //
  278. // It returns the forward-backward likelihood of the lattice. */
  279. // BaseFloat LatticeForwardBackwardMmi(
  280. // const TransitionModel &trans,
  281. // const Lattice &lat,
  282. // const std::vector<int32> &num_ali,
  283. // bool drop_frames,
  284. // bool convert_to_pdf_ids,
  285. // bool cancel,
  286. // Posterior *arc_post);
  287. //
  288. //
  289. // /// This function takes a CompactLattice that should only contain a single
  290. // /// linear sequence (e.g. derived from lattice-1best), and that should have
  291. // been
  292. // /// processed so that the arcs in the CompactLattice align correctly with the
  293. // /// word boundaries (e.g. by lattice-align-words). It outputs 3 vectors of
  294. // the
  295. // /// same size, which give, for each word in the lattice (in sequence), the
  296. // word
  297. // /// label and the begin time and length in frames. This is done even for
  298. // zero
  299. // /// (epsilon) words, generally corresponding to optional silence-- if you
  300. // don't
  301. // /// want them, just ignore them in the output.
  302. // /// This function will print a warning and return false, if the lattice
  303. // /// did not have the correct format (e.g. if it is empty or it is not
  304. // /// linear).
  305. // bool CompactLatticeToWordAlignment(const CompactLattice &clat,
  306. // std::vector<int32> *words,
  307. // std::vector<int32> *begin_times,
  308. // std::vector<int32> *lengths);
  309. //
  310. // /// This function takes a CompactLattice that should only contain a single
  311. // /// linear sequence (e.g. derived from lattice-1best), and that should have
  312. // been
  313. // /// processed so that the arcs in the CompactLattice align correctly with the
  314. // /// word boundaries (e.g. by lattice-align-words). It outputs 4 vectors of
  315. // the
  316. // /// same size, which give, for each word in the lattice (in sequence), the
  317. // word
  318. // /// label, the begin time and length in frames, and the pronunciation
  319. // (sequence
  320. // /// of phones). This is done even for zero words, corresponding to optional
  321. // /// silences -- if you don't want them, just ignore them in the output.
  322. // /// This function will print a warning and return false, if the lattice
  323. // /// did not have the correct format (e.g. if it is empty or it is not
  324. // /// linear).
  325. // bool CompactLatticeToWordProns(
  326. // const TransitionModel &tmodel,
  327. // const CompactLattice &clat,
  328. // std::vector<int32> *words,
  329. // std::vector<int32> *begin_times,
  330. // std::vector<int32> *lengths,
  331. // std::vector<std::vector<int32> > *prons,
  332. // std::vector<std::vector<int32> > *phone_lengths);
  333. //
  334. //
  335. // /// A form of the shortest-path/best-path algorithm that's specially coded
  336. // for
  337. // /// CompactLattice. Requires that clat be acyclic.
  338. // void CompactLatticeShortestPath(const CompactLattice &clat,
  339. // CompactLattice *shortest_path);
  340. //
  341. // /// This function expands a CompactLattice to ensure high-probability paths
  342. // /// have unique histories. Arcs with posteriors larger than epsilon get
  343. // splitted. void ExpandCompactLattice(const CompactLattice &clat,
  344. // double epsilon,
  345. // CompactLattice *expand_clat);
  346. //
  347. // /// For each state, compute forward and backward best (viterbi) costs and its
  348. // /// traceback states (for generating best paths later). The forward best cost
  349. // /// for a state is the cost of the best path from the start state to the
  350. // state.
  351. // /// The traceback state of this state is its predecessor state in the best
  352. // path.
  353. // /// The backward best cost for a state is the cost of the best path from the
  354. // /// state to a final one. Its traceback state is the successor state in the
  355. // best
  356. // /// path in the forward direction.
  357. // /// Note: final weights of states are in backward_best_cost_and_pred.
  358. // /// Requires the input CompactLattice clat be acyclic.
  359. // typedef std::vector<std::pair<double,
  360. // CompactLatticeArc::StateId> > CostTraceType;
  361. // void CompactLatticeBestCostsAndTracebacks(
  362. // const CompactLattice &clat,
  363. // CostTraceType *forward_best_cost_and_pred,
  364. // CostTraceType *backward_best_cost_and_pred);
  365. //
  366. // /// This function adds estimated neural language model scores of words in a
  367. // /// minimal list of hypotheses that covers a lattice, to the graph scores on
  368. // the
  369. // /// arcs. The list of hypotheses are generated by latbin/lattice-path-cover.
  370. // typedef unordered_map<std::pair<int32, int32>, double, PairHasher<int32> >
  371. // MapT; void AddNnlmScoreToCompactLattice(const MapT &nnlm_scores,
  372. // CompactLattice *clat);
  373. //
  374. // /// This function add the word insertion penalty to graph score of each word
  375. // /// in the compact lattice
  376. // void AddWordInsPenToCompactLattice(BaseFloat word_ins_penalty,
  377. // CompactLattice *clat);
  378. //
  379. // /// This function *adds* the negated scores obtained from the Decodable
  380. // object,
  381. // /// to the acoustic scores on the arcs. If you want to replace them, you
  382. // should
  383. // /// use ScaleCompactLattice to first set the acoustic scores to zero. Returns
  384. // /// true on success, false on error (typically some kind of mismatched
  385. // inputs). bool RescoreCompactLattice(DecodableInterface *decodable,
  386. // CompactLattice *clat);
  387. //
  388. //
  389. // /// This function returns the number of words in the longest sentence in a
  390. // /// CompactLattice (i.e. the the maximum of any path, of the count of
  391. // /// olabels on that path).
  392. // int32 LongestSentenceLength(const Lattice &lat);
  393. //
  394. // /// This function returns the number of words in the longest sentence in a
  395. // /// CompactLattice, i.e. the the maximum of any path, of the count of
  396. // /// labels on that path... note, in CompactLattice, the ilabels and olabels
  397. // /// are identical because it is an acceptor.
  398. // int32 LongestSentenceLength(const CompactLattice &lat);
  399. //
  400. //
  401. // /// This function is like RescoreCompactLattice, but it is modified to avoid
  402. // /// computing probabilities on most frames where all the pdf-ids are the
  403. // same.
  404. // /// (it needs the transition-model to work out whether two transition-ids map
  405. // to
  406. // /// the same pdf-id, and it assumes that the lattice has transition-ids on
  407. // it).
  408. // /// The naive thing would be to just set all probabilities to zero on frames
  409. // /// where all the pdf-ids are the same (because this value won't affect the
  410. // /// lattice posterior). But this would become confusing when we compute
  411. // /// corpus-level diagnostics such as the MMI objective function. Instead,
  412. // /// imagine speedup_factor = 100 (it must be >= 1.0)... with probability (1.0
  413. // /
  414. // /// speedup_factor) we compute those likelihoods and multiply them by
  415. // /// speedup_factor; otherwise we set them to zero. This gives the right
  416. // /// expected probability so our corpus-level diagnostics will be about right.
  417. // bool RescoreCompactLatticeSpeedup(
  418. // const TransitionModel &tmodel,
  419. // BaseFloat speedup_factor,
  420. // DecodableInterface *decodable,
  421. // CompactLattice *clat);
  422. //
  423. //
  424. // /// This function *adds* the negated scores obtained from the Decodable
  425. // object,
  426. // /// to the acoustic scores on the arcs. If you want to replace them, you
  427. // should
  428. // /// use ScaleCompactLattice to first set the acoustic scores to zero. Returns
  429. // /// true on success, false on error (e.g. some kind of mismatched inputs).
  430. // /// The input labels, if nonzero, are interpreted as transition-ids or
  431. // whatever
  432. // /// other index the Decodable object expects.
  433. // bool RescoreLattice(DecodableInterface *decodable,
  434. // Lattice *lat);
  435. //
  436. // /// This function Composes a CompactLattice format lattice with a
  437. // /// DeterministicOnDemandFst<fst::StdFst> format fst, and outputs another
  438. // /// CompactLattice format lattice. The first element (the one that
  439. // corresponds
  440. // /// to LM weight) in CompactLatticeWeight is used for composition.
  441. // ///
  442. // /// Note that the DeterministicOnDemandFst interface is not "const",
  443. // therefore
  444. // /// we cannot use "const" for <det_fst>.
  445. // void ComposeCompactLatticeDeterministic(
  446. // const CompactLattice& clat,
  447. // fst::DeterministicOnDemandFst<fst::StdArc>* det_fst,
  448. // CompactLattice* composed_clat);
  449. //
  450. // /// This function computes the mapping from the pair
  451. // /// (frame-index, transition-id) to the pair
  452. // /// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the
  453. // /// transition-id in that frame.
  454. // /// frame-index in the lattice.
  455. // /// This function is useful for retaining the acoustic scores in a
  456. // /// non-compact lattice after a process like determinization where the
  457. // /// frame-level acoustic scores are typically lost.
  458. // /// The function ReplaceAcousticScoresFromMap is used to restore the
  459. // /// acoustic scores computed by this function.
  460. // ///
  461. // /// @param [in] lat Input lattice. Expected to be top-sorted. Otherwise
  462. // the
  463. // /// function will crash.
  464. // /// @param [out] acoustic_scores
  465. // /// Pointer to a map from the pair (frame-index,
  466. // /// transition-id) to a pair (sum-of-acoustic-scores,
  467. // /// num-of-occurences).
  468. // /// Usually the acoustic scores for a pdf-id (and hence
  469. // /// transition-id) on a frame will be the same for all
  470. // the
  471. // /// occurences of the pdf-id in that frame.
  472. // /// But if not, we will take the average of the acoustic
  473. // /// scores. Hence, we store both the
  474. // sum-of-acoustic-scores
  475. // /// and the num-of-occurences of the transition-id in
  476. // that
  477. // /// frame.
  478. // void ComputeAcousticScoresMap(
  479. // const Lattice &lat,
  480. // unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
  481. // PairHasher<int32> >
  482. // *acoustic_scores);
  483. //
  484. // /// This function restores acoustic scores computed using the function
  485. // /// ComputeAcousticScoresMap into the lattice.
  486. // ///
  487. // /// @param [in] acoustic_scores
  488. // /// A map from the pair (frame-index, transition-id) to
  489. // a
  490. // /// pair (sum-of-acoustic-scores, num-of-occurences) of
  491. // /// the occurences of the transition-id in that frame.
  492. // /// See the comments for ComputeAcousticScoresMap for
  493. // /// details.
  494. // /// @param [out] lat Pointer to the output lattice.
  495. // void ReplaceAcousticScoresFromMap(
  496. // const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
  497. // PairHasher<int32> > &acoustic_scores,
  498. // Lattice *lat);
  499. } // namespace kaldi
  500. #endif // KALDI_LAT_LATTICE_FUNCTIONS_H_