|
// Copyright 2005-2024 Google LLC
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the 'License');
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an 'AS IS' BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
// See www.openfst.org for extensive documentation on this weighted
|
|
// finite-state transducer library.
|
|
//
|
|
// General weight set and associated semiring operation definitions.
|
|
|
|
#ifndef FST_WEIGHT_H_
|
|
#define FST_WEIGHT_H_
|
|
|
|
#include <cctype>
|
|
#include <cmath>
|
|
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <ios>
|
|
#include <iostream>
|
|
#include <istream>
|
|
#include <ostream>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <type_traits>
|
|
#include <utility>
|
|
|
|
#include <fst/compat.h>
|
|
#include <fst/log.h>
|
|
#include <fst/util.h>
|
|
|
|
DECLARE_string(fst_weight_parentheses);
|
|
DECLARE_string(fst_weight_separator);
|
|
|
|
namespace fst {
|
|
|
|
// A semiring is specified by two binary operations Plus and Times and two
|
|
// designated elements Zero and One with the following properties:
|
|
//
|
|
// Plus: associative, commutative, and has Zero as its identity.
|
|
//
|
|
// Times: associative and has identity One, distributes w.r.t. Plus, and
|
|
// has Zero as an annihilator:
|
|
// Times(Zero(), a) == Times(a, Zero()) = Zero().
|
|
//
|
|
// A left semiring distributes on the left; a right semiring is similarly
|
|
// defined.
|
|
//
|
|
// A Weight class must have binary functions Plus and Times and static member
|
|
// functions Zero() and One() and these must form (at least) a left or right
|
|
// semiring.
|
|
//
|
|
// In addition, the following should be defined for a Weight:
|
|
//
|
|
// Member: predicate on set membership.
|
|
//
|
|
// NoWeight: static member function that returns an element that is
|
|
// not a set member; used to signal an error.
|
|
//
|
|
// >>: reads textual representation of a weight.
|
|
//
|
|
// <<: prints textual representation of a weight.
|
|
//
|
|
// Read(istream &istrm): reads binary representation of a weight.
|
|
//
|
|
// Write(ostream &ostrm): writes binary representation of a weight.
|
|
//
|
|
// Hash: maps weight to size_t.
|
|
//
|
|
// ApproxEqual: approximate equality (for inexact weights)
|
|
//
|
|
// Quantize: quantizes w.r.t delta (for inexact weights)
|
|
//
|
|
// Divide:
|
|
// - In a left semiring, for all a, b, b', c:
|
|
// if Times(a, b) = c, Divide(c, a, DIVIDE_LEFT) = b' and b'.Member(),
|
|
// then Times(a, b') = c.
|
|
// - In a right semiring, for all a, a', b, c:
|
|
// if Times(a, b) = c, Divide(c, b, DIVIDE_RIGHT) = a' and a'.Member(),
|
|
// then Times(a', b) = c.
|
|
// - In a commutative semiring,
|
|
// * for all a, c:
|
|
// Divide(c, a, DIVIDE_ANY) = Divide(c, a, DIVIDE_LEFT)
|
|
// = Divide(c, a, DIVIDE_RIGHT)
|
|
// * for all a, b, b', c:
|
|
// if Times(a, b) = c, Divide(c, a, DIVIDE_ANY) = b' and b'.Member(),
|
|
// then Times(a, b') = c
|
|
// - In the case where there exist no b such that c = Times(a, b), the
|
|
// return value of Divide(c, a, DIVIDE_LEFT) is unspecified. Returning
|
|
// Weight::NoWeight() is recommemded but not required in order to
|
|
// allow the most efficient implementation.
|
|
// - All algorithms in this library only call Divide(c, a) when it is
|
|
// guaranteed that there exists a b such that c = Times(a, b).
|
|
//
|
|
// ReverseWeight: the type of the corresponding reverse weight.
|
|
//
|
|
// Typically the same type as Weight for a (both left and right) semiring.
|
|
// For the left string semiring, it is the right string semiring.
|
|
//
|
|
// Reverse: a mapping from Weight to ReverseWeight s.t.
|
|
//
|
|
// --> Reverse(Reverse(a)) = a
|
|
// --> Reverse(Plus(a, b)) = Plus(Reverse(a), Reverse(b))
|
|
// --> Reverse(Times(a, b)) = Times(Reverse(b), Reverse(a))
|
|
// Typically the identity mapping in a (both left and right) semiring.
|
|
// In the left string semiring, it maps to the reverse string in the right
|
|
// string semiring.
|
|
//
|
|
// Properties: specifies additional properties that hold:
|
|
// LeftSemiring: indicates weights form a left semiring.
|
|
// RightSemiring: indicates weights form a right semiring.
|
|
// Commutative: for all a, b: Times(a,b) == Times(b, a)
|
|
// Idempotent: for all a: Plus(a, a) == a.
|
|
// Path: for all a, b: Plus(a, b) == a or Plus(a, b) == b.
|
|
//
|
|
// User-defined weights and their corresponding operations SHOULD be
|
|
// defined in the same namespace, but SHOULD NOT defined in the fst
|
|
// namespace. Defining them in fst would make the user code fragile
|
|
// to additions in fst. They will be found in another namespace
|
|
// via argument-dependent lookup.
|
|
|
|
// CONSTANT DEFINITIONS
|
|
|
|
// A representable float near .001.
|
|
inline constexpr float kDelta = 1.0F / 1024.0F;
|
|
|
|
// For all a, b, c: Times(c, Plus(a, b)) = Plus(Times(c, a), Times(c, b)).
|
|
inline constexpr uint64_t kLeftSemiring = 0x0000000000000001ULL;
|
|
|
|
// For all a, b, c: Times(Plus(a, b), c) = Plus(Times(a, c), Times(b, c)).
|
|
inline constexpr uint64_t kRightSemiring = 0x0000000000000002ULL;
|
|
|
|
inline constexpr uint64_t kSemiring = kLeftSemiring | kRightSemiring;
|
|
|
|
// For all a, b: Times(a, b) = Times(b, a).
|
|
inline constexpr uint64_t kCommutative = 0x0000000000000004ULL;
|
|
|
|
// For all a: Plus(a, a) = a.
|
|
inline constexpr uint64_t kIdempotent = 0x0000000000000008ULL;
|
|
|
|
// For all a, b: Plus(a, b) = a or Plus(a, b) = b.
|
|
inline constexpr uint64_t kPath = 0x0000000000000010ULL;
|
|
|
|
// For random weight generation: default number of distinct weights.
|
|
// This is also used for a few other weight generation defaults.
|
|
inline constexpr size_t kNumRandomWeights = 5;
|
|
|
|
// Weight property boolean constants needed for SFINAE.
|
|
|
|
template <class W>
|
|
using IsIdempotent = std::bool_constant<(W::Properties() & kIdempotent) != 0>;
|
|
|
|
template <class W>
|
|
using IsPath = std::bool_constant<(W::Properties() & kPath) != 0>;
|
|
|
|
// Determines direction of division.
|
|
enum DivideType {
|
|
DIVIDE_LEFT, // left division
|
|
DIVIDE_RIGHT, // right division
|
|
DIVIDE_ANY
|
|
}; // division in a commutative semiring
|
|
|
|
// NATURAL ORDER
|
|
//
|
|
// By definition:
|
|
//
|
|
// a <= b iff a + b = a
|
|
//
|
|
// The natural order is a negative partial order iff the semiring is
|
|
// idempotent. It is trivially monotonic for plus. It is left
|
|
// (resp. right) monotonic for times iff the semiring is left
|
|
// (resp. right) distributive. It is a total order iff the semiring
|
|
// has the path property.
|
|
//
|
|
// For more information, see:
|
|
//
|
|
// Mohri, M. 2002. Semiring framework and algorithms for shortest-distance
|
|
// problems, Journal of Automata, Languages and
|
|
// Combinatorics 7(3): 321-350, 2002.
|
|
//
|
|
// We define the strict version of this order below.
|
|
|
|
// Requires W is idempotent.
|
|
template <class W>
|
|
struct NaturalLess {
|
|
using Weight = W;
|
|
static_assert(IsIdempotent<W>::value, "W must be idempotent.");
|
|
|
|
bool operator()(const Weight &w1, const Weight &w2) const {
|
|
return w1 != w2 && Plus(w1, w2) == w1;
|
|
}
|
|
};
|
|
|
|
// Power is the iterated product for arbitrary semirings such that Power(w, 0)
|
|
// is One() for the semiring, and Power(w, n) = Times(Power(w, n - 1), w).
|
|
template <class Weight>
|
|
Weight Power(const Weight &weight, size_t n) {
|
|
auto result = Weight::One();
|
|
for (size_t i = 0; i < n; ++i) result = Times(result, weight);
|
|
return result;
|
|
}
|
|
|
|
// Simple default adder class. Specializations might be more complex.
|
|
template <class Weight>
|
|
class Adder {
|
|
public:
|
|
Adder() : sum_(Weight::Zero()) {}
|
|
|
|
explicit Adder(Weight w) : sum_(std::move(w)) {}
|
|
|
|
Weight Add(const Weight &w) {
|
|
sum_ = Plus(sum_, w);
|
|
return sum_;
|
|
}
|
|
|
|
Weight Sum() const { return sum_; }
|
|
|
|
void Reset(Weight w = Weight::Zero()) { sum_ = std::move(w); }
|
|
|
|
private:
|
|
Weight sum_;
|
|
};
|
|
|
|
// General weight converter: raises error.
|
|
template <class W1, class W2>
|
|
struct WeightConvert {
|
|
W2 operator()(W1 w1) const {
|
|
FSTERROR() << "WeightConvert: Can't convert weight from " << W1::Type()
|
|
<< " to " << W2::Type();
|
|
return W2::NoWeight();
|
|
}
|
|
};
|
|
|
|
// Specialized weight converter to self.
|
|
template <class W>
|
|
struct WeightConvert<W, W> {
|
|
constexpr W operator()(W weight) const { return weight; }
|
|
};
|
|
|
|
// General random weight generator: raises error.
|
|
//
|
|
// The standard interface is roughly:
|
|
//
|
|
// class WeightGenerate<MyWeight> {
|
|
// public:
|
|
// explicit WeightGenerate(uint64_t seed = std::random_device()(),
|
|
// bool allow_zero = true,
|
|
// ...);
|
|
//
|
|
// MyWeight operator()() const;
|
|
// };
|
|
//
|
|
// Many weight generators also take trailing constructor arguments specifying
|
|
// the number of random (unique) weights, the length of weights (e.g., for
|
|
// string-based weights), etc. with sensible defaults
|
|
template <class W>
|
|
struct WeightGenerate {
|
|
W operator()() const {
|
|
FSTERROR() << "WeightGenerate: No random generator for " << W::Type();
|
|
return W::NoWeight();
|
|
}
|
|
};
|
|
|
|
namespace internal {
|
|
|
|
class CompositeWeightIO {
|
|
public:
|
|
CompositeWeightIO();
|
|
CompositeWeightIO(char separator, std::pair<char, char> parentheses);
|
|
|
|
std::pair<char, char> parentheses() const {
|
|
return {open_paren_, close_paren_};
|
|
}
|
|
char separator() const { return separator_; }
|
|
|
|
bool error() const { return error_; }
|
|
|
|
protected:
|
|
const char separator_;
|
|
const char open_paren_;
|
|
const char close_paren_;
|
|
|
|
private:
|
|
bool error_;
|
|
};
|
|
|
|
} // namespace internal
|
|
|
|
// Helper class for writing textual composite weights.
|
|
class CompositeWeightWriter : public internal::CompositeWeightIO {
|
|
public:
|
|
// Uses configuration from flags (FST_FLAGS_fst_weight_separator,
|
|
// FST_FLAGS_fst_weight_parentheses).
|
|
explicit CompositeWeightWriter(std::ostream &ostrm);
|
|
|
|
// parentheses defines the opening and closing parenthesis characters.
|
|
// Set parentheses = {0, 0} to disable writing parenthesis.
|
|
CompositeWeightWriter(std::ostream &ostrm, char separator,
|
|
std::pair<char, char> parentheses);
|
|
|
|
CompositeWeightWriter(const CompositeWeightWriter &) = delete;
|
|
CompositeWeightWriter &operator=(const CompositeWeightWriter &) = delete;
|
|
|
|
// Writes open parenthesis to a stream if option selected.
|
|
void WriteBegin();
|
|
|
|
// Writes element to a stream.
|
|
template <class T>
|
|
void WriteElement(const T &comp) {
|
|
if (i_++ > 0) ostrm_ << separator_;
|
|
ostrm_ << comp;
|
|
}
|
|
|
|
// Writes close parenthesis to a stream if option selected.
|
|
void WriteEnd();
|
|
|
|
private:
|
|
std::ostream &ostrm_;
|
|
int i_ = 0; // Element position.
|
|
};
|
|
|
|
// Helper class for reading textual composite weights. Elements are separated by
|
|
// a separator character. There must be at least one element per textual
|
|
// representation. Parentheses characters should be set if the composite
|
|
// weights themselves contain composite weights to ensure proper parsing.
|
|
class CompositeWeightReader : public internal::CompositeWeightIO {
|
|
public:
|
|
// Uses configuration from flags (FST_FLAGS_fst_weight_separator,
|
|
// FST_FLAGS_fst_weight_parentheses).
|
|
explicit CompositeWeightReader(std::istream &istrm);
|
|
|
|
// parentheses defines the opening and closing parenthesis characters.
|
|
// Set parentheses = {0, 0} to disable reading parenthesis.
|
|
CompositeWeightReader(std::istream &istrm, char separator,
|
|
std::pair<char, char> parentheses);
|
|
|
|
CompositeWeightReader(const CompositeWeightReader &) = delete;
|
|
CompositeWeightReader &operator=(const CompositeWeightReader &) = delete;
|
|
|
|
// Reads open parenthesis from a stream if option selected.
|
|
void ReadBegin();
|
|
|
|
// Reads element from a stream. The second argument, when true, indicates that
|
|
// this will be the last element (allowing more forgiving formatting of the
|
|
// last element). Returns false when last element is read.
|
|
template <class T>
|
|
bool ReadElement(T *comp, bool last = false);
|
|
|
|
// Finalizes reading.
|
|
void ReadEnd();
|
|
|
|
private:
|
|
std::istream &istrm_; // Input stream.
|
|
int c_ = 0; // Last character read, or EOF.
|
|
int depth_ = 0; // Weight parentheses depth.
|
|
};
|
|
|
|
template <class T>
|
|
inline bool CompositeWeightReader::ReadElement(T *comp, bool last) {
|
|
std::string s;
|
|
const bool has_parens = open_paren_ != 0;
|
|
while ((c_ != std::istream::traits_type::eof()) && !std::isspace(c_) &&
|
|
(c_ != separator_ || depth_ > 1 || last) &&
|
|
(c_ != close_paren_ || depth_ != 1)) {
|
|
s += c_;
|
|
// If parentheses encountered before separator, they must be matched.
|
|
if (has_parens && c_ == open_paren_) {
|
|
++depth_;
|
|
} else if (has_parens && c_ == close_paren_) {
|
|
// Failure on unmatched parentheses.
|
|
if (depth_ == 0) {
|
|
FSTERROR() << "CompositeWeightReader: Unmatched close paren: "
|
|
<< "Is the fst_weight_parentheses flag set correctly?";
|
|
istrm_.clear(std::ios::badbit);
|
|
return false;
|
|
}
|
|
--depth_;
|
|
}
|
|
c_ = istrm_.get();
|
|
}
|
|
if (s.empty()) {
|
|
FSTERROR() << "CompositeWeightReader: Empty element: "
|
|
<< "Is the fst_weight_parentheses flag set correctly?";
|
|
istrm_.clear(std::ios::badbit);
|
|
return false;
|
|
}
|
|
std::istringstream istrm(s);
|
|
istrm >> *comp;
|
|
// Skips separator/close parenthesis.
|
|
if (c_ != std::istream::traits_type::eof() && !std::isspace(c_)) {
|
|
c_ = istrm_.get();
|
|
}
|
|
const bool is_eof = c_ == std::istream::traits_type::eof();
|
|
// Clears fail bit if just EOF.
|
|
if (is_eof && !istrm_.bad()) istrm_.clear(std::ios::eofbit);
|
|
return !is_eof && !std::isspace(c_);
|
|
}
|
|
|
|
} // namespace fst
|
|
|
|
#endif // FST_WEIGHT_H_
|