You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
507 lines
18 KiB
507 lines
18 KiB
3 years ago
|
// lat/kaldi-lattice.cc
|
||
|
|
||
|
// Copyright 2009-2011 Microsoft Corporation
|
||
|
// 2013 Johns Hopkins University (author: Daniel Povey)
|
||
|
|
||
|
// See ../../COPYING for clarification regarding multiple authors
|
||
|
//
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||
|
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||
|
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||
|
// MERCHANTABLITY OR NON-INFRINGEMENT.
|
||
|
// See the Apache 2 License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
|
||
|
#include "lat/kaldi-lattice.h"
|
||
|
#include "fst/script/print-impl.h"
|
||
|
|
||
|
namespace kaldi {
|
||
|
|
||
|
/// Converts lattice types if necessary, deleting its input.
|
||
|
template<class OrigWeightType>
|
||
|
CompactLattice* ConvertToCompactLattice(fst::VectorFst<OrigWeightType> *ifst) {
|
||
|
if (!ifst) return NULL;
|
||
|
CompactLattice *ofst = new CompactLattice();
|
||
|
ConvertLattice(*ifst, ofst);
|
||
|
delete ifst;
|
||
|
return ofst;
|
||
|
}
|
||
|
|
||
|
// This overrides the template if there is no type conversion going on
|
||
|
// (for efficiency).
|
||
|
template<>
|
||
|
CompactLattice* ConvertToCompactLattice(CompactLattice *ifst) {
|
||
|
return ifst;
|
||
|
}
|
||
|
|
||
|
/// Converts lattice types if necessary, deleting its input.
|
||
|
template<class OrigWeightType>
|
||
|
Lattice* ConvertToLattice(fst::VectorFst<OrigWeightType> *ifst) {
|
||
|
if (!ifst) return NULL;
|
||
|
Lattice *ofst = new Lattice();
|
||
|
ConvertLattice(*ifst, ofst);
|
||
|
delete ifst;
|
||
|
return ofst;
|
||
|
}
|
||
|
|
||
|
// This overrides the template if there is no type conversion going on
|
||
|
// (for efficiency).
|
||
|
template<>
|
||
|
Lattice* ConvertToLattice(Lattice *ifst) {
|
||
|
return ifst;
|
||
|
}
|
||
|
|
||
|
|
||
|
bool WriteCompactLattice(std::ostream &os, bool binary,
|
||
|
const CompactLattice &t) {
|
||
|
if (binary) {
|
||
|
fst::FstWriteOptions opts;
|
||
|
// Leave all the options default. Normally these lattices wouldn't have any
|
||
|
// osymbols/isymbols so no point directing it not to write them (who knows what
|
||
|
// we'd want to if we had them).
|
||
|
return t.Write(os, opts);
|
||
|
} else {
|
||
|
// Text-mode output. Note: we expect that t.InputSymbols() and
|
||
|
// t.OutputSymbols() would always return NULL. The corresponding input
|
||
|
// routine would not work if the FST actually had symbols attached.
|
||
|
// Write a newline after the key, so the first line of the FST appears
|
||
|
// on its own line.
|
||
|
os << '\n';
|
||
|
bool acceptor = true, write_one = false;
|
||
|
fst::FstPrinter<CompactLatticeArc> printer(t, t.InputSymbols(),
|
||
|
t.OutputSymbols(),
|
||
|
NULL, acceptor, write_one, "\t");
|
||
|
printer.Print(&os, "<unknown>");
|
||
|
if (os.fail())
|
||
|
KALDI_WARN << "Stream failure detected.";
|
||
|
// Write another newline as a terminating character. The read routine will
|
||
|
// detect this [this is a Kaldi mechanism, not somethig in the original
|
||
|
// OpenFst code].
|
||
|
os << '\n';
|
||
|
return os.good();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/// LatticeReader provides (static) functions for reading both Lattice
|
||
|
/// and CompactLattice, in text form.
|
||
|
class LatticeReader {
|
||
|
typedef LatticeArc Arc;
|
||
|
typedef LatticeWeight Weight;
|
||
|
typedef CompactLatticeArc CArc;
|
||
|
typedef CompactLatticeWeight CWeight;
|
||
|
typedef Arc::Label Label;
|
||
|
typedef Arc::StateId StateId;
|
||
|
public:
|
||
|
// everything is static in this class.
|
||
|
|
||
|
/** This function reads from the FST text format; it does not know in advance
|
||
|
whether it's a Lattice or CompactLattice in the stream so it tries to
|
||
|
read both formats until it becomes clear which is the correct one.
|
||
|
*/
|
||
|
static std::pair<Lattice*, CompactLattice*> ReadText(
|
||
|
std::istream &is) {
|
||
|
typedef std::pair<Lattice*, CompactLattice*> PairT;
|
||
|
using std::string;
|
||
|
using std::vector;
|
||
|
Lattice *fst = new Lattice();
|
||
|
CompactLattice *cfst = new CompactLattice();
|
||
|
string line;
|
||
|
size_t nline = 0;
|
||
|
string separator = FLAGS_fst_field_separator + "\r\n";
|
||
|
while (std::getline(is, line)) {
|
||
|
nline++;
|
||
|
vector<string> col;
|
||
|
// on Windows we'll write in text and read in binary mode.
|
||
|
SplitStringToVector(line, separator.c_str(), true, &col);
|
||
|
if (col.size() == 0) break; // Empty line is a signal to stop, in our
|
||
|
// archive format.
|
||
|
if (col.size() > 5) {
|
||
|
KALDI_WARN << "Reading lattice: bad line in FST: " << line;
|
||
|
delete fst;
|
||
|
delete cfst;
|
||
|
return PairT(static_cast<Lattice*>(NULL),
|
||
|
static_cast<CompactLattice*>(NULL));
|
||
|
}
|
||
|
StateId s;
|
||
|
if (!ConvertStringToInteger(col[0], &s)) {
|
||
|
KALDI_WARN << "FstCompiler: bad line in FST: " << line;
|
||
|
delete fst;
|
||
|
delete cfst;
|
||
|
return PairT(static_cast<Lattice*>(NULL),
|
||
|
static_cast<CompactLattice*>(NULL));
|
||
|
}
|
||
|
if (fst)
|
||
|
while (s >= fst->NumStates())
|
||
|
fst->AddState();
|
||
|
if (cfst)
|
||
|
while (s >= cfst->NumStates())
|
||
|
cfst->AddState();
|
||
|
if (nline == 1) {
|
||
|
if (fst) fst->SetStart(s);
|
||
|
if (cfst) cfst->SetStart(s);
|
||
|
}
|
||
|
|
||
|
if (fst) { // we still have fst; try to read that arc.
|
||
|
bool ok = true;
|
||
|
Arc arc;
|
||
|
Weight w;
|
||
|
StateId d = s;
|
||
|
switch (col.size()) {
|
||
|
case 1 :
|
||
|
fst->SetFinal(s, Weight::One());
|
||
|
break;
|
||
|
case 2:
|
||
|
if (!StrToWeight(col[1], true, &w)) ok = false;
|
||
|
else fst->SetFinal(s, w);
|
||
|
break;
|
||
|
case 3: // 3 columns not ok for Lattice format; it's not an acceptor.
|
||
|
ok = false;
|
||
|
break;
|
||
|
case 4:
|
||
|
ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
|
||
|
ConvertStringToInteger(col[2], &arc.ilabel) &&
|
||
|
ConvertStringToInteger(col[3], &arc.olabel);
|
||
|
if (ok) {
|
||
|
d = arc.nextstate;
|
||
|
arc.weight = Weight::One();
|
||
|
fst->AddArc(s, arc);
|
||
|
}
|
||
|
break;
|
||
|
case 5:
|
||
|
ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
|
||
|
ConvertStringToInteger(col[2], &arc.ilabel) &&
|
||
|
ConvertStringToInteger(col[3], &arc.olabel) &&
|
||
|
StrToWeight(col[4], false, &arc.weight);
|
||
|
if (ok) {
|
||
|
d = arc.nextstate;
|
||
|
fst->AddArc(s, arc);
|
||
|
}
|
||
|
break;
|
||
|
default:
|
||
|
ok = false;
|
||
|
}
|
||
|
while (d >= fst->NumStates())
|
||
|
fst->AddState();
|
||
|
if (!ok) {
|
||
|
delete fst;
|
||
|
fst = NULL;
|
||
|
}
|
||
|
}
|
||
|
if (cfst) {
|
||
|
bool ok = true;
|
||
|
CArc arc;
|
||
|
CWeight w;
|
||
|
StateId d = s;
|
||
|
switch (col.size()) {
|
||
|
case 1 :
|
||
|
cfst->SetFinal(s, CWeight::One());
|
||
|
break;
|
||
|
case 2:
|
||
|
if (!StrToCWeight(col[1], true, &w)) ok = false;
|
||
|
else cfst->SetFinal(s, w);
|
||
|
break;
|
||
|
case 3: // compact-lattice is acceptor format: state, next-state, label.
|
||
|
ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
|
||
|
ConvertStringToInteger(col[2], &arc.ilabel);
|
||
|
if (ok) {
|
||
|
d = arc.nextstate;
|
||
|
arc.olabel = arc.ilabel;
|
||
|
arc.weight = CWeight::One();
|
||
|
cfst->AddArc(s, arc);
|
||
|
}
|
||
|
break;
|
||
|
case 4:
|
||
|
ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
|
||
|
ConvertStringToInteger(col[2], &arc.ilabel) &&
|
||
|
StrToCWeight(col[3], false, &arc.weight);
|
||
|
if (ok) {
|
||
|
d = arc.nextstate;
|
||
|
arc.olabel = arc.ilabel;
|
||
|
cfst->AddArc(s, arc);
|
||
|
}
|
||
|
break;
|
||
|
case 5: default:
|
||
|
ok = false;
|
||
|
}
|
||
|
while (d >= cfst->NumStates())
|
||
|
cfst->AddState();
|
||
|
if (!ok) {
|
||
|
delete cfst;
|
||
|
cfst = NULL;
|
||
|
}
|
||
|
}
|
||
|
if (!fst && !cfst) {
|
||
|
KALDI_WARN << "Bad line in lattice text format: " << line;
|
||
|
// read until we get an empty line, so at least we
|
||
|
// have a chance to read the next one (although this might
|
||
|
// be a bit futile since the calling code will get unhappy
|
||
|
// about failing to read this one.
|
||
|
while (std::getline(is, line)) {
|
||
|
SplitStringToVector(line, separator.c_str(), true, &col);
|
||
|
if (col.empty()) break;
|
||
|
}
|
||
|
return PairT(static_cast<Lattice*>(NULL),
|
||
|
static_cast<CompactLattice*>(NULL));
|
||
|
}
|
||
|
}
|
||
|
return PairT(fst, cfst);
|
||
|
}
|
||
|
|
||
|
static bool StrToWeight(const std::string &s, bool allow_zero, Weight *w) {
|
||
|
std::istringstream strm(s);
|
||
|
strm >> *w;
|
||
|
if (!strm || (!allow_zero && *w == Weight::Zero())) {
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
static bool StrToCWeight(const std::string &s, bool allow_zero, CWeight *w) {
|
||
|
std::istringstream strm(s);
|
||
|
strm >> *w;
|
||
|
if (!strm || (!allow_zero && *w == CWeight::Zero())) {
|
||
|
return false;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
};
|
||
|
|
||
|
|
||
|
CompactLattice *ReadCompactLatticeText(std::istream &is) {
|
||
|
std::pair<Lattice*, CompactLattice*> lat_pair = LatticeReader::ReadText(is);
|
||
|
if (lat_pair.second != NULL) {
|
||
|
delete lat_pair.first;
|
||
|
return lat_pair.second;
|
||
|
} else if (lat_pair.first != NULL) {
|
||
|
// note: ConvertToCompactLattice frees its input.
|
||
|
return ConvertToCompactLattice(lat_pair.first);
|
||
|
} else {
|
||
|
return NULL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
Lattice *ReadLatticeText(std::istream &is) {
|
||
|
std::pair<Lattice*, CompactLattice*> lat_pair = LatticeReader::ReadText(is);
|
||
|
if (lat_pair.first != NULL) {
|
||
|
delete lat_pair.second;
|
||
|
return lat_pair.first;
|
||
|
} else if (lat_pair.second != NULL) {
|
||
|
// note: ConvertToLattice frees its input.
|
||
|
return ConvertToLattice(lat_pair.second);
|
||
|
} else {
|
||
|
return NULL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bool ReadCompactLattice(std::istream &is, bool binary,
|
||
|
CompactLattice **clat) {
|
||
|
KALDI_ASSERT(*clat == NULL);
|
||
|
if (binary) {
|
||
|
fst::FstHeader hdr;
|
||
|
if (!hdr.Read(is, "<unknown>")) {
|
||
|
KALDI_WARN << "Reading compact lattice: error reading FST header.";
|
||
|
return false;
|
||
|
}
|
||
|
if (hdr.FstType() != "vector") {
|
||
|
KALDI_WARN << "Reading compact lattice: unsupported FST type: "
|
||
|
<< hdr.FstType();
|
||
|
return false;
|
||
|
}
|
||
|
fst::FstReadOptions ropts("<unspecified>",
|
||
|
&hdr);
|
||
|
|
||
|
typedef fst::CompactLatticeWeightTpl<fst::LatticeWeightTpl<float>, int32> T1;
|
||
|
typedef fst::CompactLatticeWeightTpl<fst::LatticeWeightTpl<double>, int32> T2;
|
||
|
typedef fst::LatticeWeightTpl<float> T3;
|
||
|
typedef fst::LatticeWeightTpl<double> T4;
|
||
|
typedef fst::VectorFst<fst::ArcTpl<T1> > F1;
|
||
|
typedef fst::VectorFst<fst::ArcTpl<T2> > F2;
|
||
|
typedef fst::VectorFst<fst::ArcTpl<T3> > F3;
|
||
|
typedef fst::VectorFst<fst::ArcTpl<T4> > F4;
|
||
|
|
||
|
CompactLattice *ans = NULL;
|
||
|
if (hdr.ArcType() == T1::Type()) {
|
||
|
ans = ConvertToCompactLattice(F1::Read(is, ropts));
|
||
|
} else if (hdr.ArcType() == T2::Type()) {
|
||
|
ans = ConvertToCompactLattice(F2::Read(is, ropts));
|
||
|
} else if (hdr.ArcType() == T3::Type()) {
|
||
|
ans = ConvertToCompactLattice(F3::Read(is, ropts));
|
||
|
} else if (hdr.ArcType() == T4::Type()) {
|
||
|
ans = ConvertToCompactLattice(F4::Read(is, ropts));
|
||
|
} else {
|
||
|
KALDI_WARN << "FST with arc type " << hdr.ArcType()
|
||
|
<< " cannot be converted to CompactLattice.\n";
|
||
|
return false;
|
||
|
}
|
||
|
if (ans == NULL) {
|
||
|
KALDI_WARN << "Error reading compact lattice (after reading header).";
|
||
|
return false;
|
||
|
}
|
||
|
*clat = ans;
|
||
|
return true;
|
||
|
} else {
|
||
|
// The next line would normally consume the \r on Windows, plus any
|
||
|
// extra spaces that might have got in there somehow.
|
||
|
while (std::isspace(is.peek()) && is.peek() != '\n') is.get();
|
||
|
if (is.peek() == '\n') is.get(); // consume the newline.
|
||
|
else { // saw spaces but no newline.. this is not expected.
|
||
|
KALDI_WARN << "Reading compact lattice: unexpected sequence of spaces "
|
||
|
<< " at file position " << is.tellg();
|
||
|
return false;
|
||
|
}
|
||
|
*clat = ReadCompactLatticeText(is); // that routine will warn on error.
|
||
|
return (*clat != NULL);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
bool CompactLatticeHolder::Read(std::istream &is) {
|
||
|
Clear(); // in case anything currently stored.
|
||
|
int c = is.peek();
|
||
|
if (c == -1) {
|
||
|
KALDI_WARN << "End of stream detected reading CompactLattice.";
|
||
|
return false;
|
||
|
} else if (isspace(c)) { // The text form of the lattice begins
|
||
|
// with space (normally, '\n'), so this means it's text (the binary form
|
||
|
// cannot begin with space because it starts with the FST Type() which is not
|
||
|
// space).
|
||
|
return ReadCompactLattice(is, false, &t_);
|
||
|
} else if (c != 214) { // 214 is first char of FST magic number,
|
||
|
// on little-endian machines which is all we support (\326 octal)
|
||
|
KALDI_WARN << "Reading compact lattice: does not appear to be an FST "
|
||
|
<< " [non-space but no magic number detected], file pos is "
|
||
|
<< is.tellg();
|
||
|
return false;
|
||
|
} else {
|
||
|
return ReadCompactLattice(is, true, &t_);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) {
|
||
|
if (binary) {
|
||
|
fst::FstWriteOptions opts;
|
||
|
// Leave all the options default. Normally these lattices wouldn't have any
|
||
|
// osymbols/isymbols so no point directing it not to write them (who knows what
|
||
|
// we'd want to do if we had them).
|
||
|
return t.Write(os, opts);
|
||
|
} else {
|
||
|
// Text-mode output. Note: we expect that t.InputSymbols() and
|
||
|
// t.OutputSymbols() would always return NULL. The corresponding input
|
||
|
// routine would not work if the FST actually had symbols attached.
|
||
|
// Write a newline after the key, so the first line of the FST appears
|
||
|
// on its own line.
|
||
|
os << '\n';
|
||
|
bool acceptor = false, write_one = false;
|
||
|
fst::FstPrinter<LatticeArc> printer(t, t.InputSymbols(),
|
||
|
t.OutputSymbols(),
|
||
|
NULL, acceptor, write_one, "\t");
|
||
|
printer.Print(&os, "<unknown>");
|
||
|
if (os.fail())
|
||
|
KALDI_WARN << "Stream failure detected.";
|
||
|
// Write another newline as a terminating character. The read routine will
|
||
|
// detect this [this is a Kaldi mechanism, not somethig in the original
|
||
|
// OpenFst code].
|
||
|
os << '\n';
|
||
|
return os.good();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bool ReadLattice(std::istream &is, bool binary,
|
||
|
Lattice **lat) {
|
||
|
KALDI_ASSERT(*lat == NULL);
|
||
|
if (binary) {
|
||
|
fst::FstHeader hdr;
|
||
|
if (!hdr.Read(is, "<unknown>")) {
|
||
|
KALDI_WARN << "Reading lattice: error reading FST header.";
|
||
|
return false;
|
||
|
}
|
||
|
if (hdr.FstType() != "vector") {
|
||
|
KALDI_WARN << "Reading lattice: unsupported FST type: "
|
||
|
<< hdr.FstType();
|
||
|
return false;
|
||
|
}
|
||
|
fst::FstReadOptions ropts("<unspecified>",
|
||
|
&hdr);
|
||
|
|
||
|
typedef fst::CompactLatticeWeightTpl<fst::LatticeWeightTpl<float>, int32> T1;
|
||
|
typedef fst::CompactLatticeWeightTpl<fst::LatticeWeightTpl<double>, int32> T2;
|
||
|
typedef fst::LatticeWeightTpl<float> T3;
|
||
|
typedef fst::LatticeWeightTpl<double> T4;
|
||
|
typedef fst::VectorFst<fst::ArcTpl<T1> > F1;
|
||
|
typedef fst::VectorFst<fst::ArcTpl<T2> > F2;
|
||
|
typedef fst::VectorFst<fst::ArcTpl<T3> > F3;
|
||
|
typedef fst::VectorFst<fst::ArcTpl<T4> > F4;
|
||
|
|
||
|
Lattice *ans = NULL;
|
||
|
if (hdr.ArcType() == T1::Type()) {
|
||
|
ans = ConvertToLattice(F1::Read(is, ropts));
|
||
|
} else if (hdr.ArcType() == T2::Type()) {
|
||
|
ans = ConvertToLattice(F2::Read(is, ropts));
|
||
|
} else if (hdr.ArcType() == T3::Type()) {
|
||
|
ans = ConvertToLattice(F3::Read(is, ropts));
|
||
|
} else if (hdr.ArcType() == T4::Type()) {
|
||
|
ans = ConvertToLattice(F4::Read(is, ropts));
|
||
|
} else {
|
||
|
KALDI_WARN << "FST with arc type " << hdr.ArcType()
|
||
|
<< " cannot be converted to Lattice.\n";
|
||
|
return false;
|
||
|
}
|
||
|
if (ans == NULL) {
|
||
|
KALDI_WARN << "Error reading lattice (after reading header).";
|
||
|
return false;
|
||
|
}
|
||
|
*lat = ans;
|
||
|
return true;
|
||
|
} else {
|
||
|
// The next line would normally consume the \r on Windows, plus any
|
||
|
// extra spaces that might have got in there somehow.
|
||
|
while (std::isspace(is.peek()) && is.peek() != '\n') is.get();
|
||
|
if (is.peek() == '\n') is.get(); // consume the newline.
|
||
|
else { // saw spaces but no newline.. this is not expected.
|
||
|
KALDI_WARN << "Reading compact lattice: unexpected sequence of spaces "
|
||
|
<< " at file position " << is.tellg();
|
||
|
return false;
|
||
|
}
|
||
|
*lat = ReadLatticeText(is); // that routine will warn on error.
|
||
|
return (*lat != NULL);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/* Since we don't write the binary headers for this type of holder,
|
||
|
we use a different method to work out whether we're in binary mode.
|
||
|
*/
|
||
|
bool LatticeHolder::Read(std::istream &is) {
|
||
|
Clear(); // in case anything currently stored.
|
||
|
int c = is.peek();
|
||
|
if (c == -1) {
|
||
|
KALDI_WARN << "End of stream detected reading Lattice.";
|
||
|
return false;
|
||
|
} else if (isspace(c)) { // The text form of the lattice begins
|
||
|
// with space (normally, '\n'), so this means it's text (the binary form
|
||
|
// cannot begin with space because it starts with the FST Type() which is not
|
||
|
// space).
|
||
|
return ReadLattice(is, false, &t_);
|
||
|
} else if (c != 214) { // 214 is first char of FST magic number,
|
||
|
// on little-endian machines which is all we support (\326 octal)
|
||
|
KALDI_WARN << "Reading compact lattice: does not appear to be an FST "
|
||
|
<< " [non-space but no magic number detected], file pos is "
|
||
|
<< is.tellg();
|
||
|
return false;
|
||
|
} else {
|
||
|
return ReadLattice(is, true, &t_);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
} // end namespace kaldi
|