// bin/arpa2fst.cc // // Copyright 2009-2011 Gilles Boulianne. // // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABILITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include #include "lm/arpa-lm-compiler.h" #include "util/kaldi-io.h" #include "util/parse-options.h" int main(int argc, char *argv[]) { using namespace kaldi; // NOLINT try { const char *usage = "Convert an ARPA format language model into an FST\n" "Usage: arpa2fst [opts] \n" " e.g.: arpa2fst --disambig-symbol=#0 --read-symbol-table=" "data/lang/words.txt lm/input.arpa G.fst\n\n" "Note: When called without switches, the output G.fst will " "contain\n" "an embedded symbol table. This is compatible with the way a " "previous\n" "version of arpa2fst worked.\n"; ParseOptions po(usage); ArpaParseOptions options; options.Register(&po); // Option flags. std::string bos_symbol = ""; std::string eos_symbol = ""; std::string disambig_symbol; std::string read_syms_filename; std::string write_syms_filename; bool keep_symbols = false; bool ilabel_sort = true; po.Register("bos-symbol", &bos_symbol, "Beginning of sentence symbol"); po.Register("eos-symbol", &eos_symbol, "End of sentence symbol"); po.Register( "disambig-symbol", &disambig_symbol, "Disambiguator. If provided (e. g. #0), used on input side of " "backoff links, and and are replaced with epsilons"); po.Register("read-symbol-table", &read_syms_filename, "Use existing symbol table"); po.Register("write-symbol-table", &write_syms_filename, "Write generated symbol table to a file"); po.Register( "keep-symbols", &keep_symbols, "Store symbol table with FST. Symbols always saved to FST if " "symbol tables are neither read or written (otherwise symbols " "would be lost entirely)"); po.Register("ilabel-sort", &ilabel_sort, "Ilabel-sort the output FST"); po.Read(argc, argv); if (po.NumArgs() != 1 && po.NumArgs() != 2) { po.PrintUsage(); exit(1); } std::string arpa_rxfilename = po.GetArg(1), fst_wxfilename = po.GetOptArg(2); int64 disambig_symbol_id = 0; fst::SymbolTable *symbols; if (!read_syms_filename.empty()) { // Use existing symbols. Required symbols must be in the table. kaldi::Input kisym(read_syms_filename); symbols = fst::SymbolTable::ReadText( kisym.Stream(), PrintableWxfilename(read_syms_filename)); if (symbols == NULL) KALDI_ERR << "Could not read symbol table from file " << read_syms_filename; options.oov_handling = ArpaParseOptions::kSkipNGram; if (!disambig_symbol.empty()) { disambig_symbol_id = symbols->Find(disambig_symbol); if (disambig_symbol_id == -1) // fst::kNoSymbol KALDI_ERR << "Symbol table " << read_syms_filename << " has no symbol for " << disambig_symbol; } } else { // Create a new symbol table and populate it from ARPA file. symbols = new fst::SymbolTable(PrintableWxfilename(fst_wxfilename)); options.oov_handling = ArpaParseOptions::kAddToSymbols; symbols->AddSymbol("", 0); if (!disambig_symbol.empty()) { disambig_symbol_id = symbols->AddSymbol(disambig_symbol); } } // Add or use existing BOS and EOS. options.bos_symbol = symbols->AddSymbol(bos_symbol); options.eos_symbol = symbols->AddSymbol(eos_symbol); // If producing new (not reading existing) symbols and not saving them, // need to keep symbols with FST, otherwise they would be lost. if (read_syms_filename.empty() && write_syms_filename.empty()) keep_symbols = true; // Actually compile LM. KALDI_ASSERT(symbols != NULL); ArpaLmCompiler lm_compiler(options, disambig_symbol_id, symbols); { Input ki(arpa_rxfilename); lm_compiler.Read(ki.Stream()); } // Sort the FST in-place if requested by options. if (ilabel_sort) { fst::ArcSort(lm_compiler.MutableFst(), fst::StdILabelCompare()); } // Write symbols if requested. if (!write_syms_filename.empty()) { kaldi::Output kosym(write_syms_filename, false); symbols->WriteText(kosym.Stream()); } // Write LM FST. bool write_binary = true, write_header = false; kaldi::Output kofst(fst_wxfilename, write_binary, write_header); fst::FstWriteOptions wopts(PrintableWxfilename(fst_wxfilename)); wopts.write_isymbols = wopts.write_osymbols = keep_symbols; lm_compiler.Fst().Write(kofst.Stream(), wopts); delete symbols; } catch (const std::exception &e) { std::cerr << e.what(); return -1; } }