You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
154 lines
5.9 KiB
154 lines
5.9 KiB
// bin/arpa2fst.cc
|
|
//
|
|
// Copyright 2009-2011 Gilles Boulianne.
|
|
//
|
|
// See ../../COPYING for clarification regarding multiple authors
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
|
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
|
// MERCHANTABILITY OR NON-INFRINGEMENT.
|
|
// See the Apache 2 License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include <string>
|
|
|
|
#include "lm/arpa-lm-compiler.h"
|
|
#include "util/kaldi-io.h"
|
|
#include "util/parse-options.h"
|
|
|
|
int main(int argc, char *argv[]) {
|
|
using namespace kaldi; // NOLINT
|
|
try {
|
|
const char *usage =
|
|
"Convert an ARPA format language model into an FST\n"
|
|
"Usage: arpa2fst [opts] <input-arpa> <output-fst>\n"
|
|
" e.g.: arpa2fst --disambig-symbol=#0 --read-symbol-table="
|
|
"data/lang/words.txt lm/input.arpa G.fst\n\n"
|
|
"Note: When called without switches, the output G.fst will "
|
|
"contain\n"
|
|
"an embedded symbol table. This is compatible with the way a "
|
|
"previous\n"
|
|
"version of arpa2fst worked.\n";
|
|
|
|
ParseOptions po(usage);
|
|
|
|
ArpaParseOptions options;
|
|
options.Register(&po);
|
|
|
|
// Option flags.
|
|
std::string bos_symbol = "<s>";
|
|
std::string eos_symbol = "</s>";
|
|
std::string disambig_symbol;
|
|
std::string read_syms_filename;
|
|
std::string write_syms_filename;
|
|
bool keep_symbols = false;
|
|
bool ilabel_sort = true;
|
|
|
|
po.Register("bos-symbol", &bos_symbol, "Beginning of sentence symbol");
|
|
po.Register("eos-symbol", &eos_symbol, "End of sentence symbol");
|
|
po.Register(
|
|
"disambig-symbol",
|
|
&disambig_symbol,
|
|
"Disambiguator. If provided (e. g. #0), used on input side of "
|
|
"backoff links, and <s> and </s> are replaced with epsilons");
|
|
po.Register("read-symbol-table",
|
|
&read_syms_filename,
|
|
"Use existing symbol table");
|
|
po.Register("write-symbol-table",
|
|
&write_syms_filename,
|
|
"Write generated symbol table to a file");
|
|
po.Register(
|
|
"keep-symbols",
|
|
&keep_symbols,
|
|
"Store symbol table with FST. Symbols always saved to FST if "
|
|
"symbol tables are neither read or written (otherwise symbols "
|
|
"would be lost entirely)");
|
|
po.Register("ilabel-sort", &ilabel_sort, "Ilabel-sort the output FST");
|
|
|
|
po.Read(argc, argv);
|
|
|
|
if (po.NumArgs() != 1 && po.NumArgs() != 2) {
|
|
po.PrintUsage();
|
|
exit(1);
|
|
}
|
|
std::string arpa_rxfilename = po.GetArg(1),
|
|
fst_wxfilename = po.GetOptArg(2);
|
|
|
|
int64 disambig_symbol_id = 0;
|
|
|
|
fst::SymbolTable *symbols;
|
|
if (!read_syms_filename.empty()) {
|
|
// Use existing symbols. Required symbols must be in the table.
|
|
kaldi::Input kisym(read_syms_filename);
|
|
symbols = fst::SymbolTable::ReadText(
|
|
kisym.Stream(), PrintableWxfilename(read_syms_filename));
|
|
if (symbols == NULL)
|
|
KALDI_ERR << "Could not read symbol table from file "
|
|
<< read_syms_filename;
|
|
|
|
options.oov_handling = ArpaParseOptions::kSkipNGram;
|
|
if (!disambig_symbol.empty()) {
|
|
disambig_symbol_id = symbols->Find(disambig_symbol);
|
|
if (disambig_symbol_id == -1) // fst::kNoSymbol
|
|
KALDI_ERR << "Symbol table " << read_syms_filename
|
|
<< " has no symbol for " << disambig_symbol;
|
|
}
|
|
} else {
|
|
// Create a new symbol table and populate it from ARPA file.
|
|
symbols = new fst::SymbolTable(PrintableWxfilename(fst_wxfilename));
|
|
options.oov_handling = ArpaParseOptions::kAddToSymbols;
|
|
symbols->AddSymbol("<eps>", 0);
|
|
if (!disambig_symbol.empty()) {
|
|
disambig_symbol_id = symbols->AddSymbol(disambig_symbol);
|
|
}
|
|
}
|
|
|
|
// Add or use existing BOS and EOS.
|
|
options.bos_symbol = symbols->AddSymbol(bos_symbol);
|
|
options.eos_symbol = symbols->AddSymbol(eos_symbol);
|
|
|
|
// If producing new (not reading existing) symbols and not saving them,
|
|
// need to keep symbols with FST, otherwise they would be lost.
|
|
if (read_syms_filename.empty() && write_syms_filename.empty())
|
|
keep_symbols = true;
|
|
|
|
// Actually compile LM.
|
|
KALDI_ASSERT(symbols != NULL);
|
|
ArpaLmCompiler lm_compiler(options, disambig_symbol_id, symbols);
|
|
{
|
|
Input ki(arpa_rxfilename);
|
|
lm_compiler.Read(ki.Stream());
|
|
}
|
|
|
|
// Sort the FST in-place if requested by options.
|
|
if (ilabel_sort) {
|
|
fst::ArcSort(lm_compiler.MutableFst(), fst::StdILabelCompare());
|
|
}
|
|
|
|
// Write symbols if requested.
|
|
if (!write_syms_filename.empty()) {
|
|
kaldi::Output kosym(write_syms_filename, false);
|
|
symbols->WriteText(kosym.Stream());
|
|
}
|
|
|
|
// Write LM FST.
|
|
bool write_binary = true, write_header = false;
|
|
kaldi::Output kofst(fst_wxfilename, write_binary, write_header);
|
|
fst::FstWriteOptions wopts(PrintableWxfilename(fst_wxfilename));
|
|
wopts.write_isymbols = wopts.write_osymbols = keep_symbols;
|
|
lm_compiler.Fst().Write(kofst.Stream(), wopts);
|
|
|
|
delete symbols;
|
|
} catch (const std::exception &e) {
|
|
std::cerr << e.what();
|
|
return -1;
|
|
}
|
|
}
|