From 15f434a5d3e20ab9535bc068b7fb60cf2eba655d Mon Sep 17 00:00:00 2001 From: Yang Zhou Date: Mon, 28 Mar 2022 19:57:55 +0800 Subject: [PATCH] add make TLG binary & script --- .../aishell/local/aishell_train_lms.sh | 59 ++++++ speechx/examples/aishell/mkgraph.sh | 31 +++ speechx/examples/aishell/path.sh | 14 ++ .../aishell/tools/fst/add_lex_disambig.pl | 195 ++++++++++++++++++ .../tools/fst/compile_lexicon_token_fst.sh | 86 ++++++++ .../aishell/tools/fst/ctc_token_fst.py | 24 +++ .../tools/fst/ctc_token_fst_compact.py | 21 ++ .../tools/fst/ctc_token_fst_corrected.py | 55 +++++ .../aishell/tools/fst/eps2disambig.pl | 29 +++ .../aishell/tools/fst/make_lexicon_fst.pl | 155 ++++++++++++++ .../examples/aishell/tools/fst/make_tlg.sh | 38 ++++ .../aishell/tools/fst/prepare_dict.py | 64 ++++++ .../examples/aishell/tools/fst/remove_oovs.pl | 43 ++++ .../aishell/tools/fst/rnnt_token_fst.py | 17 ++ speechx/examples/aishell/tools/fst/s2eps.pl | 27 +++ .../examples/aishell/tools/parse_options.sh | 97 +++++++++ .../decoder/offline_wfst_decoder_main.cc | 4 + speechx/speechx/frontend/normalizer.cc | 2 +- speechx/speechx/nnet/decodable.cc | 4 +- speechx/tools/fstbin/fstaddselfloops.cc | 100 +++++++++ speechx/tools/fstbin/fstdeterminizestar.cc | 114 ++++++++++ speechx/tools/fstbin/fstisstochastic.cc | 91 ++++++++ speechx/tools/fstbin/fstminimizeencoded.cc | 74 +++++++ speechx/tools/fstbin/fsttablecompose.cc | 133 ++++++++++++ speechx/tools/install_srilm.sh | 97 +++++++++ speechx/tools/lmbin/CMakeLists.txt | 5 + speechx/tools/lmbin/arpa2fst.cc | 145 +++++++++++++ 27 files changed, 1721 insertions(+), 3 deletions(-) create mode 100755 speechx/examples/aishell/local/aishell_train_lms.sh create mode 100644 speechx/examples/aishell/mkgraph.sh create mode 100644 speechx/examples/aishell/path.sh create mode 100755 speechx/examples/aishell/tools/fst/add_lex_disambig.pl create mode 100755 speechx/examples/aishell/tools/fst/compile_lexicon_token_fst.sh create mode 100755 speechx/examples/aishell/tools/fst/ctc_token_fst.py create mode 100755 speechx/examples/aishell/tools/fst/ctc_token_fst_compact.py create mode 100755 speechx/examples/aishell/tools/fst/ctc_token_fst_corrected.py create mode 100755 speechx/examples/aishell/tools/fst/eps2disambig.pl create mode 100755 speechx/examples/aishell/tools/fst/make_lexicon_fst.pl create mode 100755 speechx/examples/aishell/tools/fst/make_tlg.sh create mode 100755 speechx/examples/aishell/tools/fst/prepare_dict.py create mode 100755 speechx/examples/aishell/tools/fst/remove_oovs.pl create mode 100755 speechx/examples/aishell/tools/fst/rnnt_token_fst.py create mode 100755 speechx/examples/aishell/tools/fst/s2eps.pl create mode 100755 speechx/examples/aishell/tools/parse_options.sh create mode 100644 speechx/tools/fstbin/fstaddselfloops.cc create mode 100644 speechx/tools/fstbin/fstdeterminizestar.cc create mode 100644 speechx/tools/fstbin/fstisstochastic.cc create mode 100644 speechx/tools/fstbin/fstminimizeencoded.cc create mode 100644 speechx/tools/fstbin/fsttablecompose.cc create mode 100755 speechx/tools/install_srilm.sh create mode 100644 speechx/tools/lmbin/CMakeLists.txt create mode 100644 speechx/tools/lmbin/arpa2fst.cc diff --git a/speechx/examples/aishell/local/aishell_train_lms.sh b/speechx/examples/aishell/local/aishell_train_lms.sh new file mode 100755 index 00000000..30ffb797 --- /dev/null +++ b/speechx/examples/aishell/local/aishell_train_lms.sh @@ -0,0 +1,59 @@ +#!/bin/bash + + +# To be run from one directory above this script. +. ./path.sh + +text=data/local/lm/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# Check SRILM tools +if ! which ngram-count > /dev/null; then + echo "srilm tools are not found, please download it and install it from: " + echo "http://www.speech.sri.com/projects/srilm/download.html" + echo "Then add the tools to your PATH" + exit 1 +fi + +# This script takes no arguments. It assumes you have already run +# aishell_data_prep.sh. +# It takes as input the files +# data/local/lm/text +# data/local/dict/lexicon.txt +dir=data/local/lm +mkdir -p $dir + + +cleantext=$dir/text.no_oov + +cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + > $cleantext || exit 1; + +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ + sort -nr > $dir/word.counts || exit 1; + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; + +cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist + +heldout_sent=10000 # Don't change this if you want result to be comparable with + # kaldi_lm results +mkdir -p $dir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/train + +ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $dir/lm.arpa +ngram -lm $dir/lm.arpa -ppl $dir/heldout diff --git a/speechx/examples/aishell/mkgraph.sh b/speechx/examples/aishell/mkgraph.sh new file mode 100644 index 00000000..f66cd4dc --- /dev/null +++ b/speechx/examples/aishell/mkgraph.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +. ./path.sh || exit 1; + +. tools/parse_options.sh || exit 1; + +data=/mnt/dataset/aishell + +# Optionally, you can add LM and test it with runtime. +dir=./ds2_graph +dict=$dir/vocab.txt +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + # 7.1 Prepare dict + unit_file=$dict + mkdir -p $dir/local/dict + cp $unit_file $dir/local/dict/units.txt + tools/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \ + $dir/local/dict/lexicon.txt + # Train lm + lm=$dir/local/lm + mkdir -p $lm + tools/filter_scp.pl data/train/text \ + $data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text + local/ds2_aishell_train_lms.sh + # Build decoding TLG + tools/fst/compile_lexicon_token_fst.sh \ + $dir/local/dict $dir/local/tmp $dir/local/lang + tools/fst/make_tlg.sh $dir/local/lm $dir/local/lang $dir/lang_test || exit 1; +fi + + diff --git a/speechx/examples/aishell/path.sh b/speechx/examples/aishell/path.sh new file mode 100644 index 00000000..8ab7ee29 --- /dev/null +++ b/speechx/examples/aishell/path.sh @@ -0,0 +1,14 @@ +# This contains the locations of binarys build required for running the examples. + +SPEECHX_ROOT=$PWD/../.. +SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples + +SPEECHX_TOOLS=$SPEECHX_ROOT/tools +TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin + +[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } + +export LC_AL=C + +SPEECHX_BIN=$SPEECHX_EXAMPLES/feat +export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN diff --git a/speechx/examples/aishell/tools/fst/add_lex_disambig.pl b/speechx/examples/aishell/tools/fst/add_lex_disambig.pl new file mode 100755 index 00000000..dd8a25de --- /dev/null +++ b/speechx/examples/aishell/tools/fst/add_lex_disambig.pl @@ -0,0 +1,195 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation +# 2013-2016 Johns Hopkins University (author: Daniel Povey) +# 2015 Hainan Xu +# 2015 Guoguo Chen + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Adds disambiguation symbols to a lexicon. +# Outputs still in the normal lexicon format. +# Disambig syms are numbered #1, #2, #3, etc. (#0 +# reserved for symbol in grammar). +# Outputs the number of disambig syms to the standard output. +# With the --pron-probs option, expects the second field +# of each lexicon line to be a pron-prob. +# With the --sil-probs option, expects three additional +# fields after the pron-prob, representing various components +# of the silence probability model. + +$pron_probs = 0; +$sil_probs = 0; +$first_allowed_disambig = 1; + +for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { + if ($ARGV[0] eq "--pron-probs") { + $pron_probs = 1; + shift @ARGV; + } + if ($ARGV[0] eq "--sil-probs") { + $sil_probs = 1; + shift @ARGV; + } + if ($ARGV[0] eq "--first-allowed-disambig") { + $first_allowed_disambig = 0 + $ARGV[1]; + if ($first_allowed_disambig < 1) { + die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; + } + shift @ARGV; + shift @ARGV; + } +} + +if (@ARGV != 2) { + die "Usage: add_lex_disambig.pl [opts] \n" . + "This script adds disambiguation symbols to a lexicon in order to\n" . + "make decoding graphs determinizable; it adds pseudo-phone\n" . + "disambiguation symbols #1, #2 and so on at the ends of phones\n" . + "to ensure that all pronunciations are different, and that none\n" . + "is a prefix of another.\n" . + "It prints to the standard output the number of the largest-numbered" . + "disambiguation symbol that was used.\n" . + "\n" . + "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . + " --sil-probs [should be with --pron-probs option]\n" . + " Expect 3 extra fields after the pron-probs, for aspects of\n" . + " the silence probability model\n" . + " --first-allowed-disambig The number of the first disambiguation symbol\n" . + " that this script is allowed to add. By default this is\n" . + " #1, but you can set this to a larger value using this option.\n" . + "e.g.:\n" . + " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . + " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . + " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; +} + + +$lexfn = shift @ARGV; +$lexoutfn = shift @ARGV; + +open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; + +# (1) Read in the lexicon. +@L = ( ); +while() { + @A = split(" ", $_); + push @L, join(" ", @A); +} + +# (2) Work out the count of each phone-sequence in the +# lexicon. + +foreach $l (@L) { + @A = split(" ", $l); + shift @A; # Remove word. + if ($pron_probs) { + $p = shift @A; + if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } + } + if ($sil_probs) { + $silp = shift @A; + if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } + $correction = shift @A; + if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } + $correction = shift @A; + if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } + } + if (!(@A)) { + die "Bad lexicon line $1, no phone in phone list"; + } + $count{join(" ",@A)}++; +} + +# (3) For each left sub-sequence of each phone-sequence, note down +# that it exists (for identifying prefixes of longer strings). + +foreach $l (@L) { + @A = split(" ", $l); + shift @A; # Remove word. + if ($pron_probs) { shift @A; } # remove pron-prob. + if ($sil_probs) { + shift @A; # Remove silprob + shift @A; # Remove silprob + } + while(@A > 0) { + pop @A; # Remove last phone + $issubseq{join(" ",@A)} = 1; + } +} + +# (4) For each entry in the lexicon: +# if the phone sequence is unique and is not a +# prefix of another word, no diambig symbol. +# Else output #1, or #2, #3, ... if the same phone-seq +# has already been assigned a disambig symbol. + + +open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; + +# max_disambig will always be the highest-numbered disambiguation symbol that +# has been used so far. +$max_disambig = $first_allowed_disambig - 1; + +foreach $l (@L) { + @A = split(" ", $l); + $word = shift @A; + if ($pron_probs) { + $pron_prob = shift @A; + } + if ($sil_probs) { + $sil_word_prob = shift @A; + $word_sil_correction = shift @A; + $prev_nonsil_correction = shift @A + } + $phnseq = join(" ", @A); + if (!defined $issubseq{$phnseq} + && $count{$phnseq} == 1) { + ; # Do nothing. + } else { + if ($phnseq eq "") { # need disambig symbols for the empty string + # that are not use anywhere else. + $max_disambig++; + $reserved_for_the_empty_string{$max_disambig} = 1; + $phnseq = "#$max_disambig"; + } else { + $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; + if (!defined $cur_disambig) { + $cur_disambig = $first_allowed_disambig; + } else { + $cur_disambig++; # Get a number that has not been used yet for + # this phone sequence. + } + while (defined $reserved_for_the_empty_string{$cur_disambig}) { + $cur_disambig++; + } + if ($cur_disambig > $max_disambig) { + $max_disambig = $cur_disambig; + } + $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; + $phnseq = $phnseq . " #" . $cur_disambig; + } + } + if ($pron_probs) { + if ($sil_probs) { + print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; + } else { + print O "$word\t$pron_prob\t$phnseq\n"; + } + } else { + print O "$word\t$phnseq\n"; + } +} + +print $max_disambig . "\n"; diff --git a/speechx/examples/aishell/tools/fst/compile_lexicon_token_fst.sh b/speechx/examples/aishell/tools/fst/compile_lexicon_token_fst.sh new file mode 100755 index 00000000..fda971fe --- /dev/null +++ b/speechx/examples/aishell/tools/fst/compile_lexicon_token_fst.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Copyright 2015 Yajie Miao (Carnegie Mellon University) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the +# phoneme and character-based lexicons. +set -eo pipefail +. tools/parse_options.sh + +if [ $# -ne 3 ]; then + echo "usage: tools/fst/compile_lexicon_token_fst.sh " + echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" + echo " should contain the following files:" + echo "lexicon.txt lexicon_numbers.txt units.txt" + echo "options: " + exit 1; +fi + +srcdir=$1 +tmpdir=$2 +dir=$3 +mkdir -p $dir $tmpdir + +[ -f path.sh ] && . ./path.sh + +cp $srcdir/units.txt $dir + +# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. +# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. +perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; + +# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. +# Without these symbols, determinization will fail. +ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` +ndisambig=$[$ndisambig+1]; + +( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list + +# Get the full list of CTC tokens used in FST. These tokens include , the blank , +# the actual model unit, and the disambiguation symbols. +cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list +(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt + +# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, +# so here just use simple ctc_token_fst +tools/fst/ctc_token_fst.py $dir/tokens.txt | \ + fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ + fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; + +# Encode the words with indices. Will be used in lexicon and language model FST compiling. +cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | awk ' + BEGIN { + print " 0"; + } + { + printf("%s %d\n", $1, NR); + } + END { + printf("#0 %d\n", NR+1); + printf(" %d\n", NR+2); + printf(" %d\n", NR+3); + }' > $dir/words.txt || exit 1; + +# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. +token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` +word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` + +tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ + fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ + fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; + +echo "Lexicon and token FSTs compiling succeeded" diff --git a/speechx/examples/aishell/tools/fst/ctc_token_fst.py b/speechx/examples/aishell/tools/fst/ctc_token_fst.py new file mode 100755 index 00000000..048734c8 --- /dev/null +++ b/speechx/examples/aishell/tools/fst/ctc_token_fst.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python + +import sys + +print('0 1 ') +print('1 1 ') +print('2 2 ') +print('2 0 ') + +with open(sys.argv[1], 'r') as fin: + node = 3 + for entry in fin: + fields = entry.strip().split(' ') + phone = fields[0] + if phone == '' or phone == '': + continue + elif '#' in phone: # disambiguous phone + print('{} {} {} {}'.format(0, 0, '', phone)) + else: + print('{} {} {} {}'.format(1, node, phone, phone)) + print('{} {} {} {}'.format(node, node, phone, '')) + print('{} {} {} {}'.format(node, 2, '', '')) + node += 1 +print('0') diff --git a/speechx/examples/aishell/tools/fst/ctc_token_fst_compact.py b/speechx/examples/aishell/tools/fst/ctc_token_fst_compact.py new file mode 100755 index 00000000..d3018d8b --- /dev/null +++ b/speechx/examples/aishell/tools/fst/ctc_token_fst_compact.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python + +import sys + +print('0 0 ') + +with open(sys.argv[1], 'r', encoding='utf8') as fin: + node = 1 + for entry in fin: + fields = entry.strip().split(' ') + phone = fields[0] + if phone == '' or phone == '': + continue + elif '#' in phone: # disambiguous phone + print('{} {} {} {}'.format(0, 0, '', phone)) + else: + print('{} {} {} {}'.format(0, node, phone, phone)) + print('{} {} {} {}'.format(node, node, phone, '')) + print('{} {} {} {}'.format(node, 0, '', '')) + node += 1 +print('0') diff --git a/speechx/examples/aishell/tools/fst/ctc_token_fst_corrected.py b/speechx/examples/aishell/tools/fst/ctc_token_fst_corrected.py new file mode 100755 index 00000000..81f7079e --- /dev/null +++ b/speechx/examples/aishell/tools/fst/ctc_token_fst_corrected.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +import sys + + +def il(n): + return n + 1 + + +def ol(n): + return n + 1 + + +def s(n): + return n + + +if __name__ == "__main__": + with open(sys.argv[1]) as f: + lines = f.readlines() + phone_count = 0 + disambig_count = 0 + for line in lines: + sp = line.split() + phone = sp[0] + if phone == '' or phone == '': + continue + if phone.startswith('#'): + disambig_count += 1 + else: + phone_count += 1 + + # 1. add start state + print('0 0 {} 0'.format(il(0))) + + # 2. 0 -> i, i -> i, i -> 0 + for i in range(1, phone_count + 1): + print('0 {} {} {}'.format(s(i), il(i), ol(i))) + print('{} {} {} 0'.format(s(i), s(i), il(i))) + print('{} 0 {} 0'.format(s(i), il(0))) + + # 3. i -> other phone + for i in range(1, phone_count + 1): + for j in range(1, phone_count + 1): + if i != j: + print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) + + # 4. add disambiguous arcs on every final state + for i in range(0, phone_count + 1): + for j in range(phone_count + 2, phone_count + disambig_count + 2): + print('{} {} {} {}'.format(s(i), s(i), 0, j)) + + # 5. every i is final state + for i in range(0, phone_count + 1): + print(s(i)) diff --git a/speechx/examples/aishell/tools/fst/eps2disambig.pl b/speechx/examples/aishell/tools/fst/eps2disambig.pl new file mode 100755 index 00000000..e1d84a6b --- /dev/null +++ b/speechx/examples/aishell/tools/fst/eps2disambig.pl @@ -0,0 +1,29 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation +# 2015 Guoguo Chen + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script replaces epsilon with #0 on the input side only, of the G.fst +# acceptor. + +while(<>){ + if (/\s+#0\s+/) { + print STDERR "$0: ERROR: LM has word #0, " . + "which is reserved as disambiguation symbol\n"; + exit 1; + } + s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; + print; +} diff --git a/speechx/examples/aishell/tools/fst/make_lexicon_fst.pl b/speechx/examples/aishell/tools/fst/make_lexicon_fst.pl new file mode 100755 index 00000000..f97129c0 --- /dev/null +++ b/speechx/examples/aishell/tools/fst/make_lexicon_fst.pl @@ -0,0 +1,155 @@ +#!/usr/bin/env perl +use warnings; #sed replacement for -w perl parameter +# Copyright 2010-2011 Microsoft Corporation +# 2013 Johns Hopkins University (author: Daniel Povey) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). + +$pron_probs = 0; + +if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { + $pron_probs = 1; + shift @ARGV; +} + +if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { + print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; + print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; + print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; + print STDERR " word phone1 phone2 ... phoneN;\n"; + print STDERR "if the --pron-probs option is used, each line is:\n"; + print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; + print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; + print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; + print STDERR "this is your responsibility.\n\n"; + print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; + print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; + print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; + exit(1); +} + +$lexfn = shift @ARGV; +if (@ARGV == 0) { + $silprob = 0.0; +} elsif (@ARGV == 2) { + ($silprob,$silphone) = @ARGV; +} else { + ($silprob,$silphone,$sildisambig) = @ARGV; +} +if ($silprob != 0.0) { + $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; + $silcost = -log($silprob); + $nosilcost = -log(1.0 - $silprob); +} + + +open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; + + +if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. + $loopstate = 0; + $nextstate = 1; # next unallocated state. + while () { + @A = split(" ", $_); + @A == 0 && die "Empty lexicon line."; + foreach $a (@A) { + if ($a eq "") { + die "Bad lexicon line $_ ( is forbidden)"; + } + } + $w = shift @A; + if (! $pron_probs) { + $pron_cost = 0.0; + } else { + $pron_prob = shift @A; + if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { + die "Bad pronunciation probability in line $_"; + } + $pron_cost = -log($pron_prob); + } + if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } + + $s = $loopstate; + $word_or_eps = $w; + while (@A > 0) { + $p = shift @A; + if (@A > 0) { + $ns = $nextstate++; + } else { + $ns = $loopstate; + } + print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; + $word_or_eps = ""; + $pron_cost_string = ""; # so we only print it on the first arc of the word. + $s = $ns; + } + } + print "$loopstate\t0\n"; # final-cost. +} else { # have silence probs. + $startstate = 0; + $loopstate = 1; + $silstate = 2; # state from where we go to loopstate after emitting silence. + print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. + if (!defined $sildisambig) { + print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. + print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. + $nextstate = 3; + } else { + $disambigstate = 3; + $nextstate = 4; + print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. + print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. + print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. + } + while () { + @A = split(" ", $_); + $w = shift @A; + if (! $pron_probs) { + $pron_cost = 0.0; + } else { + $pron_prob = shift @A; + if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { + die "Bad pronunciation probability in line $_"; + } + $pron_cost = -log($pron_prob); + } + if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } + $s = $loopstate; + $word_or_eps = $w; + while (@A > 0) { + $p = shift @A; + if (@A > 0) { + $ns = $nextstate++; + print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; + $word_or_eps = ""; + $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. + $s = $ns; + } elsif (!defined($silphone) || $p ne $silphone) { + # This is non-deterministic but relatively compact, + # and avoids epsilons. + $local_nosilcost = $nosilcost + $pron_cost; + $local_silcost = $silcost + $pron_cost; + print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; + print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; + } else { + # no point putting opt-sil after silence word. + print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; + } + } + } + print "$loopstate\t0\n"; # final-cost. +} diff --git a/speechx/examples/aishell/tools/fst/make_tlg.sh b/speechx/examples/aishell/tools/fst/make_tlg.sh new file mode 100755 index 00000000..98694e55 --- /dev/null +++ b/speechx/examples/aishell/tools/fst/make_tlg.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# + +if [ -f path.sh ]; then . path.sh; fi + +lm_dir=$1 +src_lang=$2 +tgt_lang=$3 + +arpa_lm=${lm_dir}/lm.arpa +[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; + +rm -rf $tgt_lang +cp -r $src_lang $tgt_lang + +# Compose the language model to FST +cat $arpa_lm | \ + grep -v ' ' | \ + grep -v ' ' | \ + grep -v ' ' | \ + grep -v -i '' | \ + grep -v -i '' | \ + arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ + tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ + --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst + + +echo "Checking how stochastic G is (the first of these numbers should be small):" +fstisstochastic $tgt_lang/G.fst + +# Compose the token, lexicon and language-model FST into the final decoding graph +fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ + fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; +fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; + +echo "Composing decoding graph TLG.fst succeeded" +#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/speechx/examples/aishell/tools/fst/prepare_dict.py b/speechx/examples/aishell/tools/fst/prepare_dict.py new file mode 100755 index 00000000..b6b92ea1 --- /dev/null +++ b/speechx/examples/aishell/tools/fst/prepare_dict.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +import sys + +# sys.argv[1]: e2e model unit file(lang_char.txt) +# sys.argv[2]: raw lexicon file +# sys.argv[3]: output lexicon file +# sys.argv[4]: bpemodel + +unit_table = set() +with open(sys.argv[1], 'r', encoding='utf8') as fin: + for line in fin: + unit = line.split()[0] + unit_table.add(unit) + + +def contain_oov(units): + for unit in units: + if unit not in unit_table: + return True + return False + + +bpemode = len(sys.argv) > 4 +if bpemode: + import sentencepiece as spm + sp = spm.SentencePieceProcessor() + sp.Load(sys.argv[4]) +lexicon_table = set() +with open(sys.argv[2], 'r', encoding='utf8') as fin, \ + open(sys.argv[3], 'w', encoding='utf8') as fout: + for line in fin: + word = line.split()[0] + if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel + continue + elif word == '': + continue + else: + # each word only has one pronunciation for e2e system + if word in lexicon_table: + continue + if bpemode: + pieces = sp.EncodeAsPieces(word) + if contain_oov(pieces): + print( + 'Ignoring words {}, which contains oov unit'.format( + ''.join(word).strip('▁')) + ) + continue + chars = ' '.join( + [p if p in unit_table else '' for p in pieces]) + else: + # ignore words with OOV + if contain_oov(word): + print('Ignoring words {}, which contains oov unit'.format(word)) + continue + # Optional, append ▁ in front of english word + # we assume the model unit of our e2e system is char now. + if word.encode('utf8').isalpha() and '▁' in unit_table: + word = '▁' + word + chars = ' '.join(word) # word is a char list + fout.write('{} {}\n'.format(word, chars)) + lexicon_table.add(word) diff --git a/speechx/examples/aishell/tools/fst/remove_oovs.pl b/speechx/examples/aishell/tools/fst/remove_oovs.pl new file mode 100755 index 00000000..ac914c3b --- /dev/null +++ b/speechx/examples/aishell/tools/fst/remove_oovs.pl @@ -0,0 +1,43 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script removes lines that contain these OOVs on either the +# third or fourth fields of the line. It is intended to remove arcs +# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). + +if ( @ARGV < 1 && @ARGV > 2) { + die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; +} + +$unklist = shift @ARGV; +open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; +while(){ + @A = split(" ", $_); + @A == 1 || die "Bad line in unknown-symbol list: $_"; + $unk{$A[0]} = 1; +} + +$num_removed = 0; +while(<>){ + @A = split(" ", $_); + if(defined $unk{$A[2]} || defined $unk{$A[3]}) { + $num_removed++; + } else { + print; + } +} +print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; + diff --git a/speechx/examples/aishell/tools/fst/rnnt_token_fst.py b/speechx/examples/aishell/tools/fst/rnnt_token_fst.py new file mode 100755 index 00000000..cc6def17 --- /dev/null +++ b/speechx/examples/aishell/tools/fst/rnnt_token_fst.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python + +import sys + +print('0 0 ') + +with open(sys.argv[1], 'r', encoding='utf8') as fin: + for entry in fin: + fields = entry.strip().split(' ') + phone = fields[0] + if phone == '' or phone == '': + continue + elif '#' in phone: # disambiguous phone + print('{} {} {} {}'.format(0, 0, '', phone)) + else: + print('{} {} {} {}'.format(0, 0, phone, phone)) +print('0') diff --git a/speechx/examples/aishell/tools/fst/s2eps.pl b/speechx/examples/aishell/tools/fst/s2eps.pl new file mode 100755 index 00000000..ffeeb8eb --- /dev/null +++ b/speechx/examples/aishell/tools/fst/s2eps.pl @@ -0,0 +1,27 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script replaces and with (on both input and output sides), +# for the G.fst acceptor. + +while(<>){ + @A = split(" ", $_); + if ( @A >= 4 ) { + if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } + if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } + } + print join("\t", @A) . "\n"; +} diff --git a/speechx/examples/aishell/tools/parse_options.sh b/speechx/examples/aishell/tools/parse_options.sh new file mode 100755 index 00000000..34476fdb --- /dev/null +++ b/speechx/examples/aishell/tools/parse_options.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal, Karel Vesely + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Parse command-line options. +# To be sourced by another script (as in ". parse_options.sh"). +# Option format is: --option-name arg +# and shell variable "option_name" gets set to value "arg." +# The exception is --help, which takes no arguments, but prints the +# $help_message variable (if defined). + + +### +### The --config file options have lower priority to command line +### options, so we need to import them first... +### + +# Now import all the configs specified by command-line, in left-to-right order +for ((argpos=1; argpos<$#; argpos++)); do + if [ "${!argpos}" == "--config" ]; then + argpos_plus1=$((argpos+1)) + config=${!argpos_plus1} + [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 + . $config # source the config file. + fi +done + + +### +### No we process the command line options +### +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + # If the enclosing script is called with --help option, print the help + # message and exit. Scripts should put help messages in $help_message + --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; + else printf "$help_message\n" 1>&2 ; fi; + exit 0 ;; + --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" + exit 1 ;; + # If the first command-line argument begins with "--" (e.g. --foo-bar), + # then work out the variable name as $name, which will equal "foo_bar". + --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; + # Next we test whether the variable in question is undefned-- if so it's + # an invalid option and we die. Note: $0 evaluates to the name of the + # enclosing script. + # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar + # is undefined. We then have to wrap this test inside "eval" because + # foo_bar is itself inside a variable ($name). + eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; + + oldval="`eval echo \\$$name`"; + # Work out whether we seem to be expecting a Boolean argument. + if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval $name=\"$2\"; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 + exit 1; + fi + shift 2; + ;; + *) break; + esac +done + + +# Check for an empty argument to the --cmd option, which can easily occur as a +# result of scripting errors. +[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; + + +true; # so this script returns exit code 0. diff --git a/speechx/examples/decoder/offline_wfst_decoder_main.cc b/speechx/examples/decoder/offline_wfst_decoder_main.cc index 7c1f6226..f0b9cc4f 100644 --- a/speechx/examples/decoder/offline_wfst_decoder_main.cc +++ b/speechx/examples/decoder/offline_wfst_decoder_main.cc @@ -27,6 +27,8 @@ DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string(word_symbol_table, "vocab.txt", "word symbol table"); DEFINE_string(graph_path, "TLG", "decoder graph"); +DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); +DEFINE_int32(max_active, 5000, "decoder graph"); using kaldi::BaseFloat; @@ -49,11 +51,13 @@ int main(int argc, char* argv[]) { ppspeech::TLGDecoderOptions opts; opts.word_symbol_table = word_symbol_table; opts.fst_path = graph_path; + opts.opts.max_active = FLAGS_max_active; ppspeech::TLGDecoder decoder(opts); ppspeech::ModelOptions model_opts; model_opts.model_path = model_graph; model_opts.params_path = model_params; + model_opts.cache_shape = "5-1-1024,5-1-1024"; std::shared_ptr nnet( new ppspeech::PaddleNnet(model_opts)); std::shared_ptr raw_data( diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/normalizer.cc index 1adddb40..52412561 100644 --- a/speechx/speechx/frontend/normalizer.cc +++ b/speechx/speechx/frontend/normalizer.cc @@ -107,7 +107,7 @@ void CMVN::Accept(const kaldi::VectorBase& inputs) { } bool CMVN::Read(kaldi::Vector* feats) { - if (base_extractor_->Read(feats) == false) { + if (base_extractor_->Read(feats) == false || feats->Dim() == 0) { return false; } Compute(feats); diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc index 0a5e0a77..b0f82e0c 100644 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -36,9 +36,9 @@ int32 Decodable::NumFramesReady() const { } bool Decodable::IsLastFrame(int32 frame) { - CHECK_LE(frame, frames_ready_); bool flag = EnsureFrameHaveComputed(frame); - return (flag == false) && (frame == frames_ready_ - 1); + //CHECK_LE(frame, frames_ready_); + return (flag == false) && (frame == frames_ready_); } int32 Decodable::NumIndices() const { return 0; } diff --git a/speechx/tools/fstbin/fstaddselfloops.cc b/speechx/tools/fstbin/fstaddselfloops.cc new file mode 100644 index 00000000..145bf006 --- /dev/null +++ b/speechx/tools/fstbin/fstaddselfloops.cc @@ -0,0 +1,100 @@ +// fstbin/fstaddselfloops.cc + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "fst/fstlib.h" +#include "fstext/determinize-star.h" +#include "fstext/fstext-utils.h" +#include "fstext/kaldi-fst-io.h" +#include "util/parse-options.h" +#include "util/simple-io-funcs.h" + +/* some test examples: + pushd ~/tmpdir + ( echo 3; echo 4) > in.list + ( echo 5; echo 6) > out.list + ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstaddselfloops in.list out.list + | fstprint ( echo "0 1 0 1"; echo " 0 2 1 0"; echo "1 0"; echo "2 0"; ) | + fstcompile | fstaddselfloops in.list out.list | fstprint +*/ + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; // NOLINT + using namespace fst; // NOLINT + using kaldi::int32; + + const char *usage = + "Adds self-loops to states of an FST to propagate disambiguation " + "symbols through it\n" + "They are added on each final state and each state with non-epsilon " + "output symbols\n" + "on at least one arc out of the state. Useful in conjunction with " + "predeterminize\n" + "\n" + "Usage: fstaddselfloops in-disambig-list out-disambig-list [in.fst " + "[out.fst] ]\n" + "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n" + "in.list and out.list are lists of integers, one per line, of the\n" + "same length.\n"; + + ParseOptions po(usage); + po.Read(argc, argv); + + if (po.NumArgs() < 2 || po.NumArgs() > 4) { + po.PrintUsage(); + exit(1); + } + + std::string disambig_in_rxfilename = po.GetArg(1), + disambig_out_rxfilename = po.GetArg(2), + fst_in_filename = po.GetOptArg(3), + fst_out_filename = po.GetOptArg(4); + + VectorFst *fst = ReadFstKaldi(fst_in_filename); + + std::vector disambig_in; + if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in)) + KALDI_ERR + << "fstaddselfloops: Could not read disambiguation symbols from " + << kaldi::PrintableRxfilename(disambig_in_rxfilename); + + std::vector disambig_out; + if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out)) + KALDI_ERR + << "fstaddselfloops: Could not read disambiguation symbols from " + << kaldi::PrintableRxfilename(disambig_out_rxfilename); + + if (disambig_in.size() != disambig_out.size()) + KALDI_ERR + << "fstaddselfloops: mismatch in size of disambiguation symbols"; + + AddSelfLoops(fst, disambig_in, disambig_out); + + WriteFstKaldi(*fst, fst_out_filename); + + delete fst; + + return 0; + } catch (const std::exception &e) { + std::cerr << e.what(); + return -1; + } + return 0; +} diff --git a/speechx/tools/fstbin/fstdeterminizestar.cc b/speechx/tools/fstbin/fstdeterminizestar.cc new file mode 100644 index 00000000..e8181430 --- /dev/null +++ b/speechx/tools/fstbin/fstdeterminizestar.cc @@ -0,0 +1,114 @@ +// fstbin/fstdeterminizestar.cc + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "fst/fstlib.h" +#include "fstext/determinize-star.h" +#include "fstext/fstext-utils.h" +#include "fstext/kaldi-fst-io.h" +#include "util/parse-options.h" +#if !defined(_MSC_VER) && !defined(__APPLE__) +#include // Comment this line and the call to signal below if +// it causes compilation problems. It is only to enable a debugging procedure +// when determinization does not terminate. We are disabling this code if +// compiling on Windows because signal.h is not available there, and on +// MacOS due to a problem with in the initial release of Sierra. +#endif + +/* some test examples: + ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint + ( echo "0 0 1 0"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint + ( echo "0 0 1 0"; echo "0 1 1 0"; echo "0 0" ) | fstcompile | + fstdeterminizestar | fstprint # this last one fails [correctly]: ( echo "0 0 0 + 1"; echo "0 0" ) | fstcompile | fstdeterminizestar | fstprint + + cd ~/tmpdir + while true; do + fstrand > 1.fst + fstpredeterminize out.lst 1.fst | fstdeterminizestar | fstrmsymbols out.lst + > 2.fst fstequivalent --random=true 1.fst 2.fst || echo "Test failed" echo -n + "." done + + Test of debugging [with non-determinizable input]: + ( echo " 0 0 1 0 1.0"; echo "0 1 1 0"; echo "1 1 1 0 0"; echo "0 2 2 0"; echo + "2"; echo "1" ) | fstcompile | fstdeterminizestar kill -SIGUSR1 [the process-id + of fstdeterminizestar] # prints out a bunch of debugging output showing the + mess it got itself into. +*/ + +bool debug_location = false; +void signal_handler(int) { debug_location = true; } + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; // NOLINT + using namespace fst; // NOLINT + using kaldi::int32; + + const char *usage = + "Removes epsilons and determinizes in one step\n" + "\n" + "Usage: fstdeterminizestar [in.fst [out.fst] ]\n" + "\n" + "See also: fstdeterminizelog, lattice-determinize\n"; + + float delta = kDelta; + int max_states = -1; + bool use_log = false; + ParseOptions po(usage); + po.Register("use-log", &use_log, "Determinize in log semiring."); + po.Register("delta", &delta, + "Delta value used to determine equivalence of weights."); + po.Register( + "max-states", &max_states, + "Maximum number of states in determinized FST before it will abort."); + po.Read(argc, argv); + + if (po.NumArgs() > 2) { + po.PrintUsage(); + exit(1); + } + + std::string fst_in_str = po.GetOptArg(1), fst_out_str = po.GetOptArg(2); + + // This enables us to get traceback info from determinization that is + // not seeming to terminate. +#if !defined(_MSC_VER) && !defined(__APPLE__) + signal(SIGUSR1, signal_handler); +#endif + // Normal case: just files. + VectorFst *fst = ReadFstKaldi(fst_in_str); + + ArcSort(fst, ILabelCompare()); // improves speed. + if (use_log) { + DeterminizeStarInLog(fst, delta, &debug_location, max_states); + } else { + VectorFst det_fst; + DeterminizeStar(*fst, &det_fst, delta, &debug_location, max_states); + *fst = det_fst; // will do shallow copy and then det_fst goes + // out of scope anyway. + } + WriteFstKaldi(*fst, fst_out_str); + delete fst; + return 0; + } catch (const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/speechx/tools/fstbin/fstisstochastic.cc b/speechx/tools/fstbin/fstisstochastic.cc new file mode 100644 index 00000000..468ed0da --- /dev/null +++ b/speechx/tools/fstbin/fstisstochastic.cc @@ -0,0 +1,91 @@ +// fstbin/fstisstochastic.cc + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "fst/fstlib.h" +#include "fstext/fstext-utils.h" +#include "fstext/kaldi-fst-io.h" +#include "util/kaldi-io.h" +#include "util/parse-options.h" + +// e.g. of test: +// echo " 0 0" | fstcompile | fstisstochastic +// should return 0 and print "0 0" [meaning, min and +// max weight are one = exp(0)] +// echo " 0 1" | fstcompile | fstisstochastic +// should return 1, not stochastic, and print 1 1 +// (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | +// fstcompile | fstisstochastic should return 0, stochastic; it prints "0 +// -1.78e-07" for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo +// "1 0" ) | fstcompile | fstisstochastic --test-in-log=false should return 1, +// not stochastic in tropical; it prints "0 0.693147" for me (echo "0 0 0 0 0 "; +// echo "0 1 0 0 0 "; echo "1 0" ) | fstcompile | fstisstochastic +// --test-in-log=false should return 0, stochastic in tropical; it prints "0 0" +// for me (echo "0 0 0 0 0.693147 "; echo "0 1 0 0 0.693147 "; echo "1 0" ) | +// fstcompile | fstisstochastic --test-in-log=false --delta=1 returns 0 even +// though not stochastic because we gave it an absurdly large delta. + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; // NOLINT + using namespace fst; // NOLINT + using kaldi::int32; + + const char *usage = + "Checks whether an FST is stochastic and exits with success if so.\n" + "Prints out maximum error (in log units).\n" + "\n" + "Usage: fstisstochastic [ in.fst ]\n"; + + float delta = 0.01; + bool test_in_log = true; + + ParseOptions po(usage); + po.Register("delta", &delta, "Maximum error to accept."); + po.Register("test-in-log", &test_in_log, + "Test stochasticity in log semiring."); + po.Read(argc, argv); + + if (po.NumArgs() > 1) { + po.PrintUsage(); + exit(1); + } + + std::string fst_in_filename = po.GetOptArg(1); + + Fst *fst = ReadFstKaldiGeneric(fst_in_filename); + + bool ans; + StdArc::Weight min, max; + if (test_in_log) + ans = IsStochasticFstInLog(*fst, delta, &min, &max); + else + ans = IsStochasticFst(*fst, delta, &min, &max); + + std::cout << min.Value() << " " << max.Value() << '\n'; + delete fst; + if (ans) + return 0; // success; + else + return 1; + } catch (const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/speechx/tools/fstbin/fstminimizeencoded.cc b/speechx/tools/fstbin/fstminimizeencoded.cc new file mode 100644 index 00000000..ae9ca6d7 --- /dev/null +++ b/speechx/tools/fstbin/fstminimizeencoded.cc @@ -0,0 +1,74 @@ +// fstbin/fstminimizeencoded.cc + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "fst/fstlib.h" +#include "fstext/determinize-star.h" +#include "fstext/fstext-utils.h" +#include "fstext/kaldi-fst-io.h" +#include "util/kaldi-io.h" +#include "util/parse-options.h" +#include "util/text-utils.h" + +/* some test examples: + ( echo "0 0 0 0"; echo "0 0" ) | fstcompile | fstminimizeencoded | fstprint + ( echo "0 1 0 0"; echo " 0 2 0 0"; echo "1 0"; echo "2 0"; ) | fstcompile | + fstminimizeencoded | fstprint +*/ + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; // NOLINT + using namespace fst; // NOLINT + using kaldi::int32; + + const char *usage = + "Minimizes FST after encoding [similar to fstminimize, but no " + "weight-pushing]\n" + "\n" + "Usage: fstminimizeencoded [in.fst [out.fst] ]\n"; + + float delta = kDelta; + ParseOptions po(usage); + po.Register("delta", &delta, + "Delta likelihood used for quantization of weights"); + po.Read(argc, argv); + + if (po.NumArgs() > 2) { + po.PrintUsage(); + exit(1); + } + + std::string fst_in_filename = po.GetOptArg(1), + fst_out_filename = po.GetOptArg(2); + + VectorFst *fst = ReadFstKaldi(fst_in_filename); + + MinimizeEncoded(fst, delta); + + WriteFstKaldi(*fst, fst_out_filename); + + delete fst; + return 0; + } catch (const std::exception &e) { + std::cerr << e.what(); + return -1; + } + return 0; +} diff --git a/speechx/tools/fstbin/fsttablecompose.cc b/speechx/tools/fstbin/fsttablecompose.cc new file mode 100644 index 00000000..bdd476da --- /dev/null +++ b/speechx/tools/fstbin/fsttablecompose.cc @@ -0,0 +1,133 @@ +// fstbin/fsttablecompose.cc + +// Copyright 2009-2011 Microsoft Corporation +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "fst/fstlib.h" +#include "fstext/fstext-utils.h" +#include "fstext/kaldi-fst-io.h" +#include "fstext/table-matcher.h" +#include "util/parse-options.h" + +/* + cd ~/tmpdir + while true; do + fstrand | fstarcsort --sort_type=olabel > 1.fst; fstrand | fstarcsort + > 2.fst fstcompose 1.fst 2.fst > 3a.fst fsttablecompose 1.fst 2.fst > 3b.fst + fstequivalent --random=true 3a.fst 3b.fst || echo "Test failed" + echo -n "." + done + +*/ + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; // NOLINT + using namespace fst; // NOLINT + using kaldi::int32; + /* + fsttablecompose should always give equivalent results to compose, + but it is more efficient for certain kinds of inputs. + In particular, it is useful when, say, the left FST has states + that typically either have epsilon olabels, or + one transition out for each of the possible symbols (as the + olabel). The same with the input symbols of the right-hand FST + is possible. + */ + + const char *usage = + "Composition algorithm [between two FSTs of standard type, in " + "tropical\n" + "semiring] that is more efficient for certain cases-- in particular,\n" + "where one of the FSTs (the left one, if --match-side=left) has large\n" + "out-degree\n" + "\n" + "Usage: fsttablecompose (fst1-rxfilename|fst1-rspecifier) " + "(fst2-rxfilename|fst2-rspecifier) [(out-rxfilename|out-rspecifier)]\n"; + + ParseOptions po(usage); + + TableComposeOptions opts; + std::string match_side = "left"; + std::string compose_filter = "sequence"; + + po.Register("connect", &opts.connect, "If true, trim FST before output."); + po.Register("match-side", &match_side, + "Side of composition to do table " + "match, one of: \"left\" or \"right\"."); + po.Register("compose-filter", &compose_filter, + "Composition filter to use, " + "one of: \"alt_sequence\", \"auto\", \"match\", \"sequence\""); + + po.Read(argc, argv); + + if (match_side == "left") { + opts.table_match_type = MATCH_OUTPUT; + } else if (match_side == "right") { + opts.table_match_type = MATCH_INPUT; + } else { + KALDI_ERR << "Invalid match-side option: " << match_side; + } + + if (compose_filter == "alt_sequence") { + opts.filter_type = ALT_SEQUENCE_FILTER; + } else if (compose_filter == "auto") { + opts.filter_type = AUTO_FILTER; + } else if (compose_filter == "match") { + opts.filter_type = MATCH_FILTER; + } else if (compose_filter == "sequence") { + opts.filter_type = SEQUENCE_FILTER; + } else { + KALDI_ERR << "Invalid compose-filter option: " << compose_filter; + } + + if (po.NumArgs() < 2 || po.NumArgs() > 3) { + po.PrintUsage(); + exit(1); + } + + std::string fst1_in_str = po.GetArg(1), fst2_in_str = po.GetArg(2), + fst_out_str = po.GetOptArg(3); + + VectorFst *fst1 = ReadFstKaldi(fst1_in_str); + + VectorFst *fst2 = ReadFstKaldi(fst2_in_str); + + // Checks if is olabel sorted and is ilabel sorted. + if (fst1->Properties(fst::kOLabelSorted, true) == 0) { + KALDI_WARN << "The first FST is not olabel sorted."; + } + if (fst2->Properties(fst::kILabelSorted, true) == 0) { + KALDI_WARN << "The second FST is not ilabel sorted."; + } + + VectorFst composed_fst; + + TableCompose(*fst1, *fst2, &composed_fst, opts); + + delete fst1; + delete fst2; + + WriteFstKaldi(composed_fst, fst_out_str); + return 0; + } catch (const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/speechx/tools/install_srilm.sh b/speechx/tools/install_srilm.sh new file mode 100755 index 00000000..813109db --- /dev/null +++ b/speechx/tools/install_srilm.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash + +current_path=`pwd` +current_dir=`basename "$current_path"` + +if [ "tools" != "$current_dir" ]; then + echo "You should run this script in tools/ directory!!" + exit 1 +fi + +if [ ! -d liblbfgs-1.10 ]; then + echo Installing libLBFGS library to support MaxEnt LMs + bash extras/install_liblbfgs.sh || exit 1 +fi + +! command -v gawk > /dev/null && \ + echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; + +if [ $# -ne 3 ]; then + echo "SRILM download requires some information about you" + echo + echo "Usage: $0 " + exit 1 +fi + +srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php" +post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3" + +if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then + echo 'There was a problem downloading the file.' + echo 'Check you internet connection and try again.' + exit 1 +fi + +mkdir -p srilm +cd srilm + + +if [ -f ../srilm.tgz ]; then + tar -xvzf ../srilm.tgz # Old SRILM format +elif [ -f ../srilm.tar.gz ]; then + tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz +fi + +major=`gawk -F. '{ print $1 }' RELEASE` +minor=`gawk -F. '{ print $2 }' RELEASE` +micro=`gawk -F. '{ print $3 }' RELEASE` + +if [ $major -le 1 ] && [ $minor -le 7 ] && [ $micro -le 1 ]; then + echo "Detected version 1.7.1 or earlier. Applying patch." + patch -p0 < ../extras/srilm.patch +fi + +# set the SRILM variable in the top-level Makefile to this directory. +cp Makefile tmpf + +cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ + > Makefile || exit 1 +rm tmpf + +mtype=`sbin/machine-type` + +echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype +grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \ + sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \ + >> common/Makefile.machine.$mtype + +grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \ + sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/ -Wl,-rpath -Wl,$(SRILM)/../liblbfgs-1.10/lib/|' \ + >> common/Makefile.machine.$mtype + +make || exit + +cd .. +( + [ ! -z "${SRILM}" ] && \ + echo >&2 "SRILM variable is aleady defined. Undefining..." && \ + unset SRILM + + [ -f ./env.sh ] && . ./env.sh + + [ ! -z "${SRILM}" ] && \ + echo >&2 "SRILM config is already in env.sh" && exit + + wd=`pwd` + wd=`readlink -f $wd || pwd` + + echo "export SRILM=$wd/srilm" + dirs="\${PATH}" + for directory in $(cd srilm && find bin -type d ) ; do + dirs="$dirs:\${SRILM}/$directory" + done + echo "export PATH=$dirs" +) >> env.sh + +echo >&2 "Installation of SRILM finished successfully" +echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/speechx/tools/lmbin/CMakeLists.txt b/speechx/tools/lmbin/CMakeLists.txt new file mode 100644 index 00000000..277e2077 --- /dev/null +++ b/speechx/tools/lmbin/CMakeLists.txt @@ -0,0 +1,5 @@ +cmake_minimum_required(VERSION 3.14 FATAL_ERROR) + +add_executable(arpa2fst ${CMAKE_CURRENT_SOURCE_DIR}/arpa2fst.cc) +target_include_directories(arpa2fst PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) +target_link_libraries(arpa2fst ) diff --git a/speechx/tools/lmbin/arpa2fst.cc b/speechx/tools/lmbin/arpa2fst.cc new file mode 100644 index 00000000..881a45c5 --- /dev/null +++ b/speechx/tools/lmbin/arpa2fst.cc @@ -0,0 +1,145 @@ +// bin/arpa2fst.cc +// +// Copyright 2009-2011 Gilles Boulianne. +// +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABILITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "lm/arpa-lm-compiler.h" +#include "util/kaldi-io.h" +#include "util/parse-options.h" + +int main(int argc, char *argv[]) { + using namespace kaldi; // NOLINT + try { + const char *usage = + "Convert an ARPA format language model into an FST\n" + "Usage: arpa2fst [opts] \n" + " e.g.: arpa2fst --disambig-symbol=#0 --read-symbol-table=" + "data/lang/words.txt lm/input.arpa G.fst\n\n" + "Note: When called without switches, the output G.fst will contain\n" + "an embedded symbol table. This is compatible with the way a previous\n" + "version of arpa2fst worked.\n"; + + ParseOptions po(usage); + + ArpaParseOptions options; + options.Register(&po); + + // Option flags. + std::string bos_symbol = ""; + std::string eos_symbol = ""; + std::string disambig_symbol; + std::string read_syms_filename; + std::string write_syms_filename; + bool keep_symbols = false; + bool ilabel_sort = true; + + po.Register("bos-symbol", &bos_symbol, "Beginning of sentence symbol"); + po.Register("eos-symbol", &eos_symbol, "End of sentence symbol"); + po.Register("disambig-symbol", &disambig_symbol, + "Disambiguator. If provided (e. g. #0), used on input side of " + "backoff links, and and are replaced with epsilons"); + po.Register("read-symbol-table", &read_syms_filename, + "Use existing symbol table"); + po.Register("write-symbol-table", &write_syms_filename, + "Write generated symbol table to a file"); + po.Register("keep-symbols", &keep_symbols, + "Store symbol table with FST. Symbols always saved to FST if " + "symbol tables are neither read or written (otherwise symbols " + "would be lost entirely)"); + po.Register("ilabel-sort", &ilabel_sort, "Ilabel-sort the output FST"); + + po.Read(argc, argv); + + if (po.NumArgs() != 1 && po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + std::string arpa_rxfilename = po.GetArg(1), + fst_wxfilename = po.GetOptArg(2); + + int64 disambig_symbol_id = 0; + + fst::SymbolTable *symbols; + if (!read_syms_filename.empty()) { + // Use existing symbols. Required symbols must be in the table. + kaldi::Input kisym(read_syms_filename); + symbols = fst::SymbolTable::ReadText( + kisym.Stream(), PrintableWxfilename(read_syms_filename)); + if (symbols == NULL) + KALDI_ERR << "Could not read symbol table from file " + << read_syms_filename; + + options.oov_handling = ArpaParseOptions::kSkipNGram; + if (!disambig_symbol.empty()) { + disambig_symbol_id = symbols->Find(disambig_symbol); + if (disambig_symbol_id == -1) // fst::kNoSymbol + KALDI_ERR << "Symbol table " << read_syms_filename + << " has no symbol for " << disambig_symbol; + } + } else { + // Create a new symbol table and populate it from ARPA file. + symbols = new fst::SymbolTable(PrintableWxfilename(fst_wxfilename)); + options.oov_handling = ArpaParseOptions::kAddToSymbols; + symbols->AddSymbol("", 0); + if (!disambig_symbol.empty()) { + disambig_symbol_id = symbols->AddSymbol(disambig_symbol); + } + } + + // Add or use existing BOS and EOS. + options.bos_symbol = symbols->AddSymbol(bos_symbol); + options.eos_symbol = symbols->AddSymbol(eos_symbol); + + // If producing new (not reading existing) symbols and not saving them, + // need to keep symbols with FST, otherwise they would be lost. + if (read_syms_filename.empty() && write_syms_filename.empty()) + keep_symbols = true; + + // Actually compile LM. + KALDI_ASSERT(symbols != NULL); + ArpaLmCompiler lm_compiler(options, disambig_symbol_id, symbols); + { + Input ki(arpa_rxfilename); + lm_compiler.Read(ki.Stream()); + } + + // Sort the FST in-place if requested by options. + if (ilabel_sort) { + fst::ArcSort(lm_compiler.MutableFst(), fst::StdILabelCompare()); + } + + // Write symbols if requested. + if (!write_syms_filename.empty()) { + kaldi::Output kosym(write_syms_filename, false); + symbols->WriteText(kosym.Stream()); + } + + // Write LM FST. + bool write_binary = true, write_header = false; + kaldi::Output kofst(fst_wxfilename, write_binary, write_header); + fst::FstWriteOptions wopts(PrintableWxfilename(fst_wxfilename)); + wopts.write_isymbols = wopts.write_osymbols = keep_symbols; + lm_compiler.Fst().Write(kofst.Stream(), wopts); + + delete symbols; + } catch (const std::exception &e) { + std::cerr << e.what(); + return -1; + } +}