diff --git a/speechx/examples/aishell/tools/fst/add_lex_disambig.pl b/speechx/examples/aishell/tools/fst/add_lex_disambig.pl deleted file mode 100755 index dd8a25de..00000000 --- a/speechx/examples/aishell/tools/fst/add_lex_disambig.pl +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2013-2016 Johns Hopkins University (author: Daniel Povey) -# 2015 Hainan Xu -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Adds disambiguation symbols to a lexicon. -# Outputs still in the normal lexicon format. -# Disambig syms are numbered #1, #2, #3, etc. (#0 -# reserved for symbol in grammar). -# Outputs the number of disambig syms to the standard output. -# With the --pron-probs option, expects the second field -# of each lexicon line to be a pron-prob. -# With the --sil-probs option, expects three additional -# fields after the pron-prob, representing various components -# of the silence probability model. - -$pron_probs = 0; -$sil_probs = 0; -$first_allowed_disambig = 1; - -for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { - if ($ARGV[0] eq "--pron-probs") { - $pron_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--sil-probs") { - $sil_probs = 1; - shift @ARGV; - } - if ($ARGV[0] eq "--first-allowed-disambig") { - $first_allowed_disambig = 0 + $ARGV[1]; - if ($first_allowed_disambig < 1) { - die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; - } - shift @ARGV; - shift @ARGV; - } -} - -if (@ARGV != 2) { - die "Usage: add_lex_disambig.pl [opts] \n" . - "This script adds disambiguation symbols to a lexicon in order to\n" . - "make decoding graphs determinizable; it adds pseudo-phone\n" . - "disambiguation symbols #1, #2 and so on at the ends of phones\n" . - "to ensure that all pronunciations are different, and that none\n" . - "is a prefix of another.\n" . - "It prints to the standard output the number of the largest-numbered" . - "disambiguation symbol that was used.\n" . - "\n" . - "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . - " --sil-probs [should be with --pron-probs option]\n" . - " Expect 3 extra fields after the pron-probs, for aspects of\n" . - " the silence probability model\n" . - " --first-allowed-disambig The number of the first disambiguation symbol\n" . - " that this script is allowed to add. By default this is\n" . - " #1, but you can set this to a larger value using this option.\n" . - "e.g.:\n" . - " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . - " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; -} - - -$lexfn = shift @ARGV; -$lexoutfn = shift @ARGV; - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - -# (1) Read in the lexicon. -@L = ( ); -while() { - @A = split(" ", $_); - push @L, join(" ", @A); -} - -# (2) Work out the count of each phone-sequence in the -# lexicon. - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { - $p = shift @A; - if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } - } - if ($sil_probs) { - $silp = shift @A; - if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - $correction = shift @A; - if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } - } - if (!(@A)) { - die "Bad lexicon line $1, no phone in phone list"; - } - $count{join(" ",@A)}++; -} - -# (3) For each left sub-sequence of each phone-sequence, note down -# that it exists (for identifying prefixes of longer strings). - -foreach $l (@L) { - @A = split(" ", $l); - shift @A; # Remove word. - if ($pron_probs) { shift @A; } # remove pron-prob. - if ($sil_probs) { - shift @A; # Remove silprob - shift @A; # Remove silprob - } - while(@A > 0) { - pop @A; # Remove last phone - $issubseq{join(" ",@A)} = 1; - } -} - -# (4) For each entry in the lexicon: -# if the phone sequence is unique and is not a -# prefix of another word, no diambig symbol. -# Else output #1, or #2, #3, ... if the same phone-seq -# has already been assigned a disambig symbol. - - -open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; - -# max_disambig will always be the highest-numbered disambiguation symbol that -# has been used so far. -$max_disambig = $first_allowed_disambig - 1; - -foreach $l (@L) { - @A = split(" ", $l); - $word = shift @A; - if ($pron_probs) { - $pron_prob = shift @A; - } - if ($sil_probs) { - $sil_word_prob = shift @A; - $word_sil_correction = shift @A; - $prev_nonsil_correction = shift @A - } - $phnseq = join(" ", @A); - if (!defined $issubseq{$phnseq} - && $count{$phnseq} == 1) { - ; # Do nothing. - } else { - if ($phnseq eq "") { # need disambig symbols for the empty string - # that are not use anywhere else. - $max_disambig++; - $reserved_for_the_empty_string{$max_disambig} = 1; - $phnseq = "#$max_disambig"; - } else { - $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; - if (!defined $cur_disambig) { - $cur_disambig = $first_allowed_disambig; - } else { - $cur_disambig++; # Get a number that has not been used yet for - # this phone sequence. - } - while (defined $reserved_for_the_empty_string{$cur_disambig}) { - $cur_disambig++; - } - if ($cur_disambig > $max_disambig) { - $max_disambig = $cur_disambig; - } - $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; - $phnseq = $phnseq . " #" . $cur_disambig; - } - } - if ($pron_probs) { - if ($sil_probs) { - print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; - } else { - print O "$word\t$pron_prob\t$phnseq\n"; - } - } else { - print O "$word\t$phnseq\n"; - } -} - -print $max_disambig . "\n"; diff --git a/speechx/examples/aishell/tools/fst/compile_lexicon_token_fst.sh b/speechx/examples/aishell/tools/fst/compile_lexicon_token_fst.sh deleted file mode 100755 index fda971fe..00000000 --- a/speechx/examples/aishell/tools/fst/compile_lexicon_token_fst.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/bash -# Copyright 2015 Yajie Miao (Carnegie Mellon University) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the -# phoneme and character-based lexicons. -set -eo pipefail -. tools/parse_options.sh - -if [ $# -ne 3 ]; then - echo "usage: tools/fst/compile_lexicon_token_fst.sh " - echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" - echo " should contain the following files:" - echo "lexicon.txt lexicon_numbers.txt units.txt" - echo "options: " - exit 1; -fi - -srcdir=$1 -tmpdir=$2 -dir=$3 -mkdir -p $dir $tmpdir - -[ -f path.sh ] && . ./path.sh - -cp $srcdir/units.txt $dir - -# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. -# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. -perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; - -# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. -# Without these symbols, determinization will fail. -ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` -ndisambig=$[$ndisambig+1]; - -( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list - -# Get the full list of CTC tokens used in FST. These tokens include , the blank , -# the actual model unit, and the disambiguation symbols. -cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list -(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt - -# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, -# so here just use simple ctc_token_fst -tools/fst/ctc_token_fst.py $dir/tokens.txt | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; - -# Encode the words with indices. Will be used in lexicon and language model FST compiling. -cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | awk ' - BEGIN { - print " 0"; - } - { - printf("%s %d\n", $1, NR); - } - END { - printf("#0 %d\n", NR+1); - printf(" %d\n", NR+2); - printf(" %d\n", NR+3); - }' > $dir/words.txt || exit 1; - -# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. -token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` - -tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ - fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; - -echo "Lexicon and token FSTs compiling succeeded" diff --git a/speechx/examples/aishell/tools/fst/ctc_token_fst.py b/speechx/examples/aishell/tools/fst/ctc_token_fst.py deleted file mode 100755 index 048734c8..00000000 --- a/speechx/examples/aishell/tools/fst/ctc_token_fst.py +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 1 ') -print('1 1 ') -print('2 2 ') -print('2 0 ') - -with open(sys.argv[1], 'r') as fin: - node = 3 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(1, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 2, '', '')) - node += 1 -print('0') diff --git a/speechx/examples/aishell/tools/fst/ctc_token_fst_compact.py b/speechx/examples/aishell/tools/fst/ctc_token_fst_compact.py deleted file mode 100755 index d3018d8b..00000000 --- a/speechx/examples/aishell/tools/fst/ctc_token_fst_compact.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - node = 1 - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, node, phone, phone)) - print('{} {} {} {}'.format(node, node, phone, '')) - print('{} {} {} {}'.format(node, 0, '', '')) - node += 1 -print('0') diff --git a/speechx/examples/aishell/tools/fst/ctc_token_fst_corrected.py b/speechx/examples/aishell/tools/fst/ctc_token_fst_corrected.py deleted file mode 100755 index 81f7079e..00000000 --- a/speechx/examples/aishell/tools/fst/ctc_token_fst_corrected.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python - -import sys - - -def il(n): - return n + 1 - - -def ol(n): - return n + 1 - - -def s(n): - return n - - -if __name__ == "__main__": - with open(sys.argv[1]) as f: - lines = f.readlines() - phone_count = 0 - disambig_count = 0 - for line in lines: - sp = line.split() - phone = sp[0] - if phone == '' or phone == '': - continue - if phone.startswith('#'): - disambig_count += 1 - else: - phone_count += 1 - - # 1. add start state - print('0 0 {} 0'.format(il(0))) - - # 2. 0 -> i, i -> i, i -> 0 - for i in range(1, phone_count + 1): - print('0 {} {} {}'.format(s(i), il(i), ol(i))) - print('{} {} {} 0'.format(s(i), s(i), il(i))) - print('{} 0 {} 0'.format(s(i), il(0))) - - # 3. i -> other phone - for i in range(1, phone_count + 1): - for j in range(1, phone_count + 1): - if i != j: - print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) - - # 4. add disambiguous arcs on every final state - for i in range(0, phone_count + 1): - for j in range(phone_count + 2, phone_count + disambig_count + 2): - print('{} {} {} {}'.format(s(i), s(i), 0, j)) - - # 5. every i is final state - for i in range(0, phone_count + 1): - print(s(i)) diff --git a/speechx/examples/aishell/tools/fst/eps2disambig.pl b/speechx/examples/aishell/tools/fst/eps2disambig.pl deleted file mode 100755 index e1d84a6b..00000000 --- a/speechx/examples/aishell/tools/fst/eps2disambig.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation -# 2015 Guoguo Chen - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces epsilon with #0 on the input side only, of the G.fst -# acceptor. - -while(<>){ - if (/\s+#0\s+/) { - print STDERR "$0: ERROR: LM has word #0, " . - "which is reserved as disambiguation symbol\n"; - exit 1; - } - s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; - print; -} diff --git a/speechx/examples/aishell/tools/fst/make_lexicon_fst.pl b/speechx/examples/aishell/tools/fst/make_lexicon_fst.pl deleted file mode 100755 index f97129c0..00000000 --- a/speechx/examples/aishell/tools/fst/make_lexicon_fst.pl +++ /dev/null @@ -1,155 +0,0 @@ -#!/usr/bin/env perl -use warnings; #sed replacement for -w perl parameter -# Copyright 2010-2011 Microsoft Corporation -# 2013 Johns Hopkins University (author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). - -$pron_probs = 0; - -if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { - $pron_probs = 1; - shift @ARGV; -} - -if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { - print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; - print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; - print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; - print STDERR " word phone1 phone2 ... phoneN;\n"; - print STDERR "if the --pron-probs option is used, each line is:\n"; - print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; - print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; - print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; - print STDERR "this is your responsibility.\n\n"; - print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; - print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; - print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; - exit(1); -} - -$lexfn = shift @ARGV; -if (@ARGV == 0) { - $silprob = 0.0; -} elsif (@ARGV == 2) { - ($silprob,$silphone) = @ARGV; -} else { - ($silprob,$silphone,$sildisambig) = @ARGV; -} -if ($silprob != 0.0) { - $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; - $silcost = -log($silprob); - $nosilcost = -log(1.0 - $silprob); -} - - -open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; - - -if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. - $loopstate = 0; - $nextstate = 1; # next unallocated state. - while () { - @A = split(" ", $_); - @A == 0 && die "Empty lexicon line."; - foreach $a (@A) { - if ($a eq "") { - die "Bad lexicon line $_ ( is forbidden)"; - } - } - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - } else { - $ns = $loopstate; - } - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; # so we only print it on the first arc of the word. - $s = $ns; - } - } - print "$loopstate\t0\n"; # final-cost. -} else { # have silence probs. - $startstate = 0; - $loopstate = 1; - $silstate = 2; # state from where we go to loopstate after emitting silence. - print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. - if (!defined $sildisambig) { - print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. - $nextstate = 3; - } else { - $disambigstate = 3; - $nextstate = 4; - print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. - print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. - print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. - } - while () { - @A = split(" ", $_); - $w = shift @A; - if (! $pron_probs) { - $pron_cost = 0.0; - } else { - $pron_prob = shift @A; - if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { - die "Bad pronunciation probability in line $_"; - } - $pron_cost = -log($pron_prob); - } - if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } - $s = $loopstate; - $word_or_eps = $w; - while (@A > 0) { - $p = shift @A; - if (@A > 0) { - $ns = $nextstate++; - print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; - $word_or_eps = ""; - $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. - $s = $ns; - } elsif (!defined($silphone) || $p ne $silphone) { - # This is non-deterministic but relatively compact, - # and avoids epsilons. - $local_nosilcost = $nosilcost + $pron_cost; - $local_silcost = $silcost + $pron_cost; - print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; - print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; - } else { - # no point putting opt-sil after silence word. - print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; - } - } - } - print "$loopstate\t0\n"; # final-cost. -} diff --git a/speechx/examples/aishell/tools/fst/make_tlg.sh b/speechx/examples/aishell/tools/fst/make_tlg.sh deleted file mode 100755 index 98694e55..00000000 --- a/speechx/examples/aishell/tools/fst/make_tlg.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# - -if [ -f path.sh ]; then . path.sh; fi - -lm_dir=$1 -src_lang=$2 -tgt_lang=$3 - -arpa_lm=${lm_dir}/lm.arpa -[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; - -rm -rf $tgt_lang -cp -r $src_lang $tgt_lang - -# Compose the language model to FST -cat $arpa_lm | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v ' ' | \ - grep -v -i '' | \ - grep -v -i '' | \ - arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ - tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ - --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ - fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst - - -echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic $tgt_lang/G.fst - -# Compose the token, lexicon and language-model FST into the final decoding graph -fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ - fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; -fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; - -echo "Composing decoding graph TLG.fst succeeded" -#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST diff --git a/speechx/examples/aishell/tools/fst/prepare_dict.py b/speechx/examples/aishell/tools/fst/prepare_dict.py deleted file mode 100755 index b6b92ea1..00000000 --- a/speechx/examples/aishell/tools/fst/prepare_dict.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 - -import sys - -# sys.argv[1]: e2e model unit file(lang_char.txt) -# sys.argv[2]: raw lexicon file -# sys.argv[3]: output lexicon file -# sys.argv[4]: bpemodel - -unit_table = set() -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for line in fin: - unit = line.split()[0] - unit_table.add(unit) - - -def contain_oov(units): - for unit in units: - if unit not in unit_table: - return True - return False - - -bpemode = len(sys.argv) > 4 -if bpemode: - import sentencepiece as spm - sp = spm.SentencePieceProcessor() - sp.Load(sys.argv[4]) -lexicon_table = set() -with open(sys.argv[2], 'r', encoding='utf8') as fin, \ - open(sys.argv[3], 'w', encoding='utf8') as fout: - for line in fin: - word = line.split()[0] - if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel - continue - elif word == '': - continue - else: - # each word only has one pronunciation for e2e system - if word in lexicon_table: - continue - if bpemode: - pieces = sp.EncodeAsPieces(word) - if contain_oov(pieces): - print( - 'Ignoring words {}, which contains oov unit'.format( - ''.join(word).strip('▁')) - ) - continue - chars = ' '.join( - [p if p in unit_table else '' for p in pieces]) - else: - # ignore words with OOV - if contain_oov(word): - print('Ignoring words {}, which contains oov unit'.format(word)) - continue - # Optional, append ▁ in front of english word - # we assume the model unit of our e2e system is char now. - if word.encode('utf8').isalpha() and '▁' in unit_table: - word = '▁' + word - chars = ' '.join(word) # word is a char list - fout.write('{} {}\n'.format(word, chars)) - lexicon_table.add(word) diff --git a/speechx/examples/aishell/tools/fst/remove_oovs.pl b/speechx/examples/aishell/tools/fst/remove_oovs.pl deleted file mode 100755 index ac914c3b..00000000 --- a/speechx/examples/aishell/tools/fst/remove_oovs.pl +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script removes lines that contain these OOVs on either the -# third or fourth fields of the line. It is intended to remove arcs -# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). - -if ( @ARGV < 1 && @ARGV > 2) { - die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; -} - -$unklist = shift @ARGV; -open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; -while(){ - @A = split(" ", $_); - @A == 1 || die "Bad line in unknown-symbol list: $_"; - $unk{$A[0]} = 1; -} - -$num_removed = 0; -while(<>){ - @A = split(" ", $_); - if(defined $unk{$A[2]} || defined $unk{$A[3]}) { - $num_removed++; - } else { - print; - } -} -print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; - diff --git a/speechx/examples/aishell/tools/fst/rnnt_token_fst.py b/speechx/examples/aishell/tools/fst/rnnt_token_fst.py deleted file mode 100755 index cc6def17..00000000 --- a/speechx/examples/aishell/tools/fst/rnnt_token_fst.py +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env python - -import sys - -print('0 0 ') - -with open(sys.argv[1], 'r', encoding='utf8') as fin: - for entry in fin: - fields = entry.strip().split(' ') - phone = fields[0] - if phone == '' or phone == '': - continue - elif '#' in phone: # disambiguous phone - print('{} {} {} {}'.format(0, 0, '', phone)) - else: - print('{} {} {} {}'.format(0, 0, phone, phone)) -print('0') diff --git a/speechx/examples/aishell/tools/fst/s2eps.pl b/speechx/examples/aishell/tools/fst/s2eps.pl deleted file mode 100755 index ffeeb8eb..00000000 --- a/speechx/examples/aishell/tools/fst/s2eps.pl +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env perl -# Copyright 2010-2011 Microsoft Corporation - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -# This script replaces and with (on both input and output sides), -# for the G.fst acceptor. - -while(<>){ - @A = split(" ", $_); - if ( @A >= 4 ) { - if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } - if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } - } - print join("\t", @A) . "\n"; -} diff --git a/speechx/examples/aishell/tools/parse_options.sh b/speechx/examples/aishell/tools/parse_options.sh deleted file mode 100755 index 34476fdb..00000000 --- a/speechx/examples/aishell/tools/parse_options.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); -# Arnab Ghoshal, Karel Vesely - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - - -# Parse command-line options. -# To be sourced by another script (as in ". parse_options.sh"). -# Option format is: --option-name arg -# and shell variable "option_name" gets set to value "arg." -# The exception is --help, which takes no arguments, but prints the -# $help_message variable (if defined). - - -### -### The --config file options have lower priority to command line -### options, so we need to import them first... -### - -# Now import all the configs specified by command-line, in left-to-right order -for ((argpos=1; argpos<$#; argpos++)); do - if [ "${!argpos}" == "--config" ]; then - argpos_plus1=$((argpos+1)) - config=${!argpos_plus1} - [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 - . $config # source the config file. - fi -done - - -### -### No we process the command line options -### -while true; do - [ -z "${1:-}" ] && break; # break if there are no arguments - case "$1" in - # If the enclosing script is called with --help option, print the help - # message and exit. Scripts should put help messages in $help_message - --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; - else printf "$help_message\n" 1>&2 ; fi; - exit 0 ;; - --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" - exit 1 ;; - # If the first command-line argument begins with "--" (e.g. --foo-bar), - # then work out the variable name as $name, which will equal "foo_bar". - --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; - # Next we test whether the variable in question is undefned-- if so it's - # an invalid option and we die. Note: $0 evaluates to the name of the - # enclosing script. - # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar - # is undefined. We then have to wrap this test inside "eval" because - # foo_bar is itself inside a variable ($name). - eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; - - oldval="`eval echo \\$$name`"; - # Work out whether we seem to be expecting a Boolean argument. - if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then - was_bool=true; - else - was_bool=false; - fi - - # Set the variable to the right value-- the escaped quotes make it work if - # the option had spaces, like --cmd "queue.pl -sync y" - eval $name=\"$2\"; - - # Check that Boolean-valued arguments are really Boolean. - if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then - echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 - exit 1; - fi - shift 2; - ;; - *) break; - esac -done - - -# Check for an empty argument to the --cmd option, which can easily occur as a -# result of scripting errors. -[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; - - -true; # so this script returns exit code 0.