add TLG utils

4 years ago · f5369abdbe
parent 8ad915a907
commit f5369abdbe
13 changed files with 885 additions and 5 deletions
--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
@ -38,7 +38,43 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
-if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-    # export ckpt avg_n
+#     # export ckpt avg_n
-    CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+#     CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-fi
+# fi
 # Optionally, you can add LM and test it with runtime.
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
     # 7.1 Prepare dict
     unit_file=data/vocab.txt
     mkdir -p data/local/dict
     cp $unit_file data/local/dict/units.txt
     utils/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \
         data/local/dict/lexicon.txt
     # 7.2 Train lm
     lm=data/local/lm
     mkdir -p $lm
     utils/filter_scp.pl data/train/text \
          $data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text
     local/aishell_train_lms.sh
     # 7.3 Build decoding TLG
     utils/fst/compile_lexicon_token_fst.sh \
         data/local/dict data/local/tmp data/local/lang
     utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
    #  # 7.4 Decoding with runtime
    #  # reverse_weight only works for u2++ model and only left to right decoder is used when it is set to 0.0.
    #  dir=exp/conformer
    #  reverse_weight=0.0
    #  chunk_size=-1
    #  ./tools/decode.sh --nj 16 \
    #      --beam 15.0 --lattice_beam 7.5 --max_active 7000 \
    #      --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \
    #      --reverse_weight $reverse_weight --chunk_size $chunk_size \
    #      --fst_path data/lang_test/TLG.fst \
    #      data/test/wav.scp data/test/text $dir/final.zip \
    #      data/lang_test/words.txt $dir/lm_with_runtime
    #  # See $dir/lm_with_runtime for wer
 fi
--- a/examples/dataset/aishell/.gitignore
+++ b/examples/dataset/aishell/.gitignore
@ -1,4 +1,5 @@
 data_aishell*
 *.meta
 manifest.*
-*.tgz
+*.tgz
 resource_aishell
--- a/utils/fst/add_lex_disambig.pl
+++ b/utils/fst/add_lex_disambig.pl
@ -0,0 +1,195 @@
 #!/usr/bin/env perl
 #  Copyright 2010-2011  Microsoft Corporation
 #            2013-2016  Johns Hopkins University (author: Daniel Povey)
 #                 2015  Hainan Xu
 #                 2015  Guoguo Chen
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #  http://www.apache.org/licenses/LICENSE-2.0
 #
 # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 # MERCHANTABLITY OR NON-INFRINGEMENT.
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 # Adds disambiguation symbols to a lexicon.
 # Outputs still in the normal lexicon format.
 # Disambig syms are numbered #1, #2, #3, etc. (#0
 # reserved for symbol in grammar).
 # Outputs the number of disambig syms to the standard output.
 # With the --pron-probs option, expects the second field
 # of each lexicon line to be a pron-prob.
 # With the --sil-probs option, expects three additional
 # fields after the pron-prob, representing various components
 # of the silence probability model.
 $pron_probs = 0;
 $sil_probs = 0;
 $first_allowed_disambig = 1;
 for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
  if ($ARGV[0] eq "--pron-probs") {
    $pron_probs = 1;
    shift @ARGV;
  }
  if ($ARGV[0] eq "--sil-probs") {
    $sil_probs = 1;
    shift @ARGV;
  }
  if ($ARGV[0] eq "--first-allowed-disambig") {
    $first_allowed_disambig = 0 + $ARGV[1];
    if ($first_allowed_disambig < 1) {
      die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
    }
    shift @ARGV;
    shift @ARGV;
  }
 }
 if (@ARGV != 2) {
  die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
    "This script adds disambiguation symbols to a lexicon in order to\n" .
    "make decoding graphs determinizable; it adds pseudo-phone\n" .
    "disambiguation symbols #1, #2 and so on at the ends of phones\n" .
    "to ensure that all pronunciations are different, and that none\n" .
    "is a prefix of another.\n" .
    "It prints to the standard output the number of the largest-numbered" .
    "disambiguation symbol that was used.\n" .
    "\n" .
    "Options:   --pron-probs       Expect pronunciation probabilities in the 2nd field\n" .
    "           --sil-probs        [should be with --pron-probs option]\n" .
    "                              Expect 3 extra fields after the pron-probs, for aspects of\n" .
    "                              the silence probability model\n" .
    "           --first-allowed-disambig <n>  The number of the first disambiguation symbol\n" .
    "                              that this script is allowed to add.  By default this is\n" .
    "                              #1, but you can set this to a larger value using this option.\n" .
    "e.g.:\n" .
    " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
    " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
    " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
 }
 $lexfn = shift @ARGV;
 $lexoutfn = shift @ARGV;
 open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
 # (1)  Read in the lexicon.
@L = ( );
 while(<L>) {
    @A = split(" ", $_);
    push @L, join(" ", @A);
 }
 # (2) Work out the count of each phone-sequence in the
 # lexicon.
 foreach $l (@L) {
    @A = split(" ", $l);
    shift @A; # Remove word.
    if ($pron_probs) {
      $p = shift @A;
      if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
    }
    if ($sil_probs) {
      $silp = shift @A;
      if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
      $correction = shift @A;
      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
      $correction = shift @A;
      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
    }
    if (!(@A)) {
      die "Bad lexicon line $1, no phone in phone list";
    }
    $count{join(" ",@A)}++;
 }
 # (3) For each left sub-sequence of each phone-sequence, note down
 # that it exists (for identifying prefixes of longer strings).
 foreach $l (@L) {
    @A = split(" ", $l);
    shift @A; # Remove word.
    if ($pron_probs) { shift @A; } # remove pron-prob.
    if ($sil_probs) {
      shift @A; # Remove silprob
      shift @A; # Remove silprob
    }
    while(@A > 0) {
        pop @A;  # Remove last phone
        $issubseq{join(" ",@A)} = 1;
    }
 }
 # (4) For each entry in the lexicon:
 #  if the phone sequence is unique and is not a
 #  prefix of another word, no diambig symbol.
 #  Else output #1, or #2, #3, ... if the same phone-seq
 #  has already been assigned a disambig symbol.
 open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
 # max_disambig will always be the highest-numbered disambiguation symbol that
 # has been used so far.
 $max_disambig = $first_allowed_disambig - 1;
 foreach $l (@L) {
  @A = split(" ", $l);
  $word = shift @A;
  if ($pron_probs) {
    $pron_prob = shift @A;
  }
  if ($sil_probs) {
    $sil_word_prob = shift @A;
    $word_sil_correction = shift @A;
    $prev_nonsil_correction = shift @A
  }
  $phnseq = join(" ", @A);
  if (!defined $issubseq{$phnseq}
      && $count{$phnseq} == 1) {
    ;                           # Do nothing.
  } else {
    if ($phnseq eq "") {        # need disambig symbols for the empty string
      # that are not use anywhere else.
      $max_disambig++;
      $reserved_for_the_empty_string{$max_disambig} = 1;
      $phnseq = "#$max_disambig";
    } else {
      $cur_disambig = $last_used_disambig_symbol_of{$phnseq};
      if (!defined $cur_disambig) {
        $cur_disambig = $first_allowed_disambig;
      } else {
        $cur_disambig++;           # Get a number that has not been used yet for
                                   # this phone sequence.
      }
      while (defined $reserved_for_the_empty_string{$cur_disambig}) {
        $cur_disambig++;
      }
      if ($cur_disambig > $max_disambig) {
        $max_disambig = $cur_disambig;
      }
      $last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
      $phnseq = $phnseq . " #" . $cur_disambig;
    }
  }
  if ($pron_probs) {
    if ($sil_probs) {
      print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
    } else {
      print O "$word\t$pron_prob\t$phnseq\n";
    }
  } else {
    print O "$word\t$phnseq\n";
  }
 }
 print $max_disambig . "\n";
--- a/utils/fst/compile_lexicon_token_fst.sh
+++ b/utils/fst/compile_lexicon_token_fst.sh
@ -0,0 +1,88 @@
 #!/bin/bash
 # Copyright 2015       Yajie Miao    (Carnegie Mellon University)
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #  http://www.apache.org/licenses/LICENSE-2.0
 #
 # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 # MERCHANTABLITY OR NON-INFRINGEMENT.
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 # This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
 # phoneme and character-based lexicons.
 set -eo pipefail
 . utils/parse_options.sh
 if [ $# -ne 3 ]; then
  echo "usage: utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>"
  echo "e.g.: utils/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang"
  echo "<dict-src-dir> should contain the following files:"
  echo "lexicon.txt lexicon_numbers.txt units.txt"
  echo "options: "
  exit 1;
 fi
 srcdir=$1
 tmpdir=$2
 dir=$3
 mkdir -p $dir $tmpdir
 [ -f path.sh ] && . ./path.sh
 cp $srcdir/units.txt $dir
 # Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
 # But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
 perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1;
 # Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
 # Without these symbols, determinization will fail.
 # default first disambiguation is #1
 ndisambig=`utils/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
 # add #0 (#0 reserved for symbol in grammar).
 ndisambig=$[$ndisambig+1];
 ( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list
 # Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>,
 # the actual model unit, and the disambiguation symbols.
 cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
 (echo '<eps>';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt
 # ctc_token_fst_corrected is too big and too slow for character based chinese modeling,
 # so here just use simple ctc_token_fst
 utils/fst/ctc_token_fst.py $dir/tokens.txt | \
  fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \
  fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
 # Encode the words with indices. Will be used in lexicon and language model FST compiling.
 cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | awk '
  BEGIN {
    print "<eps> 0";
  }
  {
    printf("%s %d\n", $1, NR);
  }
  END {
    printf("#0 %d\n", NR+1);
    printf("<s> %d\n", NR+2);
    printf("</s> %d\n", NR+3);
  }' > $dir/words.txt || exit 1;
 # Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
 token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'`
 word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
 utils/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \
  fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
  --keep_isymbols=false --keep_osymbols=false |   \
  fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
  fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
 echo "Lexicon and Token FSTs compiling succeeded"
--- a/utils/fst/ctc_token_fst.py
+++ b/utils/fst/ctc_token_fst.py
@ -0,0 +1,51 @@
 #!/usr/bin/env python3
 import argparse
 import sys
 def main(args):
    """Token Transducer"""
    # <eps> entry
    print('0 1 <eps> <eps>')
    # skip begining and ending <blank>
    print('1 1 <blank> <eps>')
    print('2 2 <blank> <eps>')
    # <eps> exit
    print('2 0 <eps> <eps>')
    # linking `token` between node 1 and node 2
    with open(sys.token_file, 'r') as fin:
        node = 3
        for entry in fin:
            fields = entry.strip().split(' ')
            phone = fields[0]
            if phone == '<eps>' or phone == '<blank>':
                continue
            elif '#' in phone:
                # disambiguous phone
                # `token` maybe ending with disambiguous symbol
                print('{} {} {} {}'.format(0, 0, '<eps>', phone))
            else:
                # eating `token`
                print('{} {} {} {}'.format(1, node, phone, phone))
                # remove repeating `token`
                print('{} {} {} {}'.format(node, node, phone, '<eps>'))
                # leaving `token`
                print('{} {} {} {}'.format(node, 2, '<eps>', '<eps>'))
            node += 1
    # Fianl node
    print('0')
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='FST: CTC Token FST transducer')
    parser.add_argument(
        '--token_file',
        required=True,
        help='e2e model token file. line: token(char/phone/spm/disambigous)')
    args = parser.parse_args()
    print(args)
    main(args)
--- a/utils/fst/ctc_token_fst_corrected.py
+++ b/utils/fst/ctc_token_fst_corrected.py
@ -0,0 +1,80 @@
 #!/usr/bin/env python3
 import argparse
 def il(n):
    """ilabel"""
    return n + 1
 def ol(n):
    """olabel"""
    return n + 1
 def s(n):
    """state"""
    return n
 def main(args):
    with open(args.token_file) as f:
        lines = f.readlines()
    # token count w/0 <blank> <eps>
    phone_count = 0
    disambig_count = 0
    for line in lines:
        sp = line.strip().split()
        phone = sp[0]
        if phone == '<eps>' or phone == '<blank>':
            continue
        if phone.startswith('#'):
            disambig_count += 1
        else:
            phone_count += 1
    # 1. add start state
    # first token is <blank>:0
    print('0 0 {} 0'.format(il(0)))
    # 2. 0 -> i, i -> i, i -> 0
    # non-blank token start from 1
    for i in range(1, phone_count + 1):
        # eating `token`
        print('0 {} {} {}'.format(s(i), il(i), ol(i)))
        # remove repeating `token`
        print('{} {} {} 0'.format(s(i), s(i), il(i)))
        # skip ending <blank> `token`
        print('{} 0 {} 0'.format(s(i), il(0)))
    # 3. i -> other phone
    # non-blank token to other non-blank token
    for i in range(1, phone_count + 1):
        for j in range(1, phone_count + 1):
            if i != j:
                print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j)))
    # 4. add disambiguous arcs on every final state
    # blank and non-blank token maybe ending with disambiguous `token`
    for i in range(0, phone_count + 1):
        for j in range(phone_count + 2, phone_count + disambig_count + 2):
            print('{} {} {} {}'.format(s(i), s(i), 0, j))
    # 5. every i is final state
    # blank and non-blank `token` are final state
    for i in range(0, phone_count + 1):
        print(s(i))
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='FST: CTC Token unfold FST transducer')
    parser.add_argument(
        '--token_file',
        required=True,
        help='e2e model token file. line: token(char/phone/spm/disambigous)')
    args = parser.parse_args()
    print(args)
    main(args)
--- a/utils/fst/eps2disambig.pl
+++ b/utils/fst/eps2disambig.pl
@ -0,0 +1,29 @@
 #!/usr/bin/env perl
 # Copyright 2010-2011 Microsoft Corporation
 #                2015 Guoguo Chen
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #  http://www.apache.org/licenses/LICENSE-2.0
 #
 # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 # MERCHANTABLITY OR NON-INFRINGEMENT.
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 # This script replaces epsilon with #0 on the input side only, of the G.fst
 # acceptor.
 while(<>){
  if (/\s+#0\s+/) {
    print STDERR "$0: ERROR: LM has word #0, " .
                 "which is reserved as disambiguation symbol\n";
    exit 1;
  }
  s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
  print;
 }
--- a/utils/fst/make_lexicon_fst.pl
+++ b/utils/fst/make_lexicon_fst.pl
@ -0,0 +1,154 @@
 #!/usr/bin/env perl
 use warnings; #sed replacement for -w perl parameter
 # Copyright 2010-2011  Microsoft Corporation
 #                2013  Johns Hopkins University (author: Daniel Povey)
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #  http://www.apache.org/licenses/LICENSE-2.0
 #
 # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 # MERCHANTABLITY OR NON-INFRINGEMENT.
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 # makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional).
 $pron_probs = 0;
 if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) {
  $pron_probs = 1;
  shift @ARGV;
 }
 if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
  print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n";
  print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n";
  print STDERR "Note: ordinarily, each line of lexicon.txt is:\n";
  print STDERR "  word phone1 phone2 ... phoneN;\n";
  print STDERR "if the --pron-probs option is used, each line is:\n";
  print STDERR "  word pronunciation-probability phone1 phone2 ... phoneN.\n\n";
  print STDERR "The probability 'prob' will typically be between zero and one, and note that\n";
  print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n";
  print STDERR "this is your responsibility.\n\n";
  print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n";
  print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n";
  print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n";
  exit(1);
 }
 $lexfn = shift @ARGV;
 if (@ARGV == 0) {
  $silprob = 0.0;
 } elsif (@ARGV == 2) {
  ($silprob,$silphone) = @ARGV;
 } else {
  ($silprob,$silphone,$sildisambig) = @ARGV;
 }
 if ($silprob != 0.0) {
  $silprob < 1.0 || die "Sil prob cannot be >= 1.0";
  $silcost = -log($silprob);
  $nosilcost = -log(1.0 - $silprob);
 }
 open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
 if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
  $loopstate = 0;
  $nextstate = 1;               # next unallocated state.
  while (<L>) {
    @A = split(" ", $_);
    @A == 0 && die "Empty lexicon line.";
    foreach $a (@A) {
      if ($a eq "<eps>") {
        die "Bad lexicon line $_ (<eps> is forbidden)";
      }
    }
    $w = shift @A;
    if (! $pron_probs) {
      $pron_cost = 0.0;
    } else {
      $pron_prob = shift @A;
      if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
        die "Bad pronunciation probability in line $_";
      }
      $pron_cost = -log($pron_prob);
    }
    if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
    $s = $loopstate;
    $word_or_eps = $w;
    while (@A > 0) {
      $p = shift @A;
      if (@A > 0) {
        $ns = $nextstate++;
      } else {
        $ns = $loopstate;
      }
      print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
      $word_or_eps = "<eps>";
      $pron_cost_string = ""; # so we only print it on the first arc of the word.
      $s = $ns;
    }
  }
  print "$loopstate\t0\n";      # final-cost.
 } else {                        # have silence probs.
  $startstate = 0;
  $loopstate = 1;
  $silstate = 2;   # state from where we go to loopstate after emitting silence.
  print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
  if (!defined $sildisambig) {
    print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
    print "$silstate\t$loopstate\t$silphone\t<eps>\n";             # no cost.
    $nextstate = 3;
  } else {
    $disambigstate = 3;
    $nextstate = 4;
    print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
    print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
    print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
  }
  while (<L>) {
    @A = split(" ", $_);
    $w = shift @A;
    if (! $pron_probs) {
      $pron_cost = 0.0;
    } else {
      $pron_prob = shift @A;
      if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
        die "Bad pronunciation probability in line $_";
      }
      $pron_cost = -log($pron_prob);
    }
    if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
    $s = $loopstate;
    $word_or_eps = $w;
    while (@A > 0) {
      $p = shift @A;
      if (@A > 0) {
        $ns = $nextstate++;
        print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
        $word_or_eps = "<eps>";
        $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time.
        $s = $ns;
      } elsif (!defined($silphone) || $p ne $silphone) {
        # This is non-deterministic but relatively compact,
        # and avoids epsilons.
        $local_nosilcost = $nosilcost + $pron_cost;
        $local_silcost = $silcost + $pron_cost;
        print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n";
        print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n";
      } else {
        # no point putting opt-sil after silence word.
        print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n";
      }
    }
  }
  print "$loopstate\t0\n";      # final-cost.
 }
--- a/utils/fst/make_tlg.sh
+++ b/utils/fst/make_tlg.sh
@ -0,0 +1,49 @@
 #!/bin/bash
 if [ -f path.sh ]; then . path.sh; fi
 lm_dir=$1
 src_lang=$2
 tgt_lang=$3
 arpa_lm=${lm_dir}/lm.arpa
 [ ! -f $arpa_lm ] && { echo "No such file $arpa_lm"; exit 1;}
 rm -rf $tgt_lang
 cp -r $src_lang $tgt_lang
 # Compose the language model to FST
 # grep -i或--ignore-case   忽略字符大小写的差别。
 # grep -v或--revert-match   反转查找。
 # arpa2fst: remove the embedded symbols from the FST
 # arpa2fst: make sure there are no out-of-vocabulary words in the language model
 # arpa2fst: remove "illegal" sequences of the start and end-ofsentence symbols
 # eps2disambig.pl: replace epsilons on the input side with the special disambiguation symbol #0. 
 # s2eps.pl: replaces <s> and </s> with <eps> (on both input and output sides), for the G.fst acceptor.
 # G.fst, the disambiguation symbol #0 only appears on the input side
 # do eps2disambig.pl and s2eps.pl maybe just for fallowing `fstrmepsilon`.
 cat $arpa_lm | \
   grep -v '<s> <s>' | \
   grep -v '</s> <s>' | \
   grep -v '</s> </s>' | \
   grep -v -i '<unk>' | \
   grep -v -i '<spoken_noise>' | \
   arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \
   utils/fst/eps2disambig.pl | utils/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \
     --osymbols=$tgt_lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
    fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst
 echo  "Checking how stochastic G is (the first of these numbers should be small):"
 fstisstochastic $tgt_lang/G.fst
 # Compose the token, lexicon and language-model FST into the final decoding graph
 # minimization: the same as minimization algorithm that applies to weighted acceptors; 
 #               the only change relevant here is that it avoids pushing weights, 
 #               hence preserving stochasticity
 fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \
    fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1;    
 fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1;
 echo "Composing decoding graph TLG.fst succeeded"
 #rm -r $tgt_lang/LG.fst   # We don't need to keep this intermediate FST
--- a/utils/fst/prepare_dict.py
+++ b/utils/fst/prepare_dict.py
@ -0,0 +1,90 @@
 #!/usr/bin/env python3
 import argparse
 import sys
 def contain_oov(units):
    for unit in units:
        if unit not in unit_table:
            return True
    return False
 def main(args):
    # load `unit` or `vocab` file
    unit_table = set()
    with open(args.unit_file, 'r') as fin:
        for line in fin:
            unit = line.strip()
            unit_table.add(unit)
    # load spm model
    bpemode = args.bpemodel
    if bpemode:
        import sentencepiece as spm
        sp = spm.SentencePieceProcessor()
        sp.Load(sys.bpemodel)
    # used to filter polyphone
    lexicon_table = set()
    with open(sys.in_lexicon, 'r') as fin, \
            open(sys.out_lexicon, 'w') as fout:
        for line in fin:
            word = line.split()[0]
            if word == 'SIL' and not bpemode:  # `sil` might be a valid piece in bpemodel
                continue
            elif word == '<SPOKEN_NOISE>':
                continue
            else:
                # each word only has one pronunciation for e2e system
                if word in lexicon_table:
                    continue
                if bpemode:
                    pieces = sp.EncodeAsPieces(word)
                    if contain_oov(pieces):
                        print('Ignoring words {}, which contains oov unit'.
                              format(''.join(word).strip('▁')))
                        continue
                    chars = ' '.join(
                        [p if p in unit_table else '<unk>' for p in pieces])
                else:
                    # ignore words with OOV
                    if contain_oov(word):
                        print('Ignoring words {}, which contains oov unit'.
                              format(word))
                        continue
                    # Optional, append ▁ in front of english word
                    # we assume the model unit of our e2e system is char now.
                    if word.encode('utf8').isalpha() and '▁' in unit_table:
                        word = '▁' + word
                    chars = ' '.join(word)  # word is a char list
                fout.write('{} {}\n'.format(word, chars))
                lexicon_table.add(word)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='FST: preprae e2e(char/spm) dict')
    parser.add_argument(
        '--unit_file',
        required=True,
        help='e2e model unit file(lang_char.txt/vocab.txt). line: char/spm_pices'
    )
    parser.add_argument(
        '--in_lexicon',
        required=True,
        help='raw lexicon file. line: word ph0 ... phn')
    parser.add_argument(
        '--out_lexicon',
        required=True,
        help='output lexicon file. line: word char0 ... charn')
    parser.add_argument('--bpemodel', default=None, help='bpemodel')
    args = parser.parse_args()
    print(args)
    main(args)
--- a/utils/fst/remove_oovs.pl
+++ b/utils/fst/remove_oovs.pl
@ -0,0 +1,42 @@
 #!/usr/bin/env perl
 # Copyright 2010-2011 Microsoft Corporation
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #  http://www.apache.org/licenses/LICENSE-2.0
 #
 # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 # MERCHANTABLITY OR NON-INFRINGEMENT.
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 # This script removes lines that contain these OOVs on either the
 # third or fourth fields  of the line.  It is intended to remove arcs
 # with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in).
 if (  @ARGV < 1 && @ARGV > 2) {
    die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n";
 }
 $unklist = shift @ARGV;
 open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n";
 while(<S>){
    @A = split(" ", $_);
    @A == 1 || die "Bad line in unknown-symbol list: $_";
    $unk{$A[0]} = 1;
 }
 $num_removed = 0;
 while(<>){
    @A = split(" ", $_);
    if(defined $unk{$A[2]} || defined $unk{$A[3]}) {
        $num_removed++;
    } else {
        print;
    }
 }
 print STDERR "remove_oovs.pl: removed $num_removed lines.\n";
--- a/utils/fst/rnnt_token_fst.py
+++ b/utils/fst/rnnt_token_fst.py
@ -0,0 +1,38 @@
 #!/usr/bin/env python3
 import argparse
 def main(args):
    # skip <blank> `token`
    print('0 0 <blank> <eps>')
    with open(args.token_file, 'r') as fin:
        for entry in fin:
            fields = entry.strip().split(' ')
            phone = fields[0]
            if phone == '<eps>' or phone == '<blank>':
                continue
            elif '#' in phone:
                # disambiguous phone
                # maybe add disambiguous `token`
                print('{} {} {} {}'.format(0, 0, '<eps>', phone))
            else:
                # eating `token`
                print('{} {} {} {}'.format(0, 0, phone, phone))
    # final state
    print('0')
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='FST: RNN-T Token FST transducer')
    parser.add_argument(
        '--token_file',
        required=True,
        help='e2e model token file. line: token(char/phone/spm/disambigous)')
    args = parser.parse_args()
    print(args)
    main(args)
--- a/utils/fst/s2eps.pl
+++ b/utils/fst/s2eps.pl
@ -0,0 +1,27 @@
 #!/usr/bin/env perl
 # Copyright 2010-2011 Microsoft Corporation
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #  http://www.apache.org/licenses/LICENSE-2.0
 #
 # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 # MERCHANTABLITY OR NON-INFRINGEMENT.
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 # This script replaces <s> and </s> with <eps> (on both input and output sides),
 # for the G.fst acceptor.
 while(<>){
    @A = split(" ", $_);
    if ( @A >= 4 ) {
        if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
        if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
    }
    print join("\t", @A) . "\n";
 }