Merge pull request #729 from PaddlePaddle/fst

TLG graph
4 years ago · fd8a4ec179
parent ab5411ec16 3e5f587537
commit fd8a4ec179
37 changed files with 1372 additions and 8 deletions
--- a/deepspeech/utils/log.py
+++ b/deepspeech/utils/log.py
@ -17,7 +17,6 @@ import os
 import socket
 import sys

-import auto_log
 from paddle import inference

 FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
@ -156,6 +155,7 @@ class Autolog:
                 batch_size,
                 model_name="DeepSpeech",
                 model_precision="fp32"):
+        import auto_log
        pid = os.getpid()
        gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0])
        infer_config = inference.Config()
--- a/examples/aishell/s1/local/aishell_train_lms.sh
+++ b/examples/aishell/s1/local/aishell_train_lms.sh
@ -0,0 +1,58 @@
+#!/bin/bash
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/local/lm/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Check SRILM tools
+if ! which ngram-count > /dev/null; then
+    echo "srilm tools are not found, please download it and install it from: "
+    echo "http://www.speech.sri.com/projects/srilm/download.html"
+    echo "Then add the tools to your PATH"
+    exit 1
+fi
+
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/lm/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
+
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+mkdir -p $dir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $dir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $dir/train
+
+ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
+ngram -lm $dir/lm.arpa -ppl $dir/heldout
--- a/examples/aishell/s1/local/tlg.sh
+++ b/examples/aishell/s1/local/tlg.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+
+set -eo pipefail
+
+stage=-1
+stop_stage=100
+corpus=aishell
+lmtype=srilm
+
+source utils/parse_options.sh
+
+data=${MAIN_ROOT}/examples/dataset/${corpus}
+lexicon=$data/resource_aishell/lexicon.txt
+text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # 7.1 Prepare dict
+    unit_file=data/vocab.txt
+    mkdir -p data/local/dict
+    cp $unit_file data/local/dict/units.txt
+    utils/fst/prepare_dict.py \
+        --unit_file $unit_file \
+        --in_lexicon ${lexicon} \
+        --out_lexicon data/local/dict/lexicon.txt
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # 7.2 Train lm
+    lm=data/local/lm
+    mkdir -p data/train
+    mkdir -p $lm
+    utils/manifest_key_value.py \
+        --manifest_path data/manifest.train \
+        --output_path data/train
+    utils/filter_scp.pl data/train/text \
+        $text > $lm/text
+    if [ $lmtype == 'srilm' ];then
+        local/aishell_train_lms.sh
+    else
+        utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 
+    # 7.3 Build decoding TLG
+    utils/fst/compile_lexicon_token_fst.sh \
+        data/local/dict data/local/tmp data/local/lang
+    utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
+fi
+
+echo "Aishell build TLG done."
+exit 0
--- a/examples/aishell/s1/path.sh
+++ b/examples/aishell/s1/path.sh
@ -4,11 +4,25 @@ export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8 
+export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}

 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/

-
+# model exp
 MODEL=u2
 export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
+
+
+# srilm
+export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
+export SRILM=${MAIN_ROOT}/tools/srilm
+export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
+
+# Kaldi
+export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!"
+. $KALDI_ROOT/tools/config/common_path.sh || true
--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
@ -42,3 +42,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # export ckpt avg_n
    CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
 fi
+
+ # Optionally, you can add LM and test it with runtime.
+ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    # train lm and build TLG
+    ./local/tlg.sh --corpus aishell --lmtype srilm 
+ fi
--- a/examples/aishell/s1/utils
+++ b/examples/aishell/s1/utils
@ -0,0 +1 @@
+../../../utils
--- a/examples/dataset/aishell/.gitignore
+++ b/examples/dataset/aishell/.gitignore
@ -1,4 +1,5 @@
 data_aishell*
 *.meta
 manifest.*
-*.tgz
+*.tgz
+resource_aishell
--- a/tools/install/README.md
+++ b/tools/install/README.md
--- a/tools/install/install_gcc.sh
+++ b/tools/install/install_gcc.sh
--- a/tools/install/install_kaldi.sh
+++ b/tools/install/install_kaldi.sh
--- a/tools/extras/install_kenlm.sh
+++ b/tools/extras/install_kenlm.sh
@ -0,0 +1,9 @@
+#!/bin/bash
+
+apt install -y build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
+
+apt-get install -y gcc-5 g++-5 && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50  && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 50
+
+test -d kenlm || wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
+
+rm -rf kenlm/build && mkdir -p kenlm/build && cd kenlm/build && cmake .. && make -j4 && make install
--- a/tools/extras/install_liblbfgs.sh
+++ b/tools/extras/install_liblbfgs.sh
@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+
+VER=1.10
+
+WGET=${WGET:-wget}
+
+if [ ! -f liblbfgs-$VER.tar.gz ]; then
+  if [ -d "$DOWNLOAD_DIR" ]; then
+    cp -p "$DOWNLOAD_DIR/liblbfgs-$VER.tar.gz" . || exit 1
+  else
+    $WGET https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz || exit 1
+  fi
+fi
+
+tar -xzf liblbfgs-$VER.tar.gz
+cd liblbfgs-$VER
+./configure --prefix=`pwd`
+make
+# due to the liblbfgs project directory structure, we have to use -i
+# but the erros are completely harmless
+make -i install
+cd ..
+
+(
+  [ ! -z "${LIBLBFGS}" ] && \
+    echo >&2 "LIBLBFGS variable is aleady defined. Undefining..." && \
+    unset LIBLBFGS
+
+  [ -f ./env.sh ] && . ./env.sh
+
+  [ ! -z "${LIBLBFGS}" ] && \
+    echo >&2 "libLBFGS config is already in env.sh" && exit
+
+  wd=`pwd`
+  wd=`readlink -f $wd || pwd`
+
+  echo "export LIBLBFGS=$wd/liblbfgs-1.10"
+  echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH:-}':'${LIBLBFGS}'/lib/.libs
+) >> env.sh
+
--- a/tools/install/install_mfa.sh
+++ b/tools/install/install_mfa.sh
--- a/tools/install/install_miniconda.sh
+++ b/tools/install/install_miniconda.sh
--- a/tools/install/install_mkl.sh
+++ b/tools/install/install_mkl.sh
--- a/tools/install/install_ngram.sh
+++ b/tools/install/install_ngram.sh
--- a/tools/install/install_openblas.sh
+++ b/tools/install/install_openblas.sh
--- a/tools/install/install_openfst.sh
+++ b/tools/install/install_openfst.sh
--- a/tools/install/install_pynini.sh
+++ b/tools/install/install_pynini.sh
--- a/tools/extras/install_srilm.sh
+++ b/tools/extras/install_srilm.sh
@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+
+current_path=`pwd`
+current_dir=`basename "$current_path"`
+
+if [ "tools" != "$current_dir" ]; then
+    echo "You should run this script in tools/ directory!!"
+    exit 1
+fi
+
+if [ ! -d liblbfgs-1.10 ]; then
+    echo Installing libLBFGS library to support MaxEnt LMs
+    bash extras/install_liblbfgs.sh || exit 1
+fi
+
+# http://www.speech.sri.com/projects/srilm/download.html
+if [ ! -f srilm.tgz ] && [ ! -f srilm.tar.gz ]; then  # Changed format type from tgz to tar.gz as the srilm v1.7.3 downloads as tar.gz
+  echo This script cannot install SRILM in a completely automatic
+  echo way because you need to put your address in a download form.
+  echo Please download SRILM from http://www.speech.sri.com/projects/srilm/download.html
+  echo put it in ./srilm.tar.gz , then run this script.
+  echo Note: You may have to rename the downloaded file to remove version name from filename eg: mv srilm-1.7.3.tar.gz srilm.tar.gz
+  exit 1
+fi
+
+! which gawk 2>/dev/null && \
+   echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
+
+mkdir -p srilm
+cd srilm
+
+
+if [ -f ../srilm.tgz ]; then
+    tar -xvzf ../srilm.tgz # Old SRILM format
+elif [  -f ../srilm.tar.gz ]; then
+    tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz
+fi
+
+major=`awk -F. '{ print $1 }' RELEASE`
+minor=`awk -F. '{ print $2 }' RELEASE`
+micro=`awk -F. '{ print $3 }' RELEASE`
+
+if [ $major -le 1 ] && [ $minor -le 7 ] && [ $micro -le 1 ]; then
+  echo "Detected version 1.7.1 or earlier. Applying patch."
+  patch -p0 < ../extras/srilm.patch
+fi
+
+# set the SRILM variable in the top-level Makefile to this directory.
+cp Makefile tmpf
+
+cat tmpf | awk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \
+  > Makefile || exit 1
+rm tmpf
+
+mtype=`sbin/machine-type`
+
+echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype
+grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \
+    sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \
+    >> common/Makefile.machine.$mtype
+
+grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \
+    sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/ -Wl,-rpath -Wl,$(SRILM)/../liblbfgs-1.10/lib/|' \
+    >> common/Makefile.machine.$mtype
+
+make || exit
+
+cd ..
+(
+  [ ! -z "${SRILM}" ] && \
+    echo >&2 "SRILM variable is aleady defined. Undefining..." && \
+    unset SRILM
+
+  [ -f ./env.sh ] && . ./env.sh
+
+  [ ! -z "${SRILM}" ] && \
+    echo >&2 "SRILM config is already in env.sh" && exit
+
+  wd=`pwd`
+  wd=`readlink -f $wd || pwd`
+
+  echo "export SRILM=$wd/srilm"
+  dirs="\${PATH}"
+  for directory in $(cd srilm && find bin -type d ) ; do
+    dirs="$dirs:\${SRILM}/$directory"
+  done
+  echo "export PATH=$dirs"
+) >> env.sh
+
+echo >&2 "Installation of SRILM finished successfully"
+echo >&2 "Please source the tools/env.sh in your path.sh to enable it"
--- a/tools/extras/srilm.patch
+++ b/tools/extras/srilm.patch
@ -0,0 +1,17 @@
+--- dstruct/src/Trie.orig	2016-11-08 19:53:40.524000000 +0000
+++ dstruct/src/Trie.cc	2016-11-08 19:53:59.088000000 +0000
+@@ -200,11 +200,14 @@
+ 	if (removedData == 0) {
+ 	    Trie<KeyT,DataT> node;
+ 	    if (sub.remove(keys[0], &node)) {
+#if !defined(__GNUC__) || !(__GNUC__ >= 4 && __GNUC_MINOR__ >= 9 || __GNUC__ > 4) 
+ 		/*
+ 		 * XXX: Call subtrie destructor explicitly since we're not
+ 		 * passing the removed node to the caller.
+		 * !!! Triggers bug with gcc >= 4.9 optimization !!!
+ 		 */
+ 	        node.~Trie();
+#endif
+ 		return true;
+ 	    } else {
+ 		return false;
--- a/utils/init.py
+++ b/utils/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/utils/filter_scp.pl
+++ b/utils/filter_scp.pl
@ -0,0 +1,87 @@
+#!/usr/bin/env perl
+# Copyright 2010-2012 Microsoft Corporation
+#                     Johns Hopkins University (author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script takes a list of utterance-ids or any file whose first field
+# of each line is an utterance-id, and filters an scp
+# file (or any file whose "n-th" field is an utterance id), printing
+# out only those lines whose "n-th" field is in id_list. The index of
+# the "n-th" field is 1, by default, but can be changed by using
+# the -f <n> switch
+
+$exclude = 0;
+$field = 1;
+$shifted = 0;
+
+do {
+  $shifted=0;
+  if ($ARGV[0] eq "--exclude") {
+    $exclude = 1;
+    shift @ARGV;
+    $shifted=1;
+  }
+  if ($ARGV[0] eq "-f") {
+    $field = $ARGV[1];
+    shift @ARGV; shift @ARGV;
+    $shifted=1
+  }
+} while ($shifted);
+
+if(@ARGV < 1 || @ARGV > 2) {
+  die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
+      "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
+      "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
+      "only the lines that were *not* in id_list.\n" .
+      "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
+      "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
+      "-f option, add 1 to the argument.\n" .
+      "See also: utils/filter_scp.pl .\n";
+}
+
+
+$idlist = shift @ARGV;
+open(F, "<$idlist") || die "Could not open id-list file $idlist";
+while(<F>) {
+  @A = split;
+  @A>=1 || die "Invalid id-list file line $_";
+  $seen{$A[0]} = 1;
+}
+
+if ($field == 1) { # Treat this as special case, since it is common.
+  while(<>) {
+    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
+    # $1 is what we filter on.
+    if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
+      print $_;
+    }
+  }
+} else {
+  while(<>) {
+    @A = split;
+    @A > 0 || die "Invalid scp file line $_";
+    @A >= $field || die "Invalid scp file line $_";
+    if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
+      print $_;
+    }
+  }
+}
+
+# tests:
+# the following should print "foo 1"
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
+# the following should print "bar 2".
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
--- a/utils/fst/add_lex_disambig.pl
+++ b/utils/fst/add_lex_disambig.pl
@ -0,0 +1,195 @@
+#!/usr/bin/env perl
+#  Copyright 2010-2011  Microsoft Corporation
+#            2013-2016  Johns Hopkins University (author: Daniel Povey)
+#                 2015  Hainan Xu
+#                 2015  Guoguo Chen
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Adds disambiguation symbols to a lexicon.
+# Outputs still in the normal lexicon format.
+# Disambig syms are numbered #1, #2, #3, etc. (#0
+# reserved for symbol in grammar).
+# Outputs the number of disambig syms to the standard output.
+# With the --pron-probs option, expects the second field
+# of each lexicon line to be a pron-prob.
+# With the --sil-probs option, expects three additional
+# fields after the pron-prob, representing various components
+# of the silence probability model.
+
+$pron_probs = 0;
+$sil_probs = 0;
+$first_allowed_disambig = 1;
+
+for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
+  if ($ARGV[0] eq "--pron-probs") {
+    $pron_probs = 1;
+    shift @ARGV;
+  }
+  if ($ARGV[0] eq "--sil-probs") {
+    $sil_probs = 1;
+    shift @ARGV;
+  }
+  if ($ARGV[0] eq "--first-allowed-disambig") {
+    $first_allowed_disambig = 0 + $ARGV[1];
+    if ($first_allowed_disambig < 1) {
+      die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
+    }
+    shift @ARGV;
+    shift @ARGV;
+  }
+}
+
+if (@ARGV != 2) {
+  die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
+    "This script adds disambiguation symbols to a lexicon in order to\n" .
+    "make decoding graphs determinizable; it adds pseudo-phone\n" .
+    "disambiguation symbols #1, #2 and so on at the ends of phones\n" .
+    "to ensure that all pronunciations are different, and that none\n" .
+    "is a prefix of another.\n" .
+    "It prints to the standard output the number of the largest-numbered" .
+    "disambiguation symbol that was used.\n" .
+    "\n" .
+    "Options:   --pron-probs       Expect pronunciation probabilities in the 2nd field\n" .
+    "           --sil-probs        [should be with --pron-probs option]\n" .
+    "                              Expect 3 extra fields after the pron-probs, for aspects of\n" .
+    "                              the silence probability model\n" .
+    "           --first-allowed-disambig <n>  The number of the first disambiguation symbol\n" .
+    "                              that this script is allowed to add.  By default this is\n" .
+    "                              #1, but you can set this to a larger value using this option.\n" .
+    "e.g.:\n" .
+    " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
+    " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
+    " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
+}
+
+
+$lexfn = shift @ARGV;
+$lexoutfn = shift @ARGV;
+
+open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
+
+# (1)  Read in the lexicon.
+@L = ( );
+while(<L>) {
+    @A = split(" ", $_);
+    push @L, join(" ", @A);
+}
+
+# (2) Work out the count of each phone-sequence in the
+# lexicon.
+
+foreach $l (@L) {
+    @A = split(" ", $l);
+    shift @A; # Remove word.
+    if ($pron_probs) {
+      $p = shift @A;
+      if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
+    }
+    if ($sil_probs) {
+      $silp = shift @A;
+      if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
+      $correction = shift @A;
+      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
+      $correction = shift @A;
+      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
+    }
+    if (!(@A)) {
+      die "Bad lexicon line $1, no phone in phone list";
+    }
+    $count{join(" ",@A)}++;
+}
+
+# (3) For each left sub-sequence of each phone-sequence, note down
+# that it exists (for identifying prefixes of longer strings).
+
+foreach $l (@L) {
+    @A = split(" ", $l);
+    shift @A; # Remove word.
+    if ($pron_probs) { shift @A; } # remove pron-prob.
+    if ($sil_probs) {
+      shift @A; # Remove silprob
+      shift @A; # Remove silprob
+    }
+    while(@A > 0) {
+        pop @A;  # Remove last phone
+        $issubseq{join(" ",@A)} = 1;
+    }
+}
+
+# (4) For each entry in the lexicon:
+#  if the phone sequence is unique and is not a
+#  prefix of another word, no diambig symbol.
+#  Else output #1, or #2, #3, ... if the same phone-seq
+#  has already been assigned a disambig symbol.
+
+
+open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
+
+# max_disambig will always be the highest-numbered disambiguation symbol that
+# has been used so far.
+$max_disambig = $first_allowed_disambig - 1;
+
+foreach $l (@L) {
+  @A = split(" ", $l);
+  $word = shift @A;
+  if ($pron_probs) {
+    $pron_prob = shift @A;
+  }
+  if ($sil_probs) {
+    $sil_word_prob = shift @A;
+    $word_sil_correction = shift @A;
+    $prev_nonsil_correction = shift @A
+  }
+  $phnseq = join(" ", @A);
+  if (!defined $issubseq{$phnseq}
+      && $count{$phnseq} == 1) {
+    ;                           # Do nothing.
+  } else {
+    if ($phnseq eq "") {        # need disambig symbols for the empty string
+      # that are not use anywhere else.
+      $max_disambig++;
+      $reserved_for_the_empty_string{$max_disambig} = 1;
+      $phnseq = "#$max_disambig";
+    } else {
+      $cur_disambig = $last_used_disambig_symbol_of{$phnseq};
+      if (!defined $cur_disambig) {
+        $cur_disambig = $first_allowed_disambig;
+      } else {
+        $cur_disambig++;           # Get a number that has not been used yet for
+                                   # this phone sequence.
+      }
+      while (defined $reserved_for_the_empty_string{$cur_disambig}) {
+        $cur_disambig++;
+      }
+      if ($cur_disambig > $max_disambig) {
+        $max_disambig = $cur_disambig;
+      }
+      $last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
+      $phnseq = $phnseq . " #" . $cur_disambig;
+    }
+  }
+  if ($pron_probs) {
+    if ($sil_probs) {
+      print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
+    } else {
+      print O "$word\t$pron_prob\t$phnseq\n";
+    }
+  } else {
+    print O "$word\t$phnseq\n";
+  }
+}
+
+print $max_disambig . "\n";
--- a/utils/fst/compile_lexicon_token_fst.sh
+++ b/utils/fst/compile_lexicon_token_fst.sh
@ -0,0 +1,88 @@
+#!/bin/bash
+# Copyright 2015       Yajie Miao    (Carnegie Mellon University)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
+# phoneme and character-based lexicons.
+set -eo pipefail
+. utils/parse_options.sh
+
+if [ $# -ne 3 ]; then
+  echo "usage: utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>"
+  echo "e.g.: utils/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang"
+  echo "<dict-src-dir> should contain the following files:"
+  echo "lexicon.txt lexicon_numbers.txt units.txt"
+  echo "options: "
+  exit 1;
+fi
+
+srcdir=$1
+tmpdir=$2
+dir=$3
+mkdir -p $dir $tmpdir
+
+[ -f path.sh ] && . ./path.sh
+
+cp $srcdir/units.txt $dir
+
+# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
+# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
+perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1;
+
+# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
+# Without these symbols, determinization will fail.
+# default first disambiguation is #1
+ndisambig=`utils/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
+# add #0 (#0 reserved for symbol in grammar).
+ndisambig=$[$ndisambig+1];
+
+( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list
+
+# Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>,
+# the actual model unit, and the disambiguation symbols.
+cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
+(echo '<eps>';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt
+
+# ctc_token_fst_corrected is too big and too slow for character based chinese modeling,
+# so here just use simple ctc_token_fst
+utils/fst/ctc_token_fst.py --token_file $dir/tokens.txt | \
+  fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \
+  fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
+
+# Encode the words with indices. Will be used in lexicon and language model FST compiling.
+cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | awk '
+  BEGIN {
+    print "<eps> 0";
+  }
+  {
+    printf("%s %d\n", $1, NR);
+  }
+  END {
+    printf("#0 %d\n", NR+1);
+    printf("<s> %d\n", NR+2);
+    printf("</s> %d\n", NR+3);
+  }' > $dir/words.txt || exit 1;
+
+# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
+token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'`
+word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
+
+utils/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \
+  fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
+  --keep_isymbols=false --keep_osymbols=false |   \
+  fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
+  fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
+
+echo "Lexicon and Token FSTs compiling succeeded"
--- a/utils/fst/ctc_token_fst.py
+++ b/utils/fst/ctc_token_fst.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+import argparse
+
+
+def main(args):
+    """Token Transducer"""
+    # <eps> entry
+    print('0 1 <eps> <eps>')
+    # skip begining and ending <blank>
+    print('1 1 <blank> <eps>')
+    print('2 2 <blank> <eps>')
+    # <eps> exit
+    print('2 0 <eps> <eps>')
+
+    # linking `token` between node 1 and node 2
+    with open(args.token_file, 'r') as fin:
+        node = 3
+        for entry in fin:
+            fields = entry.strip().split(' ')
+            phone = fields[0]
+            if phone == '<eps>' or phone == '<blank>':
+                continue
+            elif '#' in phone:
+                # disambiguous phone
+                # `token` maybe ending with disambiguous symbol
+                print('{} {} {} {}'.format(0, 0, '<eps>', phone))
+            else:
+                # eating `token`
+                print('{} {} {} {}'.format(1, node, phone, phone))
+                # remove repeating `token`
+                print('{} {} {} {}'.format(node, node, phone, '<eps>'))
+                # leaving `token`
+                print('{} {} {} {}'.format(node, 2, '<eps>', '<eps>'))
+            node += 1
+    # Fianl node
+    print('0')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='FST: CTC Token FST transducer')
+    parser.add_argument(
+        '--token_file',
+        required=True,
+        help='e2e model token file. line: token(char/phone/spm/disambigous)')
+
+    args = parser.parse_args()
+
+    main(args)
--- a/utils/fst/ctc_token_fst_corrected.py
+++ b/utils/fst/ctc_token_fst_corrected.py
@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+import argparse
+
+
+def il(n):
+    """ilabel"""
+    return n + 1
+
+
+def ol(n):
+    """olabel"""
+    return n + 1
+
+
+def s(n):
+    """state"""
+    return n
+
+
+def main(args):
+    with open(args.token_file) as f:
+        lines = f.readlines()
+    # token count w/0 <blank> <eps>
+    phone_count = 0
+    disambig_count = 0
+    for line in lines:
+        sp = line.strip().split()
+        phone = sp[0]
+        if phone == '<eps>' or phone == '<blank>':
+            continue
+        if phone.startswith('#'):
+            disambig_count += 1
+        else:
+            phone_count += 1
+
+    # 1. add start state
+    # first token is <blank>:0
+    print('0 0 {} 0'.format(il(0)))
+
+    # 2. 0 -> i, i -> i, i -> 0
+    # non-blank token start from 1
+    for i in range(1, phone_count + 1):
+        # eating `token`
+        print('0 {} {} {}'.format(s(i), il(i), ol(i)))
+        # remove repeating `token`
+        print('{} {} {} 0'.format(s(i), s(i), il(i)))
+        # skip ending <blank> `token`
+        print('{} 0 {} 0'.format(s(i), il(0)))
+
+    # 3. i -> other phone
+    # non-blank token to other non-blank token
+    for i in range(1, phone_count + 1):
+        for j in range(1, phone_count + 1):
+            if i != j:
+                print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j)))
+
+    # 4. add disambiguous arcs on every final state
+    # blank and non-blank token maybe ending with disambiguous `token`
+    for i in range(0, phone_count + 1):
+        for j in range(phone_count + 2, phone_count + disambig_count + 2):
+            print('{} {} {} {}'.format(s(i), s(i), 0, j))
+
+    # 5. every i is final state
+    # blank and non-blank `token` are final state
+    for i in range(0, phone_count + 1):
+        print(s(i))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='FST: CTC Token unfold FST transducer')
+    parser.add_argument(
+        '--token_file',
+        required=True,
+        help='e2e model token file. line: token(char/phone/spm/disambigous)')
+    args = parser.parse_args()
+
+    main(args)
--- a/utils/fst/eps2disambig.pl
+++ b/utils/fst/eps2disambig.pl
@ -0,0 +1,29 @@
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+#                2015 Guoguo Chen
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script replaces epsilon with #0 on the input side only, of the G.fst
+# acceptor.
+
+while(<>){
+  if (/\s+#0\s+/) {
+    print STDERR "$0: ERROR: LM has word #0, " .
+                 "which is reserved as disambiguation symbol\n";
+    exit 1;
+  }
+  s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
+  print;
+}
--- a/utils/fst/make_lexicon_fst.pl
+++ b/utils/fst/make_lexicon_fst.pl
@ -0,0 +1,154 @@
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+# Copyright 2010-2011  Microsoft Corporation
+#                2013  Johns Hopkins University (author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional).
+
+$pron_probs = 0;
+
+if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) {
+  $pron_probs = 1;
+  shift @ARGV;
+}
+
+if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
+  print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n";
+  print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n";
+  print STDERR "Note: ordinarily, each line of lexicon.txt is:\n";
+  print STDERR "  word phone1 phone2 ... phoneN;\n";
+  print STDERR "if the --pron-probs option is used, each line is:\n";
+  print STDERR "  word pronunciation-probability phone1 phone2 ... phoneN.\n\n";
+  print STDERR "The probability 'prob' will typically be between zero and one, and note that\n";
+  print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n";
+  print STDERR "this is your responsibility.\n\n";
+  print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n";
+  print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n";
+  print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n";
+  exit(1);
+}
+
+$lexfn = shift @ARGV;
+if (@ARGV == 0) {
+  $silprob = 0.0;
+} elsif (@ARGV == 2) {
+  ($silprob,$silphone) = @ARGV;
+} else {
+  ($silprob,$silphone,$sildisambig) = @ARGV;
+}
+if ($silprob != 0.0) {
+  $silprob < 1.0 || die "Sil prob cannot be >= 1.0";
+  $silcost = -log($silprob);
+  $nosilcost = -log(1.0 - $silprob);
+}
+
+
+open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
+
+
+if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
+  $loopstate = 0;
+  $nextstate = 1;               # next unallocated state.
+  while (<L>) {
+    @A = split(" ", $_);
+    @A == 0 && die "Empty lexicon line.";
+    foreach $a (@A) {
+      if ($a eq "<eps>") {
+        die "Bad lexicon line $_ (<eps> is forbidden)";
+      }
+    }
+    $w = shift @A;
+    if (! $pron_probs) {
+      $pron_cost = 0.0;
+    } else {
+      $pron_prob = shift @A;
+      if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
+        die "Bad pronunciation probability in line $_";
+      }
+      $pron_cost = -log($pron_prob);
+    }
+    if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
+
+    $s = $loopstate;
+    $word_or_eps = $w;
+    while (@A > 0) {
+      $p = shift @A;
+      if (@A > 0) {
+        $ns = $nextstate++;
+      } else {
+        $ns = $loopstate;
+      }
+      print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
+      $word_or_eps = "<eps>";
+      $pron_cost_string = ""; # so we only print it on the first arc of the word.
+      $s = $ns;
+    }
+  }
+  print "$loopstate\t0\n";      # final-cost.
+} else {                        # have silence probs.
+  $startstate = 0;
+  $loopstate = 1;
+  $silstate = 2;   # state from where we go to loopstate after emitting silence.
+  print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
+  if (!defined $sildisambig) {
+    print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
+    print "$silstate\t$loopstate\t$silphone\t<eps>\n";             # no cost.
+    $nextstate = 3;
+  } else {
+    $disambigstate = 3;
+    $nextstate = 4;
+    print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
+    print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
+    print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
+  }
+  while (<L>) {
+    @A = split(" ", $_);
+    $w = shift @A;
+    if (! $pron_probs) {
+      $pron_cost = 0.0;
+    } else {
+      $pron_prob = shift @A;
+      if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
+        die "Bad pronunciation probability in line $_";
+      }
+      $pron_cost = -log($pron_prob);
+    }
+    if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
+    $s = $loopstate;
+    $word_or_eps = $w;
+    while (@A > 0) {
+      $p = shift @A;
+      if (@A > 0) {
+        $ns = $nextstate++;
+        print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
+        $word_or_eps = "<eps>";
+        $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time.
+        $s = $ns;
+      } elsif (!defined($silphone) || $p ne $silphone) {
+        # This is non-deterministic but relatively compact,
+        # and avoids epsilons.
+        $local_nosilcost = $nosilcost + $pron_cost;
+        $local_silcost = $silcost + $pron_cost;
+        print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n";
+        print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n";
+      } else {
+        # no point putting opt-sil after silence word.
+        print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n";
+      }
+    }
+  }
+  print "$loopstate\t0\n";      # final-cost.
+}
--- a/utils/fst/make_tlg.sh
+++ b/utils/fst/make_tlg.sh
@ -0,0 +1,49 @@
+#!/bin/bash
+
+if [ -f path.sh ]; then . path.sh; fi
+
+lm_dir=$1
+src_lang=$2
+tgt_lang=$3
+
+arpa_lm=${lm_dir}/lm.arpa
+[ ! -f $arpa_lm ] && { echo "No such file $arpa_lm"; exit 1;}
+
+rm -rf $tgt_lang
+cp -r $src_lang $tgt_lang
+
+# Compose the language model to FST
+# grep -i或--ignore-case   忽略字符大小写的差别。
+# grep -v或--revert-match   反转查找。
+# arpa2fst: remove the embedded symbols from the FST
+# arpa2fst: make sure there are no out-of-vocabulary words in the language model
+# arpa2fst: remove "illegal" sequences of the start and end-ofsentence symbols
+# eps2disambig.pl: replace epsilons on the input side with the special disambiguation symbol #0. 
+# s2eps.pl: replaces <s> and </s> with <eps> (on both input and output sides), for the G.fst acceptor.
+# G.fst, the disambiguation symbol #0 only appears on the input side
+# do eps2disambig.pl and s2eps.pl maybe just for fallowing `fstrmepsilon`.
+cat $arpa_lm | \
+   grep -v '<s> <s>' | \
+   grep -v '</s> <s>' | \
+   grep -v '</s> </s>' | \
+   grep -v -i '<unk>' | \
+   grep -v -i '<spoken_noise>' | \
+   arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \
+   utils/fst/eps2disambig.pl | utils/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \
+     --osymbols=$tgt_lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
+    fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst
+
+
+echo  "Checking how stochastic G is (the first of these numbers should be small):"
+fstisstochastic $tgt_lang/G.fst
+
+# Compose the token, lexicon and language-model FST into the final decoding graph
+# minimization: the same as minimization algorithm that applies to weighted acceptors; 
+#               the only change relevant here is that it avoids pushing weights, 
+#               hence preserving stochasticity
+fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \
+    fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1;    
+fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1;
+
+echo "Composing decoding graph TLG.fst succeeded"
+#rm -r $tgt_lang/LG.fst   # We don't need to keep this intermediate FST
--- a/utils/fst/prepare_dict.py
+++ b/utils/fst/prepare_dict.py
@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+import argparse
+
+
+def main(args):
+    # load `unit` or `vocab` file
+    unit_table = set()
+    with open(args.unit_file, 'r') as fin:
+        for line in fin:
+            unit = line.strip()
+            unit_table.add(unit)
+
+    def contain_oov(units):
+        for unit in units:
+            if unit not in unit_table:
+                return True
+        return False
+
+    # load spm model
+    bpemode = args.bpemodel
+    if bpemode:
+        import sentencepiece as spm
+        sp = spm.SentencePieceProcessor()
+        sp.Load(sys.bpemodel)
+
+    # used to filter polyphone
+    lexicon_table = set()
+    with open(args.in_lexicon, 'r') as fin, \
+            open(args.out_lexicon, 'w') as fout:
+        for line in fin:
+            word = line.split()[0]
+            if word == 'SIL' and not bpemode:  # `sil` might be a valid piece in bpemodel
+                continue
+            elif word == '<SPOKEN_NOISE>':
+                continue
+            else:
+                # each word only has one pronunciation for e2e system
+                if word in lexicon_table:
+                    continue
+
+                if bpemode:
+                    pieces = sp.EncodeAsPieces(word)
+                    if contain_oov(pieces):
+                        print('Ignoring words {}, which contains oov unit'.
+                              format(''.join(word).strip('▁')))
+                        continue
+
+                    chars = ' '.join(
+                        [p if p in unit_table else '<unk>' for p in pieces])
+                else:
+                    # ignore words with OOV
+                    if contain_oov(word):
+                        print('Ignoring words {}, which contains oov unit'.
+                              format(word))
+                        continue
+
+                    # Optional, append ▁ in front of english word
+                    # we assume the model unit of our e2e system is char now.
+                    if word.encode('utf8').isalpha() and '▁' in unit_table:
+                        word = '▁' + word
+                    chars = ' '.join(word)  # word is a char list
+
+                fout.write('{} {}\n'.format(word, chars))
+                lexicon_table.add(word)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='FST: preprae e2e(char/spm) dict')
+    parser.add_argument(
+        '--unit_file',
+        required=True,
+        help='e2e model unit file(lang_char.txt/vocab.txt). line: char/spm_pices'
+    )
+    parser.add_argument(
+        '--in_lexicon',
+        required=True,
+        help='raw lexicon file. line: word ph0 ... phn')
+    parser.add_argument(
+        '--out_lexicon',
+        required=True,
+        help='output lexicon file. line: word char0 ... charn')
+    parser.add_argument('--bpemodel', default=None, help='bpemodel')
+
+    args = parser.parse_args()
+    print(args)
+
+    main(args)
--- a/utils/fst/remove_oovs.pl
+++ b/utils/fst/remove_oovs.pl
@ -0,0 +1,42 @@
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script removes lines that contain these OOVs on either the
+# third or fourth fields  of the line.  It is intended to remove arcs
+# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in).
+
+if (  @ARGV < 1 && @ARGV > 2) {
+    die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n";
+}
+
+$unklist = shift @ARGV;
+open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n";
+while(<S>){
+    @A = split(" ", $_);
+    @A == 1 || die "Bad line in unknown-symbol list: $_";
+    $unk{$A[0]} = 1;
+}
+
+$num_removed = 0;
+while(<>){
+    @A = split(" ", $_);
+    if(defined $unk{$A[2]} || defined $unk{$A[3]}) {
+        $num_removed++;
+    } else {
+        print;
+    }
+}
+print STDERR "remove_oovs.pl: removed $num_removed lines.\n";
--- a/utils/fst/rnnt_token_fst.py
+++ b/utils/fst/rnnt_token_fst.py
@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+import argparse
+
+
+def main(args):
+    # skip <blank> `token`
+    print('0 0 <blank> <eps>')
+
+    with open(args.token_file, 'r') as fin:
+        for entry in fin:
+            fields = entry.strip().split(' ')
+            phone = fields[0]
+            if phone == '<eps>' or phone == '<blank>':
+                continue
+            elif '#' in phone:
+                # disambiguous phone
+                # maybe add disambiguous `token`
+                print('{} {} {} {}'.format(0, 0, '<eps>', phone))
+            else:
+                # eating `token`
+                print('{} {} {} {}'.format(0, 0, phone, phone))
+
+    # final state
+    print('0')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='FST: RNN-T Token FST transducer')
+    parser.add_argument(
+        '--token_file',
+        required=True,
+        help='e2e model token file. line: token(char/phone/spm/disambigous)')
+    args = parser.parse_args()
+
+    main(args)
--- a/utils/fst/s2eps.pl
+++ b/utils/fst/s2eps.pl
@ -0,0 +1,27 @@
+#!/usr/bin/env perl
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This script replaces <s> and </s> with <eps> (on both input and output sides),
+# for the G.fst acceptor.
+
+while(<>){
+    @A = split(" ", $_);
+    if ( @A >= 4 ) {
+        if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
+        if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
+    }
+    print join("\t", @A) . "\n";
+}
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""Manifest file to key-value files."""
+import argparse
+import functools
+from pathlib import Path
+
+from utils.utility import add_arguments
+from utils.utility import print_arguments
+from utils.utility import read_manifest
+
+
+def main(args):
+    print_arguments(args, globals())
+
+    count = 0
+
+    outdir = Path(args.output_path)
+    wav_scp = outdir / 'wav.scp'
+    dur_scp = outdir / 'duration'
+    text_scp = outdir / 'text'
+
+    manifest_jsons = read_manifest(args.manifest_path)
+
+    with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
+            'w') as ftxt:
+        for line_json in manifest_jsons:
+            utt = line_json['utt']
+            feat = line_json['feat']
+            file_ext = Path(feat).suffix  # .wav
+            text = line_json['text']
+            feat_shape = line_json['feat_shape']
+            dur = feat_shape[0]
+            feat_dim = feat_shape[1]
+            if 'token' in line_json:
+                tokens = line_json['token']
+                tokenids = line_json['token_id']
+                token_shape = line_json['token_shape']
+                token_len = token_shape[0]
+                vocab_dim = token_shape[1]
+
+            if file_ext == '.wav':
+                fwav.write(f"{utt} {feat}\n")
+            fdur.write(f"{utt} {dur}\n")
+            ftxt.write(f"{utt} {text}\n")
+
+            count += 1
+
+    print(f"Examples number: {count}")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+    # yapf: disable
+    add_arg('manifest_path',    str,
+            'data/librispeech/manifest.train',
+            "Filepath of manifest to compute normalizer's mean and stddev.")
+    add_arg('output_path',    str,
+            'data/train',
+            "dir path to dump wav.scp/duaration/text files.")
+    # yapf: disable
+    args = parser.parse_args()
+
+    main(args)
--- a/utils/ngram_train.sh
+++ b/utils/ngram_train.sh
@ -22,7 +22,7 @@ lmbin=${2}.klm.bin

 # https://kheafield.com/code/kenlm/estimation/
 echo "build arpa lm."
-lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} >${arpa} || { echo "train kenlm error!"; exit -1; }
+lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} > ${arpa} || { echo "train kenlm error!"; exit -1; }

 # https://kheafield.com/code/kenlm/
 echo "build binary lm."
--- a/utils/utility.py
+++ b/utils/utility.py
@ -11,19 +11,95 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import hashlib
+import json
 import os
 import tarfile
 import zipfile
 from typing import Text

-from paddle.dataset.common import md5file
-
 __all__ = [
    "check_md5sum", "getfile_insensitive", "download_multi", "download",
-    "unpack", "unzip"
+    "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
+    "read_manifest"
 ]


+def read_manifest(manifest_path):
+    """Load and parse manifest file.
+    Args:
+        manifest_path ([type]): Manifest file to load and parse.
+
+    Raises:
+        IOError: If failed to parse the manifest.
+
+    Returns:
+        List[dict]: Manifest parsing results.
+    """
+
+    manifest = []
+    for json_line in open(manifest_path, 'r'):
+        try:
+            json_data = json.loads(json_line)
+        except Exception as e:
+            raise IOError("Error reading manifest: %s" % str(e))
+    return manifest
+
+
+def print_arguments(args, info=None):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    filename = ""
+    if info:
+        filename = info["__file__"]
+    filename = os.path.basename(filename)
+    print(f"----------- {filename} Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).items()):
+        print("%s: %s" % (arg, value))
+    print("-----------------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
+
+
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    f = open(fname, "rb")
+    for chunk in iter(lambda: f.read(4096), b""):
+        hash_md5.update(chunk)
+    f.close()
+    return hash_md5.hexdigest()
+
+
 def getfile_insensitive(path):
    """Get the actual file path when given insensitive filename."""
    directory, filename = os.path.split(path)