TLG build pass

4 years ago · 104743cccc
parent f5369abdbe
commit 104743cccc
22 changed files with 387 additions and 61 deletions
--- a/deepspeech/utils/log.py
+++ b/deepspeech/utils/log.py
@ -17,7 +17,6 @@ import os
 import socket
 import sys
 import auto_log
 from paddle import inference
 FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
@ -156,6 +155,7 @@ class Autolog:
                 batch_size,
                 model_name="DeepSpeech",
                 model_precision="fp32"):
        import auto_log
        pid = os.getpid()
        gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0])
        infer_config = inference.Config()
--- a/examples/aishell/s1/local/aishell_train_lms.sh
+++ b/examples/aishell/s1/local/aishell_train_lms.sh
@ -0,0 +1,58 @@
 #!/bin/bash
 # To be run from one directory above this script.
 . ./path.sh
 text=data/local/lm/text
 lexicon=data/local/dict/lexicon.txt
 for f in "$text" "$lexicon"; do
  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
 done
 # Check SRILM tools
 if ! which ngram-count > /dev/null; then
    echo "srilm tools are not found, please download it and install it from: "
    echo "http://www.speech.sri.com/projects/srilm/download.html"
    echo "Then add the tools to your PATH"
    exit 1
 fi
 # This script takes no arguments.  It assumes you have already run
 # aishell_data_prep.sh.
 # It takes as input the files
 # data/local/lm/text
 # data/local/dict/lexicon.txt
 dir=data/local/lm
 mkdir -p $dir
 cleantext=$dir/text.no_oov
 cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
  > $cleantext || exit 1;
 cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
   sort -nr > $dir/word.counts || exit 1;
 # Get counts from acoustic training transcripts, and add  one-count
 # for each word in the lexicon (but not silence, we don't want it
 # in the LM-- we'll add it optionally later).
 cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
 cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
 heldout_sent=10000 # Don't change this if you want result to be comparable with
    # kaldi_lm results
 mkdir -p $dir
 cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
  head -$heldout_sent > $dir/heldout
 cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
  tail -n +$heldout_sent > $dir/train
 ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
  -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
 ngram -lm $dir/lm.arpa -ppl $dir/heldout
--- a/examples/aishell/s1/local/tlg.sh
+++ b/examples/aishell/s1/local/tlg.sh
@ -0,0 +1,52 @@
 #!/bin/bash
 set -eo pipefail
 stage=-1
 stop_stage=100
 corpus=aishell
 lmtype=srilm
 source utils/parse_options.sh
 data=${MAIN_ROOT}/examples/dataset/${corpus}
 lexicon=$data/resource_aishell/lexicon.txt
 text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # 7.1 Prepare dict
    unit_file=data/vocab.txt
    mkdir -p data/local/dict
    cp $unit_file data/local/dict/units.txt
    utils/fst/prepare_dict.py \
        --unit_file $unit_file \
        --in_lexicon ${lexicon} \
        --out_lexicon data/local/dict/lexicon.txt
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # 7.2 Train lm
    lm=data/local/lm
    mkdir -p data/train
    mkdir -p $lm
    utils/manifest_key_value.py \
        --manifest_path data/manifest.train \
        --output_path data/train
    utils/filter_scp.pl data/train/text \
        $text > $lm/text
    if [ $lmtype == 'srilm' ];then
        local/aishell_train_lms.sh
    else
        utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa
    fi
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 
    # 7.3 Build decoding TLG
    utils/fst/compile_lexicon_token_fst.sh \
        data/local/dict data/local/tmp data/local/lang
    utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
 fi
 echo "Aishell build TLG done."
 exit 0
--- a/examples/aishell/s1/path.sh
+++ b/examples/aishell/s1/path.sh
@ -4,11 +4,24 @@ export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8 
+export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
-
+# model exp
 MODEL=u2
 export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
 export LIBLBFGS=/workspace/zhanghui/asr/wenet-210713/tools/liblbfgs-1.10
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
 export SRILM=/workspace/zhanghui/asr/wenet-210713/tools/srilm
 export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
 # Kaldi
 export KALDI_ROOT=/workspace/zhanghui/asr/kaldi
 [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
@ -38,43 +38,13 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
-# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-#     # export ckpt avg_n
+    # export ckpt avg_n
-#     CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+    CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-# fi
+fi
 # Optionally, you can add LM and test it with runtime.
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-     # 7.1 Prepare dict
+    # train lm and build TLG
-     unit_file=data/vocab.txt
+    ./local/tlg.sh --corpus aishell --lmtype srilm 
     mkdir -p data/local/dict
     cp $unit_file data/local/dict/units.txt
     utils/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \
         data/local/dict/lexicon.txt
     # 7.2 Train lm
     lm=data/local/lm
     mkdir -p $lm
     utils/filter_scp.pl data/train/text \
          $data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text
     local/aishell_train_lms.sh
     # 7.3 Build decoding TLG
     utils/fst/compile_lexicon_token_fst.sh \
         data/local/dict data/local/tmp data/local/lang
     utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
    #  # 7.4 Decoding with runtime
    #  # reverse_weight only works for u2++ model and only left to right decoder is used when it is set to 0.0.
    #  dir=exp/conformer
    #  reverse_weight=0.0
    #  chunk_size=-1
    #  ./tools/decode.sh --nj 16 \
    #      --beam 15.0 --lattice_beam 7.5 --max_active 7000 \
    #      --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \
    #      --reverse_weight $reverse_weight --chunk_size $chunk_size \
    #      --fst_path data/lang_test/TLG.fst \
    #      data/test/wav.scp data/test/text $dir/final.zip \
    #      data/lang_test/words.txt $dir/lm_with_runtime
    #  # See $dir/lm_with_runtime for wer
 fi
--- a/examples/aishell/s1/utils
+++ b/examples/aishell/s1/utils
@ -0,0 +1 @@
 ../../../utils
--- a/utils/init.py
+++ b/utils/init.py
@ -0,0 +1,13 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/utils/filter_scp.pl
+++ b/utils/filter_scp.pl
@ -0,0 +1,87 @@
 #!/usr/bin/env perl
 # Copyright 2010-2012 Microsoft Corporation
 #                     Johns Hopkins University (author: Daniel Povey)
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #  http://www.apache.org/licenses/LICENSE-2.0
 #
 # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
 # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
 # MERCHANTABLITY OR NON-INFRINGEMENT.
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 # This script takes a list of utterance-ids or any file whose first field
 # of each line is an utterance-id, and filters an scp
 # file (or any file whose "n-th" field is an utterance id), printing
 # out only those lines whose "n-th" field is in id_list. The index of
 # the "n-th" field is 1, by default, but can be changed by using
 # the -f <n> switch
 $exclude = 0;
 $field = 1;
 $shifted = 0;
 do {
  $shifted=0;
  if ($ARGV[0] eq "--exclude") {
    $exclude = 1;
    shift @ARGV;
    $shifted=1;
  }
  if ($ARGV[0] eq "-f") {
    $field = $ARGV[1];
    shift @ARGV; shift @ARGV;
    $shifted=1
  }
 } while ($shifted);
 if(@ARGV < 1 || @ARGV > 2) {
  die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
      "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
      "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
      "only the lines that were *not* in id_list.\n" .
      "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
      "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
      "-f option, add 1 to the argument.\n" .
      "See also: utils/filter_scp.pl .\n";
 }
 $idlist = shift @ARGV;
 open(F, "<$idlist") || die "Could not open id-list file $idlist";
 while(<F>) {
  @A = split;
  @A>=1 || die "Invalid id-list file line $_";
  $seen{$A[0]} = 1;
 }
 if ($field == 1) { # Treat this as special case, since it is common.
  while(<>) {
    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
    # $1 is what we filter on.
    if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
      print $_;
    }
  }
 } else {
  while(<>) {
    @A = split;
    @A > 0 || die "Invalid scp file line $_";
    @A >= $field || die "Invalid scp file line $_";
    if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
      print $_;
    }
  }
 }
 # tests:
 # the following should print "foo 1"
 # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
 # the following should print "bar 2".
 # ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
--- a/utils/fst/add_lex_disambig.pl
+++ b/utils/fst/add_lex_disambig.pl
--- a/utils/fst/compile_lexicon_token_fst.sh
+++ b/utils/fst/compile_lexicon_token_fst.sh
@ -57,7 +57,7 @@ cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
 # ctc_token_fst_corrected is too big and too slow for character based chinese modeling,
 # so here just use simple ctc_token_fst
-utils/fst/ctc_token_fst.py $dir/tokens.txt | \
+utils/fst/ctc_token_fst.py --token_file $dir/tokens.txt | \
  fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \
  fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
--- a/utils/fst/ctc_token_fst.py
+++ b/utils/fst/ctc_token_fst.py
@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 import argparse
 import sys
 def main(args):
@ -14,7 +13,7 @@ def main(args):
    print('2 0 <eps> <eps>')
    # linking `token` between node 1 and node 2
-    with open(sys.token_file, 'r') as fin:
+    with open(args.token_file, 'r') as fin:
        node = 3
        for entry in fin:
            fields = entry.strip().split(' ')
@ -46,6 +45,5 @@ if __name__ == '__main__':
        help='e2e model token file. line: token(char/phone/spm/disambigous)')
    args = parser.parse_args()
    print(args)
    main(args)
--- a/utils/fst/ctc_token_fst_corrected.py
+++ b/utils/fst/ctc_token_fst_corrected.py
@ -73,8 +73,6 @@ if __name__ == "__main__":
        '--token_file',
        required=True,
        help='e2e model token file. line: token(char/phone/spm/disambigous)')
    args = parser.parse_args()
    print(args)
    main(args)
--- a/utils/fst/eps2disambig.pl
+++ b/utils/fst/eps2disambig.pl
--- a/utils/fst/make_lexicon_fst.pl
+++ b/utils/fst/make_lexicon_fst.pl
--- a/utils/fst/make_tlg.sh
+++ b/utils/fst/make_tlg.sh
--- a/utils/fst/prepare_dict.py
+++ b/utils/fst/prepare_dict.py
@ -1,13 +1,5 @@
 #!/usr/bin/env python3
 import argparse
 import sys
 def contain_oov(units):
    for unit in units:
        if unit not in unit_table:
            return True
    return False
 def main(args):
@ -18,6 +10,12 @@ def main(args):
            unit = line.strip()
            unit_table.add(unit)
    def contain_oov(units):
        for unit in units:
            if unit not in unit_table:
                return True
        return False
    # load spm model
    bpemode = args.bpemodel
    if bpemode:
@ -27,8 +25,8 @@ def main(args):
    # used to filter polyphone
    lexicon_table = set()
-    with open(sys.in_lexicon, 'r') as fin, \
+    with open(args.in_lexicon, 'r') as fin, \
-            open(sys.out_lexicon, 'w') as fout:
+            open(args.out_lexicon, 'w') as fout:
        for line in fin:
            word = line.split()[0]
            if word == 'SIL' and not bpemode:  # `sil` might be a valid piece in bpemodel
--- a/utils/fst/remove_oovs.pl
+++ b/utils/fst/remove_oovs.pl
--- a/utils/fst/rnnt_token_fst.py
+++ b/utils/fst/rnnt_token_fst.py
@ -31,8 +31,6 @@ if __name__ == "__main__":
        '--token_file',
        required=True,
        help='e2e model token file. line: token(char/phone/spm/disambigous)')
    args = parser.parse_args()
    print(args)
    main(args)
--- a/utils/fst/s2eps.pl
+++ b/utils/fst/s2eps.pl
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@ -0,0 +1,64 @@
 #!/usr/bin/env python3
 """Manifest file to key-value files."""
 import argparse
 import functools
 from pathlib import Path
 from utils.utility import add_arguments
 from utils.utility import print_arguments
 from utils.utility import read_manifest
 def main(args):
    print_arguments(args, globals())
    count = 0
    outdir = Path(args.output_path)
    wav_scp = outdir / 'wav.scp'
    dur_scp = outdir / 'duration'
    text_scp = outdir / 'text'
    manifest_jsons = read_manifest(args.manifest_path)
    with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
            'w') as ftxt:
        for line_json in manifest_jsons:
            utt = line_json['utt']
            feat = line_json['feat']
            file_ext = Path(feat).suffix  # .wav
            text = line_json['text']
            feat_shape = line_json['feat_shape']
            dur = feat_shape[0]
            feat_dim = feat_shape[1]
            if 'token' in line_json:
                tokens = line_json['token']
                tokenids = line_json['token_id']
                token_shape = line_json['token_shape']
                token_len = token_shape[0]
                vocab_dim = token_shape[1]
            if file_ext == '.wav':
                fwav.write(f"{utt} {feat}\n")
            fdur.write(f"{utt} {dur}\n")
            ftxt.write(f"{utt} {text}\n")
            count += 1
    print(f"Examples number: {count}")
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    add_arg = functools.partial(add_arguments, argparser=parser)
    # yapf: disable
    add_arg('manifest_path',    str,
            'data/librispeech/manifest.train',
            "Filepath of manifest to compute normalizer's mean and stddev.")
    add_arg('output_path',    str,
            'data/train',
            "dir path to dump wav.scp/duaration/text files.")
    # yapf: disable
    args = parser.parse_args()
    main(args)
--- a/utils/ngram_train.sh
+++ b/utils/ngram_train.sh
@ -22,7 +22,7 @@ lmbin=${2}.klm.bin
 # https://kheafield.com/code/kenlm/estimation/
 echo "build arpa lm."
-lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} >${arpa} || { echo "train kenlm error!"; exit -1; }
+lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} > ${arpa} || { echo "train kenlm error!"; exit -1; }
 # https://kheafield.com/code/kenlm/
 echo "build binary lm."
--- a/utils/utility.py
+++ b/utils/utility.py
@ -11,19 +11,95 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import hashlib
 import json
 import os
 import tarfile
 import zipfile
 from typing import Text
 from paddle.dataset.common import md5file
 __all__ = [
    "check_md5sum", "getfile_insensitive", "download_multi", "download",
-    "unpack", "unzip"
+    "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
    "read_manifest"
 ]
 def read_manifest(manifest_path):
    """Load and parse manifest file.
    Args:
        manifest_path ([type]): Manifest file to load and parse.
    Raises:
        IOError: If failed to parse the manifest.
    Returns:
        List[dict]: Manifest parsing results.
    """
    manifest = []
    for json_line in open(manifest_path, 'r'):
        try:
            json_data = json.loads(json_line)
        except Exception as e:
            raise IOError("Error reading manifest: %s" % str(e))
    return manifest
 def print_arguments(args, info=None):
    """Print argparse's arguments.
    Usage:
    .. code-block:: python
        parser = argparse.ArgumentParser()
        parser.add_argument("name", default="Jonh", type=str, help="User name.")
        args = parser.parse_args()
        print_arguments(args)
    :param args: Input argparse.Namespace for printing.
    :type args: argparse.Namespace
    """
    filename = ""
    if info:
        filename = info["__file__"]
    filename = os.path.basename(filename)
    print(f"----------- {filename} Configuration Arguments -----------")
    for arg, value in sorted(vars(args).items()):
        print("%s: %s" % (arg, value))
    print("-----------------------------------------------------------")
 def add_arguments(argname, type, default, help, argparser, **kwargs):
    """Add argparse's argument.
    Usage:
    .. code-block:: python
        parser = argparse.ArgumentParser()
        add_argument("name", str, "Jonh", "User name.", parser)
        args = parser.parse_args()
    """
    type = distutils.util.strtobool if type == bool else type
    argparser.add_argument(
        "--" + argname,
        default=default,
        type=type,
        help=help + ' Default: %(default)s.',
        **kwargs)
 def md5file(fname):
    hash_md5 = hashlib.md5()
    f = open(fname, "rb")
    for chunk in iter(lambda: f.read(4096), b""):
        hash_md5.update(chunk)
    f.close()
    return hash_md5.hexdigest()
 def getfile_insensitive(path):
    """Get the actual file path when given insensitive filename."""
    directory, filename = os.path.split(path)