TLG build pass

4 years ago · 104743cccc
parent f5369abdbe
commit 104743cccc
22 changed files with 387 additions and 61 deletions
--- a/deepspeech/utils/log.py
+++ b/deepspeech/utils/log.py
@ -17,7 +17,6 @@ import os
 import socket
 import sys

-import auto_log
 from paddle import inference

 FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
@ -156,6 +155,7 @@ class Autolog:
                 batch_size,
                 model_name="DeepSpeech",
                 model_precision="fp32"):
+        import auto_log
        pid = os.getpid()
        gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0])
        infer_config = inference.Config()
--- a/examples/aishell/s1/local/aishell_train_lms.sh
+++ b/examples/aishell/s1/local/aishell_train_lms.sh
@ -0,0 +1,58 @@
+#!/bin/bash
+
+# To be run from one directory above this script.
+. ./path.sh
+
+text=data/local/lm/text
+lexicon=data/local/dict/lexicon.txt
+
+for f in "$text" "$lexicon"; do
+  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
+done
+
+# Check SRILM tools
+if ! which ngram-count > /dev/null; then
+    echo "srilm tools are not found, please download it and install it from: "
+    echo "http://www.speech.sri.com/projects/srilm/download.html"
+    echo "Then add the tools to your PATH"
+    exit 1
+fi
+
+# This script takes no arguments.  It assumes you have already run
+# aishell_data_prep.sh.
+# It takes as input the files
+# data/local/lm/text
+# data/local/dict/lexicon.txt
+dir=data/local/lm
+mkdir -p $dir
+
+
+cleantext=$dir/text.no_oov
+
+cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
+  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
+  > $cleantext || exit 1;
+
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
+   sort -nr > $dir/word.counts || exit 1;
+
+# Get counts from acoustic training transcripts, and add  one-count
+# for each word in the lexicon (but not silence, we don't want it
+# in the LM-- we'll add it optionally later).
+cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
+  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
+   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
+
+cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
+
+heldout_sent=10000 # Don't change this if you want result to be comparable with
+    # kaldi_lm results
+mkdir -p $dir
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  head -$heldout_sent > $dir/heldout
+cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
+  tail -n +$heldout_sent > $dir/train
+
+ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
+  -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
+ngram -lm $dir/lm.arpa -ppl $dir/heldout
--- a/examples/aishell/s1/local/tlg.sh
+++ b/examples/aishell/s1/local/tlg.sh
@ -0,0 +1,52 @@
+#!/bin/bash
+
+set -eo pipefail
+
+stage=-1
+stop_stage=100
+corpus=aishell
+lmtype=srilm
+
+source utils/parse_options.sh
+
+data=${MAIN_ROOT}/examples/dataset/${corpus}
+lexicon=$data/resource_aishell/lexicon.txt
+text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # 7.1 Prepare dict
+    unit_file=data/vocab.txt
+    mkdir -p data/local/dict
+    cp $unit_file data/local/dict/units.txt
+    utils/fst/prepare_dict.py \
+        --unit_file $unit_file \
+        --in_lexicon ${lexicon} \
+        --out_lexicon data/local/dict/lexicon.txt
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # 7.2 Train lm
+    lm=data/local/lm
+    mkdir -p data/train
+    mkdir -p $lm
+    utils/manifest_key_value.py \
+        --manifest_path data/manifest.train \
+        --output_path data/train
+    utils/filter_scp.pl data/train/text \
+        $text > $lm/text
+    if [ $lmtype == 'srilm' ];then
+        local/aishell_train_lms.sh
+    else
+        utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 
+    # 7.3 Build decoding TLG
+    utils/fst/compile_lexicon_token_fst.sh \
+        data/local/dict data/local/tmp data/local/lang
+    utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
+fi
+
+echo "Aishell build TLG done."
+exit 0
--- a/examples/aishell/s1/path.sh
+++ b/examples/aishell/s1/path.sh
@ -4,11 +4,24 @@ export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8 
+export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}

 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/

-
+# model exp
 MODEL=u2
 export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
+
+
+export LIBLBFGS=/workspace/zhanghui/asr/wenet-210713/tools/liblbfgs-1.10
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
+export SRILM=/workspace/zhanghui/asr/wenet-210713/tools/srilm
+export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
+
+# Kaldi
+export KALDI_ROOT=/workspace/zhanghui/asr/kaldi
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
@ -38,43 +38,13 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi

-# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-#     # export ckpt avg_n
-#     CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
-# fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # export ckpt avg_n
+    CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+fi

 # Optionally, you can add LM and test it with runtime.
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-     # 7.1 Prepare dict
-     unit_file=data/vocab.txt
-     mkdir -p data/local/dict
-     cp $unit_file data/local/dict/units.txt
-     utils/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \
-         data/local/dict/lexicon.txt
-        
-     # 7.2 Train lm
-     lm=data/local/lm
-     mkdir -p $lm
-     utils/filter_scp.pl data/train/text \
-          $data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text
-     local/aishell_train_lms.sh
-
-     # 7.3 Build decoding TLG
-     utils/fst/compile_lexicon_token_fst.sh \
-         data/local/dict data/local/tmp data/local/lang
-     utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
-
-    #  # 7.4 Decoding with runtime
-    #  # reverse_weight only works for u2++ model and only left to right decoder is used when it is set to 0.0.
-    #  dir=exp/conformer
-    #  reverse_weight=0.0
-    #  chunk_size=-1
-    #  ./tools/decode.sh --nj 16 \
-    #      --beam 15.0 --lattice_beam 7.5 --max_active 7000 \
-    #      --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \
-    #      --reverse_weight $reverse_weight --chunk_size $chunk_size \
-    #      --fst_path data/lang_test/TLG.fst \
-    #      data/test/wav.scp data/test/text $dir/final.zip \
-    #      data/lang_test/words.txt $dir/lm_with_runtime
-    #  # See $dir/lm_with_runtime for wer
+    # train lm and build TLG
+    ./local/tlg.sh --corpus aishell --lmtype srilm 
 fi
--- a/examples/aishell/s1/utils
+++ b/examples/aishell/s1/utils
@ -0,0 +1 @@
+../../../utils
--- a/utils/init.py
+++ b/utils/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/utils/filter_scp.pl
+++ b/utils/filter_scp.pl
@ -0,0 +1,87 @@
+#!/usr/bin/env perl
+# Copyright 2010-2012 Microsoft Corporation
+#                     Johns Hopkins University (author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This script takes a list of utterance-ids or any file whose first field
+# of each line is an utterance-id, and filters an scp
+# file (or any file whose "n-th" field is an utterance id), printing
+# out only those lines whose "n-th" field is in id_list. The index of
+# the "n-th" field is 1, by default, but can be changed by using
+# the -f <n> switch
+
+$exclude = 0;
+$field = 1;
+$shifted = 0;
+
+do {
+  $shifted=0;
+  if ($ARGV[0] eq "--exclude") {
+    $exclude = 1;
+    shift @ARGV;
+    $shifted=1;
+  }
+  if ($ARGV[0] eq "-f") {
+    $field = $ARGV[1];
+    shift @ARGV; shift @ARGV;
+    $shifted=1
+  }
+} while ($shifted);
+
+if(@ARGV < 1 || @ARGV > 2) {
+  die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
+      "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
+      "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
+      "only the lines that were *not* in id_list.\n" .
+      "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
+      "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
+      "-f option, add 1 to the argument.\n" .
+      "See also: utils/filter_scp.pl .\n";
+}
+
+
+$idlist = shift @ARGV;
+open(F, "<$idlist") || die "Could not open id-list file $idlist";
+while(<F>) {
+  @A = split;
+  @A>=1 || die "Invalid id-list file line $_";
+  $seen{$A[0]} = 1;
+}
+
+if ($field == 1) { # Treat this as special case, since it is common.
+  while(<>) {
+    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
+    # $1 is what we filter on.
+    if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
+      print $_;
+    }
+  }
+} else {
+  while(<>) {
+    @A = split;
+    @A > 0 || die "Invalid scp file line $_";
+    @A >= $field || die "Invalid scp file line $_";
+    if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
+      print $_;
+    }
+  }
+}
+
+# tests:
+# the following should print "foo 1"
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
+# the following should print "bar 2".
+# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
--- a/utils/fst/add_lex_disambig.pl
+++ b/utils/fst/add_lex_disambig.pl
--- a/utils/fst/compile_lexicon_token_fst.sh
+++ b/utils/fst/compile_lexicon_token_fst.sh
@ -57,7 +57,7 @@ cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list

 # ctc_token_fst_corrected is too big and too slow for character based chinese modeling,
 # so here just use simple ctc_token_fst
-utils/fst/ctc_token_fst.py $dir/tokens.txt | \
+utils/fst/ctc_token_fst.py --token_file $dir/tokens.txt | \
  fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \
  fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;

--- a/utils/fst/ctc_token_fst.py
+++ b/utils/fst/ctc_token_fst.py
@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 import argparse
-import sys


 def main(args):
@ -14,7 +13,7 @@ def main(args):
    print('2 0 <eps> <eps>')

    # linking `token` between node 1 and node 2
-    with open(sys.token_file, 'r') as fin:
+    with open(args.token_file, 'r') as fin:
        node = 3
        for entry in fin:
            fields = entry.strip().split(' ')
@ -46,6 +45,5 @@ if __name__ == '__main__':
        help='e2e model token file. line: token(char/phone/spm/disambigous)')

    args = parser.parse_args()
-    print(args)

    main(args)
--- a/utils/fst/ctc_token_fst_corrected.py
+++ b/utils/fst/ctc_token_fst_corrected.py
@ -73,8 +73,6 @@ if __name__ == "__main__":
        '--token_file',
        required=True,
        help='e2e model token file. line: token(char/phone/spm/disambigous)')
-
    args = parser.parse_args()
-    print(args)

    main(args)
--- a/utils/fst/eps2disambig.pl
+++ b/utils/fst/eps2disambig.pl
--- a/utils/fst/make_lexicon_fst.pl
+++ b/utils/fst/make_lexicon_fst.pl
--- a/utils/fst/make_tlg.sh
+++ b/utils/fst/make_tlg.sh
--- a/utils/fst/prepare_dict.py
+++ b/utils/fst/prepare_dict.py
@ -1,13 +1,5 @@
 #!/usr/bin/env python3
 import argparse
-import sys
-
-
-def contain_oov(units):
-    for unit in units:
-        if unit not in unit_table:
-            return True
-    return False


 def main(args):
@ -18,6 +10,12 @@ def main(args):
            unit = line.strip()
            unit_table.add(unit)

+    def contain_oov(units):
+        for unit in units:
+            if unit not in unit_table:
+                return True
+        return False
+
    # load spm model
    bpemode = args.bpemodel
    if bpemode:
@ -27,8 +25,8 @@ def main(args):

    # used to filter polyphone
    lexicon_table = set()
-    with open(sys.in_lexicon, 'r') as fin, \
-            open(sys.out_lexicon, 'w') as fout:
+    with open(args.in_lexicon, 'r') as fin, \
+            open(args.out_lexicon, 'w') as fout:
        for line in fin:
            word = line.split()[0]
            if word == 'SIL' and not bpemode:  # `sil` might be a valid piece in bpemodel
--- a/utils/fst/remove_oovs.pl
+++ b/utils/fst/remove_oovs.pl
--- a/utils/fst/rnnt_token_fst.py
+++ b/utils/fst/rnnt_token_fst.py
@ -31,8 +31,6 @@ if __name__ == "__main__":
        '--token_file',
        required=True,
        help='e2e model token file. line: token(char/phone/spm/disambigous)')
-
    args = parser.parse_args()
-    print(args)

    main(args)
--- a/utils/fst/s2eps.pl
+++ b/utils/fst/s2eps.pl
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""Manifest file to key-value files."""
+import argparse
+import functools
+from pathlib import Path
+
+from utils.utility import add_arguments
+from utils.utility import print_arguments
+from utils.utility import read_manifest
+
+
+def main(args):
+    print_arguments(args, globals())
+
+    count = 0
+
+    outdir = Path(args.output_path)
+    wav_scp = outdir / 'wav.scp'
+    dur_scp = outdir / 'duration'
+    text_scp = outdir / 'text'
+
+    manifest_jsons = read_manifest(args.manifest_path)
+
+    with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
+            'w') as ftxt:
+        for line_json in manifest_jsons:
+            utt = line_json['utt']
+            feat = line_json['feat']
+            file_ext = Path(feat).suffix  # .wav
+            text = line_json['text']
+            feat_shape = line_json['feat_shape']
+            dur = feat_shape[0]
+            feat_dim = feat_shape[1]
+            if 'token' in line_json:
+                tokens = line_json['token']
+                tokenids = line_json['token_id']
+                token_shape = line_json['token_shape']
+                token_len = token_shape[0]
+                vocab_dim = token_shape[1]
+
+            if file_ext == '.wav':
+                fwav.write(f"{utt} {feat}\n")
+            fdur.write(f"{utt} {dur}\n")
+            ftxt.write(f"{utt} {text}\n")
+
+            count += 1
+
+    print(f"Examples number: {count}")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+    # yapf: disable
+    add_arg('manifest_path',    str,
+            'data/librispeech/manifest.train',
+            "Filepath of manifest to compute normalizer's mean and stddev.")
+    add_arg('output_path',    str,
+            'data/train',
+            "dir path to dump wav.scp/duaration/text files.")
+    # yapf: disable
+    args = parser.parse_args()
+
+    main(args)
--- a/utils/ngram_train.sh
+++ b/utils/ngram_train.sh
@ -22,7 +22,7 @@ lmbin=${2}.klm.bin

 # https://kheafield.com/code/kenlm/estimation/
 echo "build arpa lm."
-lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} >${arpa} || { echo "train kenlm error!"; exit -1; }
+lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} > ${arpa} || { echo "train kenlm error!"; exit -1; }

 # https://kheafield.com/code/kenlm/
 echo "build binary lm."
--- a/utils/utility.py
+++ b/utils/utility.py
@ -11,19 +11,95 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import hashlib
+import json
 import os
 import tarfile
 import zipfile
 from typing import Text

-from paddle.dataset.common import md5file
-
 __all__ = [
    "check_md5sum", "getfile_insensitive", "download_multi", "download",
-    "unpack", "unzip"
+    "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
+    "read_manifest"
 ]


+def read_manifest(manifest_path):
+    """Load and parse manifest file.
+    Args:
+        manifest_path ([type]): Manifest file to load and parse.
+
+    Raises:
+        IOError: If failed to parse the manifest.
+
+    Returns:
+        List[dict]: Manifest parsing results.
+    """
+
+    manifest = []
+    for json_line in open(manifest_path, 'r'):
+        try:
+            json_data = json.loads(json_line)
+        except Exception as e:
+            raise IOError("Error reading manifest: %s" % str(e))
+    return manifest
+
+
+def print_arguments(args, info=None):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    filename = ""
+    if info:
+        filename = info["__file__"]
+    filename = os.path.basename(filename)
+    print(f"----------- {filename} Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).items()):
+        print("%s: %s" % (arg, value))
+    print("-----------------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
+
+
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    f = open(fname, "rb")
+    for chunk in iter(lambda: f.read(4096), b""):
+        hash_md5.update(chunk)
+    f.close()
+    return hash_md5.hexdigest()
+
+
 def getfile_insensitive(path):
    """Get the actual file path when given insensitive filename."""
    directory, filename = os.path.split(path)