TLG build pass

pull/729/head
Hui Zhang 3 years ago
parent f5369abdbe
commit 104743cccc

@ -17,7 +17,6 @@ import os
import socket import socket
import sys import sys
import auto_log
from paddle import inference from paddle import inference
FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s' FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
@ -156,6 +155,7 @@ class Autolog:
batch_size, batch_size,
model_name="DeepSpeech", model_name="DeepSpeech",
model_precision="fp32"): model_precision="fp32"):
import auto_log
pid = os.getpid() pid = os.getpid()
gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]) gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0])
infer_config = inference.Config() infer_config = inference.Config()

@ -0,0 +1,58 @@
#!/bin/bash
# To be run from one directory above this script.
. ./path.sh
text=data/local/lm/text
lexicon=data/local/dict/lexicon.txt
for f in "$text" "$lexicon"; do
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
done
# Check SRILM tools
if ! which ngram-count > /dev/null; then
echo "srilm tools are not found, please download it and install it from: "
echo "http://www.speech.sri.com/projects/srilm/download.html"
echo "Then add the tools to your PATH"
exit 1
fi
# This script takes no arguments. It assumes you have already run
# aishell_data_prep.sh.
# It takes as input the files
# data/local/lm/text
# data/local/dict/lexicon.txt
dir=data/local/lm
mkdir -p $dir
cleantext=$dir/text.no_oov
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
> $cleantext || exit 1;
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
sort -nr > $dir/word.counts || exit 1;
# Get counts from acoustic training transcripts, and add one-count
# for each word in the lexicon (but not silence, we don't want it
# in the LM-- we'll add it optionally later).
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
heldout_sent=10000 # Don't change this if you want result to be comparable with
# kaldi_lm results
mkdir -p $dir
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
head -$heldout_sent > $dir/heldout
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
tail -n +$heldout_sent > $dir/train
ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
-map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
ngram -lm $dir/lm.arpa -ppl $dir/heldout

@ -0,0 +1,52 @@
#!/bin/bash
set -eo pipefail
stage=-1
stop_stage=100
corpus=aishell
lmtype=srilm
source utils/parse_options.sh
data=${MAIN_ROOT}/examples/dataset/${corpus}
lexicon=$data/resource_aishell/lexicon.txt
text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# 7.1 Prepare dict
unit_file=data/vocab.txt
mkdir -p data/local/dict
cp $unit_file data/local/dict/units.txt
utils/fst/prepare_dict.py \
--unit_file $unit_file \
--in_lexicon ${lexicon} \
--out_lexicon data/local/dict/lexicon.txt
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# 7.2 Train lm
lm=data/local/lm
mkdir -p data/train
mkdir -p $lm
utils/manifest_key_value.py \
--manifest_path data/manifest.train \
--output_path data/train
utils/filter_scp.pl data/train/text \
$text > $lm/text
if [ $lmtype == 'srilm' ];then
local/aishell_train_lms.sh
else
utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# 7.3 Build decoding TLG
utils/fst/compile_lexicon_token_fst.sh \
data/local/dict data/local/tmp data/local/lang
utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
fi
echo "Aishell build TLG done."
exit 0

@ -4,11 +4,24 @@ export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8 export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
# model exp
MODEL=u2 MODEL=u2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export LIBLBFGS=/workspace/zhanghui/asr/wenet-210713/tools/liblbfgs-1.10
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
export SRILM=/workspace/zhanghui/asr/wenet-210713/tools/srilm
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
# Kaldi
export KALDI_ROOT=/workspace/zhanghui/asr/kaldi
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
. $KALDI_ROOT/tools/config/common_path.sh

@ -38,43 +38,13 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# # export ckpt avg_n # export ckpt avg_n
# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
# fi fi
# Optionally, you can add LM and test it with runtime. # Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# 7.1 Prepare dict # train lm and build TLG
unit_file=data/vocab.txt ./local/tlg.sh --corpus aishell --lmtype srilm
mkdir -p data/local/dict
cp $unit_file data/local/dict/units.txt
utils/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \
data/local/dict/lexicon.txt
# 7.2 Train lm
lm=data/local/lm
mkdir -p $lm
utils/filter_scp.pl data/train/text \
$data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text
local/aishell_train_lms.sh
# 7.3 Build decoding TLG
utils/fst/compile_lexicon_token_fst.sh \
data/local/dict data/local/tmp data/local/lang
utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
# # 7.4 Decoding with runtime
# # reverse_weight only works for u2++ model and only left to right decoder is used when it is set to 0.0.
# dir=exp/conformer
# reverse_weight=0.0
# chunk_size=-1
# ./tools/decode.sh --nj 16 \
# --beam 15.0 --lattice_beam 7.5 --max_active 7000 \
# --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \
# --reverse_weight $reverse_weight --chunk_size $chunk_size \
# --fst_path data/lang_test/TLG.fst \
# data/test/wav.scp data/test/text $dir/final.zip \
# data/lang_test/words.txt $dir/lm_with_runtime
# # See $dir/lm_with_runtime for wer
fi fi

@ -0,0 +1 @@
../../../utils

@ -0,0 +1,13 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -0,0 +1,87 @@
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation
# Johns Hopkins University (author: Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script takes a list of utterance-ids or any file whose first field
# of each line is an utterance-id, and filters an scp
# file (or any file whose "n-th" field is an utterance id), printing
# out only those lines whose "n-th" field is in id_list. The index of
# the "n-th" field is 1, by default, but can be changed by using
# the -f <n> switch
$exclude = 0;
$field = 1;
$shifted = 0;
do {
$shifted=0;
if ($ARGV[0] eq "--exclude") {
$exclude = 1;
shift @ARGV;
$shifted=1;
}
if ($ARGV[0] eq "-f") {
$field = $ARGV[1];
shift @ARGV; shift @ARGV;
$shifted=1
}
} while ($shifted);
if(@ARGV < 1 || @ARGV > 2) {
die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
"Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
"Note: only the first field of each line in id_list matters. With --exclude, prints\n" .
"only the lines that were *not* in id_list.\n" .
"Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
"If your older scripts (written before Oct 2014) stopped working and you used the\n" .
"-f option, add 1 to the argument.\n" .
"See also: utils/filter_scp.pl .\n";
}
$idlist = shift @ARGV;
open(F, "<$idlist") || die "Could not open id-list file $idlist";
while(<F>) {
@A = split;
@A>=1 || die "Invalid id-list file line $_";
$seen{$A[0]} = 1;
}
if ($field == 1) { # Treat this as special case, since it is common.
while(<>) {
$_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
# $1 is what we filter on.
if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
print $_;
}
}
} else {
while(<>) {
@A = split;
@A > 0 || die "Invalid scp file line $_";
@A >= $field || die "Invalid scp file line $_";
if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
print $_;
}
}
}
# tests:
# the following should print "foo 1"
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
# the following should print "bar 2".
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)

@ -57,7 +57,7 @@ cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, # ctc_token_fst_corrected is too big and too slow for character based chinese modeling,
# so here just use simple ctc_token_fst # so here just use simple ctc_token_fst
utils/fst/ctc_token_fst.py $dir/tokens.txt | \ utils/fst/ctc_token_fst.py --token_file $dir/tokens.txt | \
fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;

@ -1,6 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
import sys
def main(args): def main(args):
@ -14,7 +13,7 @@ def main(args):
print('2 0 <eps> <eps>') print('2 0 <eps> <eps>')
# linking `token` between node 1 and node 2 # linking `token` between node 1 and node 2
with open(sys.token_file, 'r') as fin: with open(args.token_file, 'r') as fin:
node = 3 node = 3
for entry in fin: for entry in fin:
fields = entry.strip().split(' ') fields = entry.strip().split(' ')
@ -46,6 +45,5 @@ if __name__ == '__main__':
help='e2e model token file. line: token(char/phone/spm/disambigous)') help='e2e model token file. line: token(char/phone/spm/disambigous)')
args = parser.parse_args() args = parser.parse_args()
print(args)
main(args) main(args)

@ -73,8 +73,6 @@ if __name__ == "__main__":
'--token_file', '--token_file',
required=True, required=True,
help='e2e model token file. line: token(char/phone/spm/disambigous)') help='e2e model token file. line: token(char/phone/spm/disambigous)')
args = parser.parse_args() args = parser.parse_args()
print(args)
main(args) main(args)

@ -1,13 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
import sys
def contain_oov(units):
for unit in units:
if unit not in unit_table:
return True
return False
def main(args): def main(args):
@ -18,6 +10,12 @@ def main(args):
unit = line.strip() unit = line.strip()
unit_table.add(unit) unit_table.add(unit)
def contain_oov(units):
for unit in units:
if unit not in unit_table:
return True
return False
# load spm model # load spm model
bpemode = args.bpemodel bpemode = args.bpemodel
if bpemode: if bpemode:
@ -27,8 +25,8 @@ def main(args):
# used to filter polyphone # used to filter polyphone
lexicon_table = set() lexicon_table = set()
with open(sys.in_lexicon, 'r') as fin, \ with open(args.in_lexicon, 'r') as fin, \
open(sys.out_lexicon, 'w') as fout: open(args.out_lexicon, 'w') as fout:
for line in fin: for line in fin:
word = line.split()[0] word = line.split()[0]
if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel

@ -31,8 +31,6 @@ if __name__ == "__main__":
'--token_file', '--token_file',
required=True, required=True,
help='e2e model token file. line: token(char/phone/spm/disambigous)') help='e2e model token file. line: token(char/phone/spm/disambigous)')
args = parser.parse_args() args = parser.parse_args()
print(args)
main(args) main(args)

@ -0,0 +1,64 @@
#!/usr/bin/env python3
"""Manifest file to key-value files."""
import argparse
import functools
from pathlib import Path
from utils.utility import add_arguments
from utils.utility import print_arguments
from utils.utility import read_manifest
def main(args):
print_arguments(args, globals())
count = 0
outdir = Path(args.output_path)
wav_scp = outdir / 'wav.scp'
dur_scp = outdir / 'duration'
text_scp = outdir / 'text'
manifest_jsons = read_manifest(args.manifest_path)
with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
'w') as ftxt:
for line_json in manifest_jsons:
utt = line_json['utt']
feat = line_json['feat']
file_ext = Path(feat).suffix # .wav
text = line_json['text']
feat_shape = line_json['feat_shape']
dur = feat_shape[0]
feat_dim = feat_shape[1]
if 'token' in line_json:
tokens = line_json['token']
tokenids = line_json['token_id']
token_shape = line_json['token_shape']
token_len = token_shape[0]
vocab_dim = token_shape[1]
if file_ext == '.wav':
fwav.write(f"{utt} {feat}\n")
fdur.write(f"{utt} {dur}\n")
ftxt.write(f"{utt} {text}\n")
count += 1
print(f"Examples number: {count}")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('manifest_path', str,
'data/librispeech/manifest.train',
"Filepath of manifest to compute normalizer's mean and stddev.")
add_arg('output_path', str,
'data/train',
"dir path to dump wav.scp/duaration/text files.")
# yapf: disable
args = parser.parse_args()
main(args)

@ -22,7 +22,7 @@ lmbin=${2}.klm.bin
# https://kheafield.com/code/kenlm/estimation/ # https://kheafield.com/code/kenlm/estimation/
echo "build arpa lm." echo "build arpa lm."
lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} >${arpa} || { echo "train kenlm error!"; exit -1; } lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} > ${arpa} || { echo "train kenlm error!"; exit -1; }
# https://kheafield.com/code/kenlm/ # https://kheafield.com/code/kenlm/
echo "build binary lm." echo "build binary lm."

@ -11,19 +11,95 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import hashlib
import json
import os import os
import tarfile import tarfile
import zipfile import zipfile
from typing import Text from typing import Text
from paddle.dataset.common import md5file
__all__ = [ __all__ = [
"check_md5sum", "getfile_insensitive", "download_multi", "download", "check_md5sum", "getfile_insensitive", "download_multi", "download",
"unpack", "unzip" "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
"read_manifest"
] ]
def read_manifest(manifest_path):
"""Load and parse manifest file.
Args:
manifest_path ([type]): Manifest file to load and parse.
Raises:
IOError: If failed to parse the manifest.
Returns:
List[dict]: Manifest parsing results.
"""
manifest = []
for json_line in open(manifest_path, 'r'):
try:
json_data = json.loads(json_line)
except Exception as e:
raise IOError("Error reading manifest: %s" % str(e))
return manifest
def print_arguments(args, info=None):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
filename = ""
if info:
filename = info["__file__"]
filename = os.path.basename(filename)
print(f"----------- {filename} Configuration Arguments -----------")
for arg, value in sorted(vars(args).items()):
print("%s: %s" % (arg, value))
print("-----------------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def md5file(fname):
hash_md5 = hashlib.md5()
f = open(fname, "rb")
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
f.close()
return hash_md5.hexdigest()
def getfile_insensitive(path): def getfile_insensitive(path):
"""Get the actual file path when given insensitive filename.""" """Get the actual file path when given insensitive filename."""
directory, filename = os.path.split(path) directory, filename = os.path.split(path)

Loading…
Cancel
Save