diff --git a/deepspeech/utils/log.py b/deepspeech/utils/log.py index aefc8b59..e99dacec 100644 --- a/deepspeech/utils/log.py +++ b/deepspeech/utils/log.py @@ -17,7 +17,6 @@ import os import socket import sys -import auto_log from paddle import inference FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s' @@ -156,6 +155,7 @@ class Autolog: batch_size, model_name="DeepSpeech", model_precision="fp32"): + import auto_log pid = os.getpid() gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]) infer_config = inference.Config() diff --git a/examples/aishell/s1/local/aishell_train_lms.sh b/examples/aishell/s1/local/aishell_train_lms.sh new file mode 100755 index 00000000..7fb555b4 --- /dev/null +++ b/examples/aishell/s1/local/aishell_train_lms.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# To be run from one directory above this script. +. ./path.sh + +text=data/local/lm/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# Check SRILM tools +if ! which ngram-count > /dev/null; then + echo "srilm tools are not found, please download it and install it from: " + echo "http://www.speech.sri.com/projects/srilm/download.html" + echo "Then add the tools to your PATH" + exit 1 +fi + +# This script takes no arguments. It assumes you have already run +# aishell_data_prep.sh. +# It takes as input the files +# data/local/lm/text +# data/local/dict/lexicon.txt +dir=data/local/lm +mkdir -p $dir + + +cleantext=$dir/text.no_oov + +cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + > $cleantext || exit 1; + +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ + sort -nr > $dir/word.counts || exit 1; + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; + +cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist + +heldout_sent=10000 # Don't change this if you want result to be comparable with + # kaldi_lm results +mkdir -p $dir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/train + +ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $dir/lm.arpa +ngram -lm $dir/lm.arpa -ppl $dir/heldout \ No newline at end of file diff --git a/examples/aishell/s1/local/tlg.sh b/examples/aishell/s1/local/tlg.sh new file mode 100755 index 00000000..f5287f79 --- /dev/null +++ b/examples/aishell/s1/local/tlg.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +set -eo pipefail + +stage=-1 +stop_stage=100 +corpus=aishell +lmtype=srilm + +source utils/parse_options.sh + +data=${MAIN_ROOT}/examples/dataset/${corpus} +lexicon=$data/resource_aishell/lexicon.txt +text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # 7.1 Prepare dict + unit_file=data/vocab.txt + mkdir -p data/local/dict + cp $unit_file data/local/dict/units.txt + utils/fst/prepare_dict.py \ + --unit_file $unit_file \ + --in_lexicon ${lexicon} \ + --out_lexicon data/local/dict/lexicon.txt +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # 7.2 Train lm + lm=data/local/lm + mkdir -p data/train + mkdir -p $lm + utils/manifest_key_value.py \ + --manifest_path data/manifest.train \ + --output_path data/train + utils/filter_scp.pl data/train/text \ + $text > $lm/text + if [ $lmtype == 'srilm' ];then + local/aishell_train_lms.sh + else + utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa + fi +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # 7.3 Build decoding TLG + utils/fst/compile_lexicon_token_fst.sh \ + data/local/dict data/local/tmp data/local/lang + utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; +fi + +echo "Aishell build TLG done." +exit 0 diff --git a/examples/aishell/s1/path.sh b/examples/aishell/s1/path.sh index 30adb6ca..6214c8ac 100644 --- a/examples/aishell/s1/path.sh +++ b/examples/aishell/s1/path.sh @@ -4,11 +4,25 @@ export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} export LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 +export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ - +# model exp MODEL=u2 export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin + + +# srilm +export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs +export SRILM=${MAIN_ROOT}/tools/srilm +export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64 + +# Kaldi +export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" +. $KALDI_ROOT/tools/config/common_path.sh || true diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh index 65b48a97..d55d47ea 100644 --- a/examples/aishell/s1/run.sh +++ b/examples/aishell/s1/run.sh @@ -42,3 +42,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # export ckpt avg_n CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi + + # Optionally, you can add LM and test it with runtime. + if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + # train lm and build TLG + ./local/tlg.sh --corpus aishell --lmtype srilm + fi diff --git a/examples/aishell/s1/utils b/examples/aishell/s1/utils new file mode 120000 index 00000000..973afe67 --- /dev/null +++ b/examples/aishell/s1/utils @@ -0,0 +1 @@ +../../../utils \ No newline at end of file diff --git a/examples/dataset/aishell/.gitignore b/examples/dataset/aishell/.gitignore index eea6573e..27194aab 100644 --- a/examples/dataset/aishell/.gitignore +++ b/examples/dataset/aishell/.gitignore @@ -1,4 +1,5 @@ data_aishell* *.meta manifest.* -*.tgz \ No newline at end of file +*.tgz +resource_aishell diff --git a/tools/install/README.md b/tools/extras/README.md similarity index 100% rename from tools/install/README.md rename to tools/extras/README.md diff --git a/tools/install/install_gcc.sh b/tools/extras/install_gcc.sh similarity index 100% rename from tools/install/install_gcc.sh rename to tools/extras/install_gcc.sh diff --git a/tools/install/install_kaldi.sh b/tools/extras/install_kaldi.sh similarity index 100% rename from tools/install/install_kaldi.sh rename to tools/extras/install_kaldi.sh diff --git a/tools/extras/install_kenlm.sh b/tools/extras/install_kenlm.sh new file mode 100755 index 00000000..100225bf --- /dev/null +++ b/tools/extras/install_kenlm.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +apt install -y build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev + +apt-get install -y gcc-5 g++-5 && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50 && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 50 + +test -d kenlm || wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz + +rm -rf kenlm/build && mkdir -p kenlm/build && cd kenlm/build && cmake .. && make -j4 && make install diff --git a/tools/extras/install_liblbfgs.sh b/tools/extras/install_liblbfgs.sh new file mode 100755 index 00000000..8d6ae4ab --- /dev/null +++ b/tools/extras/install_liblbfgs.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +VER=1.10 + +WGET=${WGET:-wget} + +if [ ! -f liblbfgs-$VER.tar.gz ]; then + if [ -d "$DOWNLOAD_DIR" ]; then + cp -p "$DOWNLOAD_DIR/liblbfgs-$VER.tar.gz" . || exit 1 + else + $WGET https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz || exit 1 + fi +fi + +tar -xzf liblbfgs-$VER.tar.gz +cd liblbfgs-$VER +./configure --prefix=`pwd` +make +# due to the liblbfgs project directory structure, we have to use -i +# but the erros are completely harmless +make -i install +cd .. + +( + [ ! -z "${LIBLBFGS}" ] && \ + echo >&2 "LIBLBFGS variable is aleady defined. Undefining..." && \ + unset LIBLBFGS + + [ -f ./env.sh ] && . ./env.sh + + [ ! -z "${LIBLBFGS}" ] && \ + echo >&2 "libLBFGS config is already in env.sh" && exit + + wd=`pwd` + wd=`readlink -f $wd || pwd` + + echo "export LIBLBFGS=$wd/liblbfgs-1.10" + echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH:-}':'${LIBLBFGS}'/lib/.libs +) >> env.sh + diff --git a/tools/install/install_mfa.sh b/tools/extras/install_mfa.sh similarity index 100% rename from tools/install/install_mfa.sh rename to tools/extras/install_mfa.sh diff --git a/tools/install/install_miniconda.sh b/tools/extras/install_miniconda.sh similarity index 100% rename from tools/install/install_miniconda.sh rename to tools/extras/install_miniconda.sh diff --git a/tools/install/install_mkl.sh b/tools/extras/install_mkl.sh similarity index 100% rename from tools/install/install_mkl.sh rename to tools/extras/install_mkl.sh diff --git a/tools/install/install_ngram.sh b/tools/extras/install_ngram.sh similarity index 100% rename from tools/install/install_ngram.sh rename to tools/extras/install_ngram.sh diff --git a/tools/install/install_openblas.sh b/tools/extras/install_openblas.sh similarity index 100% rename from tools/install/install_openblas.sh rename to tools/extras/install_openblas.sh diff --git a/tools/install/install_openfst.sh b/tools/extras/install_openfst.sh similarity index 100% rename from tools/install/install_openfst.sh rename to tools/extras/install_openfst.sh diff --git a/tools/install/install_pynini.sh b/tools/extras/install_pynini.sh similarity index 100% rename from tools/install/install_pynini.sh rename to tools/extras/install_pynini.sh diff --git a/tools/extras/install_srilm.sh b/tools/extras/install_srilm.sh new file mode 100755 index 00000000..f359e70c --- /dev/null +++ b/tools/extras/install_srilm.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +current_path=`pwd` +current_dir=`basename "$current_path"` + +if [ "tools" != "$current_dir" ]; then + echo "You should run this script in tools/ directory!!" + exit 1 +fi + +if [ ! -d liblbfgs-1.10 ]; then + echo Installing libLBFGS library to support MaxEnt LMs + bash extras/install_liblbfgs.sh || exit 1 +fi + +# http://www.speech.sri.com/projects/srilm/download.html +if [ ! -f srilm.tgz ] && [ ! -f srilm.tar.gz ]; then # Changed format type from tgz to tar.gz as the srilm v1.7.3 downloads as tar.gz + echo This script cannot install SRILM in a completely automatic + echo way because you need to put your address in a download form. + echo Please download SRILM from http://www.speech.sri.com/projects/srilm/download.html + echo put it in ./srilm.tar.gz , then run this script. + echo Note: You may have to rename the downloaded file to remove version name from filename eg: mv srilm-1.7.3.tar.gz srilm.tar.gz + exit 1 +fi + +! which gawk 2>/dev/null && \ + echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; + +mkdir -p srilm +cd srilm + + +if [ -f ../srilm.tgz ]; then + tar -xvzf ../srilm.tgz # Old SRILM format +elif [ -f ../srilm.tar.gz ]; then + tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz +fi + +major=`awk -F. '{ print $1 }' RELEASE` +minor=`awk -F. '{ print $2 }' RELEASE` +micro=`awk -F. '{ print $3 }' RELEASE` + +if [ $major -le 1 ] && [ $minor -le 7 ] && [ $micro -le 1 ]; then + echo "Detected version 1.7.1 or earlier. Applying patch." + patch -p0 < ../extras/srilm.patch +fi + +# set the SRILM variable in the top-level Makefile to this directory. +cp Makefile tmpf + +cat tmpf | awk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ + > Makefile || exit 1 +rm tmpf + +mtype=`sbin/machine-type` + +echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype +grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \ + sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \ + >> common/Makefile.machine.$mtype + +grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \ + sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/ -Wl,-rpath -Wl,$(SRILM)/../liblbfgs-1.10/lib/|' \ + >> common/Makefile.machine.$mtype + +make || exit + +cd .. +( + [ ! -z "${SRILM}" ] && \ + echo >&2 "SRILM variable is aleady defined. Undefining..." && \ + unset SRILM + + [ -f ./env.sh ] && . ./env.sh + + [ ! -z "${SRILM}" ] && \ + echo >&2 "SRILM config is already in env.sh" && exit + + wd=`pwd` + wd=`readlink -f $wd || pwd` + + echo "export SRILM=$wd/srilm" + dirs="\${PATH}" + for directory in $(cd srilm && find bin -type d ) ; do + dirs="$dirs:\${SRILM}/$directory" + done + echo "export PATH=$dirs" +) >> env.sh + +echo >&2 "Installation of SRILM finished successfully" +echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/tools/extras/srilm.patch b/tools/extras/srilm.patch new file mode 100644 index 00000000..c54ad21a --- /dev/null +++ b/tools/extras/srilm.patch @@ -0,0 +1,17 @@ +--- dstruct/src/Trie.orig 2016-11-08 19:53:40.524000000 +0000 ++++ dstruct/src/Trie.cc 2016-11-08 19:53:59.088000000 +0000 +@@ -200,11 +200,14 @@ + if (removedData == 0) { + Trie node; + if (sub.remove(keys[0], &node)) { ++#if !defined(__GNUC__) || !(__GNUC__ >= 4 && __GNUC_MINOR__ >= 9 || __GNUC__ > 4) + /* + * XXX: Call subtrie destructor explicitly since we're not + * passing the removed node to the caller. ++ * !!! Triggers bug with gcc >= 4.9 optimization !!! + */ + node.~Trie(); ++#endif + return true; + } else { + return false; diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/utils/filter_scp.pl b/utils/filter_scp.pl new file mode 100755 index 00000000..904db868 --- /dev/null +++ b/utils/filter_scp.pl @@ -0,0 +1,87 @@ +#!/usr/bin/env perl +# Copyright 2010-2012 Microsoft Corporation +# Johns Hopkins University (author: Daniel Povey) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# This script takes a list of utterance-ids or any file whose first field +# of each line is an utterance-id, and filters an scp +# file (or any file whose "n-th" field is an utterance id), printing +# out only those lines whose "n-th" field is in id_list. The index of +# the "n-th" field is 1, by default, but can be changed by using +# the -f switch + +$exclude = 0; +$field = 1; +$shifted = 0; + +do { + $shifted=0; + if ($ARGV[0] eq "--exclude") { + $exclude = 1; + shift @ARGV; + $shifted=1; + } + if ($ARGV[0] eq "-f") { + $field = $ARGV[1]; + shift @ARGV; shift @ARGV; + $shifted=1 + } +} while ($shifted); + +if(@ARGV < 1 || @ARGV > 2) { + die "Usage: filter_scp.pl [--exclude] [-f ] id_list [in.scp] > out.scp \n" . + "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" . + "Note: only the first field of each line in id_list matters. With --exclude, prints\n" . + "only the lines that were *not* in id_list.\n" . + "Caution: previously, the -f option was interpreted as a zero-based field index.\n" . + "If your older scripts (written before Oct 2014) stopped working and you used the\n" . + "-f option, add 1 to the argument.\n" . + "See also: utils/filter_scp.pl .\n"; +} + + +$idlist = shift @ARGV; +open(F, "<$idlist") || die "Could not open id-list file $idlist"; +while() { + @A = split; + @A>=1 || die "Invalid id-list file line $_"; + $seen{$A[0]} = 1; +} + +if ($field == 1) { # Treat this as special case, since it is common. + while(<>) { + $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field."; + # $1 is what we filter on. + if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) { + print $_; + } + } +} else { + while(<>) { + @A = split; + @A > 0 || die "Invalid scp file line $_"; + @A >= $field || die "Invalid scp file line $_"; + if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) { + print $_; + } + } +} + +# tests: +# the following should print "foo 1" +# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo) +# the following should print "bar 2". +# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2) \ No newline at end of file diff --git a/utils/fst/add_lex_disambig.pl b/utils/fst/add_lex_disambig.pl new file mode 100755 index 00000000..8ecbbd3a --- /dev/null +++ b/utils/fst/add_lex_disambig.pl @@ -0,0 +1,195 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation +# 2013-2016 Johns Hopkins University (author: Daniel Povey) +# 2015 Hainan Xu +# 2015 Guoguo Chen + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Adds disambiguation symbols to a lexicon. +# Outputs still in the normal lexicon format. +# Disambig syms are numbered #1, #2, #3, etc. (#0 +# reserved for symbol in grammar). +# Outputs the number of disambig syms to the standard output. +# With the --pron-probs option, expects the second field +# of each lexicon line to be a pron-prob. +# With the --sil-probs option, expects three additional +# fields after the pron-prob, representing various components +# of the silence probability model. + +$pron_probs = 0; +$sil_probs = 0; +$first_allowed_disambig = 1; + +for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { + if ($ARGV[0] eq "--pron-probs") { + $pron_probs = 1; + shift @ARGV; + } + if ($ARGV[0] eq "--sil-probs") { + $sil_probs = 1; + shift @ARGV; + } + if ($ARGV[0] eq "--first-allowed-disambig") { + $first_allowed_disambig = 0 + $ARGV[1]; + if ($first_allowed_disambig < 1) { + die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; + } + shift @ARGV; + shift @ARGV; + } +} + +if (@ARGV != 2) { + die "Usage: add_lex_disambig.pl [opts] \n" . + "This script adds disambiguation symbols to a lexicon in order to\n" . + "make decoding graphs determinizable; it adds pseudo-phone\n" . + "disambiguation symbols #1, #2 and so on at the ends of phones\n" . + "to ensure that all pronunciations are different, and that none\n" . + "is a prefix of another.\n" . + "It prints to the standard output the number of the largest-numbered" . + "disambiguation symbol that was used.\n" . + "\n" . + "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . + " --sil-probs [should be with --pron-probs option]\n" . + " Expect 3 extra fields after the pron-probs, for aspects of\n" . + " the silence probability model\n" . + " --first-allowed-disambig The number of the first disambiguation symbol\n" . + " that this script is allowed to add. By default this is\n" . + " #1, but you can set this to a larger value using this option.\n" . + "e.g.:\n" . + " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . + " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . + " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; +} + + +$lexfn = shift @ARGV; +$lexoutfn = shift @ARGV; + +open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; + +# (1) Read in the lexicon. +@L = ( ); +while() { + @A = split(" ", $_); + push @L, join(" ", @A); +} + +# (2) Work out the count of each phone-sequence in the +# lexicon. + +foreach $l (@L) { + @A = split(" ", $l); + shift @A; # Remove word. + if ($pron_probs) { + $p = shift @A; + if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } + } + if ($sil_probs) { + $silp = shift @A; + if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } + $correction = shift @A; + if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } + $correction = shift @A; + if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } + } + if (!(@A)) { + die "Bad lexicon line $1, no phone in phone list"; + } + $count{join(" ",@A)}++; +} + +# (3) For each left sub-sequence of each phone-sequence, note down +# that it exists (for identifying prefixes of longer strings). + +foreach $l (@L) { + @A = split(" ", $l); + shift @A; # Remove word. + if ($pron_probs) { shift @A; } # remove pron-prob. + if ($sil_probs) { + shift @A; # Remove silprob + shift @A; # Remove silprob + } + while(@A > 0) { + pop @A; # Remove last phone + $issubseq{join(" ",@A)} = 1; + } +} + +# (4) For each entry in the lexicon: +# if the phone sequence is unique and is not a +# prefix of another word, no diambig symbol. +# Else output #1, or #2, #3, ... if the same phone-seq +# has already been assigned a disambig symbol. + + +open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; + +# max_disambig will always be the highest-numbered disambiguation symbol that +# has been used so far. +$max_disambig = $first_allowed_disambig - 1; + +foreach $l (@L) { + @A = split(" ", $l); + $word = shift @A; + if ($pron_probs) { + $pron_prob = shift @A; + } + if ($sil_probs) { + $sil_word_prob = shift @A; + $word_sil_correction = shift @A; + $prev_nonsil_correction = shift @A + } + $phnseq = join(" ", @A); + if (!defined $issubseq{$phnseq} + && $count{$phnseq} == 1) { + ; # Do nothing. + } else { + if ($phnseq eq "") { # need disambig symbols for the empty string + # that are not use anywhere else. + $max_disambig++; + $reserved_for_the_empty_string{$max_disambig} = 1; + $phnseq = "#$max_disambig"; + } else { + $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; + if (!defined $cur_disambig) { + $cur_disambig = $first_allowed_disambig; + } else { + $cur_disambig++; # Get a number that has not been used yet for + # this phone sequence. + } + while (defined $reserved_for_the_empty_string{$cur_disambig}) { + $cur_disambig++; + } + if ($cur_disambig > $max_disambig) { + $max_disambig = $cur_disambig; + } + $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; + $phnseq = $phnseq . " #" . $cur_disambig; + } + } + if ($pron_probs) { + if ($sil_probs) { + print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; + } else { + print O "$word\t$pron_prob\t$phnseq\n"; + } + } else { + print O "$word\t$phnseq\n"; + } +} + +print $max_disambig . "\n"; \ No newline at end of file diff --git a/utils/fst/compile_lexicon_token_fst.sh b/utils/fst/compile_lexicon_token_fst.sh new file mode 100755 index 00000000..e9e8b1ec --- /dev/null +++ b/utils/fst/compile_lexicon_token_fst.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# Copyright 2015 Yajie Miao (Carnegie Mellon University) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the +# phoneme and character-based lexicons. +set -eo pipefail +. utils/parse_options.sh + +if [ $# -ne 3 ]; then + echo "usage: utils/fst/compile_lexicon_token_fst.sh " + echo "e.g.: utils/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" + echo " should contain the following files:" + echo "lexicon.txt lexicon_numbers.txt units.txt" + echo "options: " + exit 1; +fi + +srcdir=$1 +tmpdir=$2 +dir=$3 +mkdir -p $dir $tmpdir + +[ -f path.sh ] && . ./path.sh + +cp $srcdir/units.txt $dir + +# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. +# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. +perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; + +# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. +# Without these symbols, determinization will fail. +# default first disambiguation is #1 +ndisambig=`utils/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` +# add #0 (#0 reserved for symbol in grammar). +ndisambig=$[$ndisambig+1]; + +( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list + +# Get the full list of CTC tokens used in FST. These tokens include , the blank , +# the actual model unit, and the disambiguation symbols. +cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list +(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt + +# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, +# so here just use simple ctc_token_fst +utils/fst/ctc_token_fst.py --token_file $dir/tokens.txt | \ + fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ + fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; + +# Encode the words with indices. Will be used in lexicon and language model FST compiling. +cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | awk ' + BEGIN { + print " 0"; + } + { + printf("%s %d\n", $1, NR); + } + END { + printf("#0 %d\n", NR+1); + printf(" %d\n", NR+2); + printf(" %d\n", NR+3); + }' > $dir/words.txt || exit 1; + +# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. +token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` +word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` + +utils/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ + fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ + fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; + +echo "Lexicon and Token FSTs compiling succeeded" \ No newline at end of file diff --git a/utils/fst/ctc_token_fst.py b/utils/fst/ctc_token_fst.py new file mode 100755 index 00000000..2262912c --- /dev/null +++ b/utils/fst/ctc_token_fst.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +import argparse + + +def main(args): + """Token Transducer""" + # entry + print('0 1 ') + # skip begining and ending + print('1 1 ') + print('2 2 ') + # exit + print('2 0 ') + + # linking `token` between node 1 and node 2 + with open(args.token_file, 'r') as fin: + node = 3 + for entry in fin: + fields = entry.strip().split(' ') + phone = fields[0] + if phone == '' or phone == '': + continue + elif '#' in phone: + # disambiguous phone + # `token` maybe ending with disambiguous symbol + print('{} {} {} {}'.format(0, 0, '', phone)) + else: + # eating `token` + print('{} {} {} {}'.format(1, node, phone, phone)) + # remove repeating `token` + print('{} {} {} {}'.format(node, node, phone, '')) + # leaving `token` + print('{} {} {} {}'.format(node, 2, '', '')) + node += 1 + # Fianl node + print('0') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='FST: CTC Token FST transducer') + parser.add_argument( + '--token_file', + required=True, + help='e2e model token file. line: token(char/phone/spm/disambigous)') + + args = parser.parse_args() + + main(args) diff --git a/utils/fst/ctc_token_fst_corrected.py b/utils/fst/ctc_token_fst_corrected.py new file mode 100755 index 00000000..a1d476c8 --- /dev/null +++ b/utils/fst/ctc_token_fst_corrected.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +import argparse + + +def il(n): + """ilabel""" + return n + 1 + + +def ol(n): + """olabel""" + return n + 1 + + +def s(n): + """state""" + return n + + +def main(args): + with open(args.token_file) as f: + lines = f.readlines() + # token count w/0 + phone_count = 0 + disambig_count = 0 + for line in lines: + sp = line.strip().split() + phone = sp[0] + if phone == '' or phone == '': + continue + if phone.startswith('#'): + disambig_count += 1 + else: + phone_count += 1 + + # 1. add start state + # first token is :0 + print('0 0 {} 0'.format(il(0))) + + # 2. 0 -> i, i -> i, i -> 0 + # non-blank token start from 1 + for i in range(1, phone_count + 1): + # eating `token` + print('0 {} {} {}'.format(s(i), il(i), ol(i))) + # remove repeating `token` + print('{} {} {} 0'.format(s(i), s(i), il(i))) + # skip ending `token` + print('{} 0 {} 0'.format(s(i), il(0))) + + # 3. i -> other phone + # non-blank token to other non-blank token + for i in range(1, phone_count + 1): + for j in range(1, phone_count + 1): + if i != j: + print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) + + # 4. add disambiguous arcs on every final state + # blank and non-blank token maybe ending with disambiguous `token` + for i in range(0, phone_count + 1): + for j in range(phone_count + 2, phone_count + disambig_count + 2): + print('{} {} {} {}'.format(s(i), s(i), 0, j)) + + # 5. every i is final state + # blank and non-blank `token` are final state + for i in range(0, phone_count + 1): + print(s(i)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='FST: CTC Token unfold FST transducer') + parser.add_argument( + '--token_file', + required=True, + help='e2e model token file. line: token(char/phone/spm/disambigous)') + args = parser.parse_args() + + main(args) diff --git a/utils/fst/eps2disambig.pl b/utils/fst/eps2disambig.pl new file mode 100755 index 00000000..52ec0acb --- /dev/null +++ b/utils/fst/eps2disambig.pl @@ -0,0 +1,29 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation +# 2015 Guoguo Chen + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script replaces epsilon with #0 on the input side only, of the G.fst +# acceptor. + +while(<>){ + if (/\s+#0\s+/) { + print STDERR "$0: ERROR: LM has word #0, " . + "which is reserved as disambiguation symbol\n"; + exit 1; + } + s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; + print; +} \ No newline at end of file diff --git a/utils/fst/make_lexicon_fst.pl b/utils/fst/make_lexicon_fst.pl new file mode 100755 index 00000000..95cda9df --- /dev/null +++ b/utils/fst/make_lexicon_fst.pl @@ -0,0 +1,154 @@ +#!/usr/bin/env perl +use warnings; #sed replacement for -w perl parameter +# Copyright 2010-2011 Microsoft Corporation +# 2013 Johns Hopkins University (author: Daniel Povey) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). + +$pron_probs = 0; + +if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { + $pron_probs = 1; + shift @ARGV; +} + +if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { + print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; + print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; + print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; + print STDERR " word phone1 phone2 ... phoneN;\n"; + print STDERR "if the --pron-probs option is used, each line is:\n"; + print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; + print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; + print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; + print STDERR "this is your responsibility.\n\n"; + print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; + print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; + print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; + exit(1); +} + +$lexfn = shift @ARGV; +if (@ARGV == 0) { + $silprob = 0.0; +} elsif (@ARGV == 2) { + ($silprob,$silphone) = @ARGV; +} else { + ($silprob,$silphone,$sildisambig) = @ARGV; +} +if ($silprob != 0.0) { + $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; + $silcost = -log($silprob); + $nosilcost = -log(1.0 - $silprob); +} + + +open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; + + +if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. + $loopstate = 0; + $nextstate = 1; # next unallocated state. + while () { + @A = split(" ", $_); + @A == 0 && die "Empty lexicon line."; + foreach $a (@A) { + if ($a eq "") { + die "Bad lexicon line $_ ( is forbidden)"; + } + } + $w = shift @A; + if (! $pron_probs) { + $pron_cost = 0.0; + } else { + $pron_prob = shift @A; + if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { + die "Bad pronunciation probability in line $_"; + } + $pron_cost = -log($pron_prob); + } + if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } + + $s = $loopstate; + $word_or_eps = $w; + while (@A > 0) { + $p = shift @A; + if (@A > 0) { + $ns = $nextstate++; + } else { + $ns = $loopstate; + } + print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; + $word_or_eps = ""; + $pron_cost_string = ""; # so we only print it on the first arc of the word. + $s = $ns; + } + } + print "$loopstate\t0\n"; # final-cost. +} else { # have silence probs. + $startstate = 0; + $loopstate = 1; + $silstate = 2; # state from where we go to loopstate after emitting silence. + print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. + if (!defined $sildisambig) { + print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. + print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. + $nextstate = 3; + } else { + $disambigstate = 3; + $nextstate = 4; + print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. + print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. + print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. + } + while () { + @A = split(" ", $_); + $w = shift @A; + if (! $pron_probs) { + $pron_cost = 0.0; + } else { + $pron_prob = shift @A; + if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { + die "Bad pronunciation probability in line $_"; + } + $pron_cost = -log($pron_prob); + } + if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } + $s = $loopstate; + $word_or_eps = $w; + while (@A > 0) { + $p = shift @A; + if (@A > 0) { + $ns = $nextstate++; + print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; + $word_or_eps = ""; + $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. + $s = $ns; + } elsif (!defined($silphone) || $p ne $silphone) { + # This is non-deterministic but relatively compact, + # and avoids epsilons. + $local_nosilcost = $nosilcost + $pron_cost; + $local_silcost = $silcost + $pron_cost; + print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; + print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; + } else { + # no point putting opt-sil after silence word. + print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; + } + } + } + print "$loopstate\t0\n"; # final-cost. +} \ No newline at end of file diff --git a/utils/fst/make_tlg.sh b/utils/fst/make_tlg.sh new file mode 100755 index 00000000..c68387af --- /dev/null +++ b/utils/fst/make_tlg.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +if [ -f path.sh ]; then . path.sh; fi + +lm_dir=$1 +src_lang=$2 +tgt_lang=$3 + +arpa_lm=${lm_dir}/lm.arpa +[ ! -f $arpa_lm ] && { echo "No such file $arpa_lm"; exit 1;} + +rm -rf $tgt_lang +cp -r $src_lang $tgt_lang + +# Compose the language model to FST +# grep -i或--ignore-case 忽略字符大小写的差别。 +# grep -v或--revert-match 反转查找。 +# arpa2fst: remove the embedded symbols from the FST +# arpa2fst: make sure there are no out-of-vocabulary words in the language model +# arpa2fst: remove "illegal" sequences of the start and end-ofsentence symbols +# eps2disambig.pl: replace epsilons on the input side with the special disambiguation symbol #0. +# s2eps.pl: replaces and with (on both input and output sides), for the G.fst acceptor. +# G.fst, the disambiguation symbol #0 only appears on the input side +# do eps2disambig.pl and s2eps.pl maybe just for fallowing `fstrmepsilon`. +cat $arpa_lm | \ + grep -v ' ' | \ + grep -v ' ' | \ + grep -v ' ' | \ + grep -v -i '' | \ + grep -v -i '' | \ + arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ + utils/fst/eps2disambig.pl | utils/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ + --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst + + +echo "Checking how stochastic G is (the first of these numbers should be small):" +fstisstochastic $tgt_lang/G.fst + +# Compose the token, lexicon and language-model FST into the final decoding graph +# minimization: the same as minimization algorithm that applies to weighted acceptors; +# the only change relevant here is that it avoids pushing weights, +# hence preserving stochasticity +fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ + fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; +fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; + +echo "Composing decoding graph TLG.fst succeeded" +#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST \ No newline at end of file diff --git a/utils/fst/prepare_dict.py b/utils/fst/prepare_dict.py new file mode 100755 index 00000000..f59cd311 --- /dev/null +++ b/utils/fst/prepare_dict.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +import argparse + + +def main(args): + # load `unit` or `vocab` file + unit_table = set() + with open(args.unit_file, 'r') as fin: + for line in fin: + unit = line.strip() + unit_table.add(unit) + + def contain_oov(units): + for unit in units: + if unit not in unit_table: + return True + return False + + # load spm model + bpemode = args.bpemodel + if bpemode: + import sentencepiece as spm + sp = spm.SentencePieceProcessor() + sp.Load(sys.bpemodel) + + # used to filter polyphone + lexicon_table = set() + with open(args.in_lexicon, 'r') as fin, \ + open(args.out_lexicon, 'w') as fout: + for line in fin: + word = line.split()[0] + if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel + continue + elif word == '': + continue + else: + # each word only has one pronunciation for e2e system + if word in lexicon_table: + continue + + if bpemode: + pieces = sp.EncodeAsPieces(word) + if contain_oov(pieces): + print('Ignoring words {}, which contains oov unit'. + format(''.join(word).strip('▁'))) + continue + + chars = ' '.join( + [p if p in unit_table else '' for p in pieces]) + else: + # ignore words with OOV + if contain_oov(word): + print('Ignoring words {}, which contains oov unit'. + format(word)) + continue + + # Optional, append ▁ in front of english word + # we assume the model unit of our e2e system is char now. + if word.encode('utf8').isalpha() and '▁' in unit_table: + word = '▁' + word + chars = ' '.join(word) # word is a char list + + fout.write('{} {}\n'.format(word, chars)) + lexicon_table.add(word) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='FST: preprae e2e(char/spm) dict') + parser.add_argument( + '--unit_file', + required=True, + help='e2e model unit file(lang_char.txt/vocab.txt). line: char/spm_pices' + ) + parser.add_argument( + '--in_lexicon', + required=True, + help='raw lexicon file. line: word ph0 ... phn') + parser.add_argument( + '--out_lexicon', + required=True, + help='output lexicon file. line: word char0 ... charn') + parser.add_argument('--bpemodel', default=None, help='bpemodel') + + args = parser.parse_args() + print(args) + + main(args) diff --git a/utils/fst/remove_oovs.pl b/utils/fst/remove_oovs.pl new file mode 100755 index 00000000..bbf7e632 --- /dev/null +++ b/utils/fst/remove_oovs.pl @@ -0,0 +1,42 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script removes lines that contain these OOVs on either the +# third or fourth fields of the line. It is intended to remove arcs +# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). + +if ( @ARGV < 1 && @ARGV > 2) { + die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; +} + +$unklist = shift @ARGV; +open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; +while(){ + @A = split(" ", $_); + @A == 1 || die "Bad line in unknown-symbol list: $_"; + $unk{$A[0]} = 1; +} + +$num_removed = 0; +while(<>){ + @A = split(" ", $_); + if(defined $unk{$A[2]} || defined $unk{$A[3]}) { + $num_removed++; + } else { + print; + } +} +print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; diff --git a/utils/fst/rnnt_token_fst.py b/utils/fst/rnnt_token_fst.py new file mode 100755 index 00000000..8f1cf493 --- /dev/null +++ b/utils/fst/rnnt_token_fst.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +import argparse + + +def main(args): + # skip `token` + print('0 0 ') + + with open(args.token_file, 'r') as fin: + for entry in fin: + fields = entry.strip().split(' ') + phone = fields[0] + if phone == '' or phone == '': + continue + elif '#' in phone: + # disambiguous phone + # maybe add disambiguous `token` + print('{} {} {} {}'.format(0, 0, '', phone)) + else: + # eating `token` + print('{} {} {} {}'.format(0, 0, phone, phone)) + + # final state + print('0') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='FST: RNN-T Token FST transducer') + parser.add_argument( + '--token_file', + required=True, + help='e2e model token file. line: token(char/phone/spm/disambigous)') + args = parser.parse_args() + + main(args) diff --git a/utils/fst/s2eps.pl b/utils/fst/s2eps.pl new file mode 100755 index 00000000..84d494e2 --- /dev/null +++ b/utils/fst/s2eps.pl @@ -0,0 +1,27 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script replaces and with (on both input and output sides), +# for the G.fst acceptor. + +while(<>){ + @A = split(" ", $_); + if ( @A >= 4 ) { + if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } + if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } + } + print join("\t", @A) . "\n"; +} \ No newline at end of file diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py new file mode 100755 index 00000000..b409236f --- /dev/null +++ b/utils/manifest_key_value.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +"""Manifest file to key-value files.""" +import argparse +import functools +from pathlib import Path + +from utils.utility import add_arguments +from utils.utility import print_arguments +from utils.utility import read_manifest + + +def main(args): + print_arguments(args, globals()) + + count = 0 + + outdir = Path(args.output_path) + wav_scp = outdir / 'wav.scp' + dur_scp = outdir / 'duration' + text_scp = outdir / 'text' + + manifest_jsons = read_manifest(args.manifest_path) + + with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open( + 'w') as ftxt: + for line_json in manifest_jsons: + utt = line_json['utt'] + feat = line_json['feat'] + file_ext = Path(feat).suffix # .wav + text = line_json['text'] + feat_shape = line_json['feat_shape'] + dur = feat_shape[0] + feat_dim = feat_shape[1] + if 'token' in line_json: + tokens = line_json['token'] + tokenids = line_json['token_id'] + token_shape = line_json['token_shape'] + token_len = token_shape[0] + vocab_dim = token_shape[1] + + if file_ext == '.wav': + fwav.write(f"{utt} {feat}\n") + fdur.write(f"{utt} {dur}\n") + ftxt.write(f"{utt} {text}\n") + + count += 1 + + print(f"Examples number: {count}") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + add_arg = functools.partial(add_arguments, argparser=parser) + # yapf: disable + add_arg('manifest_path', str, + 'data/librispeech/manifest.train', + "Filepath of manifest to compute normalizer's mean and stddev.") + add_arg('output_path', str, + 'data/train', + "dir path to dump wav.scp/duaration/text files.") + # yapf: disable + args = parser.parse_args() + + main(args) diff --git a/utils/ngram_train.sh b/utils/ngram_train.sh index cba74880..b56048eb 100755 --- a/utils/ngram_train.sh +++ b/utils/ngram_train.sh @@ -22,7 +22,7 @@ lmbin=${2}.klm.bin # https://kheafield.com/code/kenlm/estimation/ echo "build arpa lm." -lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} >${arpa} || { echo "train kenlm error!"; exit -1; } +lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} > ${arpa} || { echo "train kenlm error!"; exit -1; } # https://kheafield.com/code/kenlm/ echo "build binary lm." diff --git a/utils/utility.py b/utils/utility.py old mode 100644 new mode 100755 index 344900ef..a6b81d73 --- a/utils/utility.py +++ b/utils/utility.py @@ -11,19 +11,95 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import hashlib +import json import os import tarfile import zipfile from typing import Text -from paddle.dataset.common import md5file - __all__ = [ "check_md5sum", "getfile_insensitive", "download_multi", "download", - "unpack", "unzip" + "unpack", "unzip", "md5file", "print_arguments", "add_arguments", + "read_manifest" ] +def read_manifest(manifest_path): + """Load and parse manifest file. + Args: + manifest_path ([type]): Manifest file to load and parse. + + Raises: + IOError: If failed to parse the manifest. + + Returns: + List[dict]: Manifest parsing results. + """ + + manifest = [] + for json_line in open(manifest_path, 'r'): + try: + json_data = json.loads(json_line) + except Exception as e: + raise IOError("Error reading manifest: %s" % str(e)) + return manifest + + +def print_arguments(args, info=None): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + filename = "" + if info: + filename = info["__file__"] + filename = os.path.basename(filename) + print(f"----------- {filename} Configuration Arguments -----------") + for arg, value in sorted(vars(args).items()): + print("%s: %s" % (arg, value)) + print("-----------------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +def md5file(fname): + hash_md5 = hashlib.md5() + f = open(fname, "rb") + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + f.close() + return hash_md5.hexdigest() + + def getfile_insensitive(path): """Get the actual file path when given insensitive filename.""" directory, filename = os.path.split(path)