Merge pull request #729 from PaddlePaddle/fst

TLG graph
pull/732/head
Hui Zhang 3 years ago committed by GitHub
commit fd8a4ec179
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -17,7 +17,6 @@ import os
import socket import socket
import sys import sys
import auto_log
from paddle import inference from paddle import inference
FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s' FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
@ -156,6 +155,7 @@ class Autolog:
batch_size, batch_size,
model_name="DeepSpeech", model_name="DeepSpeech",
model_precision="fp32"): model_precision="fp32"):
import auto_log
pid = os.getpid() pid = os.getpid()
gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]) gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0])
infer_config = inference.Config() infer_config = inference.Config()

@ -0,0 +1,58 @@
#!/bin/bash
# To be run from one directory above this script.
. ./path.sh
text=data/local/lm/text
lexicon=data/local/dict/lexicon.txt
for f in "$text" "$lexicon"; do
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
done
# Check SRILM tools
if ! which ngram-count > /dev/null; then
echo "srilm tools are not found, please download it and install it from: "
echo "http://www.speech.sri.com/projects/srilm/download.html"
echo "Then add the tools to your PATH"
exit 1
fi
# This script takes no arguments. It assumes you have already run
# aishell_data_prep.sh.
# It takes as input the files
# data/local/lm/text
# data/local/dict/lexicon.txt
dir=data/local/lm
mkdir -p $dir
cleantext=$dir/text.no_oov
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
> $cleantext || exit 1;
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
sort -nr > $dir/word.counts || exit 1;
# Get counts from acoustic training transcripts, and add one-count
# for each word in the lexicon (but not silence, we don't want it
# in the LM-- we'll add it optionally later).
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
heldout_sent=10000 # Don't change this if you want result to be comparable with
# kaldi_lm results
mkdir -p $dir
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
head -$heldout_sent > $dir/heldout
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
tail -n +$heldout_sent > $dir/train
ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
-map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
ngram -lm $dir/lm.arpa -ppl $dir/heldout

@ -0,0 +1,52 @@
#!/bin/bash
set -eo pipefail
stage=-1
stop_stage=100
corpus=aishell
lmtype=srilm
source utils/parse_options.sh
data=${MAIN_ROOT}/examples/dataset/${corpus}
lexicon=$data/resource_aishell/lexicon.txt
text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# 7.1 Prepare dict
unit_file=data/vocab.txt
mkdir -p data/local/dict
cp $unit_file data/local/dict/units.txt
utils/fst/prepare_dict.py \
--unit_file $unit_file \
--in_lexicon ${lexicon} \
--out_lexicon data/local/dict/lexicon.txt
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# 7.2 Train lm
lm=data/local/lm
mkdir -p data/train
mkdir -p $lm
utils/manifest_key_value.py \
--manifest_path data/manifest.train \
--output_path data/train
utils/filter_scp.pl data/train/text \
$text > $lm/text
if [ $lmtype == 'srilm' ];then
local/aishell_train_lms.sh
else
utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# 7.3 Build decoding TLG
utils/fst/compile_lexicon_token_fst.sh \
data/local/dict data/local/tmp data/local/lang
utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
fi
echo "Aishell build TLG done."
exit 0

@ -4,11 +4,25 @@ export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C export LC_ALL=C
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8 export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
# model exp
MODEL=u2 MODEL=u2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
# srilm
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
export SRILM=${MAIN_ROOT}/tools/srilm
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
# Kaldi
export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!"
. $KALDI_ROOT/tools/config/common_path.sh || true

@ -42,3 +42,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# export ckpt avg_n # export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi fi
# Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# train lm and build TLG
./local/tlg.sh --corpus aishell --lmtype srilm
fi

@ -0,0 +1 @@
../../../utils

@ -1,4 +1,5 @@
data_aishell* data_aishell*
*.meta *.meta
manifest.* manifest.*
*.tgz *.tgz
resource_aishell

@ -0,0 +1,9 @@
#!/bin/bash
apt install -y build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
apt-get install -y gcc-5 g++-5 && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50 && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 50
test -d kenlm || wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
rm -rf kenlm/build && mkdir -p kenlm/build && cd kenlm/build && cmake .. && make -j4 && make install

@ -0,0 +1,40 @@
#!/usr/bin/env bash
VER=1.10
WGET=${WGET:-wget}
if [ ! -f liblbfgs-$VER.tar.gz ]; then
if [ -d "$DOWNLOAD_DIR" ]; then
cp -p "$DOWNLOAD_DIR/liblbfgs-$VER.tar.gz" . || exit 1
else
$WGET https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz || exit 1
fi
fi
tar -xzf liblbfgs-$VER.tar.gz
cd liblbfgs-$VER
./configure --prefix=`pwd`
make
# due to the liblbfgs project directory structure, we have to use -i
# but the erros are completely harmless
make -i install
cd ..
(
[ ! -z "${LIBLBFGS}" ] && \
echo >&2 "LIBLBFGS variable is aleady defined. Undefining..." && \
unset LIBLBFGS
[ -f ./env.sh ] && . ./env.sh
[ ! -z "${LIBLBFGS}" ] && \
echo >&2 "libLBFGS config is already in env.sh" && exit
wd=`pwd`
wd=`readlink -f $wd || pwd`
echo "export LIBLBFGS=$wd/liblbfgs-1.10"
echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH:-}':'${LIBLBFGS}'/lib/.libs
) >> env.sh

@ -0,0 +1,91 @@
#!/usr/bin/env bash
current_path=`pwd`
current_dir=`basename "$current_path"`
if [ "tools" != "$current_dir" ]; then
echo "You should run this script in tools/ directory!!"
exit 1
fi
if [ ! -d liblbfgs-1.10 ]; then
echo Installing libLBFGS library to support MaxEnt LMs
bash extras/install_liblbfgs.sh || exit 1
fi
# http://www.speech.sri.com/projects/srilm/download.html
if [ ! -f srilm.tgz ] && [ ! -f srilm.tar.gz ]; then # Changed format type from tgz to tar.gz as the srilm v1.7.3 downloads as tar.gz
echo This script cannot install SRILM in a completely automatic
echo way because you need to put your address in a download form.
echo Please download SRILM from http://www.speech.sri.com/projects/srilm/download.html
echo put it in ./srilm.tar.gz , then run this script.
echo Note: You may have to rename the downloaded file to remove version name from filename eg: mv srilm-1.7.3.tar.gz srilm.tar.gz
exit 1
fi
! which gawk 2>/dev/null && \
echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
mkdir -p srilm
cd srilm
if [ -f ../srilm.tgz ]; then
tar -xvzf ../srilm.tgz # Old SRILM format
elif [ -f ../srilm.tar.gz ]; then
tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz
fi
major=`awk -F. '{ print $1 }' RELEASE`
minor=`awk -F. '{ print $2 }' RELEASE`
micro=`awk -F. '{ print $3 }' RELEASE`
if [ $major -le 1 ] && [ $minor -le 7 ] && [ $micro -le 1 ]; then
echo "Detected version 1.7.1 or earlier. Applying patch."
patch -p0 < ../extras/srilm.patch
fi
# set the SRILM variable in the top-level Makefile to this directory.
cp Makefile tmpf
cat tmpf | awk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \
> Makefile || exit 1
rm tmpf
mtype=`sbin/machine-type`
echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype
grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \
sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \
>> common/Makefile.machine.$mtype
grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \
sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/ -Wl,-rpath -Wl,$(SRILM)/../liblbfgs-1.10/lib/|' \
>> common/Makefile.machine.$mtype
make || exit
cd ..
(
[ ! -z "${SRILM}" ] && \
echo >&2 "SRILM variable is aleady defined. Undefining..." && \
unset SRILM
[ -f ./env.sh ] && . ./env.sh
[ ! -z "${SRILM}" ] && \
echo >&2 "SRILM config is already in env.sh" && exit
wd=`pwd`
wd=`readlink -f $wd || pwd`
echo "export SRILM=$wd/srilm"
dirs="\${PATH}"
for directory in $(cd srilm && find bin -type d ) ; do
dirs="$dirs:\${SRILM}/$directory"
done
echo "export PATH=$dirs"
) >> env.sh
echo >&2 "Installation of SRILM finished successfully"
echo >&2 "Please source the tools/env.sh in your path.sh to enable it"

@ -0,0 +1,17 @@
--- dstruct/src/Trie.orig 2016-11-08 19:53:40.524000000 +0000
+++ dstruct/src/Trie.cc 2016-11-08 19:53:59.088000000 +0000
@@ -200,11 +200,14 @@
if (removedData == 0) {
Trie<KeyT,DataT> node;
if (sub.remove(keys[0], &node)) {
+#if !defined(__GNUC__) || !(__GNUC__ >= 4 && __GNUC_MINOR__ >= 9 || __GNUC__ > 4)
/*
* XXX: Call subtrie destructor explicitly since we're not
* passing the removed node to the caller.
+ * !!! Triggers bug with gcc >= 4.9 optimization !!!
*/
node.~Trie();
+#endif
return true;
} else {
return false;

@ -0,0 +1,13 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

@ -0,0 +1,87 @@
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation
# Johns Hopkins University (author: Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script takes a list of utterance-ids or any file whose first field
# of each line is an utterance-id, and filters an scp
# file (or any file whose "n-th" field is an utterance id), printing
# out only those lines whose "n-th" field is in id_list. The index of
# the "n-th" field is 1, by default, but can be changed by using
# the -f <n> switch
$exclude = 0;
$field = 1;
$shifted = 0;
do {
$shifted=0;
if ($ARGV[0] eq "--exclude") {
$exclude = 1;
shift @ARGV;
$shifted=1;
}
if ($ARGV[0] eq "-f") {
$field = $ARGV[1];
shift @ARGV; shift @ARGV;
$shifted=1
}
} while ($shifted);
if(@ARGV < 1 || @ARGV > 2) {
die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
"Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
"Note: only the first field of each line in id_list matters. With --exclude, prints\n" .
"only the lines that were *not* in id_list.\n" .
"Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
"If your older scripts (written before Oct 2014) stopped working and you used the\n" .
"-f option, add 1 to the argument.\n" .
"See also: utils/filter_scp.pl .\n";
}
$idlist = shift @ARGV;
open(F, "<$idlist") || die "Could not open id-list file $idlist";
while(<F>) {
@A = split;
@A>=1 || die "Invalid id-list file line $_";
$seen{$A[0]} = 1;
}
if ($field == 1) { # Treat this as special case, since it is common.
while(<>) {
$_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
# $1 is what we filter on.
if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
print $_;
}
}
} else {
while(<>) {
@A = split;
@A > 0 || die "Invalid scp file line $_";
@A >= $field || die "Invalid scp file line $_";
if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
print $_;
}
}
}
# tests:
# the following should print "foo 1"
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
# the following should print "bar 2".
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)

@ -0,0 +1,195 @@
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# 2013-2016 Johns Hopkins University (author: Daniel Povey)
# 2015 Hainan Xu
# 2015 Guoguo Chen
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Adds disambiguation symbols to a lexicon.
# Outputs still in the normal lexicon format.
# Disambig syms are numbered #1, #2, #3, etc. (#0
# reserved for symbol in grammar).
# Outputs the number of disambig syms to the standard output.
# With the --pron-probs option, expects the second field
# of each lexicon line to be a pron-prob.
# With the --sil-probs option, expects three additional
# fields after the pron-prob, representing various components
# of the silence probability model.
$pron_probs = 0;
$sil_probs = 0;
$first_allowed_disambig = 1;
for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
if ($ARGV[0] eq "--pron-probs") {
$pron_probs = 1;
shift @ARGV;
}
if ($ARGV[0] eq "--sil-probs") {
$sil_probs = 1;
shift @ARGV;
}
if ($ARGV[0] eq "--first-allowed-disambig") {
$first_allowed_disambig = 0 + $ARGV[1];
if ($first_allowed_disambig < 1) {
die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
}
shift @ARGV;
shift @ARGV;
}
}
if (@ARGV != 2) {
die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
"This script adds disambiguation symbols to a lexicon in order to\n" .
"make decoding graphs determinizable; it adds pseudo-phone\n" .
"disambiguation symbols #1, #2 and so on at the ends of phones\n" .
"to ensure that all pronunciations are different, and that none\n" .
"is a prefix of another.\n" .
"It prints to the standard output the number of the largest-numbered" .
"disambiguation symbol that was used.\n" .
"\n" .
"Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" .
" --sil-probs [should be with --pron-probs option]\n" .
" Expect 3 extra fields after the pron-probs, for aspects of\n" .
" the silence probability model\n" .
" --first-allowed-disambig <n> The number of the first disambiguation symbol\n" .
" that this script is allowed to add. By default this is\n" .
" #1, but you can set this to a larger value using this option.\n" .
"e.g.:\n" .
" add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
" add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
" add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
}
$lexfn = shift @ARGV;
$lexoutfn = shift @ARGV;
open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
# (1) Read in the lexicon.
@L = ( );
while(<L>) {
@A = split(" ", $_);
push @L, join(" ", @A);
}
# (2) Work out the count of each phone-sequence in the
# lexicon.
foreach $l (@L) {
@A = split(" ", $l);
shift @A; # Remove word.
if ($pron_probs) {
$p = shift @A;
if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
}
if ($sil_probs) {
$silp = shift @A;
if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
$correction = shift @A;
if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
$correction = shift @A;
if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
}
if (!(@A)) {
die "Bad lexicon line $1, no phone in phone list";
}
$count{join(" ",@A)}++;
}
# (3) For each left sub-sequence of each phone-sequence, note down
# that it exists (for identifying prefixes of longer strings).
foreach $l (@L) {
@A = split(" ", $l);
shift @A; # Remove word.
if ($pron_probs) { shift @A; } # remove pron-prob.
if ($sil_probs) {
shift @A; # Remove silprob
shift @A; # Remove silprob
}
while(@A > 0) {
pop @A; # Remove last phone
$issubseq{join(" ",@A)} = 1;
}
}
# (4) For each entry in the lexicon:
# if the phone sequence is unique and is not a
# prefix of another word, no diambig symbol.
# Else output #1, or #2, #3, ... if the same phone-seq
# has already been assigned a disambig symbol.
open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
# max_disambig will always be the highest-numbered disambiguation symbol that
# has been used so far.
$max_disambig = $first_allowed_disambig - 1;
foreach $l (@L) {
@A = split(" ", $l);
$word = shift @A;
if ($pron_probs) {
$pron_prob = shift @A;
}
if ($sil_probs) {
$sil_word_prob = shift @A;
$word_sil_correction = shift @A;
$prev_nonsil_correction = shift @A
}
$phnseq = join(" ", @A);
if (!defined $issubseq{$phnseq}
&& $count{$phnseq} == 1) {
; # Do nothing.
} else {
if ($phnseq eq "") { # need disambig symbols for the empty string
# that are not use anywhere else.
$max_disambig++;
$reserved_for_the_empty_string{$max_disambig} = 1;
$phnseq = "#$max_disambig";
} else {
$cur_disambig = $last_used_disambig_symbol_of{$phnseq};
if (!defined $cur_disambig) {
$cur_disambig = $first_allowed_disambig;
} else {
$cur_disambig++; # Get a number that has not been used yet for
# this phone sequence.
}
while (defined $reserved_for_the_empty_string{$cur_disambig}) {
$cur_disambig++;
}
if ($cur_disambig > $max_disambig) {
$max_disambig = $cur_disambig;
}
$last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
$phnseq = $phnseq . " #" . $cur_disambig;
}
}
if ($pron_probs) {
if ($sil_probs) {
print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
} else {
print O "$word\t$pron_prob\t$phnseq\n";
}
} else {
print O "$word\t$phnseq\n";
}
}
print $max_disambig . "\n";

@ -0,0 +1,88 @@
#!/bin/bash
# Copyright 2015 Yajie Miao (Carnegie Mellon University)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
# phoneme and character-based lexicons.
set -eo pipefail
. utils/parse_options.sh
if [ $# -ne 3 ]; then
echo "usage: utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>"
echo "e.g.: utils/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang"
echo "<dict-src-dir> should contain the following files:"
echo "lexicon.txt lexicon_numbers.txt units.txt"
echo "options: "
exit 1;
fi
srcdir=$1
tmpdir=$2
dir=$3
mkdir -p $dir $tmpdir
[ -f path.sh ] && . ./path.sh
cp $srcdir/units.txt $dir
# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1;
# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
# Without these symbols, determinization will fail.
# default first disambiguation is #1
ndisambig=`utils/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
# add #0 (#0 reserved for symbol in grammar).
ndisambig=$[$ndisambig+1];
( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list
# Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>,
# the actual model unit, and the disambiguation symbols.
cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
(echo '<eps>';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt
# ctc_token_fst_corrected is too big and too slow for character based chinese modeling,
# so here just use simple ctc_token_fst
utils/fst/ctc_token_fst.py --token_file $dir/tokens.txt | \
fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
# Encode the words with indices. Will be used in lexicon and language model FST compiling.
cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | awk '
BEGIN {
print "<eps> 0";
}
{
printf("%s %d\n", $1, NR);
}
END {
printf("#0 %d\n", NR+1);
printf("<s> %d\n", NR+2);
printf("</s> %d\n", NR+3);
}' > $dir/words.txt || exit 1;
# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'`
word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
utils/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \
fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
echo "Lexicon and Token FSTs compiling succeeded"

@ -0,0 +1,49 @@
#!/usr/bin/env python3
import argparse
def main(args):
"""Token Transducer"""
# <eps> entry
print('0 1 <eps> <eps>')
# skip begining and ending <blank>
print('1 1 <blank> <eps>')
print('2 2 <blank> <eps>')
# <eps> exit
print('2 0 <eps> <eps>')
# linking `token` between node 1 and node 2
with open(args.token_file, 'r') as fin:
node = 3
for entry in fin:
fields = entry.strip().split(' ')
phone = fields[0]
if phone == '<eps>' or phone == '<blank>':
continue
elif '#' in phone:
# disambiguous phone
# `token` maybe ending with disambiguous symbol
print('{} {} {} {}'.format(0, 0, '<eps>', phone))
else:
# eating `token`
print('{} {} {} {}'.format(1, node, phone, phone))
# remove repeating `token`
print('{} {} {} {}'.format(node, node, phone, '<eps>'))
# leaving `token`
print('{} {} {} {}'.format(node, 2, '<eps>', '<eps>'))
node += 1
# Fianl node
print('0')
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='FST: CTC Token FST transducer')
parser.add_argument(
'--token_file',
required=True,
help='e2e model token file. line: token(char/phone/spm/disambigous)')
args = parser.parse_args()
main(args)

@ -0,0 +1,78 @@
#!/usr/bin/env python3
import argparse
def il(n):
"""ilabel"""
return n + 1
def ol(n):
"""olabel"""
return n + 1
def s(n):
"""state"""
return n
def main(args):
with open(args.token_file) as f:
lines = f.readlines()
# token count w/0 <blank> <eps>
phone_count = 0
disambig_count = 0
for line in lines:
sp = line.strip().split()
phone = sp[0]
if phone == '<eps>' or phone == '<blank>':
continue
if phone.startswith('#'):
disambig_count += 1
else:
phone_count += 1
# 1. add start state
# first token is <blank>:0
print('0 0 {} 0'.format(il(0)))
# 2. 0 -> i, i -> i, i -> 0
# non-blank token start from 1
for i in range(1, phone_count + 1):
# eating `token`
print('0 {} {} {}'.format(s(i), il(i), ol(i)))
# remove repeating `token`
print('{} {} {} 0'.format(s(i), s(i), il(i)))
# skip ending <blank> `token`
print('{} 0 {} 0'.format(s(i), il(0)))
# 3. i -> other phone
# non-blank token to other non-blank token
for i in range(1, phone_count + 1):
for j in range(1, phone_count + 1):
if i != j:
print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j)))
# 4. add disambiguous arcs on every final state
# blank and non-blank token maybe ending with disambiguous `token`
for i in range(0, phone_count + 1):
for j in range(phone_count + 2, phone_count + disambig_count + 2):
print('{} {} {} {}'.format(s(i), s(i), 0, j))
# 5. every i is final state
# blank and non-blank `token` are final state
for i in range(0, phone_count + 1):
print(s(i))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='FST: CTC Token unfold FST transducer')
parser.add_argument(
'--token_file',
required=True,
help='e2e model token file. line: token(char/phone/spm/disambigous)')
args = parser.parse_args()
main(args)

@ -0,0 +1,29 @@
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# 2015 Guoguo Chen
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script replaces epsilon with #0 on the input side only, of the G.fst
# acceptor.
while(<>){
if (/\s+#0\s+/) {
print STDERR "$0: ERROR: LM has word #0, " .
"which is reserved as disambiguation symbol\n";
exit 1;
}
s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
print;
}

@ -0,0 +1,154 @@
#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2010-2011 Microsoft Corporation
# 2013 Johns Hopkins University (author: Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional).
$pron_probs = 0;
if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) {
$pron_probs = 1;
shift @ARGV;
}
if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n";
print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n";
print STDERR "Note: ordinarily, each line of lexicon.txt is:\n";
print STDERR " word phone1 phone2 ... phoneN;\n";
print STDERR "if the --pron-probs option is used, each line is:\n";
print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n";
print STDERR "The probability 'prob' will typically be between zero and one, and note that\n";
print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n";
print STDERR "this is your responsibility.\n\n";
print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n";
print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n";
print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n";
exit(1);
}
$lexfn = shift @ARGV;
if (@ARGV == 0) {
$silprob = 0.0;
} elsif (@ARGV == 2) {
($silprob,$silphone) = @ARGV;
} else {
($silprob,$silphone,$sildisambig) = @ARGV;
}
if ($silprob != 0.0) {
$silprob < 1.0 || die "Sil prob cannot be >= 1.0";
$silcost = -log($silprob);
$nosilcost = -log(1.0 - $silprob);
}
open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
$loopstate = 0;
$nextstate = 1; # next unallocated state.
while (<L>) {
@A = split(" ", $_);
@A == 0 && die "Empty lexicon line.";
foreach $a (@A) {
if ($a eq "<eps>") {
die "Bad lexicon line $_ (<eps> is forbidden)";
}
}
$w = shift @A;
if (! $pron_probs) {
$pron_cost = 0.0;
} else {
$pron_prob = shift @A;
if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
die "Bad pronunciation probability in line $_";
}
$pron_cost = -log($pron_prob);
}
if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
$s = $loopstate;
$word_or_eps = $w;
while (@A > 0) {
$p = shift @A;
if (@A > 0) {
$ns = $nextstate++;
} else {
$ns = $loopstate;
}
print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
$word_or_eps = "<eps>";
$pron_cost_string = ""; # so we only print it on the first arc of the word.
$s = $ns;
}
}
print "$loopstate\t0\n"; # final-cost.
} else { # have silence probs.
$startstate = 0;
$loopstate = 1;
$silstate = 2; # state from where we go to loopstate after emitting silence.
print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
if (!defined $sildisambig) {
print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
print "$silstate\t$loopstate\t$silphone\t<eps>\n"; # no cost.
$nextstate = 3;
} else {
$disambigstate = 3;
$nextstate = 4;
print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
}
while (<L>) {
@A = split(" ", $_);
$w = shift @A;
if (! $pron_probs) {
$pron_cost = 0.0;
} else {
$pron_prob = shift @A;
if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
die "Bad pronunciation probability in line $_";
}
$pron_cost = -log($pron_prob);
}
if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
$s = $loopstate;
$word_or_eps = $w;
while (@A > 0) {
$p = shift @A;
if (@A > 0) {
$ns = $nextstate++;
print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
$word_or_eps = "<eps>";
$pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time.
$s = $ns;
} elsif (!defined($silphone) || $p ne $silphone) {
# This is non-deterministic but relatively compact,
# and avoids epsilons.
$local_nosilcost = $nosilcost + $pron_cost;
$local_silcost = $silcost + $pron_cost;
print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n";
print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n";
} else {
# no point putting opt-sil after silence word.
print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n";
}
}
}
print "$loopstate\t0\n"; # final-cost.
}

@ -0,0 +1,49 @@
#!/bin/bash
if [ -f path.sh ]; then . path.sh; fi
lm_dir=$1
src_lang=$2
tgt_lang=$3
arpa_lm=${lm_dir}/lm.arpa
[ ! -f $arpa_lm ] && { echo "No such file $arpa_lm"; exit 1;}
rm -rf $tgt_lang
cp -r $src_lang $tgt_lang
# Compose the language model to FST
# grep -i或--ignore-case 忽略字符大小写的差别。
# grep -v或--revert-match 反转查找。
# arpa2fst: remove the embedded symbols from the FST
# arpa2fst: make sure there are no out-of-vocabulary words in the language model
# arpa2fst: remove "illegal" sequences of the start and end-ofsentence symbols
# eps2disambig.pl: replace epsilons on the input side with the special disambiguation symbol #0.
# s2eps.pl: replaces <s> and </s> with <eps> (on both input and output sides), for the G.fst acceptor.
# G.fst, the disambiguation symbol #0 only appears on the input side
# do eps2disambig.pl and s2eps.pl maybe just for fallowing `fstrmepsilon`.
cat $arpa_lm | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \
grep -v '</s> </s>' | \
grep -v -i '<unk>' | \
grep -v -i '<spoken_noise>' | \
arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \
utils/fst/eps2disambig.pl | utils/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \
--osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst
echo "Checking how stochastic G is (the first of these numbers should be small):"
fstisstochastic $tgt_lang/G.fst
# Compose the token, lexicon and language-model FST into the final decoding graph
# minimization: the same as minimization algorithm that applies to weighted acceptors;
# the only change relevant here is that it avoids pushing weights,
# hence preserving stochasticity
fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \
fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1;
fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1;
echo "Composing decoding graph TLG.fst succeeded"
#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST

@ -0,0 +1,88 @@
#!/usr/bin/env python3
import argparse
def main(args):
# load `unit` or `vocab` file
unit_table = set()
with open(args.unit_file, 'r') as fin:
for line in fin:
unit = line.strip()
unit_table.add(unit)
def contain_oov(units):
for unit in units:
if unit not in unit_table:
return True
return False
# load spm model
bpemode = args.bpemodel
if bpemode:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load(sys.bpemodel)
# used to filter polyphone
lexicon_table = set()
with open(args.in_lexicon, 'r') as fin, \
open(args.out_lexicon, 'w') as fout:
for line in fin:
word = line.split()[0]
if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel
continue
elif word == '<SPOKEN_NOISE>':
continue
else:
# each word only has one pronunciation for e2e system
if word in lexicon_table:
continue
if bpemode:
pieces = sp.EncodeAsPieces(word)
if contain_oov(pieces):
print('Ignoring words {}, which contains oov unit'.
format(''.join(word).strip('')))
continue
chars = ' '.join(
[p if p in unit_table else '<unk>' for p in pieces])
else:
# ignore words with OOV
if contain_oov(word):
print('Ignoring words {}, which contains oov unit'.
format(word))
continue
# Optional, append ▁ in front of english word
# we assume the model unit of our e2e system is char now.
if word.encode('utf8').isalpha() and '' in unit_table:
word = '' + word
chars = ' '.join(word) # word is a char list
fout.write('{} {}\n'.format(word, chars))
lexicon_table.add(word)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='FST: preprae e2e(char/spm) dict')
parser.add_argument(
'--unit_file',
required=True,
help='e2e model unit file(lang_char.txt/vocab.txt). line: char/spm_pices'
)
parser.add_argument(
'--in_lexicon',
required=True,
help='raw lexicon file. line: word ph0 ... phn')
parser.add_argument(
'--out_lexicon',
required=True,
help='output lexicon file. line: word char0 ... charn')
parser.add_argument('--bpemodel', default=None, help='bpemodel')
args = parser.parse_args()
print(args)
main(args)

@ -0,0 +1,42 @@
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script removes lines that contain these OOVs on either the
# third or fourth fields of the line. It is intended to remove arcs
# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in).
if ( @ARGV < 1 && @ARGV > 2) {
die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n";
}
$unklist = shift @ARGV;
open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n";
while(<S>){
@A = split(" ", $_);
@A == 1 || die "Bad line in unknown-symbol list: $_";
$unk{$A[0]} = 1;
}
$num_removed = 0;
while(<>){
@A = split(" ", $_);
if(defined $unk{$A[2]} || defined $unk{$A[3]}) {
$num_removed++;
} else {
print;
}
}
print STDERR "remove_oovs.pl: removed $num_removed lines.\n";

@ -0,0 +1,36 @@
#!/usr/bin/env python3
import argparse
def main(args):
# skip <blank> `token`
print('0 0 <blank> <eps>')
with open(args.token_file, 'r') as fin:
for entry in fin:
fields = entry.strip().split(' ')
phone = fields[0]
if phone == '<eps>' or phone == '<blank>':
continue
elif '#' in phone:
# disambiguous phone
# maybe add disambiguous `token`
print('{} {} {} {}'.format(0, 0, '<eps>', phone))
else:
# eating `token`
print('{} {} {} {}'.format(0, 0, phone, phone))
# final state
print('0')
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='FST: RNN-T Token FST transducer')
parser.add_argument(
'--token_file',
required=True,
help='e2e model token file. line: token(char/phone/spm/disambigous)')
args = parser.parse_args()
main(args)

@ -0,0 +1,27 @@
#!/usr/bin/env perl
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This script replaces <s> and </s> with <eps> (on both input and output sides),
# for the G.fst acceptor.
while(<>){
@A = split(" ", $_);
if ( @A >= 4 ) {
if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
}
print join("\t", @A) . "\n";
}

@ -0,0 +1,64 @@
#!/usr/bin/env python3
"""Manifest file to key-value files."""
import argparse
import functools
from pathlib import Path
from utils.utility import add_arguments
from utils.utility import print_arguments
from utils.utility import read_manifest
def main(args):
print_arguments(args, globals())
count = 0
outdir = Path(args.output_path)
wav_scp = outdir / 'wav.scp'
dur_scp = outdir / 'duration'
text_scp = outdir / 'text'
manifest_jsons = read_manifest(args.manifest_path)
with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
'w') as ftxt:
for line_json in manifest_jsons:
utt = line_json['utt']
feat = line_json['feat']
file_ext = Path(feat).suffix # .wav
text = line_json['text']
feat_shape = line_json['feat_shape']
dur = feat_shape[0]
feat_dim = feat_shape[1]
if 'token' in line_json:
tokens = line_json['token']
tokenids = line_json['token_id']
token_shape = line_json['token_shape']
token_len = token_shape[0]
vocab_dim = token_shape[1]
if file_ext == '.wav':
fwav.write(f"{utt} {feat}\n")
fdur.write(f"{utt} {dur}\n")
ftxt.write(f"{utt} {text}\n")
count += 1
print(f"Examples number: {count}")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('manifest_path', str,
'data/librispeech/manifest.train',
"Filepath of manifest to compute normalizer's mean and stddev.")
add_arg('output_path', str,
'data/train',
"dir path to dump wav.scp/duaration/text files.")
# yapf: disable
args = parser.parse_args()
main(args)

@ -22,7 +22,7 @@ lmbin=${2}.klm.bin
# https://kheafield.com/code/kenlm/estimation/ # https://kheafield.com/code/kenlm/estimation/
echo "build arpa lm." echo "build arpa lm."
lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} >${arpa} || { echo "train kenlm error!"; exit -1; } lmplz -o ${order} -S ${mem} --prune ${prune} < ${text} > ${arpa} || { echo "train kenlm error!"; exit -1; }
# https://kheafield.com/code/kenlm/ # https://kheafield.com/code/kenlm/
echo "build binary lm." echo "build binary lm."

@ -11,19 +11,95 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import hashlib
import json
import os import os
import tarfile import tarfile
import zipfile import zipfile
from typing import Text from typing import Text
from paddle.dataset.common import md5file
__all__ = [ __all__ = [
"check_md5sum", "getfile_insensitive", "download_multi", "download", "check_md5sum", "getfile_insensitive", "download_multi", "download",
"unpack", "unzip" "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
"read_manifest"
] ]
def read_manifest(manifest_path):
"""Load and parse manifest file.
Args:
manifest_path ([type]): Manifest file to load and parse.
Raises:
IOError: If failed to parse the manifest.
Returns:
List[dict]: Manifest parsing results.
"""
manifest = []
for json_line in open(manifest_path, 'r'):
try:
json_data = json.loads(json_line)
except Exception as e:
raise IOError("Error reading manifest: %s" % str(e))
return manifest
def print_arguments(args, info=None):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
filename = ""
if info:
filename = info["__file__"]
filename = os.path.basename(filename)
print(f"----------- {filename} Configuration Arguments -----------")
for arg, value in sorted(vars(args).items()):
print("%s: %s" % (arg, value))
print("-----------------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def md5file(fname):
hash_md5 = hashlib.md5()
f = open(fname, "rb")
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
f.close()
return hash_md5.hexdigest()
def getfile_insensitive(path): def getfile_insensitive(path):
"""Get the actual file path when given insensitive filename.""" """Get the actual file path when given insensitive filename."""
directory, filename = os.path.split(path) directory, filename = os.path.split(path)

Loading…
Cancel
Save