commit
fd8a4ec179
@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
|
||||
# To be run from one directory above this script.
|
||||
. ./path.sh
|
||||
|
||||
text=data/local/lm/text
|
||||
lexicon=data/local/dict/lexicon.txt
|
||||
|
||||
for f in "$text" "$lexicon"; do
|
||||
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
|
||||
done
|
||||
|
||||
# Check SRILM tools
|
||||
if ! which ngram-count > /dev/null; then
|
||||
echo "srilm tools are not found, please download it and install it from: "
|
||||
echo "http://www.speech.sri.com/projects/srilm/download.html"
|
||||
echo "Then add the tools to your PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# This script takes no arguments. It assumes you have already run
|
||||
# aishell_data_prep.sh.
|
||||
# It takes as input the files
|
||||
# data/local/lm/text
|
||||
# data/local/dict/lexicon.txt
|
||||
dir=data/local/lm
|
||||
mkdir -p $dir
|
||||
|
||||
|
||||
cleantext=$dir/text.no_oov
|
||||
|
||||
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
|
||||
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
|
||||
> $cleantext || exit 1;
|
||||
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
|
||||
sort -nr > $dir/word.counts || exit 1;
|
||||
|
||||
# Get counts from acoustic training transcripts, and add one-count
|
||||
# for each word in the lexicon (but not silence, we don't want it
|
||||
# in the LM-- we'll add it optionally later).
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
|
||||
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
|
||||
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
|
||||
|
||||
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
|
||||
|
||||
heldout_sent=10000 # Don't change this if you want result to be comparable with
|
||||
# kaldi_lm results
|
||||
mkdir -p $dir
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||
head -$heldout_sent > $dir/heldout
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||
tail -n +$heldout_sent > $dir/train
|
||||
|
||||
ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
|
||||
-map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
|
||||
ngram -lm $dir/lm.arpa -ppl $dir/heldout
|
@ -0,0 +1,52 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eo pipefail
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
corpus=aishell
|
||||
lmtype=srilm
|
||||
|
||||
source utils/parse_options.sh
|
||||
|
||||
data=${MAIN_ROOT}/examples/dataset/${corpus}
|
||||
lexicon=$data/resource_aishell/lexicon.txt
|
||||
text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# 7.1 Prepare dict
|
||||
unit_file=data/vocab.txt
|
||||
mkdir -p data/local/dict
|
||||
cp $unit_file data/local/dict/units.txt
|
||||
utils/fst/prepare_dict.py \
|
||||
--unit_file $unit_file \
|
||||
--in_lexicon ${lexicon} \
|
||||
--out_lexicon data/local/dict/lexicon.txt
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# 7.2 Train lm
|
||||
lm=data/local/lm
|
||||
mkdir -p data/train
|
||||
mkdir -p $lm
|
||||
utils/manifest_key_value.py \
|
||||
--manifest_path data/manifest.train \
|
||||
--output_path data/train
|
||||
utils/filter_scp.pl data/train/text \
|
||||
$text > $lm/text
|
||||
if [ $lmtype == 'srilm' ];then
|
||||
local/aishell_train_lms.sh
|
||||
else
|
||||
utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# 7.3 Build decoding TLG
|
||||
utils/fst/compile_lexicon_token_fst.sh \
|
||||
data/local/dict data/local/tmp data/local/lang
|
||||
utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
|
||||
fi
|
||||
|
||||
echo "Aishell build TLG done."
|
||||
exit 0
|
@ -0,0 +1 @@
|
||||
../../../utils
|
@ -1,4 +1,5 @@
|
||||
data_aishell*
|
||||
*.meta
|
||||
manifest.*
|
||||
*.tgz
|
||||
*.tgz
|
||||
resource_aishell
|
||||
|
@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
apt install -y build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev
|
||||
|
||||
apt-get install -y gcc-5 g++-5 && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50 && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 50
|
||||
|
||||
test -d kenlm || wget -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
|
||||
|
||||
rm -rf kenlm/build && mkdir -p kenlm/build && cd kenlm/build && cmake .. && make -j4 && make install
|
@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
VER=1.10
|
||||
|
||||
WGET=${WGET:-wget}
|
||||
|
||||
if [ ! -f liblbfgs-$VER.tar.gz ]; then
|
||||
if [ -d "$DOWNLOAD_DIR" ]; then
|
||||
cp -p "$DOWNLOAD_DIR/liblbfgs-$VER.tar.gz" . || exit 1
|
||||
else
|
||||
$WGET https://github.com/downloads/chokkan/liblbfgs/liblbfgs-$VER.tar.gz || exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
tar -xzf liblbfgs-$VER.tar.gz
|
||||
cd liblbfgs-$VER
|
||||
./configure --prefix=`pwd`
|
||||
make
|
||||
# due to the liblbfgs project directory structure, we have to use -i
|
||||
# but the erros are completely harmless
|
||||
make -i install
|
||||
cd ..
|
||||
|
||||
(
|
||||
[ ! -z "${LIBLBFGS}" ] && \
|
||||
echo >&2 "LIBLBFGS variable is aleady defined. Undefining..." && \
|
||||
unset LIBLBFGS
|
||||
|
||||
[ -f ./env.sh ] && . ./env.sh
|
||||
|
||||
[ ! -z "${LIBLBFGS}" ] && \
|
||||
echo >&2 "libLBFGS config is already in env.sh" && exit
|
||||
|
||||
wd=`pwd`
|
||||
wd=`readlink -f $wd || pwd`
|
||||
|
||||
echo "export LIBLBFGS=$wd/liblbfgs-1.10"
|
||||
echo export LD_LIBRARY_PATH='${LD_LIBRARY_PATH:-}':'${LIBLBFGS}'/lib/.libs
|
||||
) >> env.sh
|
||||
|
@ -0,0 +1,91 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
current_path=`pwd`
|
||||
current_dir=`basename "$current_path"`
|
||||
|
||||
if [ "tools" != "$current_dir" ]; then
|
||||
echo "You should run this script in tools/ directory!!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -d liblbfgs-1.10 ]; then
|
||||
echo Installing libLBFGS library to support MaxEnt LMs
|
||||
bash extras/install_liblbfgs.sh || exit 1
|
||||
fi
|
||||
|
||||
# http://www.speech.sri.com/projects/srilm/download.html
|
||||
if [ ! -f srilm.tgz ] && [ ! -f srilm.tar.gz ]; then # Changed format type from tgz to tar.gz as the srilm v1.7.3 downloads as tar.gz
|
||||
echo This script cannot install SRILM in a completely automatic
|
||||
echo way because you need to put your address in a download form.
|
||||
echo Please download SRILM from http://www.speech.sri.com/projects/srilm/download.html
|
||||
echo put it in ./srilm.tar.gz , then run this script.
|
||||
echo Note: You may have to rename the downloaded file to remove version name from filename eg: mv srilm-1.7.3.tar.gz srilm.tar.gz
|
||||
exit 1
|
||||
fi
|
||||
|
||||
! which gawk 2>/dev/null && \
|
||||
echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
|
||||
|
||||
mkdir -p srilm
|
||||
cd srilm
|
||||
|
||||
|
||||
if [ -f ../srilm.tgz ]; then
|
||||
tar -xvzf ../srilm.tgz # Old SRILM format
|
||||
elif [ -f ../srilm.tar.gz ]; then
|
||||
tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz
|
||||
fi
|
||||
|
||||
major=`awk -F. '{ print $1 }' RELEASE`
|
||||
minor=`awk -F. '{ print $2 }' RELEASE`
|
||||
micro=`awk -F. '{ print $3 }' RELEASE`
|
||||
|
||||
if [ $major -le 1 ] && [ $minor -le 7 ] && [ $micro -le 1 ]; then
|
||||
echo "Detected version 1.7.1 or earlier. Applying patch."
|
||||
patch -p0 < ../extras/srilm.patch
|
||||
fi
|
||||
|
||||
# set the SRILM variable in the top-level Makefile to this directory.
|
||||
cp Makefile tmpf
|
||||
|
||||
cat tmpf | awk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \
|
||||
> Makefile || exit 1
|
||||
rm tmpf
|
||||
|
||||
mtype=`sbin/machine-type`
|
||||
|
||||
echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype
|
||||
grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \
|
||||
sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \
|
||||
>> common/Makefile.machine.$mtype
|
||||
|
||||
grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \
|
||||
sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/ -Wl,-rpath -Wl,$(SRILM)/../liblbfgs-1.10/lib/|' \
|
||||
>> common/Makefile.machine.$mtype
|
||||
|
||||
make || exit
|
||||
|
||||
cd ..
|
||||
(
|
||||
[ ! -z "${SRILM}" ] && \
|
||||
echo >&2 "SRILM variable is aleady defined. Undefining..." && \
|
||||
unset SRILM
|
||||
|
||||
[ -f ./env.sh ] && . ./env.sh
|
||||
|
||||
[ ! -z "${SRILM}" ] && \
|
||||
echo >&2 "SRILM config is already in env.sh" && exit
|
||||
|
||||
wd=`pwd`
|
||||
wd=`readlink -f $wd || pwd`
|
||||
|
||||
echo "export SRILM=$wd/srilm"
|
||||
dirs="\${PATH}"
|
||||
for directory in $(cd srilm && find bin -type d ) ; do
|
||||
dirs="$dirs:\${SRILM}/$directory"
|
||||
done
|
||||
echo "export PATH=$dirs"
|
||||
) >> env.sh
|
||||
|
||||
echo >&2 "Installation of SRILM finished successfully"
|
||||
echo >&2 "Please source the tools/env.sh in your path.sh to enable it"
|
@ -0,0 +1,17 @@
|
||||
--- dstruct/src/Trie.orig 2016-11-08 19:53:40.524000000 +0000
|
||||
+++ dstruct/src/Trie.cc 2016-11-08 19:53:59.088000000 +0000
|
||||
@@ -200,11 +200,14 @@
|
||||
if (removedData == 0) {
|
||||
Trie<KeyT,DataT> node;
|
||||
if (sub.remove(keys[0], &node)) {
|
||||
+#if !defined(__GNUC__) || !(__GNUC__ >= 4 && __GNUC_MINOR__ >= 9 || __GNUC__ > 4)
|
||||
/*
|
||||
* XXX: Call subtrie destructor explicitly since we're not
|
||||
* passing the removed node to the caller.
|
||||
+ * !!! Triggers bug with gcc >= 4.9 optimization !!!
|
||||
*/
|
||||
node.~Trie();
|
||||
+#endif
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env perl
|
||||
# Copyright 2010-2012 Microsoft Corporation
|
||||
# Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
# This script takes a list of utterance-ids or any file whose first field
|
||||
# of each line is an utterance-id, and filters an scp
|
||||
# file (or any file whose "n-th" field is an utterance id), printing
|
||||
# out only those lines whose "n-th" field is in id_list. The index of
|
||||
# the "n-th" field is 1, by default, but can be changed by using
|
||||
# the -f <n> switch
|
||||
|
||||
$exclude = 0;
|
||||
$field = 1;
|
||||
$shifted = 0;
|
||||
|
||||
do {
|
||||
$shifted=0;
|
||||
if ($ARGV[0] eq "--exclude") {
|
||||
$exclude = 1;
|
||||
shift @ARGV;
|
||||
$shifted=1;
|
||||
}
|
||||
if ($ARGV[0] eq "-f") {
|
||||
$field = $ARGV[1];
|
||||
shift @ARGV; shift @ARGV;
|
||||
$shifted=1
|
||||
}
|
||||
} while ($shifted);
|
||||
|
||||
if(@ARGV < 1 || @ARGV > 2) {
|
||||
die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
|
||||
"Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
|
||||
"Note: only the first field of each line in id_list matters. With --exclude, prints\n" .
|
||||
"only the lines that were *not* in id_list.\n" .
|
||||
"Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
|
||||
"If your older scripts (written before Oct 2014) stopped working and you used the\n" .
|
||||
"-f option, add 1 to the argument.\n" .
|
||||
"See also: utils/filter_scp.pl .\n";
|
||||
}
|
||||
|
||||
|
||||
$idlist = shift @ARGV;
|
||||
open(F, "<$idlist") || die "Could not open id-list file $idlist";
|
||||
while(<F>) {
|
||||
@A = split;
|
||||
@A>=1 || die "Invalid id-list file line $_";
|
||||
$seen{$A[0]} = 1;
|
||||
}
|
||||
|
||||
if ($field == 1) { # Treat this as special case, since it is common.
|
||||
while(<>) {
|
||||
$_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
|
||||
# $1 is what we filter on.
|
||||
if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
|
||||
print $_;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while(<>) {
|
||||
@A = split;
|
||||
@A > 0 || die "Invalid scp file line $_";
|
||||
@A >= $field || die "Invalid scp file line $_";
|
||||
if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
|
||||
print $_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# tests:
|
||||
# the following should print "foo 1"
|
||||
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl <(echo foo)
|
||||
# the following should print "bar 2".
|
||||
# ( echo foo 1; echo bar 2 ) | utils/filter_scp.pl -f 2 <(echo 2)
|
@ -0,0 +1,195 @@
|
||||
#!/usr/bin/env perl
|
||||
# Copyright 2010-2011 Microsoft Corporation
|
||||
# 2013-2016 Johns Hopkins University (author: Daniel Povey)
|
||||
# 2015 Hainan Xu
|
||||
# 2015 Guoguo Chen
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
# Adds disambiguation symbols to a lexicon.
|
||||
# Outputs still in the normal lexicon format.
|
||||
# Disambig syms are numbered #1, #2, #3, etc. (#0
|
||||
# reserved for symbol in grammar).
|
||||
# Outputs the number of disambig syms to the standard output.
|
||||
# With the --pron-probs option, expects the second field
|
||||
# of each lexicon line to be a pron-prob.
|
||||
# With the --sil-probs option, expects three additional
|
||||
# fields after the pron-prob, representing various components
|
||||
# of the silence probability model.
|
||||
|
||||
$pron_probs = 0;
|
||||
$sil_probs = 0;
|
||||
$first_allowed_disambig = 1;
|
||||
|
||||
for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
|
||||
if ($ARGV[0] eq "--pron-probs") {
|
||||
$pron_probs = 1;
|
||||
shift @ARGV;
|
||||
}
|
||||
if ($ARGV[0] eq "--sil-probs") {
|
||||
$sil_probs = 1;
|
||||
shift @ARGV;
|
||||
}
|
||||
if ($ARGV[0] eq "--first-allowed-disambig") {
|
||||
$first_allowed_disambig = 0 + $ARGV[1];
|
||||
if ($first_allowed_disambig < 1) {
|
||||
die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
|
||||
}
|
||||
shift @ARGV;
|
||||
shift @ARGV;
|
||||
}
|
||||
}
|
||||
|
||||
if (@ARGV != 2) {
|
||||
die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
|
||||
"This script adds disambiguation symbols to a lexicon in order to\n" .
|
||||
"make decoding graphs determinizable; it adds pseudo-phone\n" .
|
||||
"disambiguation symbols #1, #2 and so on at the ends of phones\n" .
|
||||
"to ensure that all pronunciations are different, and that none\n" .
|
||||
"is a prefix of another.\n" .
|
||||
"It prints to the standard output the number of the largest-numbered" .
|
||||
"disambiguation symbol that was used.\n" .
|
||||
"\n" .
|
||||
"Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" .
|
||||
" --sil-probs [should be with --pron-probs option]\n" .
|
||||
" Expect 3 extra fields after the pron-probs, for aspects of\n" .
|
||||
" the silence probability model\n" .
|
||||
" --first-allowed-disambig <n> The number of the first disambiguation symbol\n" .
|
||||
" that this script is allowed to add. By default this is\n" .
|
||||
" #1, but you can set this to a larger value using this option.\n" .
|
||||
"e.g.:\n" .
|
||||
" add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
|
||||
" add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
|
||||
" add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
|
||||
}
|
||||
|
||||
|
||||
$lexfn = shift @ARGV;
|
||||
$lexoutfn = shift @ARGV;
|
||||
|
||||
open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
|
||||
|
||||
# (1) Read in the lexicon.
|
||||
@L = ( );
|
||||
while(<L>) {
|
||||
@A = split(" ", $_);
|
||||
push @L, join(" ", @A);
|
||||
}
|
||||
|
||||
# (2) Work out the count of each phone-sequence in the
|
||||
# lexicon.
|
||||
|
||||
foreach $l (@L) {
|
||||
@A = split(" ", $l);
|
||||
shift @A; # Remove word.
|
||||
if ($pron_probs) {
|
||||
$p = shift @A;
|
||||
if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
|
||||
}
|
||||
if ($sil_probs) {
|
||||
$silp = shift @A;
|
||||
if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
|
||||
$correction = shift @A;
|
||||
if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
|
||||
$correction = shift @A;
|
||||
if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
|
||||
}
|
||||
if (!(@A)) {
|
||||
die "Bad lexicon line $1, no phone in phone list";
|
||||
}
|
||||
$count{join(" ",@A)}++;
|
||||
}
|
||||
|
||||
# (3) For each left sub-sequence of each phone-sequence, note down
|
||||
# that it exists (for identifying prefixes of longer strings).
|
||||
|
||||
foreach $l (@L) {
|
||||
@A = split(" ", $l);
|
||||
shift @A; # Remove word.
|
||||
if ($pron_probs) { shift @A; } # remove pron-prob.
|
||||
if ($sil_probs) {
|
||||
shift @A; # Remove silprob
|
||||
shift @A; # Remove silprob
|
||||
}
|
||||
while(@A > 0) {
|
||||
pop @A; # Remove last phone
|
||||
$issubseq{join(" ",@A)} = 1;
|
||||
}
|
||||
}
|
||||
|
||||
# (4) For each entry in the lexicon:
|
||||
# if the phone sequence is unique and is not a
|
||||
# prefix of another word, no diambig symbol.
|
||||
# Else output #1, or #2, #3, ... if the same phone-seq
|
||||
# has already been assigned a disambig symbol.
|
||||
|
||||
|
||||
open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
|
||||
|
||||
# max_disambig will always be the highest-numbered disambiguation symbol that
|
||||
# has been used so far.
|
||||
$max_disambig = $first_allowed_disambig - 1;
|
||||
|
||||
foreach $l (@L) {
|
||||
@A = split(" ", $l);
|
||||
$word = shift @A;
|
||||
if ($pron_probs) {
|
||||
$pron_prob = shift @A;
|
||||
}
|
||||
if ($sil_probs) {
|
||||
$sil_word_prob = shift @A;
|
||||
$word_sil_correction = shift @A;
|
||||
$prev_nonsil_correction = shift @A
|
||||
}
|
||||
$phnseq = join(" ", @A);
|
||||
if (!defined $issubseq{$phnseq}
|
||||
&& $count{$phnseq} == 1) {
|
||||
; # Do nothing.
|
||||
} else {
|
||||
if ($phnseq eq "") { # need disambig symbols for the empty string
|
||||
# that are not use anywhere else.
|
||||
$max_disambig++;
|
||||
$reserved_for_the_empty_string{$max_disambig} = 1;
|
||||
$phnseq = "#$max_disambig";
|
||||
} else {
|
||||
$cur_disambig = $last_used_disambig_symbol_of{$phnseq};
|
||||
if (!defined $cur_disambig) {
|
||||
$cur_disambig = $first_allowed_disambig;
|
||||
} else {
|
||||
$cur_disambig++; # Get a number that has not been used yet for
|
||||
# this phone sequence.
|
||||
}
|
||||
while (defined $reserved_for_the_empty_string{$cur_disambig}) {
|
||||
$cur_disambig++;
|
||||
}
|
||||
if ($cur_disambig > $max_disambig) {
|
||||
$max_disambig = $cur_disambig;
|
||||
}
|
||||
$last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
|
||||
$phnseq = $phnseq . " #" . $cur_disambig;
|
||||
}
|
||||
}
|
||||
if ($pron_probs) {
|
||||
if ($sil_probs) {
|
||||
print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
|
||||
} else {
|
||||
print O "$word\t$pron_prob\t$phnseq\n";
|
||||
}
|
||||
} else {
|
||||
print O "$word\t$phnseq\n";
|
||||
}
|
||||
}
|
||||
|
||||
print $max_disambig . "\n";
|
@ -0,0 +1,88 @@
|
||||
#!/bin/bash
|
||||
# Copyright 2015 Yajie Miao (Carnegie Mellon University)
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
|
||||
# phoneme and character-based lexicons.
|
||||
set -eo pipefail
|
||||
. utils/parse_options.sh
|
||||
|
||||
if [ $# -ne 3 ]; then
|
||||
echo "usage: utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>"
|
||||
echo "e.g.: utils/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang"
|
||||
echo "<dict-src-dir> should contain the following files:"
|
||||
echo "lexicon.txt lexicon_numbers.txt units.txt"
|
||||
echo "options: "
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
srcdir=$1
|
||||
tmpdir=$2
|
||||
dir=$3
|
||||
mkdir -p $dir $tmpdir
|
||||
|
||||
[ -f path.sh ] && . ./path.sh
|
||||
|
||||
cp $srcdir/units.txt $dir
|
||||
|
||||
# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
|
||||
# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
|
||||
perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1;
|
||||
|
||||
# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
|
||||
# Without these symbols, determinization will fail.
|
||||
# default first disambiguation is #1
|
||||
ndisambig=`utils/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
|
||||
# add #0 (#0 reserved for symbol in grammar).
|
||||
ndisambig=$[$ndisambig+1];
|
||||
|
||||
( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list
|
||||
|
||||
# Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>,
|
||||
# the actual model unit, and the disambiguation symbols.
|
||||
cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
|
||||
(echo '<eps>';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt
|
||||
|
||||
# ctc_token_fst_corrected is too big and too slow for character based chinese modeling,
|
||||
# so here just use simple ctc_token_fst
|
||||
utils/fst/ctc_token_fst.py --token_file $dir/tokens.txt | \
|
||||
fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
|
||||
|
||||
# Encode the words with indices. Will be used in lexicon and language model FST compiling.
|
||||
cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | awk '
|
||||
BEGIN {
|
||||
print "<eps> 0";
|
||||
}
|
||||
{
|
||||
printf("%s %d\n", $1, NR);
|
||||
}
|
||||
END {
|
||||
printf("#0 %d\n", NR+1);
|
||||
printf("<s> %d\n", NR+2);
|
||||
printf("</s> %d\n", NR+3);
|
||||
}' > $dir/words.txt || exit 1;
|
||||
|
||||
# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
|
||||
token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'`
|
||||
word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
|
||||
|
||||
utils/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \
|
||||
fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
|
||||
--keep_isymbols=false --keep_osymbols=false | \
|
||||
fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
|
||||
fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
|
||||
|
||||
echo "Lexicon and Token FSTs compiling succeeded"
|
@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
|
||||
|
||||
def main(args):
|
||||
"""Token Transducer"""
|
||||
# <eps> entry
|
||||
print('0 1 <eps> <eps>')
|
||||
# skip begining and ending <blank>
|
||||
print('1 1 <blank> <eps>')
|
||||
print('2 2 <blank> <eps>')
|
||||
# <eps> exit
|
||||
print('2 0 <eps> <eps>')
|
||||
|
||||
# linking `token` between node 1 and node 2
|
||||
with open(args.token_file, 'r') as fin:
|
||||
node = 3
|
||||
for entry in fin:
|
||||
fields = entry.strip().split(' ')
|
||||
phone = fields[0]
|
||||
if phone == '<eps>' or phone == '<blank>':
|
||||
continue
|
||||
elif '#' in phone:
|
||||
# disambiguous phone
|
||||
# `token` maybe ending with disambiguous symbol
|
||||
print('{} {} {} {}'.format(0, 0, '<eps>', phone))
|
||||
else:
|
||||
# eating `token`
|
||||
print('{} {} {} {}'.format(1, node, phone, phone))
|
||||
# remove repeating `token`
|
||||
print('{} {} {} {}'.format(node, node, phone, '<eps>'))
|
||||
# leaving `token`
|
||||
print('{} {} {} {}'.format(node, 2, '<eps>', '<eps>'))
|
||||
node += 1
|
||||
# Fianl node
|
||||
print('0')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='FST: CTC Token FST transducer')
|
||||
parser.add_argument(
|
||||
'--token_file',
|
||||
required=True,
|
||||
help='e2e model token file. line: token(char/phone/spm/disambigous)')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
|
||||
|
||||
def il(n):
|
||||
"""ilabel"""
|
||||
return n + 1
|
||||
|
||||
|
||||
def ol(n):
|
||||
"""olabel"""
|
||||
return n + 1
|
||||
|
||||
|
||||
def s(n):
|
||||
"""state"""
|
||||
return n
|
||||
|
||||
|
||||
def main(args):
|
||||
with open(args.token_file) as f:
|
||||
lines = f.readlines()
|
||||
# token count w/0 <blank> <eps>
|
||||
phone_count = 0
|
||||
disambig_count = 0
|
||||
for line in lines:
|
||||
sp = line.strip().split()
|
||||
phone = sp[0]
|
||||
if phone == '<eps>' or phone == '<blank>':
|
||||
continue
|
||||
if phone.startswith('#'):
|
||||
disambig_count += 1
|
||||
else:
|
||||
phone_count += 1
|
||||
|
||||
# 1. add start state
|
||||
# first token is <blank>:0
|
||||
print('0 0 {} 0'.format(il(0)))
|
||||
|
||||
# 2. 0 -> i, i -> i, i -> 0
|
||||
# non-blank token start from 1
|
||||
for i in range(1, phone_count + 1):
|
||||
# eating `token`
|
||||
print('0 {} {} {}'.format(s(i), il(i), ol(i)))
|
||||
# remove repeating `token`
|
||||
print('{} {} {} 0'.format(s(i), s(i), il(i)))
|
||||
# skip ending <blank> `token`
|
||||
print('{} 0 {} 0'.format(s(i), il(0)))
|
||||
|
||||
# 3. i -> other phone
|
||||
# non-blank token to other non-blank token
|
||||
for i in range(1, phone_count + 1):
|
||||
for j in range(1, phone_count + 1):
|
||||
if i != j:
|
||||
print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j)))
|
||||
|
||||
# 4. add disambiguous arcs on every final state
|
||||
# blank and non-blank token maybe ending with disambiguous `token`
|
||||
for i in range(0, phone_count + 1):
|
||||
for j in range(phone_count + 2, phone_count + disambig_count + 2):
|
||||
print('{} {} {} {}'.format(s(i), s(i), 0, j))
|
||||
|
||||
# 5. every i is final state
|
||||
# blank and non-blank `token` are final state
|
||||
for i in range(0, phone_count + 1):
|
||||
print(s(i))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description='FST: CTC Token unfold FST transducer')
|
||||
parser.add_argument(
|
||||
'--token_file',
|
||||
required=True,
|
||||
help='e2e model token file. line: token(char/phone/spm/disambigous)')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env perl
|
||||
# Copyright 2010-2011 Microsoft Corporation
|
||||
# 2015 Guoguo Chen
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This script replaces epsilon with #0 on the input side only, of the G.fst
|
||||
# acceptor.
|
||||
|
||||
while(<>){
|
||||
if (/\s+#0\s+/) {
|
||||
print STDERR "$0: ERROR: LM has word #0, " .
|
||||
"which is reserved as disambiguation symbol\n";
|
||||
exit 1;
|
||||
}
|
||||
s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
|
||||
print;
|
||||
}
|
@ -0,0 +1,154 @@
|
||||
#!/usr/bin/env perl
|
||||
use warnings; #sed replacement for -w perl parameter
|
||||
# Copyright 2010-2011 Microsoft Corporation
|
||||
# 2013 Johns Hopkins University (author: Daniel Povey)
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional).
|
||||
|
||||
$pron_probs = 0;
|
||||
|
||||
if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) {
|
||||
$pron_probs = 1;
|
||||
shift @ARGV;
|
||||
}
|
||||
|
||||
if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
|
||||
print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n";
|
||||
print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n";
|
||||
print STDERR "Note: ordinarily, each line of lexicon.txt is:\n";
|
||||
print STDERR " word phone1 phone2 ... phoneN;\n";
|
||||
print STDERR "if the --pron-probs option is used, each line is:\n";
|
||||
print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n";
|
||||
print STDERR "The probability 'prob' will typically be between zero and one, and note that\n";
|
||||
print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n";
|
||||
print STDERR "this is your responsibility.\n\n";
|
||||
print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n";
|
||||
print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n";
|
||||
print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$lexfn = shift @ARGV;
|
||||
if (@ARGV == 0) {
|
||||
$silprob = 0.0;
|
||||
} elsif (@ARGV == 2) {
|
||||
($silprob,$silphone) = @ARGV;
|
||||
} else {
|
||||
($silprob,$silphone,$sildisambig) = @ARGV;
|
||||
}
|
||||
if ($silprob != 0.0) {
|
||||
$silprob < 1.0 || die "Sil prob cannot be >= 1.0";
|
||||
$silcost = -log($silprob);
|
||||
$nosilcost = -log(1.0 - $silprob);
|
||||
}
|
||||
|
||||
|
||||
open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
|
||||
|
||||
|
||||
if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
|
||||
$loopstate = 0;
|
||||
$nextstate = 1; # next unallocated state.
|
||||
while (<L>) {
|
||||
@A = split(" ", $_);
|
||||
@A == 0 && die "Empty lexicon line.";
|
||||
foreach $a (@A) {
|
||||
if ($a eq "<eps>") {
|
||||
die "Bad lexicon line $_ (<eps> is forbidden)";
|
||||
}
|
||||
}
|
||||
$w = shift @A;
|
||||
if (! $pron_probs) {
|
||||
$pron_cost = 0.0;
|
||||
} else {
|
||||
$pron_prob = shift @A;
|
||||
if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
|
||||
die "Bad pronunciation probability in line $_";
|
||||
}
|
||||
$pron_cost = -log($pron_prob);
|
||||
}
|
||||
if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
|
||||
|
||||
$s = $loopstate;
|
||||
$word_or_eps = $w;
|
||||
while (@A > 0) {
|
||||
$p = shift @A;
|
||||
if (@A > 0) {
|
||||
$ns = $nextstate++;
|
||||
} else {
|
||||
$ns = $loopstate;
|
||||
}
|
||||
print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
|
||||
$word_or_eps = "<eps>";
|
||||
$pron_cost_string = ""; # so we only print it on the first arc of the word.
|
||||
$s = $ns;
|
||||
}
|
||||
}
|
||||
print "$loopstate\t0\n"; # final-cost.
|
||||
} else { # have silence probs.
|
||||
$startstate = 0;
|
||||
$loopstate = 1;
|
||||
$silstate = 2; # state from where we go to loopstate after emitting silence.
|
||||
print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
|
||||
if (!defined $sildisambig) {
|
||||
print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
|
||||
print "$silstate\t$loopstate\t$silphone\t<eps>\n"; # no cost.
|
||||
$nextstate = 3;
|
||||
} else {
|
||||
$disambigstate = 3;
|
||||
$nextstate = 4;
|
||||
print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
|
||||
print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
|
||||
print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
|
||||
}
|
||||
while (<L>) {
|
||||
@A = split(" ", $_);
|
||||
$w = shift @A;
|
||||
if (! $pron_probs) {
|
||||
$pron_cost = 0.0;
|
||||
} else {
|
||||
$pron_prob = shift @A;
|
||||
if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
|
||||
die "Bad pronunciation probability in line $_";
|
||||
}
|
||||
$pron_cost = -log($pron_prob);
|
||||
}
|
||||
if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
|
||||
$s = $loopstate;
|
||||
$word_or_eps = $w;
|
||||
while (@A > 0) {
|
||||
$p = shift @A;
|
||||
if (@A > 0) {
|
||||
$ns = $nextstate++;
|
||||
print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
|
||||
$word_or_eps = "<eps>";
|
||||
$pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time.
|
||||
$s = $ns;
|
||||
} elsif (!defined($silphone) || $p ne $silphone) {
|
||||
# This is non-deterministic but relatively compact,
|
||||
# and avoids epsilons.
|
||||
$local_nosilcost = $nosilcost + $pron_cost;
|
||||
$local_silcost = $silcost + $pron_cost;
|
||||
print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n";
|
||||
print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n";
|
||||
} else {
|
||||
# no point putting opt-sil after silence word.
|
||||
print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
print "$loopstate\t0\n"; # final-cost.
|
||||
}
|
@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ -f path.sh ]; then . path.sh; fi
|
||||
|
||||
lm_dir=$1
|
||||
src_lang=$2
|
||||
tgt_lang=$3
|
||||
|
||||
arpa_lm=${lm_dir}/lm.arpa
|
||||
[ ! -f $arpa_lm ] && { echo "No such file $arpa_lm"; exit 1;}
|
||||
|
||||
rm -rf $tgt_lang
|
||||
cp -r $src_lang $tgt_lang
|
||||
|
||||
# Compose the language model to FST
|
||||
# grep -i或--ignore-case 忽略字符大小写的差别。
|
||||
# grep -v或--revert-match 反转查找。
|
||||
# arpa2fst: remove the embedded symbols from the FST
|
||||
# arpa2fst: make sure there are no out-of-vocabulary words in the language model
|
||||
# arpa2fst: remove "illegal" sequences of the start and end-ofsentence symbols
|
||||
# eps2disambig.pl: replace epsilons on the input side with the special disambiguation symbol #0.
|
||||
# s2eps.pl: replaces <s> and </s> with <eps> (on both input and output sides), for the G.fst acceptor.
|
||||
# G.fst, the disambiguation symbol #0 only appears on the input side
|
||||
# do eps2disambig.pl and s2eps.pl maybe just for fallowing `fstrmepsilon`.
|
||||
cat $arpa_lm | \
|
||||
grep -v '<s> <s>' | \
|
||||
grep -v '</s> <s>' | \
|
||||
grep -v '</s> </s>' | \
|
||||
grep -v -i '<unk>' | \
|
||||
grep -v -i '<spoken_noise>' | \
|
||||
arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \
|
||||
utils/fst/eps2disambig.pl | utils/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \
|
||||
--osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \
|
||||
fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst
|
||||
|
||||
|
||||
echo "Checking how stochastic G is (the first of these numbers should be small):"
|
||||
fstisstochastic $tgt_lang/G.fst
|
||||
|
||||
# Compose the token, lexicon and language-model FST into the final decoding graph
|
||||
# minimization: the same as minimization algorithm that applies to weighted acceptors;
|
||||
# the only change relevant here is that it avoids pushing weights,
|
||||
# hence preserving stochasticity
|
||||
fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \
|
||||
fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1;
|
||||
fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1;
|
||||
|
||||
echo "Composing decoding graph TLG.fst succeeded"
|
||||
#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST
|
@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
|
||||
|
||||
def main(args):
|
||||
# load `unit` or `vocab` file
|
||||
unit_table = set()
|
||||
with open(args.unit_file, 'r') as fin:
|
||||
for line in fin:
|
||||
unit = line.strip()
|
||||
unit_table.add(unit)
|
||||
|
||||
def contain_oov(units):
|
||||
for unit in units:
|
||||
if unit not in unit_table:
|
||||
return True
|
||||
return False
|
||||
|
||||
# load spm model
|
||||
bpemode = args.bpemodel
|
||||
if bpemode:
|
||||
import sentencepiece as spm
|
||||
sp = spm.SentencePieceProcessor()
|
||||
sp.Load(sys.bpemodel)
|
||||
|
||||
# used to filter polyphone
|
||||
lexicon_table = set()
|
||||
with open(args.in_lexicon, 'r') as fin, \
|
||||
open(args.out_lexicon, 'w') as fout:
|
||||
for line in fin:
|
||||
word = line.split()[0]
|
||||
if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel
|
||||
continue
|
||||
elif word == '<SPOKEN_NOISE>':
|
||||
continue
|
||||
else:
|
||||
# each word only has one pronunciation for e2e system
|
||||
if word in lexicon_table:
|
||||
continue
|
||||
|
||||
if bpemode:
|
||||
pieces = sp.EncodeAsPieces(word)
|
||||
if contain_oov(pieces):
|
||||
print('Ignoring words {}, which contains oov unit'.
|
||||
format(''.join(word).strip('▁')))
|
||||
continue
|
||||
|
||||
chars = ' '.join(
|
||||
[p if p in unit_table else '<unk>' for p in pieces])
|
||||
else:
|
||||
# ignore words with OOV
|
||||
if contain_oov(word):
|
||||
print('Ignoring words {}, which contains oov unit'.
|
||||
format(word))
|
||||
continue
|
||||
|
||||
# Optional, append ▁ in front of english word
|
||||
# we assume the model unit of our e2e system is char now.
|
||||
if word.encode('utf8').isalpha() and '▁' in unit_table:
|
||||
word = '▁' + word
|
||||
chars = ' '.join(word) # word is a char list
|
||||
|
||||
fout.write('{} {}\n'.format(word, chars))
|
||||
lexicon_table.add(word)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='FST: preprae e2e(char/spm) dict')
|
||||
parser.add_argument(
|
||||
'--unit_file',
|
||||
required=True,
|
||||
help='e2e model unit file(lang_char.txt/vocab.txt). line: char/spm_pices'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--in_lexicon',
|
||||
required=True,
|
||||
help='raw lexicon file. line: word ph0 ... phn')
|
||||
parser.add_argument(
|
||||
'--out_lexicon',
|
||||
required=True,
|
||||
help='output lexicon file. line: word char0 ... charn')
|
||||
parser.add_argument('--bpemodel', default=None, help='bpemodel')
|
||||
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
main(args)
|
@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env perl
|
||||
# Copyright 2010-2011 Microsoft Corporation
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This script removes lines that contain these OOVs on either the
|
||||
# third or fourth fields of the line. It is intended to remove arcs
|
||||
# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in).
|
||||
|
||||
if ( @ARGV < 1 && @ARGV > 2) {
|
||||
die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n";
|
||||
}
|
||||
|
||||
$unklist = shift @ARGV;
|
||||
open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n";
|
||||
while(<S>){
|
||||
@A = split(" ", $_);
|
||||
@A == 1 || die "Bad line in unknown-symbol list: $_";
|
||||
$unk{$A[0]} = 1;
|
||||
}
|
||||
|
||||
$num_removed = 0;
|
||||
while(<>){
|
||||
@A = split(" ", $_);
|
||||
if(defined $unk{$A[2]} || defined $unk{$A[3]}) {
|
||||
$num_removed++;
|
||||
} else {
|
||||
print;
|
||||
}
|
||||
}
|
||||
print STDERR "remove_oovs.pl: removed $num_removed lines.\n";
|
@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
|
||||
|
||||
def main(args):
|
||||
# skip <blank> `token`
|
||||
print('0 0 <blank> <eps>')
|
||||
|
||||
with open(args.token_file, 'r') as fin:
|
||||
for entry in fin:
|
||||
fields = entry.strip().split(' ')
|
||||
phone = fields[0]
|
||||
if phone == '<eps>' or phone == '<blank>':
|
||||
continue
|
||||
elif '#' in phone:
|
||||
# disambiguous phone
|
||||
# maybe add disambiguous `token`
|
||||
print('{} {} {} {}'.format(0, 0, '<eps>', phone))
|
||||
else:
|
||||
# eating `token`
|
||||
print('{} {} {} {}'.format(0, 0, phone, phone))
|
||||
|
||||
# final state
|
||||
print('0')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description='FST: RNN-T Token FST transducer')
|
||||
parser.add_argument(
|
||||
'--token_file',
|
||||
required=True,
|
||||
help='e2e model token file. line: token(char/phone/spm/disambigous)')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
@ -0,0 +1,27 @@
|
||||
#!/usr/bin/env perl
|
||||
# Copyright 2010-2011 Microsoft Corporation
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This script replaces <s> and </s> with <eps> (on both input and output sides),
|
||||
# for the G.fst acceptor.
|
||||
|
||||
while(<>){
|
||||
@A = split(" ", $_);
|
||||
if ( @A >= 4 ) {
|
||||
if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
|
||||
if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
|
||||
}
|
||||
print join("\t", @A) . "\n";
|
||||
}
|
@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Manifest file to key-value files."""
|
||||
import argparse
|
||||
import functools
|
||||
from pathlib import Path
|
||||
|
||||
from utils.utility import add_arguments
|
||||
from utils.utility import print_arguments
|
||||
from utils.utility import read_manifest
|
||||
|
||||
|
||||
def main(args):
|
||||
print_arguments(args, globals())
|
||||
|
||||
count = 0
|
||||
|
||||
outdir = Path(args.output_path)
|
||||
wav_scp = outdir / 'wav.scp'
|
||||
dur_scp = outdir / 'duration'
|
||||
text_scp = outdir / 'text'
|
||||
|
||||
manifest_jsons = read_manifest(args.manifest_path)
|
||||
|
||||
with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
|
||||
'w') as ftxt:
|
||||
for line_json in manifest_jsons:
|
||||
utt = line_json['utt']
|
||||
feat = line_json['feat']
|
||||
file_ext = Path(feat).suffix # .wav
|
||||
text = line_json['text']
|
||||
feat_shape = line_json['feat_shape']
|
||||
dur = feat_shape[0]
|
||||
feat_dim = feat_shape[1]
|
||||
if 'token' in line_json:
|
||||
tokens = line_json['token']
|
||||
tokenids = line_json['token_id']
|
||||
token_shape = line_json['token_shape']
|
||||
token_len = token_shape[0]
|
||||
vocab_dim = token_shape[1]
|
||||
|
||||
if file_ext == '.wav':
|
||||
fwav.write(f"{utt} {feat}\n")
|
||||
fdur.write(f"{utt} {dur}\n")
|
||||
ftxt.write(f"{utt} {text}\n")
|
||||
|
||||
count += 1
|
||||
|
||||
print(f"Examples number: {count}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
add_arg = functools.partial(add_arguments, argparser=parser)
|
||||
# yapf: disable
|
||||
add_arg('manifest_path', str,
|
||||
'data/librispeech/manifest.train',
|
||||
"Filepath of manifest to compute normalizer's mean and stddev.")
|
||||
add_arg('output_path', str,
|
||||
'data/train',
|
||||
"dir path to dump wav.scp/duaration/text files.")
|
||||
# yapf: disable
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
Loading…
Reference in new issue