parent
cf1a395e15
commit
37d9c08da5
@ -0,0 +1,27 @@
|
|||||||
|
# This contains the locations of binarys build required for running the examples.
|
||||||
|
|
||||||
|
SPEECHX_ROOT=$PWD/../../../
|
||||||
|
MAIN_ROOT=$SPEECHX_ROOT/../
|
||||||
|
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
|
||||||
|
|
||||||
|
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
|
||||||
|
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
|
||||||
|
|
||||||
|
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
|
||||||
|
|
||||||
|
export LC_AL=C
|
||||||
|
|
||||||
|
export PATH=$PATH:$TOOLS_BIN
|
||||||
|
|
||||||
|
# srilm
|
||||||
|
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
||||||
|
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
||||||
|
export SRILM=${MAIN_ROOT}/tools/srilm
|
||||||
|
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
|
||||||
|
|
||||||
|
# Kaldi
|
||||||
|
export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
|
||||||
|
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
|
||||||
|
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
|
||||||
|
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
|
||||||
|
[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
|
@ -0,0 +1,64 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -eo pipefail
|
||||||
|
|
||||||
|
. path.sh
|
||||||
|
|
||||||
|
stage=-1
|
||||||
|
stop_stage=100
|
||||||
|
corpus=aishell
|
||||||
|
lmtype=srilm
|
||||||
|
|
||||||
|
lexicon= # aishell/resource_aishell/lexicon.txt
|
||||||
|
text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||||
|
|
||||||
|
source parse_options.sh
|
||||||
|
|
||||||
|
if [ ! which ngram-count ]; then
|
||||||
|
pushd $MAIN_ROOT/tools
|
||||||
|
make srilm.done
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! which fstprint ]; then
|
||||||
|
pushd $MAIN_ROOT/tools
|
||||||
|
make kaldi.done
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# 7.1 Prepare dict
|
||||||
|
unit_file=data/vocab.txt
|
||||||
|
mkdir -p data/local/dict
|
||||||
|
cp $unit_file data/local/dict/units.txt
|
||||||
|
utils/fst/prepare_dict.py \
|
||||||
|
--unit_file $unit_file \
|
||||||
|
--in_lexicon ${lexicon} \
|
||||||
|
--out_lexicon data/local/dict/lexicon.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# 7.2 Train lm
|
||||||
|
lm=data/local/lm
|
||||||
|
mkdir -p data/train
|
||||||
|
mkdir -p $lm
|
||||||
|
utils/manifest_key_value.py \
|
||||||
|
--manifest_path data/manifest.train \
|
||||||
|
--output_path data/train
|
||||||
|
utils/filter_scp.pl data/train/text \
|
||||||
|
$text > $lm/text
|
||||||
|
if [ $lmtype == 'srilm' ];then
|
||||||
|
local/aishell_train_lms.sh
|
||||||
|
else
|
||||||
|
utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
# 7.3 Build decoding TLG
|
||||||
|
utils/fst/compile_lexicon_token_fst.sh \
|
||||||
|
data/local/dict data/local/tmp data/local/lang
|
||||||
|
utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Aishell build TLG done."
|
||||||
|
exit 0
|
@ -0,0 +1,57 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# To be run from one directory above this script.
|
||||||
|
. ./path.sh
|
||||||
|
|
||||||
|
text=data/local/lm/text
|
||||||
|
lexicon=data/local/dict/lexicon.txt
|
||||||
|
|
||||||
|
for f in "$text" "$lexicon"; do
|
||||||
|
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
|
||||||
|
done
|
||||||
|
|
||||||
|
# Check SRILM tools
|
||||||
|
if ! which ngram-count > /dev/null; then
|
||||||
|
echo "srilm tools are not found, please download it and install it from: "
|
||||||
|
echo "http://www.speech.sri.com/projects/srilm/download.html"
|
||||||
|
echo "Then add the tools to your PATH"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# This script takes no arguments. It assumes you have already run
|
||||||
|
# aishell_data_prep.sh.
|
||||||
|
# It takes as input the files
|
||||||
|
# data/local/lm/text
|
||||||
|
# data/local/dict/lexicon.txt
|
||||||
|
dir=data/local/lm
|
||||||
|
mkdir -p $dir
|
||||||
|
|
||||||
|
cleantext=$dir/text.no_oov
|
||||||
|
|
||||||
|
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
|
||||||
|
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
|
||||||
|
> $cleantext || exit 1;
|
||||||
|
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
|
||||||
|
sort -nr > $dir/word.counts || exit 1;
|
||||||
|
|
||||||
|
# Get counts from acoustic training transcripts, and add one-count
|
||||||
|
# for each word in the lexicon (but not silence, we don't want it
|
||||||
|
# in the LM-- we'll add it optionally later).
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
|
||||||
|
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
|
||||||
|
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
|
||||||
|
|
||||||
|
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
|
||||||
|
|
||||||
|
heldout_sent=10000 # Don't change this if you want result to be comparable with
|
||||||
|
# kaldi_lm results
|
||||||
|
mkdir -p $dir
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||||
|
head -$heldout_sent > $dir/heldout
|
||||||
|
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||||
|
tail -n +$heldout_sent > $dir/train
|
||||||
|
|
||||||
|
ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
|
||||||
|
-map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
|
||||||
|
ngram -lm $dir/lm.arpa -ppl $dir/heldout
|
@ -0,0 +1,20 @@
|
|||||||
|
# This contains the locations of binarys build required for running the examples.
|
||||||
|
|
||||||
|
SPEECHX_ROOT=$PWD/../../../
|
||||||
|
MAIN_ROOT=$SPEECHX_ROOT/../
|
||||||
|
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
|
||||||
|
|
||||||
|
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
|
||||||
|
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
|
||||||
|
|
||||||
|
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
|
||||||
|
|
||||||
|
export LC_AL=C
|
||||||
|
|
||||||
|
export PATH=$PATH:$TOOLS_BIN
|
||||||
|
|
||||||
|
# srilm
|
||||||
|
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
||||||
|
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
||||||
|
export SRILM=${MAIN_ROOT}/tools/srilm
|
||||||
|
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
|
@ -0,0 +1,61 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -eo pipefail
|
||||||
|
|
||||||
|
. path.sh
|
||||||
|
|
||||||
|
stage=-1
|
||||||
|
stop_stage=100
|
||||||
|
corpus=aishell
|
||||||
|
|
||||||
|
unit=data/vocab.txt # vocab
|
||||||
|
lexicon= # aishell/resource_aishell/lexicon.txt
|
||||||
|
text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||||
|
|
||||||
|
. parse_options.sh
|
||||||
|
|
||||||
|
data=$PWD/data
|
||||||
|
mkdir -p $data
|
||||||
|
|
||||||
|
if [ ! -f $unit ]; then
|
||||||
|
echo "$0: No such file $unit"
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! which ngram-count ]; then
|
||||||
|
pushd $MAIN_ROOT/tools
|
||||||
|
make srilm.done
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! which fstaddselfloops ]; then
|
||||||
|
pushd $MAIN_ROOT/tools
|
||||||
|
make kaldi.done
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p data/local/dict
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# 7.1 Prepare dict
|
||||||
|
cp $unit data/local/dict/units.txt
|
||||||
|
utils/fst/prepare_dict.py \
|
||||||
|
--unit_file $unit \
|
||||||
|
--in_lexicon ${lexicon} \
|
||||||
|
--out_lexicon data/local/dict/lexicon.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
lm=data/local/lm
|
||||||
|
mkdir -p data/train
|
||||||
|
mkdir -p $lm
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# 7.2 Train lm
|
||||||
|
utils/manifest_key_value.py \
|
||||||
|
--manifest_path data/manifest.train \
|
||||||
|
--output_path data/train
|
||||||
|
utils/filter_scp.pl data/train/text \
|
||||||
|
$text > $lm/text
|
||||||
|
|
||||||
|
local/aishell_train_lms.sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "build LM done."
|
||||||
|
exit 0
|
@ -0,0 +1 @@
|
|||||||
|
../../../utils/
|
@ -1,97 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
current_path=`pwd`
|
|
||||||
current_dir=`basename "$current_path"`
|
|
||||||
|
|
||||||
if [ "tools" != "$current_dir" ]; then
|
|
||||||
echo "You should run this script in tools/ directory!!"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -d liblbfgs-1.10 ]; then
|
|
||||||
echo Installing libLBFGS library to support MaxEnt LMs
|
|
||||||
bash extras/install_liblbfgs.sh || exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
! command -v gawk > /dev/null && \
|
|
||||||
echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
|
|
||||||
|
|
||||||
if [ $# -ne 3 ]; then
|
|
||||||
echo "SRILM download requires some information about you"
|
|
||||||
echo
|
|
||||||
echo "Usage: $0 <name> <organization> <email>"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php"
|
|
||||||
post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3"
|
|
||||||
|
|
||||||
if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
|
|
||||||
echo 'There was a problem downloading the file.'
|
|
||||||
echo 'Check you internet connection and try again.'
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
mkdir -p srilm
|
|
||||||
cd srilm
|
|
||||||
|
|
||||||
|
|
||||||
if [ -f ../srilm.tgz ]; then
|
|
||||||
tar -xvzf ../srilm.tgz # Old SRILM format
|
|
||||||
elif [ -f ../srilm.tar.gz ]; then
|
|
||||||
tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz
|
|
||||||
fi
|
|
||||||
|
|
||||||
major=`gawk -F. '{ print $1 }' RELEASE`
|
|
||||||
minor=`gawk -F. '{ print $2 }' RELEASE`
|
|
||||||
micro=`gawk -F. '{ print $3 }' RELEASE`
|
|
||||||
|
|
||||||
if [ $major -le 1 ] && [ $minor -le 7 ] && [ $micro -le 1 ]; then
|
|
||||||
echo "Detected version 1.7.1 or earlier. Applying patch."
|
|
||||||
patch -p0 < ../extras/srilm.patch
|
|
||||||
fi
|
|
||||||
|
|
||||||
# set the SRILM variable in the top-level Makefile to this directory.
|
|
||||||
cp Makefile tmpf
|
|
||||||
|
|
||||||
cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \
|
|
||||||
> Makefile || exit 1
|
|
||||||
rm tmpf
|
|
||||||
|
|
||||||
mtype=`sbin/machine-type`
|
|
||||||
|
|
||||||
echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype
|
|
||||||
grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \
|
|
||||||
sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \
|
|
||||||
>> common/Makefile.machine.$mtype
|
|
||||||
|
|
||||||
grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \
|
|
||||||
sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/ -Wl,-rpath -Wl,$(SRILM)/../liblbfgs-1.10/lib/|' \
|
|
||||||
>> common/Makefile.machine.$mtype
|
|
||||||
|
|
||||||
make || exit
|
|
||||||
|
|
||||||
cd ..
|
|
||||||
(
|
|
||||||
[ ! -z "${SRILM}" ] && \
|
|
||||||
echo >&2 "SRILM variable is aleady defined. Undefining..." && \
|
|
||||||
unset SRILM
|
|
||||||
|
|
||||||
[ -f ./env.sh ] && . ./env.sh
|
|
||||||
|
|
||||||
[ ! -z "${SRILM}" ] && \
|
|
||||||
echo >&2 "SRILM config is already in env.sh" && exit
|
|
||||||
|
|
||||||
wd=`pwd`
|
|
||||||
wd=`readlink -f $wd || pwd`
|
|
||||||
|
|
||||||
echo "export SRILM=$wd/srilm"
|
|
||||||
dirs="\${PATH}"
|
|
||||||
for directory in $(cd srilm && find bin -type d ) ; do
|
|
||||||
dirs="$dirs:\${SRILM}/$directory"
|
|
||||||
done
|
|
||||||
echo "export PATH=$dirs"
|
|
||||||
) >> env.sh
|
|
||||||
|
|
||||||
echo >&2 "Installation of SRILM finished successfully"
|
|
||||||
echo >&2 "Please source the tools/env.sh in your path.sh to enable it"
|
|
Loading…
Reference in new issue