parent
cf1a395e15
commit
37d9c08da5
@ -0,0 +1,27 @@
|
||||
# This contains the locations of binarys build required for running the examples.
|
||||
|
||||
SPEECHX_ROOT=$PWD/../../../
|
||||
MAIN_ROOT=$SPEECHX_ROOT/../
|
||||
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
|
||||
|
||||
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
|
||||
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
|
||||
|
||||
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
|
||||
|
||||
export LC_AL=C
|
||||
|
||||
export PATH=$PATH:$TOOLS_BIN
|
||||
|
||||
# srilm
|
||||
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
||||
export SRILM=${MAIN_ROOT}/tools/srilm
|
||||
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
|
||||
|
||||
# Kaldi
|
||||
export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
|
||||
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
|
||||
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
|
||||
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
|
||||
[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
|
@ -0,0 +1,64 @@
|
||||
#!/bin/bash
|
||||
set -eo pipefail
|
||||
|
||||
. path.sh
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
corpus=aishell
|
||||
lmtype=srilm
|
||||
|
||||
lexicon= # aishell/resource_aishell/lexicon.txt
|
||||
text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||
|
||||
source parse_options.sh
|
||||
|
||||
if [ ! which ngram-count ]; then
|
||||
pushd $MAIN_ROOT/tools
|
||||
make srilm.done
|
||||
popd
|
||||
fi
|
||||
|
||||
if [ ! which fstprint ]; then
|
||||
pushd $MAIN_ROOT/tools
|
||||
make kaldi.done
|
||||
popd
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# 7.1 Prepare dict
|
||||
unit_file=data/vocab.txt
|
||||
mkdir -p data/local/dict
|
||||
cp $unit_file data/local/dict/units.txt
|
||||
utils/fst/prepare_dict.py \
|
||||
--unit_file $unit_file \
|
||||
--in_lexicon ${lexicon} \
|
||||
--out_lexicon data/local/dict/lexicon.txt
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# 7.2 Train lm
|
||||
lm=data/local/lm
|
||||
mkdir -p data/train
|
||||
mkdir -p $lm
|
||||
utils/manifest_key_value.py \
|
||||
--manifest_path data/manifest.train \
|
||||
--output_path data/train
|
||||
utils/filter_scp.pl data/train/text \
|
||||
$text > $lm/text
|
||||
if [ $lmtype == 'srilm' ];then
|
||||
local/aishell_train_lms.sh
|
||||
else
|
||||
utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# 7.3 Build decoding TLG
|
||||
utils/fst/compile_lexicon_token_fst.sh \
|
||||
data/local/dict data/local/tmp data/local/lang
|
||||
utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
|
||||
fi
|
||||
|
||||
echo "Aishell build TLG done."
|
||||
exit 0
|
@ -0,0 +1,57 @@
|
||||
#!/bin/bash
|
||||
|
||||
# To be run from one directory above this script.
|
||||
. ./path.sh
|
||||
|
||||
text=data/local/lm/text
|
||||
lexicon=data/local/dict/lexicon.txt
|
||||
|
||||
for f in "$text" "$lexicon"; do
|
||||
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
|
||||
done
|
||||
|
||||
# Check SRILM tools
|
||||
if ! which ngram-count > /dev/null; then
|
||||
echo "srilm tools are not found, please download it and install it from: "
|
||||
echo "http://www.speech.sri.com/projects/srilm/download.html"
|
||||
echo "Then add the tools to your PATH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# This script takes no arguments. It assumes you have already run
|
||||
# aishell_data_prep.sh.
|
||||
# It takes as input the files
|
||||
# data/local/lm/text
|
||||
# data/local/dict/lexicon.txt
|
||||
dir=data/local/lm
|
||||
mkdir -p $dir
|
||||
|
||||
cleantext=$dir/text.no_oov
|
||||
|
||||
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
|
||||
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
|
||||
> $cleantext || exit 1;
|
||||
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
|
||||
sort -nr > $dir/word.counts || exit 1;
|
||||
|
||||
# Get counts from acoustic training transcripts, and add one-count
|
||||
# for each word in the lexicon (but not silence, we don't want it
|
||||
# in the LM-- we'll add it optionally later).
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
|
||||
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
|
||||
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
|
||||
|
||||
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
|
||||
|
||||
heldout_sent=10000 # Don't change this if you want result to be comparable with
|
||||
# kaldi_lm results
|
||||
mkdir -p $dir
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||
head -$heldout_sent > $dir/heldout
|
||||
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
|
||||
tail -n +$heldout_sent > $dir/train
|
||||
|
||||
ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
|
||||
-map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
|
||||
ngram -lm $dir/lm.arpa -ppl $dir/heldout
|
@ -0,0 +1,20 @@
|
||||
# This contains the locations of binarys build required for running the examples.
|
||||
|
||||
SPEECHX_ROOT=$PWD/../../../
|
||||
MAIN_ROOT=$SPEECHX_ROOT/../
|
||||
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
|
||||
|
||||
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
|
||||
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
|
||||
|
||||
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
|
||||
|
||||
export LC_AL=C
|
||||
|
||||
export PATH=$PATH:$TOOLS_BIN
|
||||
|
||||
# srilm
|
||||
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
||||
export SRILM=${MAIN_ROOT}/tools/srilm
|
||||
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
|
@ -0,0 +1,61 @@
|
||||
#!/bin/bash
|
||||
set -eo pipefail
|
||||
|
||||
. path.sh
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
corpus=aishell
|
||||
|
||||
unit=data/vocab.txt # vocab
|
||||
lexicon= # aishell/resource_aishell/lexicon.txt
|
||||
text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||
|
||||
. parse_options.sh
|
||||
|
||||
data=$PWD/data
|
||||
mkdir -p $data
|
||||
|
||||
if [ ! -f $unit ]; then
|
||||
echo "$0: No such file $unit"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
if [ ! which ngram-count ]; then
|
||||
pushd $MAIN_ROOT/tools
|
||||
make srilm.done
|
||||
popd
|
||||
fi
|
||||
|
||||
if [ ! which fstaddselfloops ]; then
|
||||
pushd $MAIN_ROOT/tools
|
||||
make kaldi.done
|
||||
popd
|
||||
fi
|
||||
|
||||
mkdir -p data/local/dict
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# 7.1 Prepare dict
|
||||
cp $unit data/local/dict/units.txt
|
||||
utils/fst/prepare_dict.py \
|
||||
--unit_file $unit \
|
||||
--in_lexicon ${lexicon} \
|
||||
--out_lexicon data/local/dict/lexicon.txt
|
||||
fi
|
||||
|
||||
lm=data/local/lm
|
||||
mkdir -p data/train
|
||||
mkdir -p $lm
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# 7.2 Train lm
|
||||
utils/manifest_key_value.py \
|
||||
--manifest_path data/manifest.train \
|
||||
--output_path data/train
|
||||
utils/filter_scp.pl data/train/text \
|
||||
$text > $lm/text
|
||||
|
||||
local/aishell_train_lms.sh
|
||||
fi
|
||||
|
||||
echo "build LM done."
|
||||
exit 0
|
@ -0,0 +1 @@
|
||||
../../../utils/
|
@ -1,97 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
current_path=`pwd`
|
||||
current_dir=`basename "$current_path"`
|
||||
|
||||
if [ "tools" != "$current_dir" ]; then
|
||||
echo "You should run this script in tools/ directory!!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -d liblbfgs-1.10 ]; then
|
||||
echo Installing libLBFGS library to support MaxEnt LMs
|
||||
bash extras/install_liblbfgs.sh || exit 1
|
||||
fi
|
||||
|
||||
! command -v gawk > /dev/null && \
|
||||
echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1;
|
||||
|
||||
if [ $# -ne 3 ]; then
|
||||
echo "SRILM download requires some information about you"
|
||||
echo
|
||||
echo "Usage: $0 <name> <organization> <email>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php"
|
||||
post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3"
|
||||
|
||||
if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then
|
||||
echo 'There was a problem downloading the file.'
|
||||
echo 'Check you internet connection and try again.'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p srilm
|
||||
cd srilm
|
||||
|
||||
|
||||
if [ -f ../srilm.tgz ]; then
|
||||
tar -xvzf ../srilm.tgz # Old SRILM format
|
||||
elif [ -f ../srilm.tar.gz ]; then
|
||||
tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz
|
||||
fi
|
||||
|
||||
major=`gawk -F. '{ print $1 }' RELEASE`
|
||||
minor=`gawk -F. '{ print $2 }' RELEASE`
|
||||
micro=`gawk -F. '{ print $3 }' RELEASE`
|
||||
|
||||
if [ $major -le 1 ] && [ $minor -le 7 ] && [ $micro -le 1 ]; then
|
||||
echo "Detected version 1.7.1 or earlier. Applying patch."
|
||||
patch -p0 < ../extras/srilm.patch
|
||||
fi
|
||||
|
||||
# set the SRILM variable in the top-level Makefile to this directory.
|
||||
cp Makefile tmpf
|
||||
|
||||
cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \
|
||||
> Makefile || exit 1
|
||||
rm tmpf
|
||||
|
||||
mtype=`sbin/machine-type`
|
||||
|
||||
echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype
|
||||
grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \
|
||||
sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \
|
||||
>> common/Makefile.machine.$mtype
|
||||
|
||||
grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \
|
||||
sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/ -Wl,-rpath -Wl,$(SRILM)/../liblbfgs-1.10/lib/|' \
|
||||
>> common/Makefile.machine.$mtype
|
||||
|
||||
make || exit
|
||||
|
||||
cd ..
|
||||
(
|
||||
[ ! -z "${SRILM}" ] && \
|
||||
echo >&2 "SRILM variable is aleady defined. Undefining..." && \
|
||||
unset SRILM
|
||||
|
||||
[ -f ./env.sh ] && . ./env.sh
|
||||
|
||||
[ ! -z "${SRILM}" ] && \
|
||||
echo >&2 "SRILM config is already in env.sh" && exit
|
||||
|
||||
wd=`pwd`
|
||||
wd=`readlink -f $wd || pwd`
|
||||
|
||||
echo "export SRILM=$wd/srilm"
|
||||
dirs="\${PATH}"
|
||||
for directory in $(cd srilm && find bin -type d ) ; do
|
||||
dirs="$dirs:\${SRILM}/$directory"
|
||||
done
|
||||
echo "export PATH=$dirs"
|
||||
) >> env.sh
|
||||
|
||||
echo >&2 "Installation of SRILM finished successfully"
|
||||
echo >&2 "Please source the tools/env.sh in your path.sh to enable it"
|
Loading…
Reference in new issue