rm example/aishell

4 years ago · 5170ccf00d
parent cc434566a1
commit 5170ccf00d
3 changed files with 0 additions and 104 deletions
--- a/speechx/examples/aishell/local/aishell_train_lms.sh
+++ b/speechx/examples/aishell/local/aishell_train_lms.sh
@ -1,59 +0,0 @@
 #!/bin/bash
 # To be run from one directory above this script.
 . ./path.sh
 text=data/local/lm/text
 lexicon=data/local/dict/lexicon.txt
 for f in "$text" "$lexicon"; do
  [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
 done
 # Check SRILM tools
 if ! which ngram-count > /dev/null; then
    echo "srilm tools are not found, please download it and install it from: "
    echo "http://www.speech.sri.com/projects/srilm/download.html"
    echo "Then add the tools to your PATH"
    exit 1
 fi
 # This script takes no arguments.  It assumes you have already run
 # aishell_data_prep.sh.
 # It takes as input the files
 # data/local/lm/text
 # data/local/dict/lexicon.txt
 dir=data/local/lm
 mkdir -p $dir
 cleantext=$dir/text.no_oov
 cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
  > $cleantext || exit 1;
 cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
   sort -nr > $dir/word.counts || exit 1;
 # Get counts from acoustic training transcripts, and add  one-count
 # for each word in the lexicon (but not silence, we don't want it
 # in the LM-- we'll add it optionally later).
 cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
 cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
 heldout_sent=10000 # Don't change this if you want result to be comparable with
    # kaldi_lm results
 mkdir -p $dir
 cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
  head -$heldout_sent > $dir/heldout
 cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
  tail -n +$heldout_sent > $dir/train
 ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
  -map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
 ngram -lm $dir/lm.arpa -ppl $dir/heldout
--- a/speechx/examples/aishell/mkgraph.sh
+++ b/speechx/examples/aishell/mkgraph.sh
@ -1,31 +0,0 @@
 #!/bin/bash
 . ./path.sh || exit 1;
 . tools/parse_options.sh || exit 1;
 data=/mnt/dataset/aishell
 # Optionally, you can add LM and test it with runtime.
 dir=./ds2_graph
 dict=$dir/vocab.txt
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
  # 7.1 Prepare dict
  unit_file=$dict
  mkdir -p $dir/local/dict
  cp $unit_file $dir/local/dict/units.txt
  tools/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \
    $dir/local/dict/lexicon.txt
  # Train lm
  lm=$dir/local/lm
  mkdir -p $lm
  tools/filter_scp.pl data/train/text \
    $data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text
  local/ds2_aishell_train_lms.sh
  # Build decoding TLG
  tools/fst/compile_lexicon_token_fst.sh \
    $dir/local/dict $dir/local/tmp $dir/local/lang
  tools/fst/make_tlg.sh $dir/local/lm $dir/local/lang $dir/lang_test || exit 1;
 fi
--- a/speechx/examples/aishell/path.sh
+++ b/speechx/examples/aishell/path.sh
@ -1,14 +0,0 @@
 # This contains the locations of binarys build required for running the examples.
 SPEECHX_ROOT=$PWD/../..
 SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
 TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 [ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
 export LC_AL=C
 SPEECHX_BIN=$SPEECHX_EXAMPLES/feat
 export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN