zh ngram build

3 years ago · eb52896c4a
parent 37d9c08da5
commit eb52896c4a
17 changed files with 217 additions and 104 deletions
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@ -1,12 +1,10 @@
 # Examples for SpeechX
 * dev - for speechx developer, using for test.
 * ngram - using to build NGram ARPA lm.
 * ds2_ol - ds2 streaming test under `aishell-1` test dataset.
- The entrypoint is `ds2_ol/aishell/run.sh`
+   The entrypoint is `ds2_ol/aishell/run.sh`
-## How to run
+## How to run  
 `run.sh` is the entry point.
@ -17,9 +15,19 @@ pushd ds2_ol/aishell
 bash run.sh
 ```
-## Display Model with [Netron](https://github.com/lutzroeder/netron)
+## Display Model with [Netron](https://github.com/lutzroeder/netron)  
 ```
 pip install netron
 netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel  --port 8022 --host 10.21.55.20
 ```
 ## Build WFST  
 * text_lm - process text for build lm
 * ngram - using to build NGram ARPA lm.
 * wfst - build wfst for TLG.
 ## For Developer  
 * dev - for speechx developer, using for test.
--- a/speechx/examples/ngram/README.md
+++ b/speechx/examples/ngram/README.md
@ -1 +0,0 @@
 # NGram Train
--- a/speechx/examples/ngram/en/README.md
+++ b/speechx/examples/ngram/en/README.md
--- a/speechx/examples/ngram/path.sh
+++ b/speechx/examples/ngram/path.sh
@ -1,20 +0,0 @@
 # This contains the locations of binarys build required for running the examples.
 SPEECHX_ROOT=$PWD/../../../
 MAIN_ROOT=$SPEECHX_ROOT/../
 SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
 TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 [ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
 export LC_AL=C
 export PATH=$PATH:$TOOLS_BIN
 # srilm
 export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
 export SRILM=${MAIN_ROOT}/tools/srilm
 export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
--- a/speechx/examples/ngram/run.sh
+++ b/speechx/examples/ngram/run.sh
@ -1,61 +0,0 @@
 #!/bin/bash
 set -eo pipefail
 . path.sh
 stage=-1
 stop_stage=100
 corpus=aishell
 unit=data/vocab.txt     # vocab
 lexicon=  # aishell/resource_aishell/lexicon.txt
 text=     # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
 . parse_options.sh
 data=$PWD/data
 mkdir -p $data
 if [ ! -f $unit ]; then
    echo "$0: No such file $unit"
    exit 1;
 fi
 if [ ! which ngram-count ]; then
    pushd $MAIN_ROOT/tools
    make srilm.done
    popd
 fi
 if [ ! which fstaddselfloops ]; then
    pushd $MAIN_ROOT/tools
    make kaldi.done
    popd
 fi
 mkdir -p data/local/dict
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # 7.1 Prepare dict
    cp $unit data/local/dict/units.txt
    utils/fst/prepare_dict.py \
        --unit_file $unit \
        --in_lexicon ${lexicon} \
        --out_lexicon data/local/dict/lexicon.txt
 fi
 lm=data/local/lm
 mkdir -p data/train
 mkdir -p $lm
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # 7.2 Train lm
    utils/manifest_key_value.py \
        --manifest_path data/manifest.train \
        --output_path data/train
    utils/filter_scp.pl data/train/text \
        $text > $lm/text
    local/aishell_train_lms.sh
 fi
 echo "build LM done."
 exit 0
--- a/speechx/examples/ngram/utils
+++ b/speechx/examples/ngram/utils
@ -1 +0,0 @@
 ../../../utils/
--- a/speechx/examples/ngram/zh/README.md
+++ b/speechx/examples/ngram/zh/README.md
@ -0,0 +1,101 @@
 # ngram train for mandarin
 Quick run:
 ```
 bash run.sh --stage -1
 ```
 ## input
 input files:
 ```
 data/
 ├── lexicon.txt
 ├── text
 └── vocab.txt
 ```
 ```
 ==> data/text <==
 BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
 BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
 BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
 BAC009S0002W0125 各地 政府 便 纷纷 跟进
 BAC009S0002W0126 仅 一 个 多 月 的 时间 里
 BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
 BAC009S0002W0128 四十六 个 限 购 城市 当中
 BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
 BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
 BAC009S0002W0131 显示 出 了 极 强 的 威力
 ==> data/lexicon.txt <==
 SIL sil
 <SPOKEN_NOISE> sil
 啊 aa a1
 啊 aa a2
 啊 aa a4
 啊 aa a5
 啊啊啊 aa a2 aa a2 aa a2
 啊啊啊 aa a5 aa a5 aa a5
 坐地 z uo4 d i4
 坐实 z uo4 sh ix2
 坐视 z uo4 sh ix4
 坐稳 z uo4 uu un3
 坐拥 z uo4 ii iong1
 坐诊 z uo4 zh en3
 坐庄 z uo4 zh uang1
 坐姿 z uo4 z iy1
 ==> data/vocab.txt <==
 <blank>
 <unk>
 A
 B
 C
 D
 E
 龙
 龚
 龛
 <eos>
 ```
 ## output
 ```
 data/
 ├── local
 │   ├── dict
 │   │   ├── lexicon.txt
 │   │   └── units.txt
 │   └── lm
 │       ├── heldout
 │       ├── lm.arpa
 │       ├── text
 │       ├── text.no_oov
 │       ├── train
 │       ├── unigram.counts
 │       ├── word.counts
 │       └── wordlist
 ```
 ```
 /workspace/srilm/bin/i686-m64/ngram-count
 Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
 Ignoring words 矽, which contains oov unit
 Ignoring words 傩, which contains oov unit
 Ignoring words 堀, which contains oov unit
 Ignoring words 莼, which contains oov unit
 Ignoring words 菰, which contains oov unit
 Ignoring words 摭, which contains oov unit
 Ignoring words 帙, which contains oov unit
 Ignoring words 迨, which contains oov unit
 Ignoring words 孥, which contains oov unit
 Ignoring words 瑗, which contains oov unit
 ...
 ...
 ...
 file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
 0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
 build LM done.
 ```
--- a/speechx/examples/ngram/zh/local/aishell_train_lms.sh
+++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh
@ -28,10 +28,14 @@ mkdir -p $dir
 cleantext=$dir/text.no_oov
 # oov to <SPOKEN_NOISE>
 # line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
 cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
  > $cleantext || exit 1;
 # compute word counts
 # line: count word
 cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
   sort -nr > $dir/word.counts || exit 1;
@ -42,10 +46,13 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
 # word with <s> </s>
 cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
 # hold out to compute ppl
 heldout_sent=10000 # Don't change this if you want result to be comparable with
    # kaldi_lm results
 mkdir -p $dir
 cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
  head -$heldout_sent > $dir/heldout
--- a/speechx/examples/ngram/zh/path.sh
+++ b/speechx/examples/ngram/zh/path.sh
@ -0,0 +1,12 @@
 # This contains the locations of binarys build required for running the examples.
 MAIN_ROOT=`realpath $PWD/../../../../`
 SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
 export LC_AL=C
 # srilm
 export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
 export SRILM=${MAIN_ROOT}/tools/srilm
 export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
--- a/speechx/examples/ngram/zh/run.sh
+++ b/speechx/examples/ngram/zh/run.sh
@ -0,0 +1,62 @@
 #!/bin/bash
 set -eo pipefail
 . path.sh
 stage=0
 stop_stage=100
 corpus=aishell
 unit=data/vocab.txt       # line: char/spm_pice, vocab file
 lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
 text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
 . utils/parse_options.sh
 data=$PWD/data
 mkdir -p $data
 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
    if [ ! -f $data/speech.ngram.zh.tar.gz ];then
        pushd $data
        wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
        tar xvzf speech.ngram.zh.tar.gz
        popd
    fi
 fi
 if [ ! -f $unit ]; then
    echo "$0: No such file $unit"
    exit 1;
 fi
 if ! which ngram-count; then
    pushd $MAIN_ROOT/tools
    make srilm.done
    popd
 fi
 mkdir -p data/local/dict
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # 7.1 Prepare dict
    # line: char/spm_pices
    cp $unit data/local/dict/units.txt
    # line: word ph0 ... phn -> line: word char0 ... charn
    utils/fst/prepare_dict.py \
        --unit_file $unit \
        --in_lexicon ${lexicon} \
        --out_lexicon data/local/dict/lexicon.txt
 fi
 lm=data/local/lm
 mkdir -p $lm
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # 7.2 Train lm
    cp $text $lm/text
    local/aishell_train_lms.sh
 fi
 echo "build LM done."
 exit 0
--- a/speechx/examples/ngram/zh/utils
+++ b/speechx/examples/ngram/zh/utils
@ -0,0 +1 @@
 ../../../../utils/
--- a/speechx/examples/text_lm/.gitignore
+++ b/speechx/examples/text_lm/.gitignore
@ -0,0 +1 @@
 data
--- a/speechx/examples/text_lm/path.sh
+++ b/speechx/examples/text_lm/path.sh
--- a/speechx/examples/text_lm/run.sh
+++ b/speechx/examples/text_lm/run.sh
--- a/speechx/examples/wfst/README.md
+++ b/speechx/examples/wfst/README.md
@ -0,0 +1,18 @@
 ```
 fstaddselfloops 'echo 4234 |' 'echo 123660 |' 
 Lexicon and Token FSTs compiling succeeded
 arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true - 
 LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
 LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
 LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
 LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
 Checking how stochastic G is (the first of these numbers should be small):
 fstisstochastic data/lang_test/G.fst 
 0 -1.14386
 fsttablecompose data/lang_test/L.fst data/lang_test/G.fst 
 fstminimizeencoded 
 fstdeterminizestar --use-log=true 
 fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst 
 Composing decoding graph TLG.fst succeeded
 Aishell build TLG done.
 ```
--- a/speechx/examples/build_wfst/path.sh
+++ b/speechx/examples/build_wfst/path.sh
@ -1,18 +1,10 @@
 # This contains the locations of binarys build required for running the examples.
-SPEECHX_ROOT=$PWD/../../../
+MAIN_ROOT=`realpath $PWD/../../../../`
-MAIN_ROOT=$SPEECHX_ROOT/../
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
 SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
 SPEECHX_TOOLS=$SPEECHX_ROOT/tools
 TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
 [ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
 export LC_AL=C
 export PATH=$PATH:$TOOLS_BIN
 # srilm
 export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
--- a/speechx/examples/build_wfst/run.sh
+++ b/speechx/examples/build_wfst/run.sh
@ -13,12 +13,6 @@ text=     # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
 source parse_options.sh
 if [ ! which ngram-count ]; then
    pushd $MAIN_ROOT/tools
    make srilm.done
    popd
 fi
 if [ ! which fstprint ]; then
    pushd $MAIN_ROOT/tools
    make kaldi.done