zh ngram build

4 years ago · eb52896c4a
parent 37d9c08da5
commit eb52896c4a
17 changed files with 217 additions and 104 deletions
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@ -1,12 +1,10 @@
 # Examples for SpeechX

-* dev - for speechx developer, using for test.
-* ngram - using to build NGram ARPA lm.
 * ds2_ol - ds2 streaming test under `aishell-1` test dataset.
- The entrypoint is `ds2_ol/aishell/run.sh`
+   The entrypoint is `ds2_ol/aishell/run.sh`


-## How to run
+## How to run  

 `run.sh` is the entry point.

@ -17,9 +15,19 @@ pushd ds2_ol/aishell
 bash run.sh
 ```

-## Display Model with [Netron](https://github.com/lutzroeder/netron)
+## Display Model with [Netron](https://github.com/lutzroeder/netron)  

 ```
 pip install netron
 netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel  --port 8022 --host 10.21.55.20
 ```
+
+## Build WFST  
+
+* text_lm - process text for build lm
+* ngram - using to build NGram ARPA lm.
+* wfst - build wfst for TLG.
+
+## For Developer  
+
+* dev - for speechx developer, using for test.
--- a/speechx/examples/ngram/README.md
+++ b/speechx/examples/ngram/README.md
@ -1 +0,0 @@
-# NGram Train
--- a/speechx/examples/ngram/en/README.md
+++ b/speechx/examples/ngram/en/README.md
--- a/speechx/examples/ngram/path.sh
+++ b/speechx/examples/ngram/path.sh
@ -1,20 +0,0 @@
-# This contains the locations of binarys build required for running the examples.
-
-SPEECHX_ROOT=$PWD/../../../
-MAIN_ROOT=$SPEECHX_ROOT/../
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
-
-SPEECHX_TOOLS=$SPEECHX_ROOT/tools
-TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
-
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
-
-export LC_AL=C
-
-export PATH=$PATH:$TOOLS_BIN
-
-# srilm
-export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
-export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
-export SRILM=${MAIN_ROOT}/tools/srilm
-export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
--- a/speechx/examples/ngram/run.sh
+++ b/speechx/examples/ngram/run.sh
@ -1,61 +0,0 @@
-#!/bin/bash
-set -eo pipefail
-
-. path.sh
-
-stage=-1
-stop_stage=100
-corpus=aishell
-
-unit=data/vocab.txt     # vocab
-lexicon=  # aishell/resource_aishell/lexicon.txt
-text=     # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
-
-. parse_options.sh
-
-data=$PWD/data
-mkdir -p $data
-
-if [ ! -f $unit ]; then
-    echo "$0: No such file $unit"
-    exit 1;
-fi
-
-if [ ! which ngram-count ]; then
-    pushd $MAIN_ROOT/tools
-    make srilm.done
-    popd
-fi
-
-if [ ! which fstaddselfloops ]; then
-    pushd $MAIN_ROOT/tools
-    make kaldi.done
-    popd
-fi
-
-mkdir -p data/local/dict
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # 7.1 Prepare dict
-    cp $unit data/local/dict/units.txt
-    utils/fst/prepare_dict.py \
-        --unit_file $unit \
-        --in_lexicon ${lexicon} \
-        --out_lexicon data/local/dict/lexicon.txt
-fi
-
-lm=data/local/lm
-mkdir -p data/train
-mkdir -p $lm
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # 7.2 Train lm
-    utils/manifest_key_value.py \
-        --manifest_path data/manifest.train \
-        --output_path data/train
-    utils/filter_scp.pl data/train/text \
-        $text > $lm/text
-    
-    local/aishell_train_lms.sh
-fi
-
-echo "build LM done."
-exit 0
--- a/speechx/examples/ngram/utils
+++ b/speechx/examples/ngram/utils
@ -1 +0,0 @@
-../../../utils/
--- a/speechx/examples/ngram/zh/README.md
+++ b/speechx/examples/ngram/zh/README.md
@ -0,0 +1,101 @@
+# ngram train for mandarin
+
+Quick run:
+```
+bash run.sh --stage -1
+```
+
+## input
+
+input files:
+```
+data/
+├── lexicon.txt
+├── text
+└── vocab.txt
+```
+
+```
+==> data/text <==
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+
+==> data/lexicon.txt <==
+SIL sil
+<SPOKEN_NOISE> sil
+啊 aa a1
+啊 aa a2
+啊 aa a4
+啊 aa a5
+啊啊啊 aa a2 aa a2 aa a2
+啊啊啊 aa a5 aa a5 aa a5
+坐地 z uo4 d i4
+坐实 z uo4 sh ix2
+坐视 z uo4 sh ix4
+坐稳 z uo4 uu un3
+坐拥 z uo4 ii iong1
+坐诊 z uo4 zh en3
+坐庄 z uo4 zh uang1
+坐姿 z uo4 z iy1
+
+==> data/vocab.txt <==
+<blank>
+<unk>
+A
+B
+C
+D
+E
+龙
+龚
+龛
+<eos>
+```
+
+## output
+
+```
+data/
+├── local
+│   ├── dict
+│   │   ├── lexicon.txt
+│   │   └── units.txt
+│   └── lm
+│       ├── heldout
+│       ├── lm.arpa
+│       ├── text
+│       ├── text.no_oov
+│       ├── train
+│       ├── unigram.counts
+│       ├── word.counts
+│       └── wordlist
+```
+
+```
+/workspace/srilm/bin/i686-m64/ngram-count
+Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
+Ignoring words 矽, which contains oov unit
+Ignoring words 傩, which contains oov unit
+Ignoring words 堀, which contains oov unit
+Ignoring words 莼, which contains oov unit
+Ignoring words 菰, which contains oov unit
+Ignoring words 摭, which contains oov unit
+Ignoring words 帙, which contains oov unit
+Ignoring words 迨, which contains oov unit
+Ignoring words 孥, which contains oov unit
+Ignoring words 瑗, which contains oov unit
+...
+...
+...
+file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
+0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
+build LM done.
+```
--- a/speechx/examples/ngram/zh/local/aishell_train_lms.sh
+++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh
@ -28,10 +28,14 @@ mkdir -p $dir

 cleantext=$dir/text.no_oov

+# oov to <SPOKEN_NOISE>
+# line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
 cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
  > $cleantext || exit 1;

+# compute word counts
+# line: count word
 cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
   sort -nr > $dir/word.counts || exit 1;

@ -42,10 +46,13 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
  cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
   sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;

+# word with <s> </s>
 cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist

+# hold out to compute ppl
 heldout_sent=10000 # Don't change this if you want result to be comparable with
    # kaldi_lm results
+
 mkdir -p $dir
 cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
  head -$heldout_sent > $dir/heldout
--- a/speechx/examples/ngram/zh/path.sh
+++ b/speechx/examples/ngram/zh/path.sh
@ -0,0 +1,12 @@
+# This contains the locations of binarys build required for running the examples.
+
+MAIN_ROOT=`realpath $PWD/../../../../`
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
+
+export LC_AL=C
+
+# srilm
+export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
+export SRILM=${MAIN_ROOT}/tools/srilm
+export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
--- a/speechx/examples/ngram/zh/run.sh
+++ b/speechx/examples/ngram/zh/run.sh
@ -0,0 +1,62 @@
+#!/bin/bash
+set -eo pipefail
+
+. path.sh
+
+stage=0
+stop_stage=100
+corpus=aishell
+
+unit=data/vocab.txt       # line: char/spm_pice, vocab file
+lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
+text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
+
+. utils/parse_options.sh
+
+data=$PWD/data
+mkdir -p $data
+
+if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
+    if [ ! -f $data/speech.ngram.zh.tar.gz ];then
+        pushd $data
+        wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
+        tar xvzf speech.ngram.zh.tar.gz
+        popd
+    fi
+fi
+
+if [ ! -f $unit ]; then
+    echo "$0: No such file $unit"
+    exit 1;
+fi
+
+if ! which ngram-count; then
+    pushd $MAIN_ROOT/tools
+    make srilm.done
+    popd
+fi
+
+mkdir -p data/local/dict
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # 7.1 Prepare dict
+    # line: char/spm_pices
+    cp $unit data/local/dict/units.txt
+
+    # line: word ph0 ... phn -> line: word char0 ... charn
+    utils/fst/prepare_dict.py \
+        --unit_file $unit \
+        --in_lexicon ${lexicon} \
+        --out_lexicon data/local/dict/lexicon.txt
+fi
+
+lm=data/local/lm
+mkdir -p $lm
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # 7.2 Train lm
+    cp $text $lm/text
+    local/aishell_train_lms.sh
+fi
+
+echo "build LM done."
+exit 0
--- a/speechx/examples/ngram/zh/utils
+++ b/speechx/examples/ngram/zh/utils
@ -0,0 +1 @@
+../../../../utils/
--- a/speechx/examples/text_lm/.gitignore
+++ b/speechx/examples/text_lm/.gitignore
@ -0,0 +1 @@
+data
--- a/speechx/examples/text_lm/path.sh
+++ b/speechx/examples/text_lm/path.sh
--- a/speechx/examples/text_lm/run.sh
+++ b/speechx/examples/text_lm/run.sh
--- a/speechx/examples/wfst/README.md
+++ b/speechx/examples/wfst/README.md
@ -0,0 +1,18 @@
+```
+fstaddselfloops 'echo 4234 |' 'echo 123660 |' 
+Lexicon and Token FSTs compiling succeeded
+arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true - 
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
+LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
+Checking how stochastic G is (the first of these numbers should be small):
+fstisstochastic data/lang_test/G.fst 
+0 -1.14386
+fsttablecompose data/lang_test/L.fst data/lang_test/G.fst 
+fstminimizeencoded 
+fstdeterminizestar --use-log=true 
+fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst 
+Composing decoding graph TLG.fst succeeded
+Aishell build TLG done.
+```
--- a/speechx/examples/build_wfst/path.sh
+++ b/speechx/examples/build_wfst/path.sh
@ -1,18 +1,10 @@
 # This contains the locations of binarys build required for running the examples.

-SPEECHX_ROOT=$PWD/../../../
-MAIN_ROOT=$SPEECHX_ROOT/../
-SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
-
-SPEECHX_TOOLS=$SPEECHX_ROOT/tools
-TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
-
-[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+MAIN_ROOT=`realpath $PWD/../../../../`
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`

 export LC_AL=C

-export PATH=$PATH:$TOOLS_BIN
-
 # srilm
 export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
--- a/speechx/examples/build_wfst/run.sh
+++ b/speechx/examples/build_wfst/run.sh
@ -13,12 +13,6 @@ text=     # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt

 source parse_options.sh

-if [ ! which ngram-count ]; then
-    pushd $MAIN_ROOT/tools
-    make srilm.done
-    popd
-fi
-
 if [ ! which fstprint ]; then
    pushd $MAIN_ROOT/tools
    make kaldi.done