zh ngram build

pull/1715/head
Hui Zhang 3 years ago
parent 37d9c08da5
commit eb52896c4a

@ -1,12 +1,10 @@
# Examples for SpeechX
* dev - for speechx developer, using for test.
* ngram - using to build NGram ARPA lm.
* ds2_ol - ds2 streaming test under `aishell-1` test dataset.
The entrypoint is `ds2_ol/aishell/run.sh`
The entrypoint is `ds2_ol/aishell/run.sh`
## How to run
## How to run
`run.sh` is the entry point.
@ -17,9 +15,19 @@ pushd ds2_ol/aishell
bash run.sh
```
## Display Model with [Netron](https://github.com/lutzroeder/netron)
## Display Model with [Netron](https://github.com/lutzroeder/netron)
```
pip install netron
netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host 10.21.55.20
```
## Build WFST
* text_lm - process text for build lm
* ngram - using to build NGram ARPA lm.
* wfst - build wfst for TLG.
## For Developer
* dev - for speechx developer, using for test.

@ -1,20 +0,0 @@
# This contains the locations of binarys build required for running the examples.
SPEECHX_ROOT=$PWD/../../../
MAIN_ROOT=$SPEECHX_ROOT/../
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
export LC_AL=C
export PATH=$PATH:$TOOLS_BIN
# srilm
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
export SRILM=${MAIN_ROOT}/tools/srilm
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64

@ -1,61 +0,0 @@
#!/bin/bash
set -eo pipefail
. path.sh
stage=-1
stop_stage=100
corpus=aishell
unit=data/vocab.txt # vocab
lexicon= # aishell/resource_aishell/lexicon.txt
text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
. parse_options.sh
data=$PWD/data
mkdir -p $data
if [ ! -f $unit ]; then
echo "$0: No such file $unit"
exit 1;
fi
if [ ! which ngram-count ]; then
pushd $MAIN_ROOT/tools
make srilm.done
popd
fi
if [ ! which fstaddselfloops ]; then
pushd $MAIN_ROOT/tools
make kaldi.done
popd
fi
mkdir -p data/local/dict
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# 7.1 Prepare dict
cp $unit data/local/dict/units.txt
utils/fst/prepare_dict.py \
--unit_file $unit \
--in_lexicon ${lexicon} \
--out_lexicon data/local/dict/lexicon.txt
fi
lm=data/local/lm
mkdir -p data/train
mkdir -p $lm
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# 7.2 Train lm
utils/manifest_key_value.py \
--manifest_path data/manifest.train \
--output_path data/train
utils/filter_scp.pl data/train/text \
$text > $lm/text
local/aishell_train_lms.sh
fi
echo "build LM done."
exit 0

@ -1 +0,0 @@
../../../utils/

@ -0,0 +1,101 @@
# ngram train for mandarin
Quick run:
```
bash run.sh --stage -1
```
## input
input files:
```
data/
├── lexicon.txt
├── text
└── vocab.txt
```
```
==> data/text <==
BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
BAC009S0002W0125 各地 政府 便 纷纷 跟进
BAC009S0002W0126 仅 一 个 多 月 的 时间 里
BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
BAC009S0002W0128 四十六 个 限 购 城市 当中
BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
BAC009S0002W0131 显示 出 了 极 强 的 威力
==> data/lexicon.txt <==
SIL sil
<SPOKEN_NOISE> sil
啊 aa a1
啊 aa a2
啊 aa a4
啊 aa a5
啊啊啊 aa a2 aa a2 aa a2
啊啊啊 aa a5 aa a5 aa a5
坐地 z uo4 d i4
坐实 z uo4 sh ix2
坐视 z uo4 sh ix4
坐稳 z uo4 uu un3
坐拥 z uo4 ii iong1
坐诊 z uo4 zh en3
坐庄 z uo4 zh uang1
坐姿 z uo4 z iy1
==> data/vocab.txt <==
<blank>
<unk>
A
B
C
D
E
<eos>
```
## output
```
data/
├── local
│ ├── dict
│ │ ├── lexicon.txt
│ │ └── units.txt
│ └── lm
│ ├── heldout
│ ├── lm.arpa
│ ├── text
│ ├── text.no_oov
│ ├── train
│ ├── unigram.counts
│ ├── word.counts
│ └── wordlist
```
```
/workspace/srilm/bin/i686-m64/ngram-count
Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
Ignoring words 矽, which contains oov unit
Ignoring words 傩, which contains oov unit
Ignoring words 堀, which contains oov unit
Ignoring words 莼, which contains oov unit
Ignoring words 菰, which contains oov unit
Ignoring words 摭, which contains oov unit
Ignoring words 帙, which contains oov unit
Ignoring words 迨, which contains oov unit
Ignoring words 孥, which contains oov unit
Ignoring words 瑗, which contains oov unit
...
...
...
file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
build LM done.
```

@ -28,10 +28,14 @@ mkdir -p $dir
cleantext=$dir/text.no_oov
# oov to <SPOKEN_NOISE>
# line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
> $cleantext || exit 1;
# compute word counts
# line: count word
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
sort -nr > $dir/word.counts || exit 1;
@ -42,10 +46,13 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
# word with <s> </s>
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
# hold out to compute ppl
heldout_sent=10000 # Don't change this if you want result to be comparable with
# kaldi_lm results
mkdir -p $dir
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
head -$heldout_sent > $dir/heldout

@ -0,0 +1,12 @@
# This contains the locations of binarys build required for running the examples.
MAIN_ROOT=`realpath $PWD/../../../../`
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
export LC_AL=C
# srilm
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
export SRILM=${MAIN_ROOT}/tools/srilm
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64

@ -0,0 +1,62 @@
#!/bin/bash
set -eo pipefail
. path.sh
stage=0
stop_stage=100
corpus=aishell
unit=data/vocab.txt # line: char/spm_pice, vocab file
lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
. utils/parse_options.sh
data=$PWD/data
mkdir -p $data
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
if [ ! -f $data/speech.ngram.zh.tar.gz ];then
pushd $data
wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
tar xvzf speech.ngram.zh.tar.gz
popd
fi
fi
if [ ! -f $unit ]; then
echo "$0: No such file $unit"
exit 1;
fi
if ! which ngram-count; then
pushd $MAIN_ROOT/tools
make srilm.done
popd
fi
mkdir -p data/local/dict
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# 7.1 Prepare dict
# line: char/spm_pices
cp $unit data/local/dict/units.txt
# line: word ph0 ... phn -> line: word char0 ... charn
utils/fst/prepare_dict.py \
--unit_file $unit \
--in_lexicon ${lexicon} \
--out_lexicon data/local/dict/lexicon.txt
fi
lm=data/local/lm
mkdir -p $lm
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# 7.2 Train lm
cp $text $lm/text
local/aishell_train_lms.sh
fi
echo "build LM done."
exit 0

@ -0,0 +1 @@
../../../../utils/

@ -0,0 +1,18 @@
```
fstaddselfloops 'echo 4234 |' 'echo 123660 |'
Lexicon and Token FSTs compiling succeeded
arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true -
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
Checking how stochastic G is (the first of these numbers should be small):
fstisstochastic data/lang_test/G.fst
0 -1.14386
fsttablecompose data/lang_test/L.fst data/lang_test/G.fst
fstminimizeencoded
fstdeterminizestar --use-log=true
fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst
Composing decoding graph TLG.fst succeeded
Aishell build TLG done.
```

@ -1,18 +1,10 @@
# This contains the locations of binarys build required for running the examples.
SPEECHX_ROOT=$PWD/../../../
MAIN_ROOT=$SPEECHX_ROOT/../
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
MAIN_ROOT=`realpath $PWD/../../../../`
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
export LC_AL=C
export PATH=$PATH:$TOOLS_BIN
# srilm
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs

@ -13,12 +13,6 @@ text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
source parse_options.sh
if [ ! which ngram-count ]; then
pushd $MAIN_ROOT/tools
make srilm.done
popd
fi
if [ ! which fstprint ]; then
pushd $MAIN_ROOT/tools
make kaldi.done
Loading…
Cancel
Save