parent
37d9c08da5
commit
eb52896c4a
@ -1 +0,0 @@
|
|||||||
# NGram Train
|
|
@ -1,20 +0,0 @@
|
|||||||
# This contains the locations of binarys build required for running the examples.
|
|
||||||
|
|
||||||
SPEECHX_ROOT=$PWD/../../../
|
|
||||||
MAIN_ROOT=$SPEECHX_ROOT/../
|
|
||||||
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
|
|
||||||
|
|
||||||
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
|
|
||||||
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
|
|
||||||
|
|
||||||
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
|
|
||||||
|
|
||||||
export LC_AL=C
|
|
||||||
|
|
||||||
export PATH=$PATH:$TOOLS_BIN
|
|
||||||
|
|
||||||
# srilm
|
|
||||||
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
|
||||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
|
||||||
export SRILM=${MAIN_ROOT}/tools/srilm
|
|
||||||
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
|
|
@ -1,61 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -eo pipefail
|
|
||||||
|
|
||||||
. path.sh
|
|
||||||
|
|
||||||
stage=-1
|
|
||||||
stop_stage=100
|
|
||||||
corpus=aishell
|
|
||||||
|
|
||||||
unit=data/vocab.txt # vocab
|
|
||||||
lexicon= # aishell/resource_aishell/lexicon.txt
|
|
||||||
text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
|
|
||||||
|
|
||||||
. parse_options.sh
|
|
||||||
|
|
||||||
data=$PWD/data
|
|
||||||
mkdir -p $data
|
|
||||||
|
|
||||||
if [ ! -f $unit ]; then
|
|
||||||
echo "$0: No such file $unit"
|
|
||||||
exit 1;
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! which ngram-count ]; then
|
|
||||||
pushd $MAIN_ROOT/tools
|
|
||||||
make srilm.done
|
|
||||||
popd
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! which fstaddselfloops ]; then
|
|
||||||
pushd $MAIN_ROOT/tools
|
|
||||||
make kaldi.done
|
|
||||||
popd
|
|
||||||
fi
|
|
||||||
|
|
||||||
mkdir -p data/local/dict
|
|
||||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
|
||||||
# 7.1 Prepare dict
|
|
||||||
cp $unit data/local/dict/units.txt
|
|
||||||
utils/fst/prepare_dict.py \
|
|
||||||
--unit_file $unit \
|
|
||||||
--in_lexicon ${lexicon} \
|
|
||||||
--out_lexicon data/local/dict/lexicon.txt
|
|
||||||
fi
|
|
||||||
|
|
||||||
lm=data/local/lm
|
|
||||||
mkdir -p data/train
|
|
||||||
mkdir -p $lm
|
|
||||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
|
||||||
# 7.2 Train lm
|
|
||||||
utils/manifest_key_value.py \
|
|
||||||
--manifest_path data/manifest.train \
|
|
||||||
--output_path data/train
|
|
||||||
utils/filter_scp.pl data/train/text \
|
|
||||||
$text > $lm/text
|
|
||||||
|
|
||||||
local/aishell_train_lms.sh
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "build LM done."
|
|
||||||
exit 0
|
|
@ -1 +0,0 @@
|
|||||||
../../../utils/
|
|
@ -0,0 +1,101 @@
|
|||||||
|
# ngram train for mandarin
|
||||||
|
|
||||||
|
Quick run:
|
||||||
|
```
|
||||||
|
bash run.sh --stage -1
|
||||||
|
```
|
||||||
|
|
||||||
|
## input
|
||||||
|
|
||||||
|
input files:
|
||||||
|
```
|
||||||
|
data/
|
||||||
|
├── lexicon.txt
|
||||||
|
├── text
|
||||||
|
└── vocab.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
==> data/text <==
|
||||||
|
BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
|
||||||
|
BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
|
||||||
|
BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
|
||||||
|
BAC009S0002W0125 各地 政府 便 纷纷 跟进
|
||||||
|
BAC009S0002W0126 仅 一 个 多 月 的 时间 里
|
||||||
|
BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
|
||||||
|
BAC009S0002W0128 四十六 个 限 购 城市 当中
|
||||||
|
BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
|
||||||
|
BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
|
||||||
|
BAC009S0002W0131 显示 出 了 极 强 的 威力
|
||||||
|
|
||||||
|
==> data/lexicon.txt <==
|
||||||
|
SIL sil
|
||||||
|
<SPOKEN_NOISE> sil
|
||||||
|
啊 aa a1
|
||||||
|
啊 aa a2
|
||||||
|
啊 aa a4
|
||||||
|
啊 aa a5
|
||||||
|
啊啊啊 aa a2 aa a2 aa a2
|
||||||
|
啊啊啊 aa a5 aa a5 aa a5
|
||||||
|
坐地 z uo4 d i4
|
||||||
|
坐实 z uo4 sh ix2
|
||||||
|
坐视 z uo4 sh ix4
|
||||||
|
坐稳 z uo4 uu un3
|
||||||
|
坐拥 z uo4 ii iong1
|
||||||
|
坐诊 z uo4 zh en3
|
||||||
|
坐庄 z uo4 zh uang1
|
||||||
|
坐姿 z uo4 z iy1
|
||||||
|
|
||||||
|
==> data/vocab.txt <==
|
||||||
|
<blank>
|
||||||
|
<unk>
|
||||||
|
A
|
||||||
|
B
|
||||||
|
C
|
||||||
|
D
|
||||||
|
E
|
||||||
|
龙
|
||||||
|
龚
|
||||||
|
龛
|
||||||
|
<eos>
|
||||||
|
```
|
||||||
|
|
||||||
|
## output
|
||||||
|
|
||||||
|
```
|
||||||
|
data/
|
||||||
|
├── local
|
||||||
|
│ ├── dict
|
||||||
|
│ │ ├── lexicon.txt
|
||||||
|
│ │ └── units.txt
|
||||||
|
│ └── lm
|
||||||
|
│ ├── heldout
|
||||||
|
│ ├── lm.arpa
|
||||||
|
│ ├── text
|
||||||
|
│ ├── text.no_oov
|
||||||
|
│ ├── train
|
||||||
|
│ ├── unigram.counts
|
||||||
|
│ ├── word.counts
|
||||||
|
│ └── wordlist
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
/workspace/srilm/bin/i686-m64/ngram-count
|
||||||
|
Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
|
||||||
|
Ignoring words 矽, which contains oov unit
|
||||||
|
Ignoring words 傩, which contains oov unit
|
||||||
|
Ignoring words 堀, which contains oov unit
|
||||||
|
Ignoring words 莼, which contains oov unit
|
||||||
|
Ignoring words 菰, which contains oov unit
|
||||||
|
Ignoring words 摭, which contains oov unit
|
||||||
|
Ignoring words 帙, which contains oov unit
|
||||||
|
Ignoring words 迨, which contains oov unit
|
||||||
|
Ignoring words 孥, which contains oov unit
|
||||||
|
Ignoring words 瑗, which contains oov unit
|
||||||
|
...
|
||||||
|
...
|
||||||
|
...
|
||||||
|
file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
|
||||||
|
0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
|
||||||
|
build LM done.
|
||||||
|
```
|
@ -0,0 +1,12 @@
|
|||||||
|
# This contains the locations of binarys build required for running the examples.
|
||||||
|
|
||||||
|
MAIN_ROOT=`realpath $PWD/../../../../`
|
||||||
|
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
|
||||||
|
|
||||||
|
export LC_AL=C
|
||||||
|
|
||||||
|
# srilm
|
||||||
|
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
||||||
|
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
||||||
|
export SRILM=${MAIN_ROOT}/tools/srilm
|
||||||
|
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
|
@ -0,0 +1,62 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -eo pipefail
|
||||||
|
|
||||||
|
. path.sh
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=100
|
||||||
|
corpus=aishell
|
||||||
|
|
||||||
|
unit=data/vocab.txt # line: char/spm_pice, vocab file
|
||||||
|
lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
|
||||||
|
text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||||
|
|
||||||
|
. utils/parse_options.sh
|
||||||
|
|
||||||
|
data=$PWD/data
|
||||||
|
mkdir -p $data
|
||||||
|
|
||||||
|
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
||||||
|
if [ ! -f $data/speech.ngram.zh.tar.gz ];then
|
||||||
|
pushd $data
|
||||||
|
wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
|
||||||
|
tar xvzf speech.ngram.zh.tar.gz
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f $unit ]; then
|
||||||
|
echo "$0: No such file $unit"
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! which ngram-count; then
|
||||||
|
pushd $MAIN_ROOT/tools
|
||||||
|
make srilm.done
|
||||||
|
popd
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p data/local/dict
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# 7.1 Prepare dict
|
||||||
|
# line: char/spm_pices
|
||||||
|
cp $unit data/local/dict/units.txt
|
||||||
|
|
||||||
|
# line: word ph0 ... phn -> line: word char0 ... charn
|
||||||
|
utils/fst/prepare_dict.py \
|
||||||
|
--unit_file $unit \
|
||||||
|
--in_lexicon ${lexicon} \
|
||||||
|
--out_lexicon data/local/dict/lexicon.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
lm=data/local/lm
|
||||||
|
mkdir -p $lm
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# 7.2 Train lm
|
||||||
|
cp $text $lm/text
|
||||||
|
local/aishell_train_lms.sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "build LM done."
|
||||||
|
exit 0
|
@ -0,0 +1 @@
|
|||||||
|
../../../../utils/
|
@ -0,0 +1 @@
|
|||||||
|
data
|
@ -0,0 +1,18 @@
|
|||||||
|
```
|
||||||
|
fstaddselfloops 'echo 4234 |' 'echo 123660 |'
|
||||||
|
Lexicon and Token FSTs compiling succeeded
|
||||||
|
arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true -
|
||||||
|
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
|
||||||
|
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
|
||||||
|
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
|
||||||
|
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
|
||||||
|
Checking how stochastic G is (the first of these numbers should be small):
|
||||||
|
fstisstochastic data/lang_test/G.fst
|
||||||
|
0 -1.14386
|
||||||
|
fsttablecompose data/lang_test/L.fst data/lang_test/G.fst
|
||||||
|
fstminimizeencoded
|
||||||
|
fstdeterminizestar --use-log=true
|
||||||
|
fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst
|
||||||
|
Composing decoding graph TLG.fst succeeded
|
||||||
|
Aishell build TLG done.
|
||||||
|
```
|
@ -1,18 +1,10 @@
|
|||||||
# This contains the locations of binarys build required for running the examples.
|
# This contains the locations of binarys build required for running the examples.
|
||||||
|
|
||||||
SPEECHX_ROOT=$PWD/../../../
|
MAIN_ROOT=`realpath $PWD/../../../../`
|
||||||
MAIN_ROOT=$SPEECHX_ROOT/../
|
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
|
||||||
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
|
|
||||||
|
|
||||||
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
|
|
||||||
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
|
|
||||||
|
|
||||||
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
|
|
||||||
|
|
||||||
export LC_AL=C
|
export LC_AL=C
|
||||||
|
|
||||||
export PATH=$PATH:$TOOLS_BIN
|
|
||||||
|
|
||||||
# srilm
|
# srilm
|
||||||
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
||||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
Loading…
Reference in new issue