parent
37d9c08da5
commit
eb52896c4a
@ -1 +0,0 @@
|
||||
# NGram Train
|
@ -1,20 +0,0 @@
|
||||
# This contains the locations of binarys build required for running the examples.
|
||||
|
||||
SPEECHX_ROOT=$PWD/../../../
|
||||
MAIN_ROOT=$SPEECHX_ROOT/../
|
||||
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
|
||||
|
||||
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
|
||||
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
|
||||
|
||||
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
|
||||
|
||||
export LC_AL=C
|
||||
|
||||
export PATH=$PATH:$TOOLS_BIN
|
||||
|
||||
# srilm
|
||||
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
||||
export SRILM=${MAIN_ROOT}/tools/srilm
|
||||
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
|
@ -1,61 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -eo pipefail
|
||||
|
||||
. path.sh
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
corpus=aishell
|
||||
|
||||
unit=data/vocab.txt # vocab
|
||||
lexicon= # aishell/resource_aishell/lexicon.txt
|
||||
text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||
|
||||
. parse_options.sh
|
||||
|
||||
data=$PWD/data
|
||||
mkdir -p $data
|
||||
|
||||
if [ ! -f $unit ]; then
|
||||
echo "$0: No such file $unit"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
if [ ! which ngram-count ]; then
|
||||
pushd $MAIN_ROOT/tools
|
||||
make srilm.done
|
||||
popd
|
||||
fi
|
||||
|
||||
if [ ! which fstaddselfloops ]; then
|
||||
pushd $MAIN_ROOT/tools
|
||||
make kaldi.done
|
||||
popd
|
||||
fi
|
||||
|
||||
mkdir -p data/local/dict
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# 7.1 Prepare dict
|
||||
cp $unit data/local/dict/units.txt
|
||||
utils/fst/prepare_dict.py \
|
||||
--unit_file $unit \
|
||||
--in_lexicon ${lexicon} \
|
||||
--out_lexicon data/local/dict/lexicon.txt
|
||||
fi
|
||||
|
||||
lm=data/local/lm
|
||||
mkdir -p data/train
|
||||
mkdir -p $lm
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# 7.2 Train lm
|
||||
utils/manifest_key_value.py \
|
||||
--manifest_path data/manifest.train \
|
||||
--output_path data/train
|
||||
utils/filter_scp.pl data/train/text \
|
||||
$text > $lm/text
|
||||
|
||||
local/aishell_train_lms.sh
|
||||
fi
|
||||
|
||||
echo "build LM done."
|
||||
exit 0
|
@ -1 +0,0 @@
|
||||
../../../utils/
|
@ -0,0 +1,101 @@
|
||||
# ngram train for mandarin
|
||||
|
||||
Quick run:
|
||||
```
|
||||
bash run.sh --stage -1
|
||||
```
|
||||
|
||||
## input
|
||||
|
||||
input files:
|
||||
```
|
||||
data/
|
||||
├── lexicon.txt
|
||||
├── text
|
||||
└── vocab.txt
|
||||
```
|
||||
|
||||
```
|
||||
==> data/text <==
|
||||
BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
|
||||
BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
|
||||
BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
|
||||
BAC009S0002W0125 各地 政府 便 纷纷 跟进
|
||||
BAC009S0002W0126 仅 一 个 多 月 的 时间 里
|
||||
BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
|
||||
BAC009S0002W0128 四十六 个 限 购 城市 当中
|
||||
BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
|
||||
BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
|
||||
BAC009S0002W0131 显示 出 了 极 强 的 威力
|
||||
|
||||
==> data/lexicon.txt <==
|
||||
SIL sil
|
||||
<SPOKEN_NOISE> sil
|
||||
啊 aa a1
|
||||
啊 aa a2
|
||||
啊 aa a4
|
||||
啊 aa a5
|
||||
啊啊啊 aa a2 aa a2 aa a2
|
||||
啊啊啊 aa a5 aa a5 aa a5
|
||||
坐地 z uo4 d i4
|
||||
坐实 z uo4 sh ix2
|
||||
坐视 z uo4 sh ix4
|
||||
坐稳 z uo4 uu un3
|
||||
坐拥 z uo4 ii iong1
|
||||
坐诊 z uo4 zh en3
|
||||
坐庄 z uo4 zh uang1
|
||||
坐姿 z uo4 z iy1
|
||||
|
||||
==> data/vocab.txt <==
|
||||
<blank>
|
||||
<unk>
|
||||
A
|
||||
B
|
||||
C
|
||||
D
|
||||
E
|
||||
龙
|
||||
龚
|
||||
龛
|
||||
<eos>
|
||||
```
|
||||
|
||||
## output
|
||||
|
||||
```
|
||||
data/
|
||||
├── local
|
||||
│ ├── dict
|
||||
│ │ ├── lexicon.txt
|
||||
│ │ └── units.txt
|
||||
│ └── lm
|
||||
│ ├── heldout
|
||||
│ ├── lm.arpa
|
||||
│ ├── text
|
||||
│ ├── text.no_oov
|
||||
│ ├── train
|
||||
│ ├── unigram.counts
|
||||
│ ├── word.counts
|
||||
│ └── wordlist
|
||||
```
|
||||
|
||||
```
|
||||
/workspace/srilm/bin/i686-m64/ngram-count
|
||||
Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
|
||||
Ignoring words 矽, which contains oov unit
|
||||
Ignoring words 傩, which contains oov unit
|
||||
Ignoring words 堀, which contains oov unit
|
||||
Ignoring words 莼, which contains oov unit
|
||||
Ignoring words 菰, which contains oov unit
|
||||
Ignoring words 摭, which contains oov unit
|
||||
Ignoring words 帙, which contains oov unit
|
||||
Ignoring words 迨, which contains oov unit
|
||||
Ignoring words 孥, which contains oov unit
|
||||
Ignoring words 瑗, which contains oov unit
|
||||
...
|
||||
...
|
||||
...
|
||||
file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
|
||||
0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
|
||||
build LM done.
|
||||
```
|
@ -0,0 +1,12 @@
|
||||
# This contains the locations of binarys build required for running the examples.
|
||||
|
||||
MAIN_ROOT=`realpath $PWD/../../../../`
|
||||
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
|
||||
|
||||
export LC_AL=C
|
||||
|
||||
# srilm
|
||||
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
||||
export SRILM=${MAIN_ROOT}/tools/srilm
|
||||
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
|
@ -0,0 +1,62 @@
|
||||
#!/bin/bash
|
||||
set -eo pipefail
|
||||
|
||||
. path.sh
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
corpus=aishell
|
||||
|
||||
unit=data/vocab.txt # line: char/spm_pice, vocab file
|
||||
lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
|
||||
text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
|
||||
|
||||
. utils/parse_options.sh
|
||||
|
||||
data=$PWD/data
|
||||
mkdir -p $data
|
||||
|
||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
||||
if [ ! -f $data/speech.ngram.zh.tar.gz ];then
|
||||
pushd $data
|
||||
wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
|
||||
tar xvzf speech.ngram.zh.tar.gz
|
||||
popd
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ! -f $unit ]; then
|
||||
echo "$0: No such file $unit"
|
||||
exit 1;
|
||||
fi
|
||||
|
||||
if ! which ngram-count; then
|
||||
pushd $MAIN_ROOT/tools
|
||||
make srilm.done
|
||||
popd
|
||||
fi
|
||||
|
||||
mkdir -p data/local/dict
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# 7.1 Prepare dict
|
||||
# line: char/spm_pices
|
||||
cp $unit data/local/dict/units.txt
|
||||
|
||||
# line: word ph0 ... phn -> line: word char0 ... charn
|
||||
utils/fst/prepare_dict.py \
|
||||
--unit_file $unit \
|
||||
--in_lexicon ${lexicon} \
|
||||
--out_lexicon data/local/dict/lexicon.txt
|
||||
fi
|
||||
|
||||
lm=data/local/lm
|
||||
mkdir -p $lm
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# 7.2 Train lm
|
||||
cp $text $lm/text
|
||||
local/aishell_train_lms.sh
|
||||
fi
|
||||
|
||||
echo "build LM done."
|
||||
exit 0
|
@ -0,0 +1 @@
|
||||
../../../../utils/
|
@ -0,0 +1 @@
|
||||
data
|
@ -0,0 +1,18 @@
|
||||
```
|
||||
fstaddselfloops 'echo 4234 |' 'echo 123660 |'
|
||||
Lexicon and Token FSTs compiling succeeded
|
||||
arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true -
|
||||
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
|
||||
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
|
||||
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
|
||||
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
|
||||
Checking how stochastic G is (the first of these numbers should be small):
|
||||
fstisstochastic data/lang_test/G.fst
|
||||
0 -1.14386
|
||||
fsttablecompose data/lang_test/L.fst data/lang_test/G.fst
|
||||
fstminimizeencoded
|
||||
fstdeterminizestar --use-log=true
|
||||
fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst
|
||||
Composing decoding graph TLG.fst succeeded
|
||||
Aishell build TLG done.
|
||||
```
|
@ -1,18 +1,10 @@
|
||||
# This contains the locations of binarys build required for running the examples.
|
||||
|
||||
SPEECHX_ROOT=$PWD/../../../
|
||||
MAIN_ROOT=$SPEECHX_ROOT/../
|
||||
SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
|
||||
|
||||
SPEECHX_TOOLS=$SPEECHX_ROOT/tools
|
||||
TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
|
||||
|
||||
[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
|
||||
MAIN_ROOT=`realpath $PWD/../../../../`
|
||||
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
|
||||
|
||||
export LC_AL=C
|
||||
|
||||
export PATH=$PATH:$TOOLS_BIN
|
||||
|
||||
# srilm
|
||||
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
|
Loading…
Reference in new issue