Merge branch 'PaddlePaddle:develop' into update_engine

pull/1992/head
liangym 3 years ago committed by lym0302
commit 919c8d0607

@ -93,6 +93,7 @@
function parseResult(data) { function parseResult(data) {
var data = JSON.parse(data) var data = JSON.parse(data)
console.log('result json:', data)
var result = data.result var result = data.result
console.log(result) console.log(result)
$("#resultPanel").html(result) $("#resultPanel").html(result)

@ -0,0 +1,15 @@
FROM registry.baidubce.com/paddlepaddle/paddle:2.2.2
LABEL maintainer="paddlesl@baidu.com"
RUN git clone --depth 1 https://github.com/PaddlePaddle/PaddleSpeech.git /home/PaddleSpeech
RUN pip3 uninstall mccabe -y ; exit 0;
RUN pip3 install multiprocess==0.70.12 importlib-metadata==4.2.0 dill==0.3.4
RUN cd /home/PaddleSpeech/audio
RUN python setup.py bdist_wheel
RUN cd /home/PaddleSpeech
RUN python setup.py bdist_wheel
RUN pip install audio/dist/*.whl dist/*.whl
WORKDIR /home/PaddleSpeech/

@ -1,36 +0,0 @@
[
{
"type": "speed",
"params": {
"min_speed_rate": 0.9,
"max_speed_rate": 1.1,
"num_rates": 3
},
"prob": 0.0
},
{
"type": "shift",
"params": {
"min_shift_ms": -5,
"max_shift_ms": 5
},
"prob": 1.0
},
{
"type": "specaug",
"params": {
"W": 0,
"warp_mode": "PIL",
"F": 10,
"n_freq_masks": 2,
"T": 50,
"n_time_masks": 2,
"p": 1.0,
"adaptive_number_ratio": 0,
"adaptive_size_ratio": 0,
"max_n_time_masks": 20,
"replace_with_zero": true
},
"prob": 1.0
}
]

@ -15,50 +15,53 @@ max_output_input_ratio: .inf
########################################### ###########################################
# Dataloader # # Dataloader #
########################################### ###########################################
batch_size: 64 # one gpu
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/lang_char/vocab.txt vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json spm_model_prefix: ''
random_seed: 0 unit_type: 'char'
spm_model_prefix: preprocess_config: conf/preprocess.yaml
spectrum_type: linear
feat_dim: 161 feat_dim: 161
delta_delta: False
stride_ms: 10.0 stride_ms: 10.0
window_ms: 20.0 window_ms: 25.0
n_fft: None sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
max_freq: None batch_size: 64
target_sample_rate: 16000 maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
use_dB_normalization: True maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
target_dB: -20 minibatches: 0 # for debug
dither: 1.0 batch_count: auto
keep_transcription_text: False batch_bins: 0
sortagrad: True batch_frames_in: 0
shuffle_method: batch_shuffle batch_frames_out: 0
num_workers: 2 batch_frames_inout: 0
num_workers: 8
subsampling_factor: 1
num_encs: 1
############################################ ############################################
# Network Architecture # # Network Architecture #
############################################ ############################################
num_conv_layers: 2 num_conv_layers: 2
num_rnn_layers: 3 num_rnn_layers: 5
rnn_layer_size: 1024 rnn_layer_size: 1024
use_gru: True rnn_direction: bidirect # [forward, bidirect]
share_rnn_weights: False num_fc_layers: 0
fc_layers_size_list: -1,
use_gru: False
blank_id: 0 blank_id: 0
ctc_grad_norm_type: instance
########################################### ###########################################
# Training # # Training #
########################################### ###########################################
n_epoch: 80 n_epoch: 50
accum_grad: 1 accum_grad: 1
lr: 2.0e-3 lr: 5.0e-4
lr_decay: 0.83 lr_decay: 0.93
weight_decay: 1.0e-6 weight_decay: 1.0e-6
global_grad_clip: 3.0 global_grad_clip: 3.0
log_interval: 100 dist_sampler: False
log_interval: 1
checkpoint: checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5

@ -15,28 +15,26 @@ max_output_input_ratio: .inf
########################################### ###########################################
# Dataloader # # Dataloader #
########################################### ###########################################
batch_size: 64 # one gpu
mean_std_filepath: data/mean_std.json
unit_type: char
vocab_filepath: data/lang_char/vocab.txt vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/augmentation.json spm_model_prefix: ''
random_seed: 0 unit_type: 'char'
spm_model_prefix: preprocess_config: conf/preprocess.yaml
spectrum_type: linear #linear, mfcc, fbank
feat_dim: 161 feat_dim: 161
delta_delta: False
stride_ms: 10.0 stride_ms: 10.0
window_ms: 20.0 window_ms: 25.0
n_fft: None sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
max_freq: None batch_size: 64
target_sample_rate: 16000 maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
use_dB_normalization: True maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
target_dB: -20 minibatches: 0 # for debug
dither: 1.0 batch_count: auto
keep_transcription_text: False batch_bins: 0
sortagrad: True batch_frames_in: 0
shuffle_method: batch_shuffle batch_frames_out: 0
num_workers: 0 batch_frames_inout: 0
num_workers: 8
subsampling_factor: 1
num_encs: 1
############################################ ############################################
# Network Architecture # # Network Architecture #
@ -54,12 +52,13 @@ blank_id: 0
########################################### ###########################################
# Training # # Training #
########################################### ###########################################
n_epoch: 65 n_epoch: 30
accum_grad: 1 accum_grad: 1
lr: 5.0e-4 lr: 5.0e-4
lr_decay: 0.93 lr_decay: 0.93
weight_decay: 1.0e-6 weight_decay: 1.0e-6
global_grad_clip: 3.0 global_grad_clip: 3.0
dist_sampler: False
log_interval: 100 log_interval: 100
checkpoint: checkpoint:
kbest_n: 50 kbest_n: 50

@ -0,0 +1,25 @@
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 161
n_shift: 160
win_length: 400
dither: 0.1
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false

@ -2,9 +2,9 @@ decode_batch_size: 128
error_rate_type: cer error_rate_type: cer
decoding_method: ctc_beam_search decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 1.9 alpha: 2.2
beta: 5.0 beta: 4.3
beam_size: 300 beam_size: 500
cutoff_prob: 0.99 cutoff_prob: 0.99
cutoff_top_n: 40 cutoff_top_n: 40
num_proc_bsearch: 10 num_proc_bsearch: 10

@ -33,12 +33,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
num_workers=$(nproc) num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--spectrum_type="linear" \ --spectrum_type="fbank" \
--feat_dim=161 \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10 \ --stride_ms=10 \
--window_ms=20 \ --window_ms=25 \
--sample_rate=16000 \ --sample_rate=16000 \
--use_dB_normalization=True \ --use_dB_normalization=False \
--num_samples=2000 \ --num_samples=2000 \
--num_workers=${num_workers} \ --num_workers=${num_workers} \
--output_path="data/mean_std.json" --output_path="data/mean_std.json"

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 4 ];then if [ $# != 3 ];then
echo "usage: $0 config_path ckpt_prefix jit_model_path model_type" echo "usage: $0 config_path ckpt_prefix jit_model_path"
exit -1 exit -1
fi fi
@ -11,14 +11,12 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_path_prefix=$2 ckpt_path_prefix=$2
jit_model_export_path=$3 jit_model_export_path=$3
model_type=$4
python3 -u ${BIN_DIR}/export.py \ python3 -u ${BIN_DIR}/export.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--checkpoint_path ${ckpt_path_prefix} \ --checkpoint_path ${ckpt_path_prefix} \
--export_path ${jit_model_export_path} \ --export_path ${jit_model_export_path}
--model_type ${model_type}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in export!" echo "Failed in export!"

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 4 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
@ -13,7 +13,6 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
decode_config_path=$2 decode_config_path=$2
ckpt_prefix=$3 ckpt_prefix=$3
model_type=$4
# download language model # download language model
bash local/download_lm_ch.sh bash local/download_lm_ch.sh
@ -23,7 +22,7 @@ fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# format the reference test file # format the reference test file
python utils/format_rsl.py \ python3 utils/format_rsl.py \
--origin_ref data/manifest.test.raw \ --origin_ref data/manifest.test.raw \
--trans_ref data/manifest.test.text --trans_ref data/manifest.test.text
@ -32,8 +31,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix}
--model_type ${model_type}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
@ -41,25 +39,25 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi fi
# format the hyp file # format the hyp file
python utils/format_rsl.py \ python3 utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.rsl \ --origin_hyp ${ckpt_prefix}.rsl \
--trans_hyp ${ckpt_prefix}.rsl.text --trans_hyp ${ckpt_prefix}.rsl.text
python utils/compute-wer.py --char=1 --v=1 \ python3 utils/compute-wer.py --char=1 --v=1 \
data/manifest.test.text ${ckpt_prefix}.rsl.text > ${ckpt_prefix}.error data/manifest.test.text ${ckpt_prefix}.rsl.text > ${ckpt_prefix}.error
fi fi
if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
python utils/format_rsl.py \ python3 utils/format_rsl.py \
--origin_ref data/manifest.test.raw \ --origin_ref data/manifest.test.raw \
--trans_ref_sclite data/manifest.test.text.sclite --trans_ref_sclite data/manifest.test.text.sclite
python utils/format_rsl.py \ python3 utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.rsl \ --origin_hyp ${ckpt_prefix}.rsl \
--trans_hyp_sclite ${ckpt_prefix}.rsl.text.sclite --trans_hyp_sclite ${ckpt_prefix}.rsl.text.sclite
mkdir -p ${ckpt_prefix}_sclite mkdir -p ${ckpt_prefix}_sclite
sclite -i wsj -r data/manifest.test.text.sclite -h ${ckpt_prefix}.rsl.text.sclite -e utf-8 -o all -O ${ckpt_prefix}_sclite -c NOASCII sclite -i wsj -r data/manifest.test.text.sclite -h ${ckpt_prefix}.rsl.text.sclite -e utf-8 -o all -O ${ckpt_prefix}_sclite -c NOASCII
fi fi
exit 0 exit 0

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 4 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
@ -11,7 +11,6 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
decode_config_path=$2 decode_config_path=$2
jit_model_export_path=$3 jit_model_export_path=$3
model_type=$4
# download language model # download language model
bash local/download_lm_ch.sh > /dev/null 2>&1 bash local/download_lm_ch.sh > /dev/null 2>&1
@ -24,8 +23,7 @@ python3 -u ${BIN_DIR}/test_export.py \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${jit_model_export_path}.rsl \ --result_file ${jit_model_export_path}.rsl \
--export_path ${jit_model_export_path} \ --export_path ${jit_model_export_path}
--model_type ${model_type}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 5 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
exit -1 exit -1
fi fi
@ -11,8 +11,7 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
decode_config_path=$2 decode_config_path=$2
ckpt_prefix=$3 ckpt_prefix=$3
model_type=$4 audio_file=$4
audio_file=$5
mkdir -p data mkdir -p data
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
@ -37,7 +36,6 @@ python3 -u ${BIN_DIR}/test_wav.py \
--decode_cfg ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} \
--audio_file ${audio_file} --audio_file ${audio_file}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type" echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
exit -1 exit -1
fi fi
@ -10,7 +10,6 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_name=$2 ckpt_name=$2
model_type=$3
mkdir -p exp mkdir -p exp
@ -25,14 +24,12 @@ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--model_type ${model_type} \
--seed ${seed} --seed ${seed}
else else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--model_type ${model_type} \
--seed ${seed} --seed ${seed}
fi fi

@ -7,8 +7,7 @@ stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeech2_online.yaml conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeech2_online.yaml
decode_conf_path=conf/tuning/decode.yaml decode_conf_path=conf/tuning/decode.yaml
avg_num=1 avg_num=10
model_type=offline # offline or online
audio_file=data/demo_01_03.wav audio_file=data/demo_01_03.wav
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -25,7 +24,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type} CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@ -35,21 +34,21 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}|| exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n # export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type} CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# test export ckpt avg_n # test export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit|| exit -1
fi fi
# Optionally, you can add LM and test it with runtime. # Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# test a single .wav file # test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi fi

@ -1,36 +0,0 @@
[
{
"type": "speed",
"params": {
"min_speed_rate": 0.9,
"max_speed_rate": 1.1,
"num_rates": 3
},
"prob": 0.0
},
{
"type": "shift",
"params": {
"min_shift_ms": -5,
"max_shift_ms": 5
},
"prob": 1.0
},
{
"type": "specaug",
"params": {
"W": 0,
"warp_mode": "PIL",
"F": 10,
"n_freq_masks": 2,
"T": 50,
"n_time_masks": 2,
"p": 1.0,
"adaptive_number_ratio": 0,
"adaptive_size_ratio": 0,
"max_n_time_masks": 20,
"replace_with_zero": true
},
"prob": 1.0
}
]

@ -15,51 +15,51 @@ max_output_input_ratio: .inf
########################################### ###########################################
# Dataloader # # Dataloader #
########################################### ###########################################
batch_size: 20 vocab_filepath: data/lang_char/vocab.txt
mean_std_filepath: data/mean_std.json spm_model_prefix: ''
unit_type: char unit_type: 'char'
vocab_filepath: data/lang_char/vocab.txt preprocess_config: conf/preprocess.yaml
augmentation_config: conf/augmentation.json feat_dim: 161
random_seed: 0
spm_model_prefix:
spectrum_type: linear
feat_dim:
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0 stride_ms: 10.0
window_ms: 20.0 window_ms: 25.0
delta_delta: False sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
dither: 1.0 batch_size: 64
use_dB_normalization: True maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
target_dB: -20 maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
random_seed: 0 minibatches: 0 # for debug
keep_transcription_text: False batch_count: auto
sortagrad: True batch_bins: 0
shuffle_method: batch_shuffle batch_frames_in: 0
num_workers: 2 batch_frames_out: 0
batch_frames_inout: 0
num_workers: 8
subsampling_factor: 1
num_encs: 1
############################################ ############################################
# Network Architecture # # Network Architecture #
############################################ ############################################
num_conv_layers: 2 num_conv_layers: 2
num_rnn_layers: 3 num_rnn_layers: 5
rnn_layer_size: 2048 rnn_layer_size: 1024
rnn_direction: bidirect
num_fc_layers: 0
fc_layers_size_list: -1
use_gru: False use_gru: False
share_rnn_weights: True
blank_id: 0 blank_id: 0
########################################### ###########################################
# Training # # Training #
########################################### ###########################################
n_epoch: 50 n_epoch: 15
accum_grad: 1 accum_grad: 1
lr: 1.0e-3 lr: 5.0e-4
lr_decay: 0.83 lr_decay: 0.93
weight_decay: 1.0e-6 weight_decay: 1.0e-6
global_grad_clip: 5.0 global_grad_clip: 5.0
log_interval: 100 dist_sampler: False
log_interval: 1
checkpoint: checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5

@ -15,39 +15,36 @@ max_output_input_ratio: .inf
########################################### ###########################################
# Dataloader # # Dataloader #
########################################### ###########################################
batch_size: 15 vocab_filepath: data/lang_char/vocab.txt
mean_std_filepath: data/mean_std.json spm_model_prefix: ''
unit_type: char unit_type: 'char'
vocab_filepath: data/lang_char/vocab.txt preprocess_config: conf/preprocess.yaml
augmentation_config: conf/augmentation.json feat_dim: 161
random_seed: 0
spm_model_prefix:
spectrum_type: linear
feat_dim:
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0 stride_ms: 10.0
window_ms: 20.0 window_ms: 25.0
delta_delta: False sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
dither: 1.0 batch_size: 64
use_dB_normalization: True maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
target_dB: -20 maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
random_seed: 0 minibatches: 0 # for debug
keep_transcription_text: False batch_count: auto
sortagrad: True batch_bins: 0
shuffle_method: batch_shuffle batch_frames_in: 0
num_workers: 0 batch_frames_out: 0
batch_frames_inout: 0
num_workers: 8
subsampling_factor: 1
num_encs: 1
############################################ ############################################
# Network Architecture # # Network Architecture #
############################################ ############################################
num_conv_layers: 2 num_conv_layers: 2
num_rnn_layers: 3 num_rnn_layers: 5
rnn_layer_size: 2048 rnn_layer_size: 1024
rnn_direction: forward rnn_direction: forward
num_fc_layers: 2 num_fc_layers: 0
fc_layers_size_list: 512, 256 fc_layers_size_list: -1
use_gru: False use_gru: False
blank_id: 0 blank_id: 0
@ -55,13 +52,13 @@ blank_id: 0
########################################### ###########################################
# Training # # Training #
########################################### ###########################################
n_epoch: 50 n_epoch: 65
accum_grad: 4 accum_grad: 1
lr: 1.0e-3 lr: 5.0e-4
lr_decay: 0.83 lr_decay: 0.93
weight_decay: 1.0e-6 weight_decay: 1.0e-6
global_grad_clip: 5.0 global_grad_clip: 5.0
log_interval: 100 log_interval: 1
checkpoint: checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5

@ -0,0 +1,25 @@
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 161
n_shift: 160
win_length: 400
dither: 0.1
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false

@ -49,12 +49,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--num_samples=2000 \ --num_samples=2000 \
--spectrum_type="linear" \ --spectrum_type="fbank" \
--feat_dim=161 \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \
--stride_ms=10 \ --stride_ms=10 \
--window_ms=20 \ --window_ms=25 \
--use_dB_normalization=True \ --use_dB_normalization=False \
--num_workers=${num_workers} \ --num_workers=${num_workers} \
--output_path="data/mean_std.json" --output_path="data/mean_std.json"

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 4 ];then if [ $# != 3 ];then
echo "usage: $0 config_path ckpt_prefix jit_model_path model_type" echo "usage: $0 config_path ckpt_prefix jit_model_path"
exit -1 exit -1
fi fi
@ -11,14 +11,12 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_path_prefix=$2 ckpt_path_prefix=$2
jit_model_export_path=$3 jit_model_export_path=$3
model_type=$4
python3 -u ${BIN_DIR}/export.py \ python3 -u ${BIN_DIR}/export.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--checkpoint_path ${ckpt_path_prefix} \ --checkpoint_path ${ckpt_path_prefix} \
--export_path ${jit_model_export_path} \ --export_path ${jit_model_export_path}
--model_type ${model_type}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in export!" echo "Failed in export!"

@ -1,9 +1,11 @@
#!/bin/bash #!/bin/bash
if [ $# != 4 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
stage=0
stop_stage=100
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
@ -11,7 +13,6 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
decode_config_path=$2 decode_config_path=$2
ckpt_prefix=$3 ckpt_prefix=$3
model_type=$4
# download language model # download language model
bash local/download_lm_en.sh bash local/download_lm_en.sh
@ -19,17 +20,43 @@ if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
python3 -u ${BIN_DIR}/test.py \ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--ngpu ${ngpu} \ # format the reference test file
--config ${config_path} \ python3 utils/format_rsl.py \
--decode_cfg ${decode_config_path} \ --origin_ref data/manifest.test-clean.raw \
--result_file ${ckpt_prefix}.rsl \ --trans_ref data/manifest.test-clean.text
--checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \
--config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.rsl \
--trans_hyp ${ckpt_prefix}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.rsl.text > ${ckpt_prefix}.error
fi
if [ $? -ne 0 ]; then if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
echo "Failed in evaluation!" python3 utils/format_rsl.py \
exit 1 --origin_ref data/manifest.test-clean.raw \
--trans_ref_sclite data/manifest.test.text-clean.sclite
python3 utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.rsl \
--trans_hyp_sclite ${ckpt_prefix}.rsl.text.sclite
mkdir -p ${ckpt_prefix}_sclite
sclite -i wsj -r data/manifest.test-clean.text.sclite -h ${ckpt_prefix}.rsl.text.sclite -e utf-8 -o all -O ${ckpt_prefix}_sclite -c NOASCII
fi fi

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 5 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
exit -1 exit -1
fi fi
@ -11,8 +11,7 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
decode_config_path=$2 decode_config_path=$2
ckpt_prefix=$3 ckpt_prefix=$3
model_type=$4 audio_file=$4
audio_file=$5
mkdir -p data mkdir -p data
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
@ -37,7 +36,6 @@ python3 -u ${BIN_DIR}/test_wav.py \
--decode_cfg ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} \
--audio_file ${audio_file} --audio_file ${audio_file}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type" echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
exit -1 exit -1
fi fi
@ -10,7 +10,6 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_name=$2 ckpt_name=$2
model_type=$3
mkdir -p exp mkdir -p exp
@ -25,14 +24,12 @@ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--model_type ${model_type} \
--seed ${seed} --seed ${seed}
else else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \ python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--model_type ${model_type} \
--seed ${seed} --seed ${seed}
fi fi

@ -2,13 +2,12 @@
set -e set -e
source path.sh source path.sh
gpus=0,1,2,3,4,5,6,7 gpus=0,1,2,3
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml conf_path=conf/deepspeech2.yaml
decode_conf_path=conf/tuning/decode.yaml decode_conf_path=conf/tuning/decode.yaml
avg_num=30 avg_num=5
model_type=offline
audio_file=data/demo_002_en.wav audio_file=data/demo_002_en.wav
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -24,7 +23,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type} CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@ -34,15 +33,20 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}|| exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n # export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type} CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# test export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit|| exit -1
fi
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# test a single .wav file # test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi fi

@ -1,36 +0,0 @@
[
{
"type": "speed",
"params": {
"min_speed_rate": 0.9,
"max_speed_rate": 1.1,
"num_rates": 3
},
"prob": 0.0
},
{
"type": "shift",
"params": {
"min_shift_ms": -5,
"max_shift_ms": 5
},
"prob": 1.0
},
{
"type": "specaug",
"params": {
"W": 5,
"warp_mode": "PIL",
"F": 30,
"n_freq_masks": 2,
"T": 40,
"n_time_masks": 2,
"p": 1.0,
"adaptive_number_ratio": 0,
"adaptive_size_ratio": 0,
"max_n_time_masks": 20,
"replace_with_zero": true
},
"prob": 1.0
}
]

@ -16,28 +16,26 @@ max_output_input_ratio: 10.0
########################################### ###########################################
# Dataloader # # Dataloader #
########################################### ###########################################
mean_std_filepath: data/mean_std.json vocab_filepath: data/lang_char/vocab.txt
unit_type: char spm_model_prefix: ''
vocab_filepath: data/lang_char/vocab.txt unit_type: 'char'
augmentation_config: conf/augmentation.json preprocess_config: conf/preprocess.yaml
random_seed: 0
spm_model_prefix:
spectrum_type: linear
feat_dim: 161 feat_dim: 161
delta_delta: False
stride_ms: 10.0 stride_ms: 10.0
window_ms: 20.0 window_ms: 25.0
n_fft: None sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
max_freq: None
target_sample_rate: 16000
use_dB_normalization: True
target_dB: -20
dither: 1.0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
batch_size: 4 batch_size: 4
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
num_workers: 8
subsampling_factor: 1
num_encs: 1
############################################ ############################################
# Network Architecture # # Network Architecture #
@ -45,8 +43,10 @@ batch_size: 4
num_conv_layers: 2 num_conv_layers: 2
num_rnn_layers: 3 num_rnn_layers: 3
rnn_layer_size: 2048 rnn_layer_size: 2048
rnn_direction: bidirect # [forward, bidirect]
num_fc_layers: 0
fc_layers_size_list: -1,
use_gru: False use_gru: False
share_rnn_weights: True
blank_id: 0 blank_id: 0
@ -59,6 +59,7 @@ lr: 1.0e-5
lr_decay: 0.8 lr_decay: 0.8
weight_decay: 1.0e-6 weight_decay: 1.0e-6
global_grad_clip: 5.0 global_grad_clip: 5.0
dist_sampler: False
log_interval: 1 log_interval: 1
checkpoint: checkpoint:
kbest_n: 3 kbest_n: 3

@ -16,29 +16,27 @@ max_output_input_ratio: 10.0
########################################### ###########################################
# Dataloader # # Dataloader #
########################################### ###########################################
mean_std_filepath: data/mean_std.json vocab_filepath: data/lang_char/vocab.txt
unit_type: char spm_model_prefix: ''
vocab_filepath: data/lang_char/vocab.txt unit_type: 'char'
augmentation_config: conf/augmentation.json preprocess_config: conf/preprocess.yaml
random_seed: 0
spm_model_prefix:
spectrum_type: linear
feat_dim: 161 feat_dim: 161
delta_delta: False
stride_ms: 10.0 stride_ms: 10.0
window_ms: 20.0 window_ms: 25.0
n_fft: None sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
max_freq: None
target_sample_rate: 16000
use_dB_normalization: True
target_dB: -20
dither: 1.0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 0
batch_size: 4 batch_size: 4
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
num_workers: 8
subsampling_factor: 1
num_encs: 1
############################################ ############################################
# Network Architecture # # Network Architecture #
############################################ ############################################
@ -61,6 +59,7 @@ lr: 1.0e-5
lr_decay: 1.0 lr_decay: 1.0
weight_decay: 1.0e-6 weight_decay: 1.0e-6
global_grad_clip: 5.0 global_grad_clip: 5.0
dist_sampler: False
log_interval: 1 log_interval: 1
checkpoint: checkpoint:
kbest_n: 3 kbest_n: 3

@ -0,0 +1,25 @@
process:
# extract kaldi fbank from PCM
- type: fbank_kaldi
fs: 16000
n_mels: 161
n_shift: 160
win_length: 400
dither: 0.1
- type: cmvn_json
cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument
- type: time_warp
max_time_warp: 5
inplace: true
mode: PIL
- type: freq_mask
F: 30
n_mask: 2
inplace: true
replace_with_zero: false
- type: time_mask
T: 40
n_mask: 2
inplace: true
replace_with_zero: false

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 4 ];then if [ $# != 3 ];then
echo "usage: $0 config_path ckpt_prefix jit_model_path model_type" echo "usage: $0 config_path ckpt_prefix jit_model_path"
exit -1 exit -1
fi fi
@ -11,14 +11,12 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_path_prefix=$2 ckpt_path_prefix=$2
jit_model_export_path=$3 jit_model_export_path=$3
model_type=$4
python3 -u ${BIN_DIR}/export.py \ python3 -u ${BIN_DIR}/export.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--checkpoint_path ${ckpt_path_prefix} \ --checkpoint_path ${ckpt_path_prefix} \
--export_path ${jit_model_export_path} \ --export_path ${jit_model_export_path}
--model_type ${model_type}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in export!" echo "Failed in export!"

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 4 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
@ -11,7 +11,6 @@ echo "using $ngpu gpus..."
config_path=$1 config_path=$1
decode_config_path=$2 decode_config_path=$2
ckpt_prefix=$3 ckpt_prefix=$3
model_type=$4
# download language model # download language model
bash local/download_lm_en.sh bash local/download_lm_en.sh
@ -24,8 +23,7 @@ python3 -u ${BIN_DIR}/test.py \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \ --decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix}
--model_type ${model_type}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"

@ -15,14 +15,13 @@ if [ ${seed} != 0 ]; then
echo "using seed $seed & FLAGS_cudnn_deterministic=True ..." echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
fi fi
if [ $# != 3 ];then if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type" echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
exit -1 exit -1
fi fi
config_path=$1 config_path=$1
ckpt_name=$2 ckpt_name=$2
model_type=$3
mkdir -p exp mkdir -p exp
@ -31,7 +30,6 @@ python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--model_type ${model_type} \
--profiler-options "${profiler_options}" \ --profiler-options "${profiler_options}" \
--seed ${seed} --seed ${seed}
else else
@ -39,7 +37,6 @@ python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/t
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--output exp/${ckpt_name} \ --output exp/${ckpt_name} \
--model_type ${model_type} \
--profiler-options "${profiler_options}" \ --profiler-options "${profiler_options}" \
--seed ${seed} --seed ${seed}
fi fi

@ -8,8 +8,6 @@ stop_stage=100
conf_path=conf/deepspeech2.yaml conf_path=conf/deepspeech2.yaml
decode_conf_path=conf/tuning/decode.yaml decode_conf_path=conf/tuning/decode.yaml
avg_num=1 avg_num=1
model_type=offline
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
avg_ckpt=avg_${avg_num} avg_ckpt=avg_${avg_num}
@ -23,7 +21,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir # train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${model_type} CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
fi fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@ -33,10 +31,10 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}|| exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# export ckpt avg_n # export ckpt avg_n
CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type} CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi fi

@ -138,6 +138,7 @@ class ASRExecutor(BaseExecutor):
tag = model_type + '-' + lang + '-' + sample_rate_str tag = model_type + '-' + lang + '-' + sample_rate_str
self.task_resource.set_task_model(tag, version=None) self.task_resource.set_task_model(tag, version=None)
self.res_path = self.task_resource.res_dir self.res_path = self.task_resource.res_dir
self.cfg_path = os.path.join( self.cfg_path = os.path.join(
self.res_path, self.task_resource.res_dict['cfg_path']) self.res_path, self.task_resource.res_dict['cfg_path'])
self.ckpt_path = os.path.join( self.ckpt_path = os.path.join(
@ -158,15 +159,18 @@ class ASRExecutor(BaseExecutor):
self.config.merge_from_file(self.cfg_path) self.config.merge_from_file(self.cfg_path)
with UpdateConfig(self.config): with UpdateConfig(self.config):
if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: if self.config.spm_model_prefix:
from paddlespeech.s2t.io.collator import SpeechCollator self.config.spm_model_prefix = os.path.join(
self.vocab = self.config.vocab_filepath self.res_path, self.config.spm_model_prefix)
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type,
vocab=self.config.vocab_filepath,
spm_model_prefix=self.config.spm_model_prefix)
if "deepspeech2" in model_type:
self.config.decode.lang_model_path = os.path.join( self.config.decode.lang_model_path = os.path.join(
MODEL_HOME, 'language_model', MODEL_HOME, 'language_model',
self.config.decode.lang_model_path) self.config.decode.lang_model_path)
self.collate_fn_test = SpeechCollator.from_config(self.config)
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type, vocab=self.vocab)
lm_url = self.task_resource.res_dict['lm_url'] lm_url = self.task_resource.res_dict['lm_url']
lm_md5 = self.task_resource.res_dict['lm_md5'] lm_md5 = self.task_resource.res_dict['lm_md5']
self.download_lm( self.download_lm(
@ -174,12 +178,6 @@ class ASRExecutor(BaseExecutor):
os.path.dirname(self.config.decode.lang_model_path), lm_md5) os.path.dirname(self.config.decode.lang_model_path), lm_md5)
elif "conformer" in model_type or "transformer" in model_type: elif "conformer" in model_type or "transformer" in model_type:
self.config.spm_model_prefix = os.path.join(
self.res_path, self.config.spm_model_prefix)
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type,
vocab=self.config.vocab_filepath,
spm_model_prefix=self.config.spm_model_prefix)
self.config.decode.decoding_method = decode_method self.config.decode.decoding_method = decode_method
else: else:
@ -222,19 +220,7 @@ class ASRExecutor(BaseExecutor):
logger.info("Preprocess audio_file:" + audio_file) logger.info("Preprocess audio_file:" + audio_file)
# Get the object for feature extraction # Get the object for feature extraction
if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: if "deepspeech2" in model_type or "conformer" in model_type or "transformer" in model_type:
audio, _ = self.collate_fn_test.process_utterance(
audio_file=audio_file, transcript=" ")
audio_len = audio.shape[0]
audio = paddle.to_tensor(audio, dtype='float32')
audio_len = paddle.to_tensor(audio_len)
audio = paddle.unsqueeze(audio, axis=0)
# vocab_list = collate_fn_test.vocab_list
self._inputs["audio"] = audio
self._inputs["audio_len"] = audio_len
logger.info(f"audio feat shape: {audio.shape}")
elif "conformer" in model_type or "transformer" in model_type:
logger.info("get the preprocess conf") logger.info("get the preprocess conf")
preprocess_conf = self.config.preprocess_config preprocess_conf = self.config.preprocess_config
preprocess_args = {"train": False} preprocess_args = {"train": False}
@ -242,7 +228,6 @@ class ASRExecutor(BaseExecutor):
logger.info("read the audio file") logger.info("read the audio file")
audio, audio_sample_rate = soundfile.read( audio, audio_sample_rate = soundfile.read(
audio_file, dtype="int16", always_2d=True) audio_file, dtype="int16", always_2d=True)
if self.change_format: if self.change_format:
if audio.shape[1] >= 2: if audio.shape[1] >= 2:
audio = audio.mean(axis=1, dtype=np.int16) audio = audio.mean(axis=1, dtype=np.int16)
@ -285,7 +270,7 @@ class ASRExecutor(BaseExecutor):
cfg = self.config.decode cfg = self.config.decode
audio = self._inputs["audio"] audio = self._inputs["audio"]
audio_len = self._inputs["audio_len"] audio_len = self._inputs["audio_len"]
if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: if "deepspeech2" in model_type:
decode_batch_size = audio.shape[0] decode_batch_size = audio.shape[0]
self.model.decoder.init_decoder( self.model.decoder.init_decoder(
decode_batch_size, self.text_feature.vocab_list, decode_batch_size, self.text_feature.vocab_list,

@ -23,7 +23,7 @@ model_alias = {
# --------------------------------- # ---------------------------------
"deepspeech2offline": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"], "deepspeech2offline": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
"deepspeech2online": "deepspeech2online":
["paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline"], ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
"conformer": ["paddlespeech.s2t.models.u2:U2Model"], "conformer": ["paddlespeech.s2t.models.u2:U2Model"],
"conformer_online": ["paddlespeech.s2t.models.u2:U2Model"], "conformer_online": ["paddlespeech.s2t.models.u2:U2Model"],
"transformer": ["paddlespeech.s2t.models.u2:U2Model"], "transformer": ["paddlespeech.s2t.models.u2:U2Model"],

@ -136,9 +136,9 @@ asr_dynamic_pretrained_models = {
"deepspeech2online_wenetspeech-zh-16k": { "deepspeech2online_wenetspeech-zh-16k": {
'1.0': { '1.0': {
'url': 'url':
'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz', 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.1.model.tar.gz',
'md5': 'md5':
'e393d4d274af0f6967db24fc146e8074', 'd1be86a3e786042ab64f05161b5fae62',
'cfg_path': 'cfg_path':
'model.yaml', 'model.yaml',
'ckpt_path': 'ckpt_path':
@ -152,13 +152,13 @@ asr_dynamic_pretrained_models = {
"deepspeech2offline_aishell-zh-16k": { "deepspeech2offline_aishell-zh-16k": {
'1.0': { '1.0': {
'url': 'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz', 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz',
'md5': 'md5':
'932c3593d62fe5c741b59b31318aa314', '4d26066c6f19f52087425dc722ae5b13',
'cfg_path': 'cfg_path':
'model.yaml', 'model.yaml',
'ckpt_path': 'ckpt_path':
'exp/deepspeech2/checkpoints/avg_1', 'exp/deepspeech2/checkpoints/avg_10',
'lm_url': 'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5': 'lm_md5':
@ -168,9 +168,9 @@ asr_dynamic_pretrained_models = {
"deepspeech2online_aishell-zh-16k": { "deepspeech2online_aishell-zh-16k": {
'1.0': { '1.0': {
'url': 'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz', 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_1.0.1.model.tar.gz',
'md5': 'md5':
'98b87b171b7240b7cae6e07d8d0bc9be', 'df5ddeac8b679a470176649ac4b78726',
'cfg_path': 'cfg_path':
'model.yaml', 'model.yaml',
'ckpt_path': 'ckpt_path':
@ -188,13 +188,13 @@ asr_dynamic_pretrained_models = {
"deepspeech2offline_librispeech-en-16k": { "deepspeech2offline_librispeech-en-16k": {
'1.0': { '1.0': {
'url': 'url':
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz', 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_offline_librispeech_ckpt_1.0.1.model.tar.gz',
'md5': 'md5':
'f5666c81ad015c8de03aac2bc92e5762', 'ed9e2b008a65268b3484020281ab048c',
'cfg_path': 'cfg_path':
'model.yaml', 'model.yaml',
'ckpt_path': 'ckpt_path':
'exp/deepspeech2/checkpoints/avg_1', 'exp/deepspeech2/checkpoints/avg_5',
'lm_url': 'lm_url':
'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm', 'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
'lm_md5': 'lm_md5':
@ -207,17 +207,17 @@ asr_static_pretrained_models = {
"deepspeech2offline_aishell-zh-16k": { "deepspeech2offline_aishell-zh-16k": {
'1.0': { '1.0': {
'url': 'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz', 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz',
'md5': 'md5':
'932c3593d62fe5c741b59b31318aa314', '4d26066c6f19f52087425dc722ae5b13',
'cfg_path': 'cfg_path':
'model.yaml', 'model.yaml',
'ckpt_path': 'ckpt_path':
'exp/deepspeech2/checkpoints/avg_1', 'exp/deepspeech2/checkpoints/avg_10',
'model': 'model':
'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel', 'exp/deepspeech2/checkpoints/avg_10.jit.pdmodel',
'params': 'params':
'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams', 'exp/deepspeech2/checkpoints/avg_10.jit.pdiparams',
'lm_url': 'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5': 'lm_md5':
@ -830,7 +830,7 @@ vector_dynamic_pretrained_models = {
'cfg_path': 'cfg_path':
'conf/model.yaml', # the yaml config path 'conf/model.yaml', # the yaml config path
'ckpt_path': 'ckpt_path':
'model/model', # the format is ${dir}/{model_name}, 'model/model', # the format is ${dir}/{model_name},
# so the first 'model' is dir, the second 'model' is the name # so the first 'model' is dir, the second 'model' is the name
# this means we have a model stored as model/model.pdparams # this means we have a model stored as model/model.pdparams
}, },

@ -32,11 +32,9 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
# save jit model to # save jit model to
parser.add_argument( parser.add_argument(
"--export_path", type=str, help="path of the jit model to save") "--export_path", type=str, help="path of the jit model to save")
parser.add_argument(
"--model_type", type=str, default='offline', help="offline/online")
parser.add_argument( parser.add_argument(
'--nxpu', '--nxpu',
type=int, type=int,
@ -44,7 +42,6 @@ if __name__ == "__main__":
choices=[0, 1], choices=[0, 1],
help="if nxpu == 0 and ngpu == 0, use cpu.") help="if nxpu == 0 and ngpu == 0, use cpu.")
args = parser.parse_args() args = parser.parse_args()
print("model_type:{}".format(args.model_type))
print_arguments(args) print_arguments(args)
# https://yaml.org/type/float.html # https://yaml.org/type/float.html

@ -32,9 +32,7 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
parser.add_argument( # save asr result to
"--model_type", type=str, default='offline', help='offline/online')
# save asr result to
parser.add_argument( parser.add_argument(
"--result_file", type=str, help="path of save the asr result") "--result_file", type=str, help="path of save the asr result")
parser.add_argument( parser.add_argument(
@ -45,7 +43,6 @@ if __name__ == "__main__":
help="if nxpu == 0 and ngpu == 0, use cpu.") help="if nxpu == 0 and ngpu == 0, use cpu.")
args = parser.parse_args() args = parser.parse_args()
print_arguments(args, globals()) print_arguments(args, globals())
print("model_type:{}".format(args.model_type))
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
config = CfgNode(new_allowed=True) config = CfgNode(new_allowed=True)

@ -38,8 +38,6 @@ if __name__ == "__main__":
#load jit model from #load jit model from
parser.add_argument( parser.add_argument(
"--export_path", type=str, help="path of the jit model to save") "--export_path", type=str, help="path of the jit model to save")
parser.add_argument(
"--model_type", type=str, default='offline', help='offline/online')
parser.add_argument( parser.add_argument(
'--nxpu', '--nxpu',
type=int, type=int,
@ -50,7 +48,6 @@ if __name__ == "__main__":
"--enable-auto-log", action="store_true", help="use auto log") "--enable-auto-log", action="store_true", help="use auto log")
args = parser.parse_args() args = parser.parse_args()
print_arguments(args, globals()) print_arguments(args, globals())
print("model_type:{}".format(args.model_type))
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
config = CfgNode(new_allowed=True) config = CfgNode(new_allowed=True)

@ -23,7 +23,6 @@ from yacs.config import CfgNode
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.models.ds2 import DeepSpeech2Model from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils import mp_tools from paddlespeech.s2t.utils import mp_tools
from paddlespeech.s2t.utils.checkpoint import Checkpoint from paddlespeech.s2t.utils.checkpoint import Checkpoint
@ -113,12 +112,7 @@ class DeepSpeech2Tester_hub():
config.input_dim = self.collate_fn_test.feature_size config.input_dim = self.collate_fn_test.feature_size
config.output_dim = self.collate_fn_test.vocab_size config.output_dim = self.collate_fn_test.vocab_size
if self.args.model_type == 'offline': model = DeepSpeech2Model.from_config(config)
model = DeepSpeech2Model.from_config(config)
elif self.args.model_type == 'online':
model = DeepSpeech2ModelOnline.from_config(config)
else:
raise Exception("wrong model type")
self.model = model self.model = model
@ -172,8 +166,6 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
parser.add_argument(
"--model_type", type=str, default='offline', help='offline/online')
parser.add_argument("--audio_file", type=str, help='audio file path') parser.add_argument("--audio_file", type=str, help='audio file path')
# save asr result to # save asr result to
parser.add_argument( parser.add_argument(
@ -184,7 +176,6 @@ if __name__ == "__main__":
print("Please input the audio file path") print("Please input the audio file path")
sys.exit(-1) sys.exit(-1)
check(args.audio_file) check(args.audio_file)
print("model_type:{}".format(args.model_type))
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
config = CfgNode(new_allowed=True) config = CfgNode(new_allowed=True)

@ -31,8 +31,6 @@ def main(config, args):
if __name__ == "__main__": if __name__ == "__main__":
parser = default_argument_parser() parser = default_argument_parser()
parser.add_argument(
"--model_type", type=str, default='offline', help='offline/online')
parser.add_argument( parser.add_argument(
'--nxpu', '--nxpu',
type=int, type=int,
@ -40,7 +38,6 @@ if __name__ == "__main__":
choices=[0, 1], choices=[0, 1],
help="if nxpu == 0 and ngpu == 0, use cpu.") help="if nxpu == 0 and ngpu == 0, use cpu.")
args = parser.parse_args() args = parser.parse_args()
print("model_type:{}".format(args.model_type))
print_arguments(args, globals()) print_arguments(args, globals())
# https://yaml.org/type/float.html # https://yaml.org/type/float.html

@ -23,16 +23,12 @@ import paddle
from paddle import distributed as dist from paddle import distributed as dist
from paddle import inference from paddle import inference
from paddle.io import DataLoader from paddle.io import DataLoader
from paddlespeech.s2t.io.dataloader import BatchDataLoader
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.io.dataset import ManifestDataset
from paddlespeech.s2t.io.sampler import SortagradBatchSampler
from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
from paddlespeech.s2t.models.ds2 import DeepSpeech2Model from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline
from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline
from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
from paddlespeech.s2t.training.reporter import report from paddlespeech.s2t.training.reporter import report
from paddlespeech.s2t.training.timer import Timer from paddlespeech.s2t.training.timer import Timer
@ -136,18 +132,13 @@ class DeepSpeech2Trainer(Trainer):
config = self.config.clone() config = self.config.clone()
with UpdateConfig(config): with UpdateConfig(config):
if self.train: if self.train:
config.input_dim = self.train_loader.collate_fn.feature_size config.input_dim = self.train_loader.feat_dim
config.output_dim = self.train_loader.collate_fn.vocab_size config.output_dim = self.train_loader.vocab_size
else: else:
config.input_dim = self.test_loader.collate_fn.feature_size config.input_dim = self.test_loader.feat_dim
config.output_dim = self.test_loader.collate_fn.vocab_size config.output_dim = self.test_loader.vocab_size
if self.args.model_type == 'offline': model = DeepSpeech2Model.from_config(config)
model = DeepSpeech2Model.from_config(config)
elif self.args.model_type == 'online':
model = DeepSpeech2ModelOnline.from_config(config)
else:
raise Exception("wrong model type")
if self.parallel: if self.parallel:
model = paddle.DataParallel(model) model = paddle.DataParallel(model)
@ -175,76 +166,81 @@ class DeepSpeech2Trainer(Trainer):
config = self.config.clone() config = self.config.clone()
config.defrost() config.defrost()
if self.train: if self.train:
# train # train/valid dataset, return token ids
config.manifest = config.train_manifest self.train_loader = BatchDataLoader(
train_dataset = ManifestDataset.from_config(config) json_file=config.train_manifest,
if self.parallel: train_mode=True,
batch_sampler = SortagradDistributedBatchSampler( sortagrad=config.sortagrad,
train_dataset, batch_size=config.batch_size,
batch_size=config.batch_size, maxlen_in=config.maxlen_in,
num_replicas=None, maxlen_out=config.maxlen_out,
rank=None, minibatches=config.minibatches,
shuffle=True, mini_batch_size=self.args.ngpu,
drop_last=True, batch_count=config.batch_count,
sortagrad=config.sortagrad, batch_bins=config.batch_bins,
shuffle_method=config.shuffle_method) batch_frames_in=config.batch_frames_in,
else: batch_frames_out=config.batch_frames_out,
batch_sampler = SortagradBatchSampler( batch_frames_inout=config.batch_frames_inout,
train_dataset, preprocess_conf=config.preprocess_config,
shuffle=True, n_iter_processes=config.num_workers,
batch_size=config.batch_size, subsampling_factor=1,
drop_last=True, num_encs=1,
sortagrad=config.sortagrad, dist_sampler=config.get('dist_sampler', False),
shuffle_method=config.shuffle_method) shortest_first=False)
config.keep_transcription_text = False self.valid_loader = BatchDataLoader(
collate_fn_train = SpeechCollator.from_config(config) json_file=config.dev_manifest,
self.train_loader = DataLoader( train_mode=False,
train_dataset, sortagrad=False,
batch_sampler=batch_sampler, batch_size=config.batch_size,
collate_fn=collate_fn_train, maxlen_in=float('inf'),
num_workers=config.num_workers) maxlen_out=float('inf'),
minibatches=0,
# dev mini_batch_size=self.args.ngpu,
config.manifest = config.dev_manifest batch_count='auto',
dev_dataset = ManifestDataset.from_config(config) batch_bins=0,
batch_frames_in=0,
config.augmentation_config = "" batch_frames_out=0,
config.keep_transcription_text = False batch_frames_inout=0,
collate_fn_dev = SpeechCollator.from_config(config) preprocess_conf=config.preprocess_config,
self.valid_loader = DataLoader( n_iter_processes=config.num_workers,
dev_dataset, subsampling_factor=1,
batch_size=int(config.batch_size), num_encs=1,
shuffle=False, dist_sampler=config.get('dist_sampler', False),
drop_last=False, shortest_first=False)
collate_fn=collate_fn_dev, logger.info("Setup train/valid Dataloader!")
num_workers=config.num_workers)
logger.info("Setup train/valid Dataloader!")
else: else:
# test
config.manifest = config.test_manifest
test_dataset = ManifestDataset.from_config(config)
config.augmentation_config = ""
config.keep_transcription_text = True
collate_fn_test = SpeechCollator.from_config(config)
decode_batch_size = config.get('decode', dict()).get( decode_batch_size = config.get('decode', dict()).get(
'decode_batch_size', 1) 'decode_batch_size', 1)
self.test_loader = DataLoader( # test dataset, return raw text
test_dataset, self.test_loader = BatchDataLoader(
json_file=config.test_manifest,
train_mode=False,
sortagrad=False,
batch_size=decode_batch_size, batch_size=decode_batch_size,
shuffle=False, maxlen_in=float('inf'),
drop_last=False, maxlen_out=float('inf'),
collate_fn=collate_fn_test, minibatches=0,
num_workers=config.num_workers) mini_batch_size=1,
logger.info("Setup test Dataloader!") batch_count='auto',
batch_bins=0,
batch_frames_in=0,
batch_frames_out=0,
batch_frames_inout=0,
preprocess_conf=config.preprocess_config,
n_iter_processes=1,
subsampling_factor=1,
num_encs=1)
logger.info("Setup test/align Dataloader!")
class DeepSpeech2Tester(DeepSpeech2Trainer): class DeepSpeech2Tester(DeepSpeech2Trainer):
def __init__(self, config, args): def __init__(self, config, args):
super().__init__(config, args) super().__init__(config, args)
self._text_featurizer = TextFeaturizer( self._text_featurizer = TextFeaturizer(
unit_type=config.unit_type, vocab=None) unit_type=config.unit_type,
vocab=config.vocab_filepath)
self.vocab_list = self._text_featurizer.vocab_list
def ordid2token(self, texts, texts_len): def ordid2token(self, texts, texts_len):
""" ord() id to chr() chr """ """ ord() id to chr() chr """
@ -252,7 +248,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
for text, n in zip(texts, texts_len): for text, n in zip(texts, texts_len):
n = n.numpy().item() n = n.numpy().item()
ids = text[:n] ids = text[:n]
trans.append(''.join([chr(i) for i in ids])) trans.append(self._text_featurizer.defeaturize(ids.numpy().tolist()))
return trans return trans
def compute_metrics(self, def compute_metrics(self,
@ -307,8 +303,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
# Initialized the decoder in model # Initialized the decoder in model
decode_cfg = self.config.decode decode_cfg = self.config.decode
vocab_list = self.test_loader.collate_fn.vocab_list vocab_list = self.vocab_list
decode_batch_size = self.test_loader.batch_size decode_batch_size = decode_cfg.decode_batch_size
self.model.decoder.init_decoder( self.model.decoder.init_decoder(
decode_batch_size, vocab_list, decode_cfg.decoding_method, decode_batch_size, vocab_list, decode_cfg.decoding_method,
decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta, decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
@ -338,17 +334,9 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
@paddle.no_grad() @paddle.no_grad()
def export(self): def export(self):
if self.args.model_type == 'offline': infer_model = DeepSpeech2InferModel.from_pretrained(
infer_model = DeepSpeech2InferModel.from_pretrained( self.test_loader, self.config, self.args.checkpoint_path)
self.test_loader, self.config, self.args.checkpoint_path)
elif self.args.model_type == 'online':
infer_model = DeepSpeech2InferModelOnline.from_pretrained(
self.test_loader, self.config, self.args.checkpoint_path)
else:
raise Exception("wrong model type")
infer_model.eval() infer_model.eval()
feat_dim = self.test_loader.collate_fn.feature_size
static_model = infer_model.export() static_model = infer_model.export()
logger.info(f"Export code: {static_model.forward.code}") logger.info(f"Export code: {static_model.forward.code}")
paddle.jit.save(static_model, self.args.export_path) paddle.jit.save(static_model, self.args.export_path)
@ -376,10 +364,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
# Initialized the decoder in model # Initialized the decoder in model
decode_cfg = self.config.decode decode_cfg = self.config.decode
vocab_list = self.test_loader.collate_fn.vocab_list vocab_list = self.vocab_list
if self.args.model_type == "online": if self.config.rnn_direction == "forward":
decode_batch_size = 1 decode_batch_size = 1
elif self.args.model_type == "offline": elif self.config.rnn_direction == "bidirect":
decode_batch_size = self.test_loader.batch_size decode_batch_size = self.test_loader.batch_size
else: else:
raise Exception("wrong model type") raise Exception("wrong model type")
@ -412,11 +400,11 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
self.model.decoder.del_decoder() self.model.decoder.del_decoder()
def compute_result_transcripts(self, audio, audio_len): def compute_result_transcripts(self, audio, audio_len):
if self.args.model_type == "online": if self.config.rnn_direction == "forward":
output_probs, output_lens, trans_batch = self.static_forward_online( output_probs, output_lens, trans_batch = self.static_forward_online(
audio, audio_len, decoder_chunk_size=1) audio, audio_len, decoder_chunk_size=1)
result_transcripts = [trans[-1] for trans in trans_batch] result_transcripts = [trans[-1] for trans in trans_batch]
elif self.args.model_type == "offline": elif self.config.rnn_direction == "bidirect":
output_probs, output_lens = self.static_forward_offline(audio, output_probs, output_lens = self.static_forward_offline(audio,
audio_len) audio_len)
batch_size = output_probs.shape[0] batch_size = output_probs.shape[0]

@ -11,161 +11,23 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddle import nn import paddle
from paddle.nn import functional as F
from paddlespeech.s2t.modules.activation import brelu from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4
from paddlespeech.s2t.modules.mask import make_non_pad_mask
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
__all__ = ['ConvStack', "conv_output_size"] class Conv2dSubsampling4Pure(Conv2dSubsampling4):
def __init__(self, idim: int, odim: int, dropout_rate: float):
super().__init__(idim, odim, dropout_rate, None)
self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim
self.receptive_field_length = 2 * (
3 - 1) + 3 # stride_1 * (kernel_size_2 - 1) + kerel_size_1
def forward(self, x: paddle.Tensor,
def conv_output_size(I, F, P, S): x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]:
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters x = x.unsqueeze(1) # (b, c=1, t, f)
# Output size after Conv:
# By noting I the length of the input volume size,
# F the length of the filter,
# P the amount of zero padding,
# S the stride,
# then the output size O of the feature map along that dimension is given by:
# O = (I - F + Pstart + Pend) // S + 1
# When Pstart == Pend == P, we can replace Pstart + Pend by 2P.
# When Pstart == Pend == 0
# O = (I - F - S) // S
# https://iq.opengenus.org/output-size-of-convolution/
# Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1
# Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1
return (I - F + 2 * P - S) // S
# receptive field calculator
# https://fomoro.com/research/article/receptive-field-calculator
# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
# https://distill.pub/2019/computing-receptive-fields/
# Rl-1 = Sl * Rl + (Kl - Sl)
class ConvBn(nn.Layer):
"""Convolution layer with batch normalization.
:param kernel_size: The x dimension of a filter kernel. Or input a tuple for
two image dimension.
:type kernel_size: int|tuple|list
:param num_channels_in: Number of input channels.
:type num_channels_in: int
:param num_channels_out: Number of output channels.
:type num_channels_out: int
:param stride: The x dimension of the stride. Or input a tuple for two
image dimension.
:type stride: int|tuple|list
:param padding: The x dimension of the padding. Or input a tuple for two
image dimension.
:type padding: int|tuple|list
:param act: Activation type, relu|brelu
:type act: string
:return: Batch norm layer after convolution layer.
:rtype: Variable
"""
def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
padding, act):
super().__init__()
assert len(kernel_size) == 2
assert len(stride) == 2
assert len(padding) == 2
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.conv = nn.Conv2D(
num_channels_in,
num_channels_out,
kernel_size=kernel_size,
stride=stride,
padding=padding,
weight_attr=None,
bias_attr=False,
data_format='NCHW')
self.bn = nn.BatchNorm2D(
num_channels_out,
weight_attr=None,
bias_attr=None,
data_format='NCHW')
self.act = F.relu if act == 'relu' else brelu
def forward(self, x, x_len):
"""
x(Tensor): audio, shape [B, C, D, T]
"""
x = self.conv(x) x = self.conv(x)
x = self.bn(x) #b, c, t, f = paddle.shape(x) #not work under jit
x = self.act(x) x = x.transpose([0, 2, 1, 3]).reshape([0, 0, -1])
x_len = ((x_len - 1) // 2 - 1) // 2
x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
) // self.stride[1] + 1
# reset padding part to 0
masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T]
# TODO(Hui Zhang): not support bool multiply
# masks = masks.type_as(x)
masks = masks.astype(x.dtype)
x = x.multiply(masks)
return x, x_len
class ConvStack(nn.Layer):
"""Convolution group with stacked convolution layers.
:param feat_size: audio feature dim.
:type feat_size: int
:param num_stacks: Number of stacked convolution layers.
:type num_stacks: int
"""
def __init__(self, feat_size, num_stacks):
super().__init__()
self.feat_size = feat_size # D
self.num_stacks = num_stacks
self.conv_in = ConvBn(
num_channels_in=1,
num_channels_out=32,
kernel_size=(41, 11), #[D, T]
stride=(2, 3),
padding=(20, 5),
act='brelu')
out_channel = 32
convs = [
ConvBn(
num_channels_in=32,
num_channels_out=out_channel,
kernel_size=(21, 11),
stride=(2, 1),
padding=(10, 5),
act='brelu') for i in range(num_stacks - 1)
]
self.conv_stack = nn.LayerList(convs)
# conv output feat_dim
output_height = (feat_size - 1) // 2 + 1
for i in range(self.num_stacks - 1):
output_height = (output_height - 1) // 2 + 1
self.output_height = out_channel * output_height
def forward(self, x, x_len):
"""
x: shape [B, C, D, T]
x_len : shape [B]
"""
x, x_len = self.conv_in(x, x_len)
for i, conv in enumerate(self.conv_stack):
x, x_len = conv(x, x_len)
return x, x_len return x, x_len

@ -13,15 +13,14 @@
# limitations under the License. # limitations under the License.
"""Deepspeech2 ASR Model""" """Deepspeech2 ASR Model"""
import paddle import paddle
import paddle.nn.functional as F
from paddle import nn from paddle import nn
from paddlespeech.s2t.models.ds2.conv import ConvStack from paddlespeech.s2t.models.ds2.conv import Conv2dSubsampling4Pure
from paddlespeech.s2t.models.ds2.rnn import RNNStack
from paddlespeech.s2t.modules.ctc import CTCDecoder from paddlespeech.s2t.modules.ctc import CTCDecoder
from paddlespeech.s2t.utils import layer_tools from paddlespeech.s2t.utils import layer_tools
from paddlespeech.s2t.utils.checkpoint import Checkpoint from paddlespeech.s2t.utils.checkpoint import Checkpoint
from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel'] __all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']
@ -32,72 +31,197 @@ class CRNNEncoder(nn.Layer):
feat_size, feat_size,
dict_size, dict_size,
num_conv_layers=2, num_conv_layers=2,
num_rnn_layers=3, num_rnn_layers=4,
rnn_size=1024, rnn_size=1024,
use_gru=False, rnn_direction='forward',
share_rnn_weights=True): num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False):
super().__init__() super().__init__()
self.rnn_size = rnn_size self.rnn_size = rnn_size
self.feat_size = feat_size # 161 for linear self.feat_size = feat_size # 161 for linear
self.dict_size = dict_size self.dict_size = dict_size
self.num_rnn_layers = num_rnn_layers
self.conv = ConvStack(feat_size, num_conv_layers) self.num_fc_layers = num_fc_layers
self.rnn_direction = rnn_direction
i_size = self.conv.output_height # H after conv stack self.fc_layers_size_list = fc_layers_size_list
self.rnn = RNNStack( self.use_gru = use_gru
i_size=i_size, self.conv = Conv2dSubsampling4Pure(feat_size, 32, dropout_rate=0.0)
h_size=rnn_size,
num_stacks=num_rnn_layers, self.output_dim = self.conv.output_dim
use_gru=use_gru,
share_rnn_weights=share_rnn_weights) i_size = self.conv.output_dim
self.rnn = nn.LayerList()
self.layernorm_list = nn.LayerList()
self.fc_layers_list = nn.LayerList()
if rnn_direction == 'bidirect' or rnn_direction == 'bidirectional':
layernorm_size = 2 * rnn_size
elif rnn_direction == 'forward':
layernorm_size = rnn_size
else:
raise Exception("Wrong rnn direction")
for i in range(0, num_rnn_layers):
if i == 0:
rnn_input_size = i_size
else:
rnn_input_size = layernorm_size
if use_gru is True:
self.rnn.append(
nn.GRU(
input_size=rnn_input_size,
hidden_size=rnn_size,
num_layers=1,
direction=rnn_direction))
else:
self.rnn.append(
nn.LSTM(
input_size=rnn_input_size,
hidden_size=rnn_size,
num_layers=1,
direction=rnn_direction))
self.layernorm_list.append(nn.LayerNorm(layernorm_size))
self.output_dim = layernorm_size
fc_input_size = layernorm_size
for i in range(self.num_fc_layers):
self.fc_layers_list.append(
nn.Linear(fc_input_size, fc_layers_size_list[i]))
fc_input_size = fc_layers_size_list[i]
self.output_dim = fc_layers_size_list[i]
@property @property
def output_size(self): def output_size(self):
return self.rnn_size * 2 return self.output_dim
def forward(self, audio, audio_len): def forward(self, x, x_lens, init_state_h_box=None, init_state_c_box=None):
"""Compute Encoder outputs """Compute Encoder outputs
Args: Args:
audio (Tensor): [B, Tmax, D] x (Tensor): [B, T, D]
text (Tensor): [B, Umax] x_lens (Tensor): [B]
audio_len (Tensor): [B] init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
text_len (Tensor): [B] init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
Returns: Return:
x (Tensor): encoder outputs, [B, T, D] x (Tensor): encoder outputs, [B, T, D]
x_lens (Tensor): encoder length, [B] x_lens (Tensor): encoder length, [B]
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
""" """
# [B, T, D] -> [B, D, T] if init_state_h_box is not None:
audio = audio.transpose([0, 2, 1]) init_state_list = None
# [B, D, T] -> [B, C=1, D, T]
x = audio.unsqueeze(1) if self.use_gru is True:
x_lens = audio_len init_state_h_list = paddle.split(
init_state_h_box, self.num_rnn_layers, axis=0)
init_state_list = init_state_h_list
else:
init_state_h_list = paddle.split(
init_state_h_box, self.num_rnn_layers, axis=0)
init_state_c_list = paddle.split(
init_state_c_box, self.num_rnn_layers, axis=0)
init_state_list = [(init_state_h_list[i], init_state_c_list[i])
for i in range(self.num_rnn_layers)]
else:
init_state_list = [None] * self.num_rnn_layers
# convolution group
x, x_lens = self.conv(x, x_lens) x, x_lens = self.conv(x, x_lens)
final_chunk_state_list = []
for i in range(0, self.num_rnn_layers):
x, final_state = self.rnn[i](x, init_state_list[i],
x_lens) #[B, T, D]
final_chunk_state_list.append(final_state)
x = self.layernorm_list[i](x)
for i in range(self.num_fc_layers):
x = self.fc_layers_list[i](x)
x = F.relu(x)
if self.use_gru is True:
final_chunk_state_h_box = paddle.concat(
final_chunk_state_list, axis=0)
final_chunk_state_c_box = init_state_c_box
else:
final_chunk_state_h_list = [
final_chunk_state_list[i][0] for i in range(self.num_rnn_layers)
]
final_chunk_state_c_list = [
final_chunk_state_list[i][1] for i in range(self.num_rnn_layers)
]
final_chunk_state_h_box = paddle.concat(
final_chunk_state_h_list, axis=0)
final_chunk_state_c_box = paddle.concat(
final_chunk_state_c_list, axis=0)
return x, x_lens, final_chunk_state_h_box, final_chunk_state_c_box
def forward_chunk_by_chunk(self, x, x_lens, decoder_chunk_size=8):
"""Compute Encoder outputs
# convert data from convolution feature map to sequence of vectors Args:
#B, C, D, T = paddle.shape(x) # not work under jit x (Tensor): [B, T, D]
x = x.transpose([0, 3, 1, 2]) #[B, T, C, D] x_lens (Tensor): [B]
#x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit decoder_chunk_size: The chunk size of decoder
x = x.reshape([0, 0, -1]) #[B, T, C*D] Returns:
eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
# remove padding part eouts_lens_list (List of Tensor): The list of encoder length in chunk_size: [B] * num_chunks
x, x_lens = self.rnn(x, x_lens) #[B, T, D] final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
return x, x_lens final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
subsampling_rate = self.conv.subsampling_rate
receptive_field_length = self.conv.receptive_field_length
chunk_size = (decoder_chunk_size - 1
) * subsampling_rate + receptive_field_length
chunk_stride = subsampling_rate * decoder_chunk_size
max_len = x.shape[1]
assert (chunk_size <= max_len)
eouts_chunk_list = []
eouts_chunk_lens_list = []
if (max_len - chunk_size) % chunk_stride != 0:
padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride
else:
padding_len = 0
padding = paddle.zeros((x.shape[0], padding_len, x.shape[2]))
padded_x = paddle.concat([x, padding], axis=1)
num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1
num_chunk = int(num_chunk)
chunk_state_h_box = None
chunk_state_c_box = None
final_state_h_box = None
final_state_c_box = None
for i in range(0, num_chunk):
start = i * chunk_stride
end = start + chunk_size
x_chunk = padded_x[:, start:end, :]
x_len_left = paddle.where(x_lens - i * chunk_stride < 0,
paddle.zeros_like(x_lens),
x_lens - i * chunk_stride)
x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size
x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp,
x_len_left, x_chunk_len_tmp)
eouts_chunk, eouts_chunk_lens, chunk_state_h_box, chunk_state_c_box = self.forward(
x_chunk, x_chunk_lens, chunk_state_h_box, chunk_state_c_box)
eouts_chunk_list.append(eouts_chunk)
eouts_chunk_lens_list.append(eouts_chunk_lens)
final_state_h_box = chunk_state_h_box
final_state_c_box = chunk_state_c_box
return eouts_chunk_list, eouts_chunk_lens_list, final_state_h_box, final_state_c_box
class DeepSpeech2Model(nn.Layer): class DeepSpeech2Model(nn.Layer):
"""The DeepSpeech2 network structure. """The DeepSpeech2 network structure.
:param audio_data: Audio spectrogram data layer. :param audio: Audio spectrogram data layer.
:type audio_data: Variable :type audio: Variable
:param text_data: Transcription text data layer. :param text: Transcription text data layer.
:type text_data: Variable :type text: Variable
:param audio_len: Valid sequence length data layer. :param audio_len: Valid sequence length data layer.
:type audio_len: Variable :type audio_len: Variable
:param masks: Masks data layer to reset padding. :param feat_size: feature size for audio.
:type masks: Variable :type feat_size: int
:param dict_size: Dictionary size for tokenized transcription. :param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int :type dict_size: int
:param num_conv_layers: Number of stacking convolution layers. :param num_conv_layers: Number of stacking convolution layers.
@ -106,37 +230,41 @@ class DeepSpeech2Model(nn.Layer):
:type num_rnn_layers: int :type num_rnn_layers: int
:param rnn_size: RNN layer size (dimension of RNN cells). :param rnn_size: RNN layer size (dimension of RNN cells).
:type rnn_size: int :type rnn_size: int
:param num_fc_layers: Number of stacking FC layers.
:type num_fc_layers: int
:param fc_layers_size_list: The list of FC layer sizes.
:type fc_layers_size_list: [int,]
:param use_gru: Use gru if set True. Use simple rnn if set False. :param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool :type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward direction RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: A tuple of an output unnormalized log probability layer ( :return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer. before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput :rtype: tuple of LayerOutput
""" """
def __init__(self, def __init__(
feat_size, self,
dict_size, feat_size,
num_conv_layers=2, dict_size,
num_rnn_layers=3, num_conv_layers=2,
rnn_size=1024, num_rnn_layers=4,
use_gru=False, rnn_size=1024,
share_rnn_weights=True, rnn_direction='forward',
blank_id=0, num_fc_layers=2,
ctc_grad_norm_type=None): fc_layers_size_list=[512, 256],
use_gru=False,
blank_id=0,
ctc_grad_norm_type=None, ):
super().__init__() super().__init__()
self.encoder = CRNNEncoder( self.encoder = CRNNEncoder(
feat_size=feat_size, feat_size=feat_size,
dict_size=dict_size, dict_size=dict_size,
num_conv_layers=num_conv_layers, num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers, num_rnn_layers=num_rnn_layers,
rnn_direction=rnn_direction,
num_fc_layers=num_fc_layers,
fc_layers_size_list=fc_layers_size_list,
rnn_size=rnn_size, rnn_size=rnn_size,
use_gru=use_gru, use_gru=use_gru)
share_rnn_weights=share_rnn_weights)
assert (self.encoder.output_size == rnn_size * 2)
self.decoder = CTCDecoder( self.decoder = CTCDecoder(
odim=dict_size, # <blank> is in vocab odim=dict_size, # <blank> is in vocab
@ -151,7 +279,7 @@ class DeepSpeech2Model(nn.Layer):
"""Compute Model loss """Compute Model loss
Args: Args:
audio (Tensors): [B, T, D] audio (Tensor): [B, T, D]
audio_len (Tensor): [B] audio_len (Tensor): [B]
text (Tensor): [B, U] text (Tensor): [B, U]
text_len (Tensor): [B] text_len (Tensor): [B]
@ -159,22 +287,22 @@ class DeepSpeech2Model(nn.Layer):
Returns: Returns:
loss (Tensor): [1] loss (Tensor): [1]
""" """
eouts, eouts_len = self.encoder(audio, audio_len) eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
audio, audio_len, None, None)
loss = self.decoder(eouts, eouts_len, text, text_len) loss = self.decoder(eouts, eouts_len, text, text_len)
return loss return loss
@paddle.no_grad() @paddle.no_grad()
def decode(self, audio, audio_len): def decode(self, audio, audio_len):
# decoders only accept string encoded in utf-8 # decoders only accept string encoded in utf-8
# Make sure the decoder has been initialized # Make sure the decoder has been initialized
eouts, eouts_len = self.encoder(audio, audio_len) eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
audio, audio_len, None, None)
probs = self.decoder.softmax(eouts) probs = self.decoder.softmax(eouts)
batch_size = probs.shape[0] batch_size = probs.shape[0]
self.decoder.reset_decoder(batch_size=batch_size) self.decoder.reset_decoder(batch_size=batch_size)
self.decoder.next(probs, eouts_len) self.decoder.next(probs, eouts_len)
trans_best, trans_beam = self.decoder.decode() trans_best, trans_beam = self.decoder.decode()
return trans_best return trans_best
@classmethod @classmethod
@ -196,13 +324,15 @@ class DeepSpeech2Model(nn.Layer):
The model built from pretrained result. The model built from pretrained result.
""" """
model = cls( model = cls(
feat_size=dataloader.collate_fn.feature_size, feat_size=dataloader.feat_dim,
dict_size=dataloader.collate_fn.vocab_size, dict_size=dataloader.vocab_size,
num_conv_layers=config.num_conv_layers, num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.num_rnn_layers, num_rnn_layers=config.num_rnn_layers,
rnn_size=config.rnn_layer_size, rnn_size=config.rnn_layer_size,
rnn_direction=config.rnn_direction,
num_fc_layers=config.num_fc_layers,
fc_layers_size_list=config.fc_layers_size_list,
use_gru=config.use_gru, use_gru=config.use_gru,
share_rnn_weights=config.share_rnn_weights,
blank_id=config.blank_id, blank_id=config.blank_id,
ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
infos = Checkpoint().load_parameters( infos = Checkpoint().load_parameters(
@ -229,8 +359,10 @@ class DeepSpeech2Model(nn.Layer):
num_conv_layers=config.num_conv_layers, num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.num_rnn_layers, num_rnn_layers=config.num_rnn_layers,
rnn_size=config.rnn_layer_size, rnn_size=config.rnn_layer_size,
rnn_direction=config.rnn_direction,
num_fc_layers=config.num_fc_layers,
fc_layers_size_list=config.fc_layers_size_list,
use_gru=config.use_gru, use_gru=config.use_gru,
share_rnn_weights=config.share_rnn_weights,
blank_id=config.blank_id, blank_id=config.blank_id,
ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
return model return model
@ -240,28 +372,46 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
def forward(self, audio, audio_len): def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box=None,
"""export model function chunk_state_c_box=None):
if self.encoder.rnn_direction == "forward":
Args: eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder(
audio (Tensor): [B, T, D] audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box)
audio_len (Tensor): [B] probs_chunk = self.decoder.softmax(eouts_chunk)
return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box
Returns: elif self.encoder.rnn_direction == "bidirect":
probs: probs after softmax eouts, eouts_len, _, _ = self.encoder(audio_chunk, audio_chunk_lens)
""" probs = self.decoder.softmax(eouts)
eouts, eouts_len = self.encoder(audio, audio_len) return probs, eouts_len
probs = self.decoder.softmax(eouts) else:
return probs, eouts_len raise Exception("wrong model type")
def export(self): def export(self):
static_model = paddle.jit.to_static( if self.encoder.rnn_direction == "forward":
self, static_model = paddle.jit.to_static(
input_spec=[ self,
paddle.static.InputSpec( input_spec=[
shape=[None, None, self.encoder.feat_size], paddle.static.InputSpec(
dtype='float32'), # audio, [B,T,D] shape=[None, None,
paddle.static.InputSpec(shape=[None], self.encoder.feat_size], #[B, chunk_size, feat_dim]
dtype='int64'), # audio_length, [B] dtype='float32'),
]) paddle.static.InputSpec(shape=[None],
dtype='int64'), # audio_length, [B]
paddle.static.InputSpec(
shape=[None, None, None], dtype='float32'),
paddle.static.InputSpec(
shape=[None, None, None], dtype='float32')
])
elif self.encoder.rnn_direction == "bidirect":
static_model = paddle.jit.to_static(
self,
input_spec=[
paddle.static.InputSpec(
shape=[None, None, self.encoder.feat_size],
dtype='float32'), # audio, [B,T,D]
paddle.static.InputSpec(shape=[None],
dtype='int64'), # audio_length, [B]
])
else:
raise Exception("wrong model type")
return static_model return static_model

@ -1,315 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
from paddle import nn
from paddle.nn import functional as F
from paddle.nn import initializer as I
from paddlespeech.s2t.modules.activation import brelu
from paddlespeech.s2t.modules.mask import make_non_pad_mask
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
__all__ = ['RNNStack']
class RNNCell(nn.RNNCellBase):
r"""
Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it
computes the outputs and updates states.
The formula used is as follows:
.. math::
h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh})
y_{t} & = h_{t}
where :math:`act` is for :attr:`activation`.
"""
def __init__(self,
hidden_size: int,
activation="tanh",
weight_ih_attr=None,
weight_hh_attr=None,
bias_ih_attr=None,
bias_hh_attr=None,
name=None):
super().__init__()
std = 1.0 / math.sqrt(hidden_size)
self.weight_hh = self.create_parameter(
(hidden_size, hidden_size),
weight_hh_attr,
default_initializer=I.Uniform(-std, std))
self.bias_ih = None
self.bias_hh = self.create_parameter(
(hidden_size, ),
bias_hh_attr,
is_bias=True,
default_initializer=I.Uniform(-std, std))
self.hidden_size = hidden_size
if activation not in ["tanh", "relu", "brelu"]:
raise ValueError(
"activation for SimpleRNNCell should be tanh or relu, "
"but get {}".format(activation))
self.activation = activation
self._activation_fn = paddle.tanh \
if activation == "tanh" \
else F.relu
if activation == 'brelu':
self._activation_fn = brelu
def forward(self, inputs, states=None):
if states is None:
states = self.get_initial_states(inputs, self.state_shape)
pre_h = states
i2h = inputs
if self.bias_ih is not None:
i2h += self.bias_ih
h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True)
if self.bias_hh is not None:
h2h += self.bias_hh
h = self._activation_fn(i2h + h2h)
return h, h
@property
def state_shape(self):
return (self.hidden_size, )
class GRUCell(nn.RNNCellBase):
r"""
Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states,
it computes the outputs and updates states.
The formula for GRU used is as follows:
.. math::
r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr})
z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz})
\widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc}))
h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
y_{t} & = h_{t}
where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise
multiplication operator.
"""
def __init__(self,
input_size: int,
hidden_size: int,
weight_ih_attr=None,
weight_hh_attr=None,
bias_ih_attr=None,
bias_hh_attr=None,
name=None):
super().__init__()
std = 1.0 / math.sqrt(hidden_size)
self.weight_hh = self.create_parameter(
(3 * hidden_size, hidden_size),
weight_hh_attr,
default_initializer=I.Uniform(-std, std))
self.bias_ih = None
self.bias_hh = self.create_parameter(
(3 * hidden_size, ),
bias_hh_attr,
is_bias=True,
default_initializer=I.Uniform(-std, std))
self.hidden_size = hidden_size
self.input_size = input_size
self._gate_activation = F.sigmoid
self._activation = paddle.tanh
def forward(self, inputs, states=None):
if states is None:
states = self.get_initial_states(inputs, self.state_shape)
pre_hidden = states
x_gates = inputs
if self.bias_ih is not None:
x_gates = x_gates + self.bias_ih
h_gates = paddle.matmul(pre_hidden, self.weight_hh, transpose_y=True)
if self.bias_hh is not None:
h_gates = h_gates + self.bias_hh
x_r, x_z, x_c = paddle.split(x_gates, num_or_sections=3, axis=1)
h_r, h_z, h_c = paddle.split(h_gates, num_or_sections=3, axis=1)
r = self._gate_activation(x_r + h_r)
z = self._gate_activation(x_z + h_z)
c = self._activation(x_c + r * h_c) # apply reset gate after mm
h = (pre_hidden - c) * z + c
# https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru
return h, h
@property
def state_shape(self):
r"""
The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch
size would be automatically inserted into shape). The shape corresponds
to the shape of :math:`h_{t-1}`.
"""
return (self.hidden_size, )
class BiRNNWithBN(nn.Layer):
"""Bidirectonal simple rnn layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param size: Dimension of RNN cells.
:type size: int
:param share_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
:type share_weights: bool
:return: Bidirectional simple rnn layer.
:rtype: Variable
"""
def __init__(self, i_size: int, h_size: int, share_weights: bool):
super().__init__()
self.share_weights = share_weights
if self.share_weights:
#input-hidden weights shared between bi-directional rnn.
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
# batch norm is only performed on input-state projection
self.fw_bn = nn.BatchNorm1D(
h_size, bias_attr=None, data_format='NLC')
self.bw_fc = self.fw_fc
self.bw_bn = self.fw_bn
else:
self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False)
self.fw_bn = nn.BatchNorm1D(
h_size, bias_attr=None, data_format='NLC')
self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False)
self.bw_bn = nn.BatchNorm1D(
h_size, bias_attr=None, data_format='NLC')
self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu')
self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu')
self.fw_rnn = nn.RNN(
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
self.bw_rnn = nn.RNN(
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
# x, shape [B, T, D]
fw_x = self.fw_bn(self.fw_fc(x))
bw_x = self.bw_bn(self.bw_fc(x))
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
x = paddle.concat([fw_x, bw_x], axis=-1)
return x, x_len
class BiGRUWithBN(nn.Layer):
"""Bidirectonal gru layer with sequence-wise batch normalization.
The batch normalization is only performed on input-state weights.
:param name: Name of the layer.
:type name: string
:param input: Input layer.
:type input: Variable
:param size: Dimension of GRU cells.
:type size: int
:param act: Activation type.
:type act: string
:return: Bidirectional GRU layer.
:rtype: Variable
"""
def __init__(self, i_size: int, h_size: int):
super().__init__()
hidden_size = h_size * 3
self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
self.fw_bn = nn.BatchNorm1D(
hidden_size, bias_attr=None, data_format='NLC')
self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False)
self.bw_bn = nn.BatchNorm1D(
hidden_size, bias_attr=None, data_format='NLC')
self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size)
self.fw_rnn = nn.RNN(
self.fw_cell, is_reverse=False, time_major=False) #[B, T, D]
self.bw_rnn = nn.RNN(
self.fw_cell, is_reverse=True, time_major=False) #[B, T, D]
def forward(self, x, x_len):
# x, shape [B, T, D]
fw_x = self.fw_bn(self.fw_fc(x))
bw_x = self.bw_bn(self.bw_fc(x))
fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len)
bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len)
x = paddle.concat([fw_x, bw_x], axis=-1)
return x, x_len
class RNNStack(nn.Layer):
"""RNN group with stacked bidirectional simple RNN or GRU layers.
:param input: Input layer.
:type input: Variable
:param size: Dimension of RNN cells in each layer.
:type size: int
:param num_stacks: Number of stacked rnn layers.
:type num_stacks: int
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:param share_rnn_weights: Whether to share input-hidden weights between
forward and backward directional RNNs.
It is only available when use_gru=False.
:type share_weights: bool
:return: Output layer of the RNN group.
:rtype: Variable
"""
def __init__(self,
i_size: int,
h_size: int,
num_stacks: int,
use_gru: bool,
share_rnn_weights: bool):
super().__init__()
rnn_stacks = []
for i in range(num_stacks):
if use_gru:
#default:GRU using tanh
rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size))
else:
rnn_stacks.append(
BiRNNWithBN(
i_size=i_size,
h_size=h_size,
share_weights=share_rnn_weights))
i_size = h_size * 2
self.rnn_stacks = nn.LayerList(rnn_stacks)
def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
"""
x: shape [B, T, D]
x_len: shpae [B]
"""
for i, rnn in enumerate(self.rnn_stacks):
x, x_len = rnn(x, x_len)
masks = make_non_pad_mask(x_len) #[B, T]
masks = masks.unsqueeze(-1) # [B, T, 1]
# TODO(Hui Zhang): not support bool multiply
masks = masks.astype(x.dtype)
x = x.multiply(masks)
return x, x_len

@ -1,31 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .deepspeech2 import DeepSpeech2InferModelOnline
from .deepspeech2 import DeepSpeech2ModelOnline
from paddlespeech.s2t.utils import dynamic_pip_install
import sys
try:
import paddlespeech_ctcdecoders
except ImportError:
try:
package_name = 'paddlespeech_ctcdecoders'
if sys.platform != "win32":
dynamic_pip_install.install(package_name)
except Exception:
raise RuntimeError(
"Can not install package paddlespeech_ctcdecoders on your system. \
The DeepSpeech2 model is not supported for your system")
__all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline']

@ -1,33 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4
class Conv2dSubsampling4Online(Conv2dSubsampling4):
def __init__(self, idim: int, odim: int, dropout_rate: float):
super().__init__(idim, odim, dropout_rate, None)
self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim
self.receptive_field_length = 2 * (
3 - 1) + 3 # stride_1 * (kernel_size_2 - 1) + kerel_size_1
def forward(self, x: paddle.Tensor,
x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]:
x = x.unsqueeze(1) # (b, c=1, t, f)
x = self.conv(x)
#b, c, t, f = paddle.shape(x) #not work under jit
x = x.transpose([0, 2, 1, 3]).reshape([0, 0, -1])
x_len = ((x_len - 1) // 2 - 1) // 2
return x, x_len

@ -1,397 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Deepspeech2 ASR Online Model"""
import paddle
import paddle.nn.functional as F
from paddle import nn
from paddlespeech.s2t.models.ds2_online.conv import Conv2dSubsampling4Online
from paddlespeech.s2t.modules.ctc import CTCDecoder
from paddlespeech.s2t.utils import layer_tools
from paddlespeech.s2t.utils.checkpoint import Checkpoint
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
__all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline']
class CRNNEncoder(nn.Layer):
def __init__(self,
feat_size,
dict_size,
num_conv_layers=2,
num_rnn_layers=4,
rnn_size=1024,
rnn_direction='forward',
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False):
super().__init__()
self.rnn_size = rnn_size
self.feat_size = feat_size # 161 for linear
self.dict_size = dict_size
self.num_rnn_layers = num_rnn_layers
self.num_fc_layers = num_fc_layers
self.rnn_direction = rnn_direction
self.fc_layers_size_list = fc_layers_size_list
self.use_gru = use_gru
self.conv = Conv2dSubsampling4Online(feat_size, 32, dropout_rate=0.0)
self.output_dim = self.conv.output_dim
i_size = self.conv.output_dim
self.rnn = nn.LayerList()
self.layernorm_list = nn.LayerList()
self.fc_layers_list = nn.LayerList()
if rnn_direction == 'bidirect' or rnn_direction == 'bidirectional':
layernorm_size = 2 * rnn_size
elif rnn_direction == 'forward':
layernorm_size = rnn_size
else:
raise Exception("Wrong rnn direction")
for i in range(0, num_rnn_layers):
if i == 0:
rnn_input_size = i_size
else:
rnn_input_size = layernorm_size
if use_gru is True:
self.rnn.append(
nn.GRU(
input_size=rnn_input_size,
hidden_size=rnn_size,
num_layers=1,
direction=rnn_direction))
else:
self.rnn.append(
nn.LSTM(
input_size=rnn_input_size,
hidden_size=rnn_size,
num_layers=1,
direction=rnn_direction))
self.layernorm_list.append(nn.LayerNorm(layernorm_size))
self.output_dim = layernorm_size
fc_input_size = layernorm_size
for i in range(self.num_fc_layers):
self.fc_layers_list.append(
nn.Linear(fc_input_size, fc_layers_size_list[i]))
fc_input_size = fc_layers_size_list[i]
self.output_dim = fc_layers_size_list[i]
@property
def output_size(self):
return self.output_dim
def forward(self, x, x_lens, init_state_h_box=None, init_state_c_box=None):
"""Compute Encoder outputs
Args:
x (Tensor): [B, T, D]
x_lens (Tensor): [B]
init_state_h_box(Tensor): init_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
init_state_c_box(Tensor): init_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
Return:
x (Tensor): encoder outputs, [B, T, D]
x_lens (Tensor): encoder length, [B]
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
if init_state_h_box is not None:
init_state_list = None
if self.use_gru is True:
init_state_h_list = paddle.split(
init_state_h_box, self.num_rnn_layers, axis=0)
init_state_list = init_state_h_list
else:
init_state_h_list = paddle.split(
init_state_h_box, self.num_rnn_layers, axis=0)
init_state_c_list = paddle.split(
init_state_c_box, self.num_rnn_layers, axis=0)
init_state_list = [(init_state_h_list[i], init_state_c_list[i])
for i in range(self.num_rnn_layers)]
else:
init_state_list = [None] * self.num_rnn_layers
x, x_lens = self.conv(x, x_lens)
final_chunk_state_list = []
for i in range(0, self.num_rnn_layers):
x, final_state = self.rnn[i](x, init_state_list[i],
x_lens) #[B, T, D]
final_chunk_state_list.append(final_state)
x = self.layernorm_list[i](x)
for i in range(self.num_fc_layers):
x = self.fc_layers_list[i](x)
x = F.relu(x)
if self.use_gru is True:
final_chunk_state_h_box = paddle.concat(
final_chunk_state_list, axis=0)
final_chunk_state_c_box = init_state_c_box
else:
final_chunk_state_h_list = [
final_chunk_state_list[i][0] for i in range(self.num_rnn_layers)
]
final_chunk_state_c_list = [
final_chunk_state_list[i][1] for i in range(self.num_rnn_layers)
]
final_chunk_state_h_box = paddle.concat(
final_chunk_state_h_list, axis=0)
final_chunk_state_c_box = paddle.concat(
final_chunk_state_c_list, axis=0)
return x, x_lens, final_chunk_state_h_box, final_chunk_state_c_box
def forward_chunk_by_chunk(self, x, x_lens, decoder_chunk_size=8):
"""Compute Encoder outputs
Args:
x (Tensor): [B, T, D]
x_lens (Tensor): [B]
decoder_chunk_size: The chunk size of decoder
Returns:
eouts_list (List of Tensor): The list of encoder outputs in chunk_size: [B, chunk_size, D] * num_chunks
eouts_lens_list (List of Tensor): The list of encoder length in chunk_size: [B] * num_chunks
final_state_h_box(Tensor): final_states h for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
final_state_c_box(Tensor): final_states c for RNN layers: [num_rnn_layers * num_directions, batch_size, hidden_size]
"""
subsampling_rate = self.conv.subsampling_rate
receptive_field_length = self.conv.receptive_field_length
chunk_size = (decoder_chunk_size - 1
) * subsampling_rate + receptive_field_length
chunk_stride = subsampling_rate * decoder_chunk_size
max_len = x.shape[1]
assert (chunk_size <= max_len)
eouts_chunk_list = []
eouts_chunk_lens_list = []
if (max_len - chunk_size) % chunk_stride != 0:
padding_len = chunk_stride - (max_len - chunk_size) % chunk_stride
else:
padding_len = 0
padding = paddle.zeros((x.shape[0], padding_len, x.shape[2]))
padded_x = paddle.concat([x, padding], axis=1)
num_chunk = (max_len + padding_len - chunk_size) / chunk_stride + 1
num_chunk = int(num_chunk)
chunk_state_h_box = None
chunk_state_c_box = None
final_state_h_box = None
final_state_c_box = None
for i in range(0, num_chunk):
start = i * chunk_stride
end = start + chunk_size
x_chunk = padded_x[:, start:end, :]
x_len_left = paddle.where(x_lens - i * chunk_stride < 0,
paddle.zeros_like(x_lens),
x_lens - i * chunk_stride)
x_chunk_len_tmp = paddle.ones_like(x_lens) * chunk_size
x_chunk_lens = paddle.where(x_len_left < x_chunk_len_tmp,
x_len_left, x_chunk_len_tmp)
eouts_chunk, eouts_chunk_lens, chunk_state_h_box, chunk_state_c_box = self.forward(
x_chunk, x_chunk_lens, chunk_state_h_box, chunk_state_c_box)
eouts_chunk_list.append(eouts_chunk)
eouts_chunk_lens_list.append(eouts_chunk_lens)
final_state_h_box = chunk_state_h_box
final_state_c_box = chunk_state_c_box
return eouts_chunk_list, eouts_chunk_lens_list, final_state_h_box, final_state_c_box
class DeepSpeech2ModelOnline(nn.Layer):
"""The DeepSpeech2 network structure for online.
:param audio: Audio spectrogram data layer.
:type audio: Variable
:param text: Transcription text data layer.
:type text: Variable
:param audio_len: Valid sequence length data layer.
:type audio_len: Variable
:param feat_size: feature size for audio.
:type feat_size: int
:param dict_size: Dictionary size for tokenized transcription.
:type dict_size: int
:param num_conv_layers: Number of stacking convolution layers.
:type num_conv_layers: int
:param num_rnn_layers: Number of stacking RNN layers.
:type num_rnn_layers: int
:param rnn_size: RNN layer size (dimension of RNN cells).
:type rnn_size: int
:param num_fc_layers: Number of stacking FC layers.
:type num_fc_layers: int
:param fc_layers_size_list: The list of FC layer sizes.
:type fc_layers_size_list: [int,]
:param use_gru: Use gru if set True. Use simple rnn if set False.
:type use_gru: bool
:return: A tuple of an output unnormalized log probability layer (
before softmax) and a ctc cost layer.
:rtype: tuple of LayerOutput
"""
def __init__(
self,
feat_size,
dict_size,
num_conv_layers=2,
num_rnn_layers=4,
rnn_size=1024,
rnn_direction='forward',
num_fc_layers=2,
fc_layers_size_list=[512, 256],
use_gru=False,
blank_id=0,
ctc_grad_norm_type=None, ):
super().__init__()
self.encoder = CRNNEncoder(
feat_size=feat_size,
dict_size=dict_size,
num_conv_layers=num_conv_layers,
num_rnn_layers=num_rnn_layers,
rnn_direction=rnn_direction,
num_fc_layers=num_fc_layers,
fc_layers_size_list=fc_layers_size_list,
rnn_size=rnn_size,
use_gru=use_gru)
self.decoder = CTCDecoder(
odim=dict_size, # <blank> is in vocab
enc_n_units=self.encoder.output_size,
blank_id=blank_id,
dropout_rate=0.0,
reduction=True, # sum
batch_average=True, # sum / batch_size
grad_norm_type=ctc_grad_norm_type)
def forward(self, audio, audio_len, text, text_len):
"""Compute Model loss
Args:
audio (Tensor): [B, T, D]
audio_len (Tensor): [B]
text (Tensor): [B, U]
text_len (Tensor): [B]
Returns:
loss (Tensor): [1]
"""
eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
audio, audio_len, None, None)
loss = self.decoder(eouts, eouts_len, text, text_len)
return loss
@paddle.no_grad()
def decode(self, audio, audio_len):
# decoders only accept string encoded in utf-8
# Make sure the decoder has been initialized
eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder(
audio, audio_len, None, None)
probs = self.decoder.softmax(eouts)
batch_size = probs.shape[0]
self.decoder.reset_decoder(batch_size=batch_size)
self.decoder.next(probs, eouts_len)
trans_best, trans_beam = self.decoder.decode()
return trans_best
@classmethod
def from_pretrained(cls, dataloader, config, checkpoint_path):
"""Build a DeepSpeech2Model model from a pretrained model.
Parameters
----------
dataloader: paddle.io.DataLoader
config: yacs.config.CfgNode
model configs
checkpoint_path: Path or str
the path of pretrained model checkpoint, without extension name
Returns
-------
DeepSpeech2ModelOnline
The model built from pretrained result.
"""
model = cls(
feat_size=dataloader.collate_fn.feature_size,
dict_size=dataloader.collate_fn.vocab_size,
num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.num_rnn_layers,
rnn_size=config.rnn_layer_size,
rnn_direction=config.rnn_direction,
num_fc_layers=config.num_fc_layers,
fc_layers_size_list=config.fc_layers_size_list,
use_gru=config.use_gru,
blank_id=config.blank_id,
ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
infos = Checkpoint().load_parameters(
model, checkpoint_path=checkpoint_path)
logger.info(f"checkpoint info: {infos}")
layer_tools.summary(model)
return model
@classmethod
def from_config(cls, config):
"""Build a DeepSpeec2ModelOnline from config
Parameters
config: yacs.config.CfgNode
config
Returns
-------
DeepSpeech2ModelOnline
The model built from config.
"""
model = cls(
feat_size=config.input_dim,
dict_size=config.output_dim,
num_conv_layers=config.num_conv_layers,
num_rnn_layers=config.num_rnn_layers,
rnn_size=config.rnn_layer_size,
rnn_direction=config.rnn_direction,
num_fc_layers=config.num_fc_layers,
fc_layers_size_list=config.fc_layers_size_list,
use_gru=config.use_gru,
blank_id=config.blank_id,
ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), )
return model
class DeepSpeech2InferModelOnline(DeepSpeech2ModelOnline):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box,
chunk_state_c_box):
eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder(
audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box)
probs_chunk = self.decoder.softmax(eouts_chunk)
return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box
def export(self):
static_model = paddle.jit.to_static(
self,
input_spec=[
paddle.static.InputSpec(
shape=[None, None,
self.encoder.feat_size], #[B, chunk_size, feat_dim]
dtype='float32'),
paddle.static.InputSpec(shape=[None],
dtype='int64'), # audio_length, [B]
paddle.static.InputSpec(
shape=[None, None, None], dtype='float32'),
paddle.static.InputSpec(
shape=[None, None, None], dtype='float32')
])
return static_model

@ -25,7 +25,6 @@ from paddlespeech.cli.log import logger
from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.cli.utils import MODEL_HOME
from paddlespeech.resource import CommonTaskResource from paddlespeech.resource import CommonTaskResource
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.speech import SpeechSegment
from paddlespeech.s2t.modules.ctc import CTCDecoder from paddlespeech.s2t.modules.ctc import CTCDecoder
from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.transform.transformation import Transformation
from paddlespeech.s2t.utils.tensor_utils import add_sos_eos from paddlespeech.s2t.utils.tensor_utils import add_sos_eos
@ -66,10 +65,13 @@ class PaddleASRConnectionHanddler:
self.text_feature = self.asr_engine.executor.text_feature self.text_feature = self.asr_engine.executor.text_feature
if "deepspeech2" in self.model_type: if "deepspeech2" in self.model_type:
from paddlespeech.s2t.io.collator import SpeechCollator
self.am_predictor = self.asr_engine.executor.am_predictor self.am_predictor = self.asr_engine.executor.am_predictor
self.collate_fn_test = SpeechCollator.from_config(self.model_config) # extract feat, new only fbank in conformer model
self.preprocess_conf = self.model_config.preprocess_config
self.preprocess_args = {"train": False}
self.preprocessing = Transformation(self.preprocess_conf)
self.decoder = CTCDecoder( self.decoder = CTCDecoder(
odim=self.model_config.output_dim, # <blank> is in vocab odim=self.model_config.output_dim, # <blank> is in vocab
enc_n_units=self.model_config.rnn_layer_size * 2, enc_n_units=self.model_config.rnn_layer_size * 2,
@ -89,10 +91,8 @@ class PaddleASRConnectionHanddler:
cfg.num_proc_bsearch) cfg.num_proc_bsearch)
# frame window and frame shift, in samples unit # frame window and frame shift, in samples unit
self.win_length = int(self.model_config.window_ms / 1000 * self.win_length = self.preprocess_conf.process[0]['win_length']
self.sample_rate) self.n_shift = self.preprocess_conf.process[0]['n_shift']
self.n_shift = int(self.model_config.stride_ms / 1000 *
self.sample_rate)
elif "conformer" in self.model_type or "transformer" in self.model_type: elif "conformer" in self.model_type or "transformer" in self.model_type:
# acoustic model # acoustic model
@ -114,20 +114,15 @@ class PaddleASRConnectionHanddler:
raise ValueError(f"Not supported: {self.model_type}") raise ValueError(f"Not supported: {self.model_type}")
def extract_feat(self, samples): def extract_feat(self, samples):
# we compute the elapsed time of first char occuring # we compute the elapsed time of first char occuring
# and we record the start time at the first pcm sample arraving # and we record the start time at the first pcm sample arraving
if "deepspeech2online" in self.model_type: if "deepspeech2online" in self.model_type:
# self.reamined_wav stores all the samples, # self.reamined_wav stores all the samples,
# include the original remained_wav and this package samples # include the original remained_wav and this package samples
samples = np.frombuffer(samples, dtype=np.int16) samples = np.frombuffer(samples, dtype=np.int16)
assert samples.ndim == 1 assert samples.ndim == 1
# pcm16 -> pcm 32
# pcm2float will change the orignal samples,
# so we shoule do pcm2float before concatenate
samples = pcm2float(samples)
if self.remained_wav is None: if self.remained_wav is None:
self.remained_wav = samples self.remained_wav = samples
else: else:
@ -137,26 +132,11 @@ class PaddleASRConnectionHanddler:
f"The connection remain the audio samples: {self.remained_wav.shape}" f"The connection remain the audio samples: {self.remained_wav.shape}"
) )
# read audio # fbank
speech_segment = SpeechSegment.from_pcm( feat = self.preprocessing(self.remained_wav,
self.remained_wav, self.sample_rate, transcript=" ") **self.preprocess_args)
# audio augment feat = paddle.to_tensor(
self.collate_fn_test.augmentation.transform_audio(speech_segment) feat, dtype="float32").unsqueeze(axis=0)
# extract speech feature
spectrum, transcript_part = self.collate_fn_test._speech_featurizer.featurize(
speech_segment, self.collate_fn_test.keep_transcription_text)
# CMVN spectrum
if self.collate_fn_test._normalizer:
spectrum = self.collate_fn_test._normalizer.apply(spectrum)
# spectrum augment
feat = self.collate_fn_test.augmentation.transform_feature(spectrum)
# audio_len is frame num
frame_num = feat.shape[0]
feat = paddle.to_tensor(feat, dtype='float32')
feat = paddle.unsqueeze(feat, axis=0)
if self.cached_feat is None: if self.cached_feat is None:
self.cached_feat = feat self.cached_feat = feat
@ -170,8 +150,11 @@ class PaddleASRConnectionHanddler:
if self.device is None: if self.device is None:
self.device = self.cached_feat.place self.device = self.cached_feat.place
self.num_frames += frame_num # cur frame step
self.remained_wav = self.remained_wav[self.n_shift * frame_num:] num_frames = feat.shape[1]
self.num_frames += num_frames
self.remained_wav = self.remained_wav[self.n_shift * num_frames:]
logger.info( logger.info(
f"process the audio feature success, the connection feat shape: {self.cached_feat.shape}" f"process the audio feature success, the connection feat shape: {self.cached_feat.shape}"
@ -190,7 +173,7 @@ class PaddleASRConnectionHanddler:
f"This package receive {samples.shape[0]} pcm data. Global samples:{self.num_samples}" f"This package receive {samples.shape[0]} pcm data. Global samples:{self.num_samples}"
) )
# self.reamined_wav stores all the samples, # self.reamined_wav stores all the samples,
# include the original remained_wav and this package samples # include the original remained_wav and this package samples
if self.remained_wav is None: if self.remained_wav is None:
self.remained_wav = samples self.remained_wav = samples
@ -246,7 +229,7 @@ class PaddleASRConnectionHanddler:
def reset(self): def reset(self):
if "deepspeech2" in self.model_type: if "deepspeech2" in self.model_type:
# for deepspeech2 # for deepspeech2
# init state # init state
self.chunk_state_h_box = np.zeros( self.chunk_state_h_box = np.zeros(
(self.model_config.num_rnn_layers, 1, (self.model_config.num_rnn_layers, 1,
@ -275,7 +258,7 @@ class PaddleASRConnectionHanddler:
## conformer ## conformer
# cache for conformer online # cache for conformer online
self.subsampling_cache = None self.subsampling_cache = None
self.elayers_output_cache = None self.elayers_output_cache = None
self.conformer_cnn_cache = None self.conformer_cnn_cache = None
@ -359,7 +342,7 @@ class PaddleASRConnectionHanddler:
# update feat cache # update feat cache
self.cached_feat = self.cached_feat[:, end - cached_feature_num:, :] self.cached_feat = self.cached_feat[:, end - cached_feature_num:, :]
# return trans_best[0] # return trans_best[0]
elif "conformer" in self.model_type or "transformer" in self.model_type: elif "conformer" in self.model_type or "transformer" in self.model_type:
try: try:
logger.info( logger.info(
@ -565,7 +548,7 @@ class PaddleASRConnectionHanddler:
@paddle.no_grad() @paddle.no_grad()
def rescoring(self): def rescoring(self):
"""Second-Pass Decoding, """Second-Pass Decoding,
only for conformer and transformer model. only for conformer and transformer model.
""" """
if "deepspeech2" in self.model_type: if "deepspeech2" in self.model_type:
@ -652,11 +635,11 @@ class PaddleASRConnectionHanddler:
## asr results ## asr results
# hyps[0][0]: the sentence word-id in the vocab with a tuple # hyps[0][0]: the sentence word-id in the vocab with a tuple
# hyps[0][1]: the sentence decoding probability with all paths # hyps[0][1]: the sentence decoding probability with all paths
## timestamp ## timestamp
# hyps[0][2]: viterbi_blank ending probability # hyps[0][2]: viterbi_blank ending probability
# hyps[0][3]: viterbi_non_blank dending probability # hyps[0][3]: viterbi_non_blank dending probability
# hyps[0][4]: current_token_prob, # hyps[0][4]: current_token_prob,
# hyps[0][5]: times_viterbi_blank ending timestamp, # hyps[0][5]: times_viterbi_blank ending timestamp,
# hyps[0][6]: times_titerbi_non_blank encding timestamp. # hyps[0][6]: times_titerbi_non_blank encding timestamp.
self.hyps = [hyps[best_index][0]] self.hyps = [hyps[best_index][0]]
logger.info(f"best hyp ids: {self.hyps}") logger.info(f"best hyp ids: {self.hyps}")
@ -752,16 +735,19 @@ class ASRServerExecutor(ASRExecutor):
self.config = CfgNode(new_allowed=True) self.config = CfgNode(new_allowed=True)
self.config.merge_from_file(self.cfg_path) self.config.merge_from_file(self.cfg_path)
if self.config.spm_model_prefix:
self.config.spm_model_prefix = os.path.join(
self.res_path, self.config.spm_model_prefix)
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type,
vocab=self.config.vocab_filepath,
spm_model_prefix=self.config.spm_model_prefix)
self.vocab = self.config.vocab_filepath
with UpdateConfig(self.config): with UpdateConfig(self.config):
if "deepspeech2" in model_type: if "deepspeech2" in model_type:
from paddlespeech.s2t.io.collator import SpeechCollator
self.vocab = self.config.vocab_filepath
self.config.decode.lang_model_path = os.path.join( self.config.decode.lang_model_path = os.path.join(
MODEL_HOME, 'language_model', MODEL_HOME, 'language_model',
self.config.decode.lang_model_path) self.config.decode.lang_model_path)
self.collate_fn_test = SpeechCollator.from_config(self.config)
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type, vocab=self.vocab)
lm_url = self.task_resource.res_dict['lm_url'] lm_url = self.task_resource.res_dict['lm_url']
lm_md5 = self.task_resource.res_dict['lm_md5'] lm_md5 = self.task_resource.res_dict['lm_md5']
@ -772,14 +758,6 @@ class ASRServerExecutor(ASRExecutor):
elif "conformer" in model_type or "transformer" in model_type: elif "conformer" in model_type or "transformer" in model_type:
logger.info("start to create the stream conformer asr engine") logger.info("start to create the stream conformer asr engine")
if self.config.spm_model_prefix:
self.config.spm_model_prefix = os.path.join(
self.res_path, self.config.spm_model_prefix)
self.vocab = self.config.vocab_filepath
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type,
vocab=self.config.vocab_filepath,
spm_model_prefix=self.config.spm_model_prefix)
# update the decoding method # update the decoding method
if decode_method: if decode_method:
self.config.decode.decoding_method = decode_method self.config.decode.decoding_method = decode_method

@ -54,6 +54,7 @@ class ASRServerExecutor(ASRExecutor):
self.max_len = 50 self.max_len = 50
sample_rate_str = '16k' if sample_rate == 16000 else '8k' sample_rate_str = '16k' if sample_rate == 16000 else '8k'
tag = model_type + '-' + lang + '-' + sample_rate_str tag = model_type + '-' + lang + '-' + sample_rate_str
self.max_len = 50
self.task_resource.set_task_model(model_tag=tag) self.task_resource.set_task_model(model_tag=tag)
if cfg_path is None or am_model is None or am_params is None: if cfg_path is None or am_model is None or am_params is None:
self.res_path = self.task_resource.res_dir self.res_path = self.task_resource.res_dir
@ -80,22 +81,25 @@ class ASRServerExecutor(ASRExecutor):
self.config.merge_from_file(self.cfg_path) self.config.merge_from_file(self.cfg_path)
with UpdateConfig(self.config): with UpdateConfig(self.config):
if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: if "deepspeech2" in model_type:
from paddlespeech.s2t.io.collator import SpeechCollator
self.vocab = self.config.vocab_filepath self.vocab = self.config.vocab_filepath
if self.config.spm_model_prefix:
self.config.spm_model_prefix = os.path.join(
self.res_path, self.config.spm_model_prefix)
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type,
vocab=self.vocab,
spm_model_prefix=self.config.spm_model_prefix)
self.config.decode.lang_model_path = os.path.join( self.config.decode.lang_model_path = os.path.join(
MODEL_HOME, 'language_model', MODEL_HOME, 'language_model',
self.config.decode.lang_model_path) self.config.decode.lang_model_path)
self.collate_fn_test = SpeechCollator.from_config(self.config)
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type, vocab=self.vocab)
lm_url = self.task_resource.res_dict['lm_url'] lm_url = self.task_resource.res_dict['lm_url']
lm_md5 = self.task_resource.res_dict['lm_md5'] lm_md5 = self.task_resource.res_dict['lm_md5']
self.download_lm( self.download_lm(
lm_url, lm_url,
os.path.dirname(self.config.decode.lang_model_path), lm_md5) os.path.dirname(self.config.decode.lang_model_path), lm_md5)
elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: elif "conformer" in model_type or "transformer" in model_type:
raise Exception("wrong type") raise Exception("wrong type")
else: else:
raise Exception("wrong type") raise Exception("wrong type")
@ -125,7 +129,7 @@ class ASRServerExecutor(ASRExecutor):
cfg = self.config.decode cfg = self.config.decode
audio = self._inputs["audio"] audio = self._inputs["audio"]
audio_len = self._inputs["audio_len"] audio_len = self._inputs["audio_len"]
if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: if "deepspeech2" in model_type:
decode_batch_size = audio.shape[0] decode_batch_size = audio.shape[0]
# init once # init once
self.decoder.init_decoder( self.decoder.init_decoder(
@ -222,10 +226,9 @@ class PaddleASRConnectionHandler(ASRServerExecutor):
self.decoder = self.executor.decoder self.decoder = self.executor.decoder
self.am_predictor = self.executor.am_predictor self.am_predictor = self.executor.am_predictor
self.text_feature = self.executor.text_feature self.text_feature = self.executor.text_feature
self.collate_fn_test = self.executor.collate_fn_test
def run(self, audio_data): def run(self, audio_data):
"""engine run """engine run
Args: Args:
audio_data (bytes): base64.b64decode audio_data (bytes): base64.b64decode

@ -40,7 +40,7 @@ class TTSServerExecutor(TTSExecutor):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.task_resource = CommonTaskResource( self.task_resource = CommonTaskResource(
task='tts', model_format='static', inference_mode='online') task='tts', model_format='dynamic', inference_mode='online')
def get_model_info(self, def get_model_info(self,

@ -142,4 +142,3 @@ set(DEPS ${DEPS}
set(SPEECHX_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/speechx) set(SPEECHX_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/speechx)
add_subdirectory(speechx) add_subdirectory(speechx)
add_subdirectory(examples)

@ -71,7 +71,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
recognizer_test_main \ recognizer_test_main \
--wav_rspecifier=scp:$wav_scp \ --wav_rspecifier=scp:$wav_scp \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--streaming_chunk=30 \
--use_fbank=true \ --use_fbank=true \
--model_path=$model_dir/avg_10.jit.pdmodel \ --model_path=$model_dir/avg_10.jit.pdmodel \
--param_path=$model_dir/avg_10.jit.pdiparams \ --param_path=$model_dir/avg_10.jit.pdiparams \

@ -2,13 +2,5 @@
## Examples ## Examples
* `websocket` - Streaming ASR with websocket. * `websocket` - Streaming ASR with websocket for deepspeech2_aishell.
* `aishell` - Streaming Decoding under aishell dataset, for local WER test.
* `aishell` - Streaming Decoding under aishell dataset, for local WER test.
## More
> The below is for developing and offline testing. Do not run it only if you know what it is.
* nnet
* feat
* decoder

@ -20,5 +20,5 @@ export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
export SRILM=${MAIN_ROOT}/tools/srilm export SRILM=${MAIN_ROOT}/tools/srilm
SPEECHX_BIN=$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/websocket SPEECHX_BIN=$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio
export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN:${SRILM}/bin:${SRILM}/bin/i686-m64:$KALDI_DIR/lmbin:$KALDI_DIR/fstbin:$OPENFST_DIR/bin export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN:${SRILM}/bin:${SRILM}/bin/i686-m64:$KALDI_DIR/lmbin:$KALDI_DIR/fstbin:$OPENFST_DIR/bin

@ -78,7 +78,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \ --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--streaming_chunk=0.36
echo "feature make have finished!!!" echo "feature make have finished!!!"
fi fi
@ -155,7 +154,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--model_path=$model_dir/avg_1.jit.pdmodel \ --model_path=$model_dir/avg_1.jit.pdmodel \
--streaming_chunk=30 \
--param_path=$model_dir/avg_1.jit.pdiparams \ --param_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \ --word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \

@ -152,7 +152,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--model_path=$model_dir/avg_5.jit.pdmodel \ --model_path=$model_dir/avg_5.jit.pdmodel \
--streaming_chunk=30 \
--use_fbank=true \ --use_fbank=true \
--param_path=$model_dir/avg_5.jit.pdiparams \ --param_path=$model_dir/avg_5.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \ --word_symbol_table=$wfst/words.txt \

@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
export LC_AL=C export LC_AL=C
SPEECHX_BIN=$SPEECHX_BUILD/protocol/websocket SPEECHX_BIN=$SPEECHX_BUILD/protocol/websocket:$SPEECHX_BUILD/frontend/audio
export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN

@ -32,4 +32,4 @@ export GLOG_logtostderr=1
# websocket client # websocket client
websocket_client_main \ websocket_client_main \
--wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.36 --wav_rspecifier=scp:$data/$aishell_wav_scp --streaming_chunk=0.5

@ -4,7 +4,6 @@ set -e
. path.sh . path.sh
# 1. compile # 1. compile
if [ ! -d ${SPEECHX_EXAMPLES} ]; then if [ ! -d ${SPEECHX_EXAMPLES} ]; then
pushd ${SPEECHX_ROOT} pushd ${SPEECHX_ROOT}
@ -19,19 +18,6 @@ ckpt_dir=$data/model
model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/ model_dir=$ckpt_dir/exp/deepspeech2_online/checkpoints/
vocb_dir=$ckpt_dir/data/lang_char/ vocb_dir=$ckpt_dir/data/lang_char/
# output
aishell_wav_scp=aishell_test.scp
if [ ! -d $data/test ]; then
pushd $data
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
unzip aishell_test.zip
popd
realpath $data/test/*/*.wav > $data/wavlist
awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
fi
if [ ! -f $ckpt_dir/data/mean_std.json ]; then if [ ! -f $ckpt_dir/data/mean_std.json ]; then
mkdir -p $ckpt_dir mkdir -p $ckpt_dir
@ -62,7 +48,6 @@ fi
websocket_server_main \ websocket_server_main \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--model_path=$model_dir/avg_1.jit.pdmodel \ --model_path=$model_dir/avg_1.jit.pdmodel \
--streaming_chunk=0.1 \
--param_path=$model_dir/avg_1.jit.pdiparams \ --param_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \ --word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \

@ -25,7 +25,6 @@ DEFINE_bool(use_fbank, false, "False for fbank; or linear feature");
// feature, or fbank"); // feature, or fbank");
DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_int32(num_bins, 161, "num bins of mel");
DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_string(cmvn_file, "", "read cmvn");
DEFINE_double(streaming_chunk, 0.1, "streaming feature chunk size");
// feature sliding window // feature sliding window
DEFINE_int32(receptive_field_length, DEFINE_int32(receptive_field_length,
7, 7,
@ -62,7 +61,6 @@ namespace ppspeech {
FeaturePipelineOptions InitFeaturePipelineOptions() { FeaturePipelineOptions InitFeaturePipelineOptions() {
FeaturePipelineOptions opts; FeaturePipelineOptions opts;
opts.cmvn_file = FLAGS_cmvn_file; opts.cmvn_file = FLAGS_cmvn_file;
opts.linear_spectrogram_opts.streaming_chunk = FLAGS_streaming_chunk;
kaldi::FrameExtractionOptions frame_opts; kaldi::FrameExtractionOptions frame_opts;
frame_opts.dither = 0.0; frame_opts.dither = 0.0;
frame_opts.frame_shift_ms = 10; frame_opts.frame_shift_ms = 10;
@ -71,8 +69,8 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
opts.to_float32 = false; opts.to_float32 = false;
frame_opts.window_type = "povey"; frame_opts.window_type = "povey";
frame_opts.frame_length_ms = 25; frame_opts.frame_length_ms = 25;
opts.fbank_opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins;
opts.fbank_opts.fbank_opts.frame_opts = frame_opts; opts.fbank_opts.frame_opts = frame_opts;
} else { } else {
opts.to_float32 = true; opts.to_float32 = true;
frame_opts.remove_dc_offset = false; frame_opts.remove_dc_offset = false;

@ -19,6 +19,7 @@
DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(wav_rspecifier, "", "test feature rspecifier");
DEFINE_string(result_wspecifier, "", "test result wspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier");
DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size");
DEFINE_int32(sample_rate, 16000, "sample rate"); DEFINE_int32(sample_rate, 16000, "sample rate");
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
@ -96,4 +97,4 @@ int main(int argc, char* argv[]) {
KALDI_LOG << " cost:" << elapsed << " s"; KALDI_LOG << " cost:" << elapsed << " s";
KALDI_LOG << "total wav duration is: " << tot_wav_duration << " s"; KALDI_LOG << "total wav duration is: " << tot_wav_duration << " s";
KALDI_LOG << "the RTF is: " << elapsed / tot_wav_duration; KALDI_LOG << "the RTF is: " << elapsed / tot_wav_duration;
} }

@ -30,8 +30,9 @@ class AudioCache : public FrontendInterface {
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
// the audio dim is 1, one sample // the audio dim is 1, one sample, which is useless,
virtual size_t Dim() const { return 1; } // so we return size_(cache samples) instead.
virtual size_t Dim() const { return size_; }
virtual void SetFinished() { virtual void SetFinished() {
std::lock_guard<std::mutex> lock(mutex_); std::lock_guard<std::mutex> lock(mutex_);

@ -49,12 +49,11 @@ int main(int argc, char* argv[]) {
std::unique_ptr<ppspeech::FrontendInterface> data_source( std::unique_ptr<ppspeech::FrontendInterface> data_source(
new ppspeech::AudioCache(3600 * 1600, false)); new ppspeech::AudioCache(3600 * 1600, false));
ppspeech::FbankOptions opt; kaldi::FbankOptions opt;
opt.fbank_opts.frame_opts.frame_length_ms = 25; opt.frame_opts.frame_length_ms = 25;
opt.fbank_opts.frame_opts.frame_shift_ms = 10; opt.frame_opts.frame_shift_ms = 10;
opt.streaming_chunk = FLAGS_streaming_chunk; opt.mel_opts.num_bins = FLAGS_num_bins;
opt.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; opt.frame_opts.dither = 0.0;
opt.fbank_opts.frame_opts.dither = 0.0;
std::unique_ptr<ppspeech::FrontendInterface> fbank( std::unique_ptr<ppspeech::FrontendInterface> fbank(
new ppspeech::Fbank(opt, std::move(data_source))); new ppspeech::Fbank(opt, std::move(data_source)));

@ -49,7 +49,6 @@ int main(int argc, char* argv[]) {
ppspeech::LinearSpectrogramOptions opt; ppspeech::LinearSpectrogramOptions opt;
opt.frame_opts.frame_length_ms = 20; opt.frame_opts.frame_length_ms = 20;
opt.frame_opts.frame_shift_ms = 10; opt.frame_opts.frame_shift_ms = 10;
opt.streaming_chunk = FLAGS_streaming_chunk;
opt.frame_opts.dither = 0.0; opt.frame_opts.dither = 0.0;
opt.frame_opts.remove_dc_offset = false; opt.frame_opts.remove_dc_offset = false;
opt.frame_opts.window_type = "hanning"; opt.frame_opts.window_type = "hanning";

@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "frontend/audio/fbank.h" #include "frontend/audio/fbank.h"
#include "kaldi/base/kaldi-math.h" #include "kaldi/base/kaldi-math.h"
#include "kaldi/feat/feature-common.h" #include "kaldi/feat/feature-common.h"
@ -29,95 +28,33 @@ using kaldi::VectorBase;
using kaldi::Matrix; using kaldi::Matrix;
using std::vector; using std::vector;
// todo refactor later:(SmileGoat) FbankComputer::FbankComputer(const Options& opts)
Fbank::Fbank(const FbankOptions& opts,
std::unique_ptr<FrontendInterface> base_extractor)
: opts_(opts), : opts_(opts),
computer_(opts.fbank_opts), computer_(opts) {}
window_function_(opts.fbank_opts.frame_opts) {
base_extractor_ = std::move(base_extractor);
chunk_sample_size_ = static_cast<int32>(
opts.streaming_chunk * opts.fbank_opts.frame_opts.samp_freq);
}
void Fbank::Accept(const VectorBase<BaseFloat>& inputs) { int32 FbankComputer::Dim() const {
base_extractor_->Accept(inputs); return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
} }
bool Fbank::Read(Vector<BaseFloat>* feats) { bool FbankComputer::NeedRawLogEnergy() {
Vector<BaseFloat> wav(chunk_sample_size_); return opts_.use_energy && opts_.raw_energy;
bool flag = base_extractor_->Read(&wav);
if (flag == false || wav.Dim() == 0) return false;
// append remaned waves
int32 wav_len = wav.Dim();
int32 left_len = remained_wav_.Dim();
Vector<BaseFloat> waves(left_len + wav_len);
waves.Range(0, left_len).CopyFromVec(remained_wav_);
waves.Range(left_len, wav_len).CopyFromVec(wav);
// compute speech feature
Compute(waves, feats);
// cache remaned waves
kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
int32 frame_shift = frame_opts.WindowShift();
int32 left_samples = waves.Dim() - frame_shift * num_frames;
remained_wav_.Resize(left_samples);
remained_wav_.CopyFromVec(
waves.Range(frame_shift * num_frames, left_samples));
return true;
} }
// Compute spectrogram feat // Compute feat
bool Fbank::Compute(const Vector<BaseFloat>& waves, Vector<BaseFloat>* feats) { bool FbankComputer::Compute(Vector<BaseFloat>* window, Vector<BaseFloat>* feat) {
const kaldi::FrameExtractionOptions& frame_opts = RealFft(window, true);
computer_.GetFrameOptions(); kaldi::ComputePowerSpectrum(window);
int32 num_samples = waves.Dim(); const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
int32 frame_length = frame_opts.WindowSize(); SubVector<BaseFloat> power_spectrum(*window, 0, window->Dim() / 2 + 1);
int32 sample_rate = frame_opts.samp_freq; if (!opts_.use_power) {
if (num_samples < frame_length) { power_spectrum.ApplyPow(0.5);
return true;
}
int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
feats->Resize(num_frames * Dim());
Vector<BaseFloat> window;
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
for (int32 frame = 0; frame < num_frames; frame++) {
BaseFloat raw_log_energy = 0.0;
kaldi::ExtractWindow(0,
waves,
frame,
frame_opts,
window_function_,
&window,
need_raw_log_energy ? &raw_log_energy : NULL);
Vector<BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
// note: this online feature-extraction code does not support VTLN.
RealFft(&window, true);
kaldi::ComputePowerSpectrum(&window);
const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
SubVector<BaseFloat> power_spectrum(window, 0, window.Dim() / 2 + 1);
if (!opts_.fbank_opts.use_power) {
power_spectrum.ApplyPow(0.5);
}
int32 mel_offset =
((opts_.fbank_opts.use_energy && !opts_.fbank_opts.htk_compat) ? 1
: 0);
SubVector<BaseFloat> mel_energies(
this_feature, mel_offset, opts_.fbank_opts.mel_opts.num_bins);
mel_bank.Compute(power_spectrum, &mel_energies);
mel_energies.ApplyFloor(1e-07);
mel_energies.ApplyLog();
SubVector<BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
output_row.CopyFromVec(this_feature);
} }
int32 mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
SubVector<BaseFloat> mel_energies(
*feat, mel_offset, opts_.mel_opts.num_bins);
mel_bank.Compute(power_spectrum, &mel_energies);
mel_energies.ApplyFloor(1e-07);
mel_energies.ApplyLog();
return true; return true;
} }

@ -15,6 +15,7 @@
#pragma once #pragma once
#include "base/common.h" #include "base/common.h"
#include "frontend/audio/feature_common.h"
#include "frontend/audio/frontend_itf.h" #include "frontend/audio/frontend_itf.h"
#include "kaldi/feat/feature-fbank.h" #include "kaldi/feat/feature-fbank.h"
#include "kaldi/feat/feature-mfcc.h" #include "kaldi/feat/feature-mfcc.h"
@ -22,56 +23,28 @@
namespace ppspeech { namespace ppspeech {
struct FbankOptions { class FbankComputer {
kaldi::FbankOptions fbank_opts;
kaldi::BaseFloat streaming_chunk; // second
FbankOptions() : streaming_chunk(0.1), fbank_opts() {}
void Register(kaldi::OptionsItf* opts) {
opts->Register("streaming-chunk",
&streaming_chunk,
"streaming chunk size, default: 0.1 sec");
fbank_opts.Register(opts);
}
};
class Fbank : public FrontendInterface {
public: public:
explicit Fbank(const FbankOptions& opts, typedef kaldi::FbankOptions Options;
std::unique_ptr<FrontendInterface> base_extractor); explicit FbankComputer(const Options& opts);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// the dim_ is the dim of single frame feature kaldi::FrameExtractionOptions& GetFrameOptions() {
virtual size_t Dim() const { return computer_.Dim(); } return opts_.frame_opts;
}
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
kaldi::Vector<kaldi::BaseFloat>* feat);
int32 Dim() const;
virtual void Reset() { bool NeedRawLogEnergy();
base_extractor_->Reset();
remained_wav_.Resize(0);
}
private: private:
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves, Options opts_;
kaldi::Vector<kaldi::BaseFloat>* feats);
FbankOptions opts_;
std::unique_ptr<FrontendInterface> base_extractor_;
kaldi::FeatureWindowFunction window_function_;
kaldi::FbankComputer computer_; kaldi::FbankComputer computer_;
// features_ is the Mfcc or Plp or Fbank features that we have already DISALLOW_COPY_AND_ASSIGN(FbankComputer);
// computed.
kaldi::Vector<kaldi::BaseFloat> features_;
kaldi::Vector<kaldi::BaseFloat> remained_wav_;
kaldi::int32 chunk_sample_size_;
DISALLOW_COPY_AND_ASSIGN(Fbank);
}; };
typedef StreamingFeatureTpl<FbankComputer> Fbank;
} // namespace ppspeech } // namespace ppspeech

@ -0,0 +1,54 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "frontend_itf.h"
#include "kaldi/feat/feature-window.h"
namespace ppspeech {
template <class F>
class StreamingFeatureTpl : public FrontendInterface {
public:
typedef typename F::Options Options;
StreamingFeatureTpl(const Options& opts,
std::unique_ptr<FrontendInterface> base_extractor);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// the dim_ is the dim of single frame feature
virtual size_t Dim() const { return computer_.Dim(); }
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() {
base_extractor_->Reset();
remained_wav_.Resize(0);
}
private:
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
kaldi::Vector<kaldi::BaseFloat>* feats);
Options opts_;
std::unique_ptr<FrontendInterface> base_extractor_;
kaldi::FeatureWindowFunction window_function_;
kaldi::Vector<kaldi::BaseFloat> remained_wav_;
F computer_;
};
} // namespace ppspeech
#include "frontend/audio/feature_common_inl.h"

@ -0,0 +1,95 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
namespace ppspeech {
template <class F>
StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts,
std::unique_ptr<FrontendInterface> base_extractor):
opts_(opts),
computer_(opts),
window_function_(opts.frame_opts) {
base_extractor_ = std::move(base_extractor);
}
template <class F>
void StreamingFeatureTpl<F>::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves) {
base_extractor_->Accept(waves);
}
template <class F>
bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
kaldi::Vector<kaldi::BaseFloat> wav(base_extractor_->Dim());
bool flag = base_extractor_->Read(&wav);
if (flag == false || wav.Dim() == 0) return false;
// append remaned waves
int32 wav_len = wav.Dim();
int32 left_len = remained_wav_.Dim();
kaldi::Vector<kaldi::BaseFloat> waves(left_len + wav_len);
waves.Range(0, left_len).CopyFromVec(remained_wav_);
waves.Range(left_len, wav_len).CopyFromVec(wav);
// compute speech feature
Compute(waves, feats);
// cache remaned waves
kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
int32 num_frames = kaldi::NumFrames(waves.Dim(), frame_opts);
int32 frame_shift = frame_opts.WindowShift();
int32 left_samples = waves.Dim() - frame_shift * num_frames;
remained_wav_.Resize(left_samples);
remained_wav_.CopyFromVec(
waves.Range(frame_shift * num_frames, left_samples));
return true;
}
// Compute feat
template <class F>
bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
kaldi::Vector<kaldi::BaseFloat>* feats) {
const kaldi::FrameExtractionOptions& frame_opts =
computer_.GetFrameOptions();
int32 num_samples = waves.Dim();
int32 frame_length = frame_opts.WindowSize();
int32 sample_rate = frame_opts.samp_freq;
if (num_samples < frame_length) {
return true;
}
int32 num_frames = kaldi::NumFrames(num_samples, frame_opts);
feats->Resize(num_frames * Dim());
kaldi::Vector<kaldi::BaseFloat> window;
bool need_raw_log_energy = computer_.NeedRawLogEnergy();
for (int32 frame = 0; frame < num_frames; frame++) {
kaldi::BaseFloat raw_log_energy = 0.0;
kaldi::ExtractWindow(0,
waves,
frame,
frame_opts,
window_function_,
&window,
need_raw_log_energy ? &raw_log_energy : NULL);
kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
computer_.Compute(&window, &this_feature);
kaldi::SubVector<kaldi::BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
output_row.CopyFromVec(this_feature);
}
return true;
}
} // namespace ppspeech

@ -32,7 +32,7 @@ struct FeaturePipelineOptions {
bool to_float32; // true, only for linear feature bool to_float32; // true, only for linear feature
bool use_fbank; bool use_fbank;
LinearSpectrogramOptions linear_spectrogram_opts; LinearSpectrogramOptions linear_spectrogram_opts;
FbankOptions fbank_opts; kaldi::FbankOptions fbank_opts;
FeatureCacheOptions feature_cache_opts; FeatureCacheOptions feature_cache_opts;
AssemblerOptions assembler_opts; AssemblerOptions assembler_opts;

@ -28,81 +28,32 @@ using kaldi::VectorBase;
using kaldi::Matrix; using kaldi::Matrix;
using std::vector; using std::vector;
LinearSpectrogram::LinearSpectrogram( LinearSpectrogramComputer::LinearSpectrogramComputer(
const LinearSpectrogramOptions& opts, const Options& opts)
std::unique_ptr<FrontendInterface> base_extractor) : opts_(opts) {
: opts_(opts), feature_window_funtion_(opts.frame_opts) { kaldi::FeatureWindowFunction feature_window_function(opts.frame_opts);
base_extractor_ = std::move(base_extractor);
int32 window_size = opts.frame_opts.WindowSize(); int32 window_size = opts.frame_opts.WindowSize();
int32 window_shift = opts.frame_opts.WindowShift(); frame_length_ = window_size;
dim_ = window_size / 2 + 1; dim_ = window_size / 2 + 1;
chunk_sample_size_ = BaseFloat hanning_window_energy = kaldi::VecVec(feature_window_function.window,
static_cast<int32>(opts.streaming_chunk * opts.frame_opts.samp_freq); feature_window_function.window);
hanning_window_energy_ = kaldi::VecVec(feature_window_funtion_.window, int32 sample_rate = opts.frame_opts.samp_freq;
feature_window_funtion_.window); scale_ = 2.0 / (hanning_window_energy * sample_rate);
}
void LinearSpectrogram::Accept(const VectorBase<BaseFloat>& inputs) {
base_extractor_->Accept(inputs);
}
bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
Vector<BaseFloat> input_feats(chunk_sample_size_);
bool flag = base_extractor_->Read(&input_feats);
if (flag == false || input_feats.Dim() == 0) return false;
int32 feat_len = input_feats.Dim();
int32 left_len = remained_wav_.Dim();
Vector<BaseFloat> waves(feat_len + left_len);
waves.Range(0, left_len).CopyFromVec(remained_wav_);
waves.Range(left_len, feat_len).CopyFromVec(input_feats);
Compute(waves, feats);
int32 frame_shift = opts_.frame_opts.WindowShift();
int32 num_frames = kaldi::NumFrames(waves.Dim(), opts_.frame_opts);
int32 left_samples = waves.Dim() - frame_shift * num_frames;
remained_wav_.Resize(left_samples);
remained_wav_.CopyFromVec(
waves.Range(frame_shift * num_frames, left_samples));
return true;
} }
// Compute spectrogram feat // Compute spectrogram feat
bool LinearSpectrogram::Compute(const Vector<BaseFloat>& waves, bool LinearSpectrogramComputer::Compute(Vector<BaseFloat>* window,
Vector<BaseFloat>* feats) { Vector<BaseFloat>* feat) {
int32 num_samples = waves.Dim(); window->Resize(frame_length_, kaldi::kCopyData);
int32 frame_length = opts_.frame_opts.WindowSize(); RealFft(window, true);
int32 sample_rate = opts_.frame_opts.samp_freq; kaldi::ComputePowerSpectrum(window);
BaseFloat scale = 2.0 / (hanning_window_energy_ * sample_rate); SubVector<BaseFloat> power_spectrum(*window, 0, dim_);
power_spectrum.Scale(scale_);
if (num_samples < frame_length) { power_spectrum(0) = power_spectrum(0) / 2;
return true; power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2;
} power_spectrum.Add(1e-14);
power_spectrum.ApplyLog();
int32 num_frames = kaldi::NumFrames(num_samples, opts_.frame_opts); feat->CopyFromVec(power_spectrum);
feats->Resize(num_frames * dim_);
Vector<BaseFloat> window;
for (int frame_idx = 0; frame_idx < num_frames; ++frame_idx) {
kaldi::ExtractWindow(0,
waves,
frame_idx,
opts_.frame_opts,
feature_window_funtion_,
&window,
NULL);
SubVector<BaseFloat> output_row(feats->Data() + frame_idx * dim_, dim_);
window.Resize(frame_length, kaldi::kCopyData);
RealFft(&window, true);
kaldi::ComputePowerSpectrum(&window);
SubVector<BaseFloat> power_spectrum(window, 0, dim_);
power_spectrum.Scale(scale);
power_spectrum(0) = power_spectrum(0) / 2;
power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2;
power_spectrum.Add(1e-14);
power_spectrum.ApplyLog();
output_row.CopyFromVec(power_spectrum);
}
return true; return true;
} }

@ -16,6 +16,7 @@
#pragma once #pragma once
#include "base/common.h" #include "base/common.h"
#include "frontend/audio/feature_common.h"
#include "frontend/audio/frontend_itf.h" #include "frontend/audio/frontend_itf.h"
#include "kaldi/feat/feature-window.h" #include "kaldi/feat/feature-window.h"
@ -23,47 +24,34 @@ namespace ppspeech {
struct LinearSpectrogramOptions { struct LinearSpectrogramOptions {
kaldi::FrameExtractionOptions frame_opts; kaldi::FrameExtractionOptions frame_opts;
kaldi::BaseFloat streaming_chunk; // second LinearSpectrogramOptions() : frame_opts() {}
LinearSpectrogramOptions() : streaming_chunk(0.1), frame_opts() {}
void Register(kaldi::OptionsItf* opts) {
opts->Register("streaming-chunk",
&streaming_chunk,
"streaming chunk size, default: 0.1 sec");
frame_opts.Register(opts);
}
}; };
class LinearSpectrogram : public FrontendInterface { class LinearSpectrogramComputer {
public: public:
explicit LinearSpectrogram( typedef LinearSpectrogramOptions Options;
const LinearSpectrogramOptions& opts, explicit LinearSpectrogramComputer(const Options& opts);
std::unique_ptr<FrontendInterface> base_extractor);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs); kaldi::FrameExtractionOptions& GetFrameOptions() {
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats); return opts_.frame_opts;
// the dim_ is the dim of single frame feature
virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() {
base_extractor_->Reset();
remained_wav_.Resize(0);
} }
private: bool Compute(kaldi::Vector<kaldi::BaseFloat>* window,
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves, kaldi::Vector<kaldi::BaseFloat>* feat);
kaldi::Vector<kaldi::BaseFloat>* feats);
size_t dim_; int32 Dim() const { return dim_; }
kaldi::FeatureWindowFunction feature_window_funtion_;
kaldi::BaseFloat hanning_window_energy_; bool NeedRawLogEnergy() { return false; }
LinearSpectrogramOptions opts_;
std::unique_ptr<FrontendInterface> base_extractor_; private:
kaldi::Vector<kaldi::BaseFloat> remained_wav_; kaldi::BaseFloat scale_;
int chunk_sample_size_; Options opts_;
DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram); int32 frame_length_;
int32 dim_;
DISALLOW_COPY_AND_ASSIGN(LinearSpectrogramComputer);
}; };
typedef StreamingFeatureTpl<LinearSpectrogramComputer> LinearSpectrogram;
} // namespace ppspeech } // namespace ppspeech

@ -1,5 +1,4 @@
add_library(utils add_library(utils
file_utils.cc file_utils.cc
simdjson.cpp )
)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save