diff --git a/examples/aishell/asr0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml index bdfa4219..1dc8581e 100644 --- a/examples/aishell/asr0/conf/deepspeech2.yaml +++ b/examples/aishell/asr0/conf/deepspeech2.yaml @@ -1,68 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 0 - ctc_grad_norm_type: instance +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 0 +ctc_grad_norm_type: instance -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 1.9 - beta: 5.0 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 10 +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml index 2f63f4de..c49973a2 100644 --- a/examples/aishell/asr0/conf/deepspeech2_online.yaml +++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml @@ -1,70 +1,68 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear #linear, mfcc, fbank - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear #linear, mfcc, fbank +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 -model: - num_conv_layers: 2 - num_rnn_layers: 5 - rnn_layer_size: 1024 - rnn_direction: forward # [forward, bidirect] - num_fc_layers: 0 - fc_layers_size_list: -1, - use_gru: False - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 5 +rnn_layer_size: 1024 +rnn_direction: forward # [forward, bidirect] +num_fc_layers: 0 +fc_layers_size_list: -1, +use_gru: False +blank_id: 0 -training: - n_epoch: 65 - accum_grad: 1 - lr: 5e-4 - lr_decay: 0.93 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 65 +accum_grad: 1 +lr: 5e-4 +lr_decay: 0.93 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 + -decoding: - batch_size: 32 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 2.2 #1.9 - beta: 4.3 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/conf/tuning/chunk_decode.yaml b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..9de06711 --- /dev/null +++ b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +chunk_batch_size: 32 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 2.2 #1.9 +beta: 4.3 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/conf/tuning/decode.yaml b/examples/aishell/asr0/conf/tuning/decode.yaml new file mode 100644 index 00000000..5778e656 --- /dev/null +++ b/examples/aishell/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 1.9 +beta: 5.0 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/local/test.sh b/examples/aishell/asr0/local/test.sh index 8cbff235..463593ef 100755 --- a/examples/aishell/asr0/local/test.sh +++ b/examples/aishell/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/aishell/asr0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh index 4f5e5c8b..7a4b87f8 100755 --- a/examples/aishell/asr0/local/test_export.sh +++ b/examples/aishell/asr0/local/test_export.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -jit_model_export_path=$2 -model_type=$3 +decode_config_path=$2 +jit_model_export_path=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh > /dev/null 2>&1 @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test_export.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${jit_model_export_path}.rsl \ --export_path ${jit_model_export_path} \ --model_type ${model_type} diff --git a/examples/aishell/asr0/local/test_hub_ori b/examples/aishell/asr0/local/test_hub_ori new file mode 100755 index 00000000..ee1fb805 --- /dev/null +++ b/examples/aishell/asr0/local/test_hub_ori @@ -0,0 +1,47 @@ +#!/bin/bash + +if [ $# != 4 ];then + echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_prefix=$2 +model_type=$3 +audio_file=$4 + +mkdir -p data +wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ +if [ $? -ne 0 ]; then + exit 1 +fi + +if [ ! -f ${audio_file} ]; then + echo "Plase input the right audio_file path" + exit 1 +fi + +# download language model +bash local/download_lm_ch.sh +if [ $? -ne 0 ]; then + exit 1 +fi + +python3 -u ${BIN_DIR}/test_hub.py \ +--nproc ${ngpu} \ +--config ${config_path} \ +--result_file ${ckpt_prefix}.rsl \ +--checkpoint_path ${ckpt_prefix} \ +--model_type ${model_type} \ +--audio_file ${audio_file} + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/aishell/asr0/local/test_wav.sh b/examples/aishell/asr0/local/test_wav.sh index 4a6d92fb..62b005a6 100755 --- a/examples/aishell/asr0/local/test_wav.sh +++ b/examples/aishell/asr0/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 4 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" +if [ $# != 5 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file" exit -1 fi @@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 -audio_file=$4 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 +audio_file=$5 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -33,6 +34,7 @@ fi python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} \ diff --git a/examples/aishell/asr0/run.sh b/examples/aishell/asr0/run.sh index 270b88fc..15685f21 100755 --- a/examples/aishell/asr0/run.sh +++ b/examples/aishell/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline # offline or online audio_file=data/demo_01_03.wav @@ -34,7 +35,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -44,11 +45,11 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test export ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 fi diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml index 31e9be13..68e852ba 100644 --- a/examples/aishell/asr1/conf/chunk_conformer.yaml +++ b/examples/aishell/asr1/conf/chunk_conformer.yaml @@ -54,8 +54,9 @@ test_manifest: data/manifest.test ########################################### vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' unit_type: 'char' -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -74,7 +75,7 @@ subsampling_factor: 1 num_encs: 1 ########################################### -# training # +# Training # ########################################### n_epoch: 240 accum_grad: 2 @@ -82,7 +83,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.002 - weight_decay: 1e-6 + weight_decay: 1.0e-6 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index d9e3daec..0a931e95 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -49,8 +49,9 @@ test_manifest: data/manifest.test # Dataloader # ########################################### vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' unit_type: 'char' -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -69,7 +70,7 @@ subsampling_factor: 1 num_encs: 1 ########################################### -# training # +# Training # ########################################### n_epoch: 240 accum_grad: 2 diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml index e6684ec8..9d294653 100644 --- a/examples/aishell/asr1/conf/transformer.yaml +++ b/examples/aishell/asr1/conf/transformer.yaml @@ -46,6 +46,7 @@ test_manifest: data/manifest.test ########################################### unit_type: 'char' vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -59,13 +60,13 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml num_workers: 0 subsampling_factor: 1 num_encs: 1 ########################################### -# training # +# Training # ########################################### n_epoch: 240 accum_grad: 2 @@ -73,7 +74,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.002 - weight_decay: 1e-6 + weight_decay: 1.0e-6 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/aishell/asr1/local/align.sh b/examples/aishell/asr1/local/align.sh index 95472e10..14d91d68 100755 --- a/examples/aishell/asr1/local/align.sh +++ b/examples/aishell/asr1/local/align.sh @@ -21,7 +21,7 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ ---decode_config ${decode_config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decode_batch_size ${batch_size} diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh index cab7c34e..65b884e5 100755 --- a/examples/aishell/asr1/local/test.sh +++ b/examples/aishell/asr1/local/test.sh @@ -30,14 +30,14 @@ for type in attention ctc_greedy_search; do # stream decoding only support batchsize=1 batch_size=1 else - batch_size=1 + batch_size=64 fi output_dir=${ckpt_prefix} mkdir -p ${output_dir} python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ @@ -57,7 +57,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ diff --git a/examples/aishell/asr1/local/test_wav.sh b/examples/aishell/asr1/local/test_wav.sh index 661013b1..d029f2fd 100755 --- a/examples/aishell/asr1/local/test_wav.sh +++ b/examples/aishell/asr1/local/test_wav.sh @@ -43,7 +43,7 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ diff --git a/examples/callcenter/asr1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml index 69959c68..19e783a6 100644 --- a/examples/callcenter/asr1/conf/chunk_conformer.yaml +++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml @@ -1,120 +1,98 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - - -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 8000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test + +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +spm_model_prefix: '' +preprocess_config: conf/preprocess.yaml +batch_size: 32 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 8000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -training: - n_epoch: 240 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/callcenter/asr1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml index 80c15abb..f6fcb949 100644 --- a/examples/callcenter/asr1/conf/conformer.yaml +++ b/examples/callcenter/asr1/conf/conformer.yaml @@ -1,117 +1,92 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.0 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 8000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +spm_model_prefix: '' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -training: - n_epoch: 100 # 50 will be lowest - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Training # +########################################### +n_epoch: 100 # 50 will be lowest +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/callcenter/asr1/conf/preprocess.yaml b/examples/callcenter/asr1/conf/preprocess.yaml index f7f4c58d..877e7d5a 100644 --- a/examples/callcenter/asr1/conf/preprocess.yaml +++ b/examples/callcenter/asr1/conf/preprocess.yaml @@ -1,7 +1,7 @@ process: # extract kaldi fbank from PCM - type: fbank_kaldi - fs: 16000 + fs: 8000 n_mels: 80 n_shift: 160 win_length: 400 diff --git a/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..49a6a114 --- /dev/null +++ b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: true # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/callcenter/asr1/conf/tuning/decode.yaml b/examples/callcenter/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..d2e0b72d --- /dev/null +++ b/examples/callcenter/asr1/conf/tuning/decode.yaml @@ -0,0 +1,13 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. + + diff --git a/examples/callcenter/asr1/local/align.sh b/examples/callcenter/asr1/local/align.sh index 681c77ed..1397ae57 100755 --- a/examples/callcenter/asr1/local/align.sh +++ b/examples/callcenter/asr1/local/align.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 ckpt_name=$(basename ${ckpt_prefxi}) @@ -25,9 +26,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/callcenter/asr1/local/test.sh b/examples/callcenter/asr1/local/test.sh index fc43c5a2..b7ff722a 100755 --- a/examples/callcenter/asr1/local/test.sh +++ b/examples/callcenter/asr1/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 + ckpt_name=$(basename ${ckpt_prefxi}) @@ -30,10 +32,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -49,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/callcenter/asr1/run.sh b/examples/callcenter/asr1/run.sh index e9be3d03..86730ce1 100644 --- a/examples/callcenter/asr1/run.sh +++ b/examples/callcenter/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/conformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=20 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -31,12 +32,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml index f3574e15..0b0a1550 100644 --- a/examples/librispeech/asr0/conf/deepspeech2.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2.yaml @@ -1,68 +1,65 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev-clean - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 30.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev-clean +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 30.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 20 - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 20.0 - delta_delta: False - dither: 1.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 20 +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 20.0 +delta_delta: False +dither: 1.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 0 -training: - n_epoch: 50 - accum_grad: 1 - lr: 1e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.9 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 1 +lr: 1e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml index 0d16bc57..8bd5a672 100644 --- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml @@ -1,70 +1,67 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev-clean - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 30.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev-clean +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 30.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 15 - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 20.0 - delta_delta: False - dither: 1.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 +########################################### +# Dataloader # +########################################### +batch_size: 15 +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 20.0 +delta_delta: False +dither: 1.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - rnn_direction: forward - num_fc_layers: 2 - fc_layers_size_list: 512, 256 - use_gru: False - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +rnn_direction: forward +num_fc_layers: 2 +fc_layers_size_list: 512, 256 +use_gru: False +blank_id: 0 -training: - n_epoch: 50 - accum_grad: 4 - lr: 1e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.9 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 4 +lr: 1e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..e07026ba --- /dev/null +++ b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.9 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/librispeech/asr0/conf/tuning/decode.yaml b/examples/librispeech/asr0/conf/tuning/decode.yaml new file mode 100644 index 00000000..e07026ba --- /dev/null +++ b/examples/librispeech/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.9 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/librispeech/asr0/local/test.sh b/examples/librispeech/asr0/local/test.sh index a627ef72..ea40046b 100755 --- a/examples/librispeech/asr0/local/test.sh +++ b/examples/librispeech/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/librispeech/asr0/local/test_wav.sh b/examples/librispeech/asr0/local/test_wav.sh index e8337da7..25cfc45e 100755 --- a/examples/librispeech/asr0/local/test_wav.sh +++ b/examples/librispeech/asr0/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 4 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" +if [ $# != 5 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file" exit -1 fi @@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 -audio_file=$4 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 +audio_file=$5 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ @@ -33,6 +34,7 @@ fi python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} \ diff --git a/examples/librispeech/asr0/run.sh b/examples/librispeech/asr0/run.sh index 5d811b65..ca2c2b9d 100755 --- a/examples/librispeech/asr0/run.sh +++ b/examples/librispeech/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=30 model_type=offline audio_file=data/demo_002_en.wav @@ -33,7 +34,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -43,5 +44,5 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 fi diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index ace61d36..72b9cb7b 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -57,7 +57,7 @@ vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -70,8 +70,7 @@ batch_count: auto batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 -batch_frames_inout: 0 -augmentation_config: conf/preprocess.yaml +batch_frames_inout: 0 num_workers: 0 subsampling_factor: 1 num_encs: 1 @@ -85,10 +84,11 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.001 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 + lr_decay: 1.0 log_interval: 100 checkpoint: kbest_n: 50 diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml index d6d84eb1..19ade8ad 100644 --- a/examples/librispeech/asr1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -50,7 +50,7 @@ vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -64,7 +64,6 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: conf/preprocess.yaml num_workers: 0 subsampling_factor: 1 num_encs: 1 @@ -79,7 +78,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.001 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index bb028e69..4f7b759b 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -55,7 +55,7 @@ vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -69,7 +69,6 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: conf/preprocess.yaml num_workers: 0 subsampling_factor: 1 num_encs: 1 @@ -84,7 +83,7 @@ global_grad_clip: 3.0 optim: adam optim_conf: lr: 0.004 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml index f81234f1..740ce78f 100644 --- a/examples/librispeech/asr1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -49,7 +49,7 @@ vocab_filepath: data/lang_char/vocab.txt unit_type: 'spm' spm_model_prefix: 'data/lang_char/bpe_unigram_5000' mean_std_filepath: "" -augmentation_config: conf/preprocess.yaml +preprocess_config: conf/preprocess.yaml feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -63,7 +63,6 @@ batch_bins: 0 batch_frames_in: 0 batch_frames_out: 0 batch_frames_inout: 0 -augmentation_config: conf/preprocess.yaml num_workers: 0 subsampling_factor: 1 num_encs: 1 @@ -78,7 +77,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.004 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/librispeech/asr1/local/align.sh b/examples/librispeech/asr1/local/align.sh index 95472e10..14d91d68 100755 --- a/examples/librispeech/asr1/local/align.sh +++ b/examples/librispeech/asr1/local/align.sh @@ -21,7 +21,7 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ ---decode_config ${decode_config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decode_batch_size ${batch_size} diff --git a/examples/librispeech/asr1/local/test.sh b/examples/librispeech/asr1/local/test.sh index ddb6c6b6..51ced18b 100755 --- a/examples/librispeech/asr1/local/test.sh +++ b/examples/librispeech/asr1/local/test.sh @@ -53,7 +53,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ @@ -78,7 +78,7 @@ for type in ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ @@ -99,7 +99,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ diff --git a/examples/librispeech/asr1/local/test_wav.sh b/examples/librispeech/asr1/local/test_wav.sh index 60eaadbf..e70fc83c 100755 --- a/examples/librispeech/asr1/local/test_wav.sh +++ b/examples/librispeech/asr1/local/test_wav.sh @@ -50,7 +50,7 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ - --decode_config ${decode_config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ diff --git a/examples/librispeech/asr2/conf/decode/decode_base.yaml b/examples/librispeech/asr2/conf/decode/decode_base.yaml new file mode 100644 index 00000000..384ed197 --- /dev/null +++ b/examples/librispeech/asr2/conf/decode/decode_base.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 1 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml index a16563a5..32d95b41 100644 --- a/examples/librispeech/asr2/conf/transformer.yaml +++ b/examples/librispeech/asr2/conf/transformer.yaml @@ -1,73 +1,80 @@ # https://yaml.org/type/float.html -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/train_960_unigram5000_units.txt - unit_type: spm - spm_model_prefix: data/lang_char/train_960_unigram5000 - feat_dim: 83 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 30 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/train_960_unigram5000_units.txt +unit_type: spm +spm_model_prefix: data/lang_char/train_960_unigram5000 +feat_dim: 83 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 30 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 120 - accum_grad: 2 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +log_interval: 1 +checkpoint: + kbest_n: 50 + latest_n: 5 optim: adam optim_conf: @@ -79,23 +86,5 @@ scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 -decoding: - batch_size: 1 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr2/local/align.sh b/examples/librispeech/asr2/local/align.sh index 626c3574..60a16f42 100755 --- a/examples/librispeech/asr2/local/align.sh +++ b/examples/librispeech/asr2/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path dict_path ckpt_path_prefix" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path dict_path ckpt_path_prefix" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -dict_path=$2 -ckpt_prefix=$3 +decode_config_path=$2 +dict_path=$3 +ckpt_prefix=$4 batch_size=1 output_dir=${ckpt_prefix} @@ -24,9 +25,10 @@ python3 -u ${BIN_DIR}/test.py \ --dict-path ${dict_path} \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result-file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/librispeech/asr2/local/test.sh b/examples/librispeech/asr2/local/test.sh index d210f2a8..bf6428d6 100755 --- a/examples/librispeech/asr2/local/test.sh +++ b/examples/librispeech/asr2/local/test.sh @@ -19,8 +19,9 @@ bpeprefix=data/lang_char/${train_set}_${bpemode}${nbpe} bpemodel=${bpeprefix}.model config_path=conf/transformer.yaml +decode_config_path=conf/decode/decode_base.yaml dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt -ckpt_prefix= +ckpt_prefix=exp/transformer/checkpoints/init source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -79,11 +80,12 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco --ngpu ${ngpu} \ --dict-path ${dict} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --checkpoint_path ${ckpt_prefix} \ --result-file ${decode_dir}/data.JOB.json \ - --opts decoding.decoding_method ${dmethd} \ - --opts decoding.batch_size ${batch_size} \ - --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} + --opts decode.decoding_method ${dmethd} \ + --opts decode.decode_batch_size ${batch_size} \ + --opts test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict} diff --git a/examples/librispeech/asr2/run.sh b/examples/librispeech/asr2/run.sh index 5b7596f2..56671233 100755 --- a/examples/librispeech/asr2/run.sh +++ b/examples/librispeech/asr2/run.sh @@ -9,12 +9,14 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=50 conf_path=conf/transformer.yaml -dict_path=lang_char/train_960_unigram5000_units.txt +decode_conf_path=conf/decode/decode_base.yaml +dict_path=data/lang_char/train_960_unigram5000_units.txt avg_num=10 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; avg_ckpt=avg_${avg_num} +avg_ckpt=init ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') echo "checkpoint name ${ckpt}" @@ -35,7 +37,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # attetion resocre decoder - ./local/test.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + ./local/test.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -45,7 +47,7 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then diff --git a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml index c2d69226..c2db2c7c 100644 --- a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml @@ -1,67 +1,65 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 4333 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 4333 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 + -decoding: - batch_size: 32 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 2.6 - beta: 5.0 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/aishell/conf/tuning/decode.yaml b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml new file mode 100644 index 00000000..b5283a93 --- /dev/null +++ b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 2.6 +beta: 5.0 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/aishell/local/test.sh b/examples/other/1xt2x/aishell/local/test.sh index 8cbff235..463593ef 100755 --- a/examples/other/1xt2x/aishell/local/test.sh +++ b/examples/other/1xt2x/aishell/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/aishell/run.sh b/examples/other/1xt2x/aishell/run.sh index 1ccac1c3..89a63411 100755 --- a/examples/other/1xt2x/aishell/run.sh +++ b/examples/other/1xt2x/aishell/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=2 @@ -23,6 +24,6 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml index be51a9b9..0c08fbc6 100644 --- a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml @@ -1,67 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: .inf # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: .inf # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 28 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 28 + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 32 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.4 - beta: 0.35 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml new file mode 100644 index 00000000..f52dde32 --- /dev/null +++ b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.4 +beta: 0.35 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/baidu_en8k/local/test.sh b/examples/other/1xt2x/baidu_en8k/local/test.sh index a627ef72..ea40046b 100755 --- a/examples/other/1xt2x/baidu_en8k/local/test.sh +++ b/examples/other/1xt2x/baidu_en8k/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/baidu_en8k/run.sh b/examples/other/1xt2x/baidu_en8k/run.sh index b7f69f6b..82de56b0 100755 --- a/examples/other/1xt2x/baidu_en8k/run.sh +++ b/examples/other/1xt2x/baidu_en8k/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=0 @@ -23,6 +24,6 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml index ad7fb2c1..a2a5649b 100644 --- a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml @@ -1,67 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 1000.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 1000.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 28 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 28 + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 32 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml new file mode 100644 index 00000000..f3b51def --- /dev/null +++ b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/librispeech/local/test.sh b/examples/other/1xt2x/librispeech/local/test.sh index a627ef72..ea40046b 100755 --- a/examples/other/1xt2x/librispeech/local/test.sh +++ b/examples/other/1xt2x/librispeech/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/librispeech/run.sh b/examples/other/1xt2x/librispeech/run.sh index 8c667de2..8b614bbb 100755 --- a/examples/other/1xt2x/librispeech/run.sh +++ b/examples/other/1xt2x/librispeech/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=1 @@ -23,5 +24,5 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/src_deepspeech2x/bin/test.py b/examples/other/1xt2x/src_deepspeech2x/bin/test.py index b4f9cdf9..b404cce8 100644 --- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py +++ b/examples/other/1xt2x/src_deepspeech2x/bin/test.py @@ -13,6 +13,7 @@ # limitations under the License. """Evaluation for DeepSpeech2 model.""" from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester +from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.training.cli import default_argument_parser @@ -44,6 +45,10 @@ if __name__ == "__main__": config = get_cfg_defaults(args.model_type) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py index ad83a41d..4c20ffcd 100644 --- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py +++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py @@ -233,11 +233,11 @@ class DeepSpeech2Model(nn.Layer): """ model = cls(feat_size=dataloader.collate_fn.feature_size, dict_size=len(dataloader.collate_fn.vocab_list), - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -250,7 +250,7 @@ class DeepSpeech2Model(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2Model diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py index 82e190d8..53a4e629 100644 --- a/examples/other/1xt2x/src_deepspeech2x/test_model.py +++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py @@ -64,7 +64,7 @@ class DeepSpeech2Trainer(Trainer): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -98,7 +98,7 @@ class DeepSpeech2Trainer(Trainer): iteration_time = time.time() - start msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -126,7 +126,7 @@ class DeepSpeech2Trainer(Trainer): total_loss += float(loss) * num_utts valid_losses['val_loss'].append(float(loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -146,15 +146,15 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config.clone() config.defrost() - config.model.feat_size = self.train_loader.collate_fn.feature_size - #config.model.dict_size = self.train_loader.collate_fn.vocab_size - config.model.dict_size = len(self.train_loader.collate_fn.vocab_list) + config.feat_size = self.train_loader.collate_fn.feature_size + #config.dict_size = self.train_loader.collate_fn.vocab_size + config.dict_size = len(self.train_loader.collate_fn.vocab_list) config.freeze() if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") if self.parallel: @@ -163,17 +163,13 @@ class DeepSpeech2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - grad_clip = ClipGradByGlobalNormWithLog( - config.training.global_grad_clip) + grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip) lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=config.training.lr, - gamma=config.training.lr_decay, - verbose=True) + learning_rate=config.lr, gamma=config.lr_decay, verbose=True) optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config.training.weight_decay), + weight_decay=paddle.regularizer.L2Decay(config.weight_decay), grad_clip=grad_clip) self.model = model @@ -184,59 +180,59 @@ class DeepSpeech2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.collator.keep_transcription_text = False + config.keep_transcription_text = False - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest test_dataset = ManifestDataset.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) collate_fn_train = SpeechCollator.from_config(config) - config.collator.augmentation_config = "" + config.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" + config.keep_transcription_text = True + config.augmentation_config = "" collate_fn_test = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev) self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.decode_batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_test) @@ -274,7 +270,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def __init__(self, config, args): self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab_filepath=None) + unit_type=config.unit_type, vocab=None) super().__init__(config, args) def ordid2token(self, texts, texts_len): @@ -293,7 +289,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer @@ -399,31 +395,3 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.export() except KeyboardInterrupt: exit(-1) - - def setup(self): - """Setup the experiment. - """ - paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') - - self.setup_output_dir() - self.setup_checkpointer() - - self.setup_dataloader() - self.setup_model() - - self.iteration = 0 - self.epoch = 0 - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - if self.args.output: - output_dir = Path(self.args.output).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - else: - output_dir = Path( - self.args.checkpoint_path).expanduser().parent.parent - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml index 36f287b1..5fe04619 100644 --- a/examples/ted_en_zh/st0/conf/transformer.yaml +++ b/examples/ted_en_zh/st0/conf/transformer.yaml @@ -1,109 +1,96 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train.tiny - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.05 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train.tiny +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.05 # second +max_input_len: 30.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/bpe_unigram_8000 +mean_std_filepath: "" +# augmentation_config: conf/augmentation.json +batch_size: 10 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.0 - ctc_weight: 0.0 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.0 + ctc_weight: 0.0 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false - -training: - n_epoch: 120 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 5 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 5 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml index 78887d3c..128561f7 100644 --- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml @@ -1,112 +1,100 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.05 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.05 # second +max_input_len: 30.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/bpe_unigram_8000 +mean_std_filepath: "" +# augmentation_config: conf/augmentation.json +batch_size: 10 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.5 - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.5 + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 120 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 2.5 - weight_decay: 1e-06 - scheduler: noam - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 50 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 1.0e-06 +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - diff --git a/examples/ted_en_zh/st0/conf/tuning/decode.yaml b/examples/ted_en_zh/st0/conf/tuning/decode.yaml new file mode 100644 index 00000000..ed081cf4 --- /dev/null +++ b/examples/ted_en_zh/st0/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +batch_size: 5 +error_rate_type: char-bleu +decoding_method: fullsentence # 'fullsentence', 'simultaneous' +beam_size: 10 +word_reward: 0.7 +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/ted_en_zh/st0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh index a9b18dd9..5c782e5b 100755 --- a/examples/ted_en_zh/st0/local/test.sh +++ b/examples/ted_en_zh/st0/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 for type in fullsentence; do echo "decoding ${type}" @@ -17,10 +18,11 @@ for type in fullsentence; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/ted_en_zh/st0/run.sh b/examples/ted_en_zh/st0/run.sh index b85ba95a..1746c025 100755 --- a/examples/ted_en_zh/st0/run.sh +++ b/examples/ted_en_zh/st0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer_mtl_noam.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=5 data_path=./TED_EnZh # path to unzipped data source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -32,7 +33,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml index 609c5824..bea8d9ab 100644 --- a/examples/ted_en_zh/st1/conf/transformer.yaml +++ b/examples/ted_en_zh/st1/conf/transformer.yaml @@ -1,110 +1,97 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train.tiny - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 5.0 # frame - max_input_len: 3000.0 # frame - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train.tiny +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 5.0 # frame +max_input_len: 3000.0 # frame +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 83 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/bpe_unigram_8000 +mean_std_filepath: "" +# augmentation_config: conf/augmentation.json +batch_size: 10 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 83 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: None - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: None +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.0 - ctc_weight: 0.0 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.0 + ctc_weight: 0.0 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 20 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 5 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 20 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 5 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml index 10eccd1e..31f7245d 100644 --- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml @@ -1,110 +1,97 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 5.0 # frame - max_input_len: 3000.0 # frame - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 5.0 # frame +max_input_len: 3000.0 # frame +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 83 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 +mean_std_filepath: "" +# augmentation_config: conf/augmentation.json +batch_size: 10 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 83 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: None - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: None +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.5 - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.5 + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 20 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 2.5 - weight_decay: 1e-06 - scheduler: noam - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 5 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 20 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 1.0e-06 +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 5 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st1/conf/tuning/decode.yaml b/examples/ted_en_zh/st1/conf/tuning/decode.yaml new file mode 100644 index 00000000..d6104dbc --- /dev/null +++ b/examples/ted_en_zh/st1/conf/tuning/decode.yaml @@ -0,0 +1,12 @@ + +batch_size: 5 +error_rate_type: char-bleu +decoding_method: fullsentence # 'fullsentence', 'simultaneous' +beam_size: 10 +word_reward: 0.7 +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh index a9b18dd9..5c782e5b 100755 --- a/examples/ted_en_zh/st1/local/test.sh +++ b/examples/ted_en_zh/st1/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 for type in fullsentence; do echo "decoding ${type}" @@ -17,10 +18,11 @@ for type in fullsentence; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh index f6362a8b..67309919 100755 --- a/examples/ted_en_zh/st1/run.sh +++ b/examples/ted_en_zh/st1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3 stage=1 stop_stage=4 conf_path=conf/transformer_mtl_noam.yaml +decode_conf_path=conf/tuning/decode.yaml ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model) avg_num=5 data_path=./TED_EnZh # path to unzipped data @@ -38,5 +39,5 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_pat} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi \ No newline at end of file diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml index 1c6059e4..4731395f 100644 --- a/examples/timit/asr1/conf/transformer.yaml +++ b/examples/timit/asr1/conf/transformer.yaml @@ -1,110 +1,89 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 # second - max_input_len: 10.0 # second - min_output_len: 0.0 # tokens - max_output_len: 150.0 # tokens - min_output_input_ratio: 0.005 - max_output_input_ratio: 1000.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: "word" - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: "word" +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 128 # dimension of attention - attention_heads: 4 - linear_units: 1024 # the number of units of position-wise feed forward - num_blocks: 6 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 128 # dimension of attention + attention_heads: 4 + linear_units: 1024 # the number of units of position-wise feed forward + num_blocks: 6 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 1024 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 1024 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.5 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.5 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 50 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 1200 - lr_decay: 1.0 - log_interval: 10 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 1200 + lr_decay: 1.0 +log_interval: 10 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/timit/asr1/conf/tuning/decode.yaml b/examples/timit/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..805dd02f --- /dev/null +++ b/examples/timit/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/timit/asr1/local/align.sh b/examples/timit/asr1/local/align.sh index c65d611c..14d91d68 100755 --- a/examples/timit/asr1/local/align.sh +++ b/examples/timit/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/timit/asr1/local/test.sh b/examples/timit/asr1/local/test.sh index 08ee0e36..88192c58 100755 --- a/examples/timit/asr1/local/test.sh +++ b/examples/timit/asr1/local/test.sh @@ -7,8 +7,8 @@ stop_stage=50 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1; -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -17,7 +17,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -43,10 +44,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -63,10 +65,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -82,10 +85,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/timit/asr1/run.sh b/examples/timit/asr1/run.sh index a95b5f3a..0d84be9f 100755 --- a/examples/timit/asr1/run.sh +++ b/examples/timit/asr1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=10 TIMIT_path=/path/to/TIMIT @@ -34,15 +35,15 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then -# # export ckpt avg_n -# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -# fi +if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml index 7d841d47..a16a79d3 100644 --- a/examples/tiny/asr0/conf/deepspeech2.yaml +++ b/examples/tiny/asr0/conf/deepspeech2.yaml @@ -1,70 +1,67 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.0 - max_input_len: 30.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.0 +max_input_len: 30.0 +min_output_len: 0.0 +max_output_len: 400.0 +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - batch_size: 4 +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 +batch_size: 4 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 0 -training: - n_epoch: 5 - accum_grad: 1 - lr: 1e-5 - lr_decay: 0.8 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 1 - checkpoint: - kbest_n: 3 - latest_n: 2 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +lr: 1e-5 +lr_decay: 0.8 +weight_decay: 1e-06 +global_grad_clip: 5.0 +log_interval: 1 +checkpoint: + kbest_n: 3 + latest_n: 2 -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml index 393b6439..5458cfb3 100644 --- a/examples/tiny/asr0/conf/deepspeech2_online.yaml +++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml @@ -1,72 +1,68 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.0 - max_input_len: 30.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.0 +max_input_len: 30.0 +min_output_len: 0.0 +max_output_len: 400.0 +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 - batch_size: 4 +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 +batch_size: 4 -model: - num_conv_layers: 2 - num_rnn_layers: 4 - rnn_layer_size: 2048 - rnn_direction: forward - num_fc_layers: 2 - fc_layers_size_list: 512, 256 - use_gru: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 4 +rnn_layer_size: 2048 +rnn_direction: forward +num_fc_layers: 2 +fc_layers_size_list: 512, 256 +use_gru: True +blank_id: 0 -training: - n_epoch: 5 - accum_grad: 1 - lr: 1e-5 - lr_decay: 1.0 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 1 - checkpoint: - kbest_n: 3 - latest_n: 2 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +lr: 1e-5 +lr_decay: 1.0 +weight_decay: 1e-06 +global_grad_clip: 5.0 +log_interval: 1 +checkpoint: + kbest_n: 3 + latest_n: 2 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/tuning/chunk_decode.yaml b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..94c3dbde --- /dev/null +++ b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/tuning/decode.yaml b/examples/tiny/asr0/conf/tuning/decode.yaml new file mode 100644 index 00000000..94c3dbde --- /dev/null +++ b/examples/tiny/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/local/test.sh b/examples/tiny/asr0/local/test.sh index a627ef72..ea40046b 100755 --- a/examples/tiny/asr0/local/test.sh +++ b/examples/tiny/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/tiny/asr0/run.sh b/examples/tiny/asr0/run.sh index f39fb3fa..25f04624 100755 --- a/examples/tiny/asr0/run.sh +++ b/examples/tiny/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline @@ -32,7 +33,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml index ad27478d..cd072c14 100644 --- a/examples/tiny/asr1/conf/chunk_confermer.yaml +++ b/examples/tiny/asr1/conf/chunk_confermer.yaml @@ -1,120 +1,98 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: True - use_dynamic_chunk: True - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: True + use_dynamic_chunk: True + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +augmentation_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml index 298518fb..2570bb85 100644 --- a/examples/tiny/asr1/conf/chunk_transformer.yaml +++ b/examples/tiny/asr1/conf/chunk_transformer.yaml @@ -1,113 +1,91 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 - +# https://yaml.org/type/float.html +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml index 085581f2..eb8f0ab9 100644 --- a/examples/tiny/asr1/conf/conformer.yaml +++ b/examples/tiny/asr1/conf/conformer.yaml @@ -1,46 +1,4 @@ # https://yaml.org/type/float.html -########################################### -# Data # -########################################### -train_manifest: data/manifest.tiny -dev_manifest: data/manifest.tiny -test_manifest: data/manifest.tiny -min_input_len: 0.5 # second -max_input_len: 20.0 # second -min_output_len: 0.0 # tokens -max_output_len: 400.0 # tokens -min_output_input_ratio: 0.05 -max_output_input_ratio: 10.0 - - -########################################### -# Dataloader # -########################################### -mean_std_filepath: "" -vocab_filepath: data/lang_char/vocab.txt -unit_type: 'spm' -spm_model_prefix: 'data/lang_char/bpe_unigram_200' -augmentation_config: conf/preprocess.yaml -batch_size: 4 -raw_wav: True # use raw_wav or kaldi feature -spectrum_type: fbank #linear, mfcc, fbank -feat_dim: 80 -delta_delta: False -dither: 1.0 -target_sample_rate: 16000 -max_freq: None -n_fft: None -stride_ms: 10.0 -window_ms: 25.0 -use_dB_normalization: True -target_dB: -20 -random_seed: 0 -keep_transcription_text: False -sortagrad: True -shuffle_method: batch_shuffle -num_workers: 2 - - ############################################ # Network Architecture # ############################################ @@ -83,7 +41,41 @@ model_conf: ########################################### -# training # +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + + +########################################### +# Training # ########################################### n_epoch: 5 accum_grad: 4 @@ -91,7 +83,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.002 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml index 95c7df50..4e3068d1 100644 --- a/examples/tiny/asr1/conf/transformer.yaml +++ b/examples/tiny/asr1/conf/transformer.yaml @@ -1,44 +1,4 @@ # https://yaml.org/type/float.html -########################################### -# Data # -########################################### -train_manifest: data/manifest.tiny -dev_manifest: data/manifest.tiny -test_manifest: data/manifest.tiny -min_input_len: 0.5 # second -max_input_len: 20.0 # second -min_output_len: 0.0 # tokens -max_output_len: 400.0 # tokens -min_output_input_ratio: 0.05 -max_output_input_ratio: 10.0 - -########################################### -# Dataloader # -########################################### -mean_std_filepath: data/mean_std.json -vocab_filepath: data/lang_char/vocab.txt -unit_type: 'spm' -spm_model_prefix: 'data/lang_char/bpe_unigram_200' -augmentation_config: conf/preprocess.yaml -batch_size: 4 -raw_wav: True # use raw_wav or kaldi feature -spectrum_type: fbank #linear, mfcc, fbank -feat_dim: 80 -delta_delta: False -dither: 1.0 -target_sample_rate: 16000 -max_freq: None -n_fft: None -stride_ms: 10.0 -window_ms: 25.0 -use_dB_normalization: True -target_dB: -20 -random_seed: 0 -keep_transcription_text: False -sortagrad: True -shuffle_method: batch_shuffle -num_workers: 2 - ############################################ # Network Architecture # ############################################ @@ -74,9 +34,41 @@ model_conf: lsm_weight: 0.1 # label smoothing option length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + ########################################### -# training # +# Training # ########################################### n_epoch: 5 accum_grad: 1 @@ -84,7 +76,7 @@ global_grad_clip: 5.0 optim: adam optim_conf: lr: 0.002 - weight_decay: 1e-06 + weight_decay: 1.0e-06 scheduler: warmuplr scheduler_conf: warmup_steps: 25000 diff --git a/examples/tiny/asr1/conf/tuning/chunk_decode.yaml b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..c5b641da --- /dev/null +++ b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 8 #64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/tiny/asr1/conf/tuning/decode.yaml b/examples/tiny/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..a0984f9e --- /dev/null +++ b/examples/tiny/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 8 #64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/local/align.sh b/examples/tiny/asr1/local/align.sh index c65d611c..14d91d68 100755 --- a/examples/tiny/asr1/local/align.sh +++ b/examples/tiny/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/tiny/asr1/local/test.sh b/examples/tiny/asr1/local/test.sh index 190bacff..79df969b 100755 --- a/examples/tiny/asr1/local/test.sh +++ b/examples/tiny/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -33,10 +34,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -50,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/tiny/asr1/run.sh b/examples/tiny/asr1/run.sh index ec9c5a56..1651c034 100755 --- a/examples/tiny/asr1/run.sh +++ b/examples/tiny/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -31,12 +32,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml index a438236d..6c2bbca4 100644 --- a/examples/wenetspeech/asr1/conf/conformer.yaml +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -1,111 +1,92 @@ -# network architecture -model: - # encoder related - encoder: conformer - encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - cnn_module_norm: layer_norm - activation_type: swish - pos_enc_layer_type: rel_pos - selfattention_layer_type: rel_selfattn +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + cnn_module_norm: layer_norm + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.1 # second - max_input_len: 12.0 # second - min_output_len: 1.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 240 - accum_grad: 16 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 5000 - lr_decay: 1.0 - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 16 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 5000 + lr_decay: 1.0 diff --git a/examples/wenetspeech/asr1/conf/tuning/decode.yaml b/examples/wenetspeech/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..6924bfa6 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh index da159de7..65b884e5 100755 --- a/examples/wenetspeech/asr1/local/test.sh +++ b/examples/wenetspeech/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh index 5c779474..47464262 100755 --- a/examples/wenetspeech/asr1/local/test_wav.sh +++ b/examples/wenetspeech/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -43,10 +44,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} if [ $? -ne 0 ]; then diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh index d77f409f..9995bc63 100644 --- a/examples/wenetspeech/asr1/run.sh +++ b/examples/wenetspeech/asr1/run.sh @@ -7,7 +7,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/conformer.yaml - +decode_conf_path=conf/tuning/decode.yaml average_checkpoint=true avg_num=10 @@ -36,12 +36,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then @@ -51,5 +51,5 @@ fi if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py index 7ccb3a6c..88148323 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py @@ -80,13 +80,13 @@ def inference(config, args): def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True - config.collator.batch_size = 1 - config.collator.num_workers = 0 + config.augmentation_config = "" + config.keep_transcription_text = True + config.batch_size = 1 + config.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) @@ -105,14 +105,14 @@ def start_server(config, args): paddle.to_tensor(audio), paddle.to_tensor(audio_len), vocab_list=test_loader.collate_fn.vocab_list, - decoding_method=config.decoding.decoding_method, - lang_model_path=config.decoding.lang_model_path, - beam_alpha=config.decoding.alpha, - beam_beta=config.decoding.beta, - beam_size=config.decoding.beam_size, - cutoff_prob=config.decoding.cutoff_prob, - cutoff_top_n=config.decoding.cutoff_top_n, - num_processes=config.decoding.num_proc_bsearch) + decoding_method=config.decode.decoding_method, + lang_model_path=config.decode.lang_model_path, + beam_alpha=config.decode.alpha, + beam_beta=config.decode.beta, + beam_size=config.decode.beam_size, + cutoff_prob=config.decode.cutoff_prob, + cutoff_top_n=config.decode.cutoff_top_n, + num_processes=config.decode.num_proc_bsearch) return result_transcript[0] # warming up with utterrances sampled from Librispeech @@ -179,12 +179,16 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() print(config) - args.warmup_manifest = config.data.test_manifest + args.warmup_manifest = config.test_manifest print_arguments(args, globals()) if args.dump_config: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py index 5c6eee3f..dea6d975 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py @@ -33,13 +33,13 @@ from paddlespeech.s2t.utils.utility import print_arguments def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True - config.collator.batch_size = 1 - config.collator.num_workers = 0 + config.augmentation_config = "" + config.keep_transcription_text = True + config.batch_size = 1 + config.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) @@ -62,14 +62,14 @@ def start_server(config, args): paddle.to_tensor(audio), paddle.to_tensor(audio_len), vocab_list=test_loader.collate_fn.vocab_list, - decoding_method=config.decoding.decoding_method, - lang_model_path=config.decoding.lang_model_path, - beam_alpha=config.decoding.alpha, - beam_beta=config.decoding.beta, - beam_size=config.decoding.beam_size, - cutoff_prob=config.decoding.cutoff_prob, - cutoff_top_n=config.decoding.cutoff_top_n, - num_processes=config.decoding.num_proc_bsearch) + decoding_method=config.decode.decoding_method, + lang_model_path=config.decode.lang_model_path, + beam_alpha=config.decode.alpha, + beam_beta=config.decode.beta, + beam_size=config.decode.beam_size, + cutoff_prob=config.decode.cutoff_prob, + cutoff_top_n=config.decode.cutoff_top_n, + num_processes=config.decode.num_proc_bsearch) return result_transcript[0] # warming up with utterrances sampled from Librispeech @@ -114,12 +114,16 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() print(config) - args.warmup_manifest = config.data.test_manifest + args.warmup_manifest = config.test_manifest print_arguments(args, globals()) if args.dump_config: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py index f52615fa..7ce921d6 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Evaluation for DeepSpeech2 model.""" +from yacs.config import CfgNode + from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser @@ -44,6 +46,10 @@ if __name__ == "__main__": config = get_cfg_defaults(args.model_type) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py index e073ebbf..7a1801d4 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Evaluation for DeepSpeech2 model.""" +from yacs.config import CfgNode + from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester from paddlespeech.s2t.training.cli import default_argument_parser @@ -49,6 +51,10 @@ if __name__ == "__main__": config = get_cfg_defaults(args.model_type) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py index cf2ca0d6..28756b05 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py @@ -18,6 +18,7 @@ from pathlib import Path import paddle import soundfile +from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer @@ -41,7 +42,7 @@ class DeepSpeech2Tester_hub(): self.audio_file = args.audio_file self.collate_fn_test = SpeechCollator.from_config(config) self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab=None) + unit_type=config.unit_type, vocab=None) def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): result_transcripts = self.model.decode( @@ -74,7 +75,7 @@ class DeepSpeech2Tester_hub(): audio = paddle.unsqueeze(audio, axis=0) vocab_list = collate_fn_test.vocab_list result_transcripts = self.compute_result_transcripts( - audio, audio_len, vocab_list, cfg.decoding) + audio, audio_len, vocab_list, cfg.decode) logger.info("result_transcripts: " + result_transcripts[0]) def run_test(self): @@ -110,13 +111,13 @@ class DeepSpeech2Tester_hub(): def setup_model(self): config = self.config.clone() with UpdateConfig(config): - config.model.input_dim = self.collate_fn_test.feature_size - config.model.output_dim = self.collate_fn_test.vocab_size + config.input_dim = self.collate_fn_test.feature_size + config.output_dim = self.collate_fn_test.vocab_size if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") @@ -134,8 +135,8 @@ class DeepSpeech2Tester_hub(): self.checkpoint_dir = checkpoint_dir self.checkpoint = Checkpoint( - kbest_n=self.config.training.checkpoint.kbest_n, - latest_n=self.config.training.checkpoint.latest_n) + kbest_n=self.config.checkpoint.kbest_n, + latest_n=self.config.checkpoint.latest_n) def resume(self): """Resume from the checkpoint at checkpoints in the output @@ -190,6 +191,10 @@ if __name__ == "__main__": config = get_cfg_defaults(args.model_type) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/config.py b/paddlespeech/s2t/exps/deepspeech2/config.py index 58dc05ff..d8eab50e 100644 --- a/paddlespeech/s2t/exps/deepspeech2/config.py +++ b/paddlespeech/s2t/exps/deepspeech2/config.py @@ -23,17 +23,6 @@ from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline def get_cfg_defaults(model_type='offline'): _C = CfgNode() - _C.data = ManifestDataset.params() - _C.collator = SpeechCollator.params() - _C.training = DeepSpeech2Trainer.params() - _C.decoding = DeepSpeech2Tester.params() - if model_type == 'offline': - _C.model = DeepSpeech2Model.params() - else: - _C.model = DeepSpeech2ModelOnline.params() - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern config = _C.clone() config.set_new_allowed(True) return config diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index a0b69d64..fc214a8a 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -69,8 +69,8 @@ class DeepSpeech2Trainer(Trainer): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - batch_size = self.config.collator.batch_size - accum_grad = self.config.training.accum_grad + batch_size = self.config.batch_size + accum_grad = self.config.accum_grad start = time.time() @@ -133,7 +133,7 @@ class DeepSpeech2Trainer(Trainer): total_loss += float(loss) * num_utts valid_losses['val_loss'].append(float(loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -154,16 +154,16 @@ class DeepSpeech2Trainer(Trainer): config = self.config.clone() with UpdateConfig(config): if self.train: - config.model.input_dim = self.train_loader.collate_fn.feature_size - config.model.output_dim = self.train_loader.collate_fn.vocab_size + config.input_dim = self.train_loader.collate_fn.feature_size + config.output_dim = self.train_loader.collate_fn.vocab_size else: - config.model.input_dim = self.test_loader.collate_fn.feature_size - config.model.output_dim = self.test_loader.collate_fn.vocab_size + config.input_dim = self.test_loader.collate_fn.feature_size + config.output_dim = self.test_loader.collate_fn.vocab_size if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") if self.parallel: @@ -177,17 +177,13 @@ class DeepSpeech2Trainer(Trainer): if not self.train: return - grad_clip = ClipGradByGlobalNormWithLog( - config.training.global_grad_clip) + grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip) lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=config.training.lr, - gamma=config.training.lr_decay, - verbose=True) + learning_rate=config.lr, gamma=config.lr_decay, verbose=True) optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config.training.weight_decay), + weight_decay=paddle.regularizer.L2Decay(config.weight_decay), grad_clip=grad_clip) self.optimizer = optimizer self.lr_scheduler = lr_scheduler @@ -198,66 +194,67 @@ class DeepSpeech2Trainer(Trainer): config.defrost() if self.train: # train - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) - config.collator.keep_transcription_text = False + config.keep_transcription_text = False collate_fn_train = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) # dev - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = False + config.augmentation_config = "" + config.keep_transcription_text = False collate_fn_dev = SpeechCollator.from_config(config) self.valid_loader = DataLoader( dev_dataset, - batch_size=int(config.collator.batch_size), + batch_size=int(config.batch_size), shuffle=False, drop_last=False, collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) logger.info("Setup train/valid Dataloader!") else: # test - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest test_dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True + config.augmentation_config = "" + config.keep_transcription_text = True collate_fn_test = SpeechCollator.from_config(config) - + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_test, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) logger.info("Setup test Dataloader!") @@ -286,7 +283,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def __init__(self, config, args): super().__init__(config, args) self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab=None) + unit_type=config.unit_type, vocab=None) def ordid2token(self, texts, texts_len): """ ord() id to chr() chr """ @@ -304,17 +301,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer vocab_list = self.test_loader.collate_fn.vocab_list target_transcripts = self.ordid2token(texts, texts_len) - result_transcripts = self.compute_result_transcripts(audio, audio_len, - vocab_list, cfg) + result_transcripts = self.compute_result_transcripts( + audio, audio_len, vocab_list, decode_cfg) for utt, target, result in zip(utts, target_transcripts, result_transcripts): @@ -327,29 +324,31 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("Current error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info( + "Current error rate [%s] = %f" % + (decode_cfg.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type) + error_rate_type=decode_cfg.error_rate_type) - def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): + def compute_result_transcripts(self, audio, audio_len, vocab_list, + decode_cfg): result_transcripts = self.model.decode( audio, audio_len, vocab_list, - decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, - beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch) + decoding_method=decode_cfg.decoding_method, + lang_model_path=decode_cfg.lang_model_path, + beam_alpha=decode_cfg.alpha, + beam_beta=decode_cfg.beta, + beam_size=decode_cfg.beam_size, + cutoff_prob=decode_cfg.cutoff_prob, + cutoff_top_n=decode_cfg.cutoff_top_n, + num_processes=decode_cfg.num_proc_bsearch) return result_transcripts @@ -358,7 +357,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def test(self): logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") self.model.eval() - cfg = self.config error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 with jsonlines.open(self.args.result_file, 'w') as fout: @@ -412,11 +410,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): if self.args.enable_auto_log is True: from paddlespeech.s2t.utils.log import Autolog self.autolog = Autolog( - batch_size=self.config.decoding.batch_size, + batch_size=self.config.decode.decode_batch_size, model_name="deepspeech2", model_precision="fp32").getlog() self.model.eval() - cfg = self.config error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 with jsonlines.open(self.args.result_file, 'w') as fout: @@ -441,7 +438,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): if self.args.enable_auto_log is True: self.autolog.report() - def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): + def compute_result_transcripts(self, audio, audio_len, vocab_list, + decode_cfg): if self.args.model_type == "online": output_probs, output_lens = self.static_forward_online(audio, audio_len) @@ -454,13 +452,15 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): self.predictor.clear_intermediate_tensor() self.predictor.try_shrink_memory() - self.model.decoder.init_decode(cfg.alpha, cfg.beta, cfg.lang_model_path, - vocab_list, cfg.decoding_method) + self.model.decoder.init_decode(decode_cfg.alpha, decode_cfg.beta, + decode_cfg.lang_model_path, vocab_list, + decode_cfg.decoding_method) result_transcripts = self.model.decoder.decode_probs( - output_probs, output_lens, vocab_list, cfg.decoding_method, - cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size, - cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch) + output_probs, output_lens, vocab_list, decode_cfg.decoding_method, + decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta, + decode_cfg.beam_size, decode_cfg.cutoff_prob, + decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch) #replace the with ' ' result_transcripts = [ self._text_featurizer.detokenize(sentence) @@ -531,12 +531,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): num_chunk = int(num_chunk) chunk_state_h_box = np.zeros( - (self.config.model.num_rnn_layers, 1, - self.config.model.rnn_layer_size), + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), dtype=x.dtype) chunk_state_c_box = np.zeros( - (self.config.model.num_rnn_layers, 1, - self.config.model.rnn_layer_size), + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), dtype=x.dtype) input_names = self.predictor.get_input_names() diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py index 229f696d..5d768536 100644 --- a/paddlespeech/s2t/exps/u2/bin/alignment.py +++ b/paddlespeech/s2t/exps/u2/bin/alignment.py @@ -43,9 +43,9 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) - if args.decode_config: + if args.decode_cfg: decode_confs = CfgNode(new_allowed=True) - decode_confs.merge_from_file(args.decode_config) + decode_confs.merge_from_file(args.decode_cfg) config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py index 419594bf..d93954fe 100644 --- a/paddlespeech/s2t/exps/u2/bin/test.py +++ b/paddlespeech/s2t/exps/u2/bin/test.py @@ -47,9 +47,9 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) - if args.decode_config: + if args.decode_cfg: decode_confs = CfgNode(new_allowed=True) - decode_confs.merge_from_file(args.decode_config) + decode_confs.merge_from_file(args.decode_cfg) config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 766e4173..554d6ca5 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -38,7 +38,7 @@ class U2Infer(): self.config = config self.audio_file = args.audio_file - self.preprocess_conf = config.augmentation_config + self.preprocess_conf = config.preprocess_config self.preprocess_args = {"train": False} self.preprocessing = Transformation(self.preprocess_conf) @@ -132,9 +132,9 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) - if args.decode_config: + if args.decode_cfg: decode_confs = CfgNode(new_allowed=True) - decode_confs.merge_from_file(args.decode_config) + decode_confs.merge_from_file(args.decode_cfg) config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) diff --git a/paddlespeech/s2t/exps/u2/config.py b/paddlespeech/s2t/exps/u2/config.py index 2b4f6fb2..44780d2e 100644 --- a/paddlespeech/s2t/exps/u2/config.py +++ b/paddlespeech/s2t/exps/u2/config.py @@ -21,15 +21,15 @@ from paddlespeech.s2t.models.u2 import U2Model _C = CfgNode(new_allowed=True) -ManifestDataset.params(_C) +# ManifestDataset.params(_C) -SpeechCollator.params(_C) +# SpeechCollator.params(_C) -U2Model.params(_C) +# U2Model.params(_C) -U2Trainer.params(_C) +# U2Trainer.params(_C) -_C.decode = U2Tester.params() +# _C.decode = U2Tester.params() def get_cfg_defaults(): diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 31610e15..f1683d70 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -264,7 +264,7 @@ class U2Trainer(Trainer): batch_frames_in=config.batch_frames_in, batch_frames_out=config.batch_frames_out, batch_frames_inout=config.batch_frames_inout, - preprocess_conf=config.augmentation_config, + preprocess_conf=config.preprocess_config, n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) @@ -283,18 +283,20 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.augmentation_config, + preprocess_conf=config.preprocess_config, n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) logger.info("Setup train/valid Dataloader!") else: + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) # test dataset, return raw text self.test_loader = BatchDataLoader( json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decode.decode_batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -304,7 +306,7 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.augmentation_config, + preprocess_conf=config.preprocess_config, n_iter_processes=1, subsampling_factor=1, num_encs=1) @@ -313,7 +315,7 @@ class U2Trainer(Trainer): json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decode.decode_batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -323,7 +325,7 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.augmentation_config, + preprocess_conf=config.preprocess_config, n_iter_processes=1, subsampling_factor=1, num_encs=1) @@ -557,7 +559,7 @@ class U2Tester(U2Trainer): "ref_len": len_refs, "decode_method": - self.config.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') diff --git a/paddlespeech/s2t/exps/u2/trainer.py b/paddlespeech/s2t/exps/u2/trainer.py index 22a0a3c5..57d87316 100644 --- a/paddlespeech/s2t/exps/u2/trainer.py +++ b/paddlespeech/s2t/exps/u2/trainer.py @@ -44,77 +44,77 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.collator.keep_transcription_text = False + config.keep_transcription_text = False # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) collate_fn_train = SpeechCollator.from_config(config) - config.collator.augmentation_config = "" + config.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) # test dataset, return raw text - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. - config.data.min_input_len = 0.0 # second - config.data.max_input_len = float('inf') # second - config.data.min_output_len = 0.0 # tokens - config.data.max_output_len = float('inf') # tokens - config.data.min_output_input_ratio = 0.00 - config.data.max_output_input_ratio = float('inf') + config.min_input_len = 0.0 # second + config.max_input_len = float('inf') # second + config.min_output_len = 0.0 # tokens + config.max_output_len = float('inf') # tokens + config.min_output_input_ratio = 0.00 + config.max_output_input_ratio = float('inf') test_dataset = ManifestDataset.from_config(config) # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" + config.keep_transcription_text = True + config.augmentation_config = "" self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.batch_size, shuffle=False, drop_last=False, collate_fn=SpeechCollator.from_config(config)) # return text token id - config.collator.keep_transcription_text = False + config.keep_transcription_text = False self.align_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.batch_size, shuffle=False, drop_last=False, collate_fn=SpeechCollator.from_config(config)) @@ -122,7 +122,7 @@ class U2Trainer(Trainer): def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size @@ -136,7 +136,7 @@ class U2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -156,7 +156,7 @@ class U2Trainer(Trainer): config, parameters, lr_scheduler=None, ): - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -182,7 +182,7 @@ class U2Trainer(Trainer): def setup_updater(self): output_dir = self.output_dir - config = self.config.training + config = self.config updater = U2Updater( model=self.model, diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py index 67bed349..422483b9 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py +++ b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py @@ -69,6 +69,10 @@ if __name__ == "__main__": config = CfgNode() config.set_new_allowed(True) config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 9b8274ad..887dd29e 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -80,7 +80,7 @@ class U2Trainer(Trainer): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -122,7 +122,7 @@ class U2Trainer(Trainer): if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -157,7 +157,7 @@ class U2Trainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -186,7 +186,7 @@ class U2Trainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: @@ -235,10 +235,10 @@ class U2Trainer(Trainer): config = self.config.clone() # train/valid dataset, return token ids self.train_loader = BatchDataLoader( - json_file=config.data.train_manifest, + json_file=config.train_manifest, train_mode=True, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -248,16 +248,16 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, - n_iter_processes=config.collator.num_workers, + preprocess_conf=config.preprocess_config, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) self.valid_loader = BatchDataLoader( - json_file=config.data.dev_manifest, + json_file=config.dev_manifest, train_mode=False, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -268,16 +268,18 @@ class U2Trainer(Trainer): batch_frames_out=0, batch_frames_inout=0, preprocess_conf=None, - n_iter_processes=config.collator.num_workers, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) # test dataset, return raw text self.test_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -293,10 +295,10 @@ class U2Trainer(Trainer): num_encs=1) self.align_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -316,7 +318,7 @@ class U2Trainer(Trainer): config = self.config # model - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.feat_dim model_conf.output_dim = self.train_loader.vocab_size @@ -392,9 +394,9 @@ class U2Tester(U2Trainer): def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -413,10 +415,10 @@ class U2Tester(U2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer start_time = time.time() target_transcripts = self.id2token(texts, texts_len, self.text_feature) @@ -424,12 +426,12 @@ class U2Tester(U2Trainer): audio, audio_len, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + ctc_weight=decode_cfg.ctc_weight, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) decode_time = time.time() - start_time for i, (utt, target, result, rec_tids) in enumerate( @@ -449,15 +451,16 @@ class U2Tester(U2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("One example error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info( + "One example error rate [%s] = %f" % + (decode_cfg.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, # num examples error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type, + error_rate_type=decode_cfg.error_rate_type, num_frames=audio_len.sum().numpy().item(), decode_time=decode_time) @@ -468,7 +471,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.config.collator.stride_ms + stride_ms = self.config.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -519,15 +522,15 @@ class U2Tester(U2Trainer): "ref_len": len_refs, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') @paddle.no_grad() def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, + self.config.decode.decode_batch_size, + self.config.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -539,7 +542,7 @@ class U2Tester(U2Trainer): """ from paddlespeech.s2t.models.u2 import U2InferModel infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), + self.config.clone(), self.args.checkpoint_path) feat_dim = self.test_loader.feat_dim input_spec = [ diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py index 93c2fee0..3ad5fc7d 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/test.py +++ b/paddlespeech/s2t/exps/u2_st/bin/test.py @@ -14,12 +14,14 @@ """Evaluation for U2 model.""" import cProfile +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load +# TODO(hui zhang): dynamic load def main_sp(config, args): @@ -35,7 +37,7 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() @@ -45,6 +47,10 @@ if __name__ == "__main__": config = get_cfg_defaults() if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_conf = CfgNode(new_allowed=True) + decode_conf.merge_from_file(args.decode_cfg) + config.decode = decode_conf if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2_st/config.py b/paddlespeech/s2t/exps/u2_st/config.py index a48f9106..a314a1ca 100644 --- a/paddlespeech/s2t/exps/u2_st/config.py +++ b/paddlespeech/s2t/exps/u2_st/config.py @@ -21,15 +21,15 @@ from paddlespeech.s2t.models.u2_st import U2STModel _C = CfgNode() -_C.data = ManifestDataset.params() +# _C.data = ManifestDataset.params() -_C.collator = SpeechCollator.params() +# _C.collator = SpeechCollator.params() -_C.model = U2STModel.params() +# _C.model = U2STModel.params() -_C.training = U2STTrainer.params() +# _C.training = U2STTrainer.params() -_C.decoding = U2STTester.params() +# _C.decoding = U2STTester.params() def get_cfg_defaults(): diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index a3b39df7..00f11599 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -78,7 +78,7 @@ class U2STTrainer(Trainer): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward utt, audio, audio_len, text, text_len = batch_data @@ -127,7 +127,7 @@ class U2STTrainer(Trainer): if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -168,7 +168,7 @@ class U2STTrainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_st_loss'] = total_loss / num_seen_utts @@ -197,7 +197,7 @@ class U2STTrainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: @@ -245,91 +245,93 @@ class U2STTrainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.collator.keep_transcription_text = False + config.keep_transcription_text = False # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) - if config.model.model_conf.asr_weight > 0.: + if config.model_conf.asr_weight > 0.: Collator = TripletSpeechCollator TestCollator = SpeechCollator else: TestCollator = Collator = SpeechCollator collate_fn_train = Collator.from_config(config) - config.collator.augmentation_config = "" + config.augmentation_config = "" collate_fn_dev = Collator.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) # test dataset, return raw text - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. - # config.data.min_input_len = 0.0 # second - # config.data.max_input_len = float('inf') # second - # config.data.min_output_len = 0.0 # tokens - # config.data.max_output_len = float('inf') # tokens - # config.data.min_output_input_ratio = 0.00 - # config.data.max_output_input_ratio = float('inf') + # config.min_input_len = 0.0 # second + # config.max_input_len = float('inf') # second + # config.min_output_len = 0.0 # tokens + # config.max_output_len = float('inf') # tokens + # config.min_output_input_ratio = 0.00 + # config.max_output_input_ratio = float('inf') test_dataset = ManifestDataset.from_config(config) # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" + config.keep_transcription_text = True + config.augmentation_config = "" + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, shuffle=False, drop_last=False, collate_fn=TestCollator.from_config(config), - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) # return text token id - config.collator.keep_transcription_text = False + config.keep_transcription_text = False self.align_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, shuffle=False, drop_last=False, collate_fn=TestCollator.from_config(config), - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) logger.info("Setup train/valid/test/align Dataloader!") def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size @@ -342,7 +344,7 @@ class U2STTrainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -428,7 +430,7 @@ class U2STTester(U2STTrainer): def translate(self, audio, audio_len): """"E2E translation from extracted audio feature""" - cfg = self.config.decoding + decode_cfg = self.config.decode text_feature = self.test_loader.collate_fn.text_feature self.model.eval() @@ -436,12 +438,12 @@ class U2STTester(U2STTrainer): audio, audio_len, text_feature=text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - word_reward=cfg.word_reward, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + word_reward=decode_cfg.word_reward, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) return hyps def compute_translation_metrics(self, @@ -452,7 +454,7 @@ class U2STTester(U2STTrainer): texts_len, bleu_func, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode len_refs, num_ins = 0, 0 start_time = time.time() @@ -467,12 +469,12 @@ class U2STTester(U2STTrainer): audio, audio_len, text_feature=text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - word_reward=cfg.word_reward, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + word_reward=decode_cfg.word_reward, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) decode_time = time.time() - start_time for utt, target, result in zip(utts, refs, hyps): @@ -502,8 +504,8 @@ class U2STTester(U2STTrainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - cfg = self.config.decoding - bleu_func = bleu_score.char_bleu if cfg.error_rate_type == 'char-bleu' else bleu_score.bleu + decode_cfg = self.config.decode + bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu stride_ms = self.test_loader.collate_fn.stride_ms hyps, refs = [], [] @@ -549,15 +551,15 @@ class U2STTester(U2STTrainer): "num_examples": num_ins, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') @paddle.no_grad() def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, + self.config.decode.decode_batch_size, + self.config.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -569,7 +571,7 @@ class U2STTester(U2STTrainer): """ from paddlespeech.s2t.models.u2 import U2InferModel infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), + self.config.clone(), self.args.checkpoint_path) feat_dim = self.test_loader.collate_fn.feature_size input_spec = [ diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index 5f233549..27bf20eb 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -256,45 +256,43 @@ class SpeechCollator(SpeechCollatorBase): Returns: SpeechCollator: collator object. """ - assert 'augmentation_config' in config.collator - assert 'keep_transcription_text' in config.collator - assert 'mean_std_filepath' in config.collator - assert 'vocab_filepath' in config.collator - assert 'spectrum_type' in config.collator - assert 'n_fft' in config.collator - assert config.collator - - if isinstance(config.collator.augmentation_config, (str, bytes)): - if config.collator.augmentation_config: + assert 'augmentation_config' in config + assert 'keep_transcription_text' in config + assert 'mean_std_filepath' in config + assert 'vocab_filepath' in config + assert 'spectrum_type' in config + assert 'n_fft' in config + assert config + + if isinstance(config.augmentation_config, (str, bytes)): + if config.augmentation_config: aug_file = io.open( - config.collator.augmentation_config, - mode='r', - encoding='utf8') + config.augmentation_config, mode='r', encoding='utf8') else: aug_file = io.StringIO(initial_value='{}', newline='') else: - aug_file = config.collator.augmentation_config + aug_file = config.augmentation_config assert isinstance(aug_file, io.StringIO) speech_collator = cls( aug_file=aug_file, random_seed=0, - mean_std_filepath=config.collator.mean_std_filepath, - unit_type=config.collator.unit_type, - vocab_filepath=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix, - spectrum_type=config.collator.spectrum_type, - feat_dim=config.collator.feat_dim, - delta_delta=config.collator.delta_delta, - stride_ms=config.collator.stride_ms, - window_ms=config.collator.window_ms, - n_fft=config.collator.n_fft, - max_freq=config.collator.max_freq, - target_sample_rate=config.collator.target_sample_rate, - use_dB_normalization=config.collator.use_dB_normalization, - target_dB=config.collator.target_dB, - dither=config.collator.dither, - keep_transcription_text=config.collator.keep_transcription_text) + mean_std_filepath=config.mean_std_filepath, + unit_type=config.unit_type, + vocab_filepath=config.vocab_filepath, + spm_model_prefix=config.spm_model_prefix, + spectrum_type=config.spectrum_type, + feat_dim=config.feat_dim, + delta_delta=config.delta_delta, + stride_ms=config.stride_ms, + window_ms=config.window_ms, + n_fft=config.n_fft, + max_freq=config.max_freq, + target_sample_rate=config.target_sample_rate, + use_dB_normalization=config.use_dB_normalization, + target_dB=config.target_dB, + dither=config.dither, + keep_transcription_text=config.keep_transcription_text) return speech_collator diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index d64d7d3e..c76ccfce 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -54,17 +54,17 @@ class ManifestDataset(Dataset): Returns: ManifestDataset: dataet object. """ - assert 'manifest' in config.data - assert config.data.manifest + assert 'manifest' in config + assert config.manifest dataset = cls( - manifest_path=config.data.manifest, - max_input_len=config.data.max_input_len, - min_input_len=config.data.min_input_len, - max_output_len=config.data.max_output_len, - min_output_len=config.data.min_output_len, - max_output_input_ratio=config.data.max_output_input_ratio, - min_output_input_ratio=config.data.min_output_input_ratio, ) + manifest_path=config.manifest, + max_input_len=config.max_input_len, + min_input_len=config.min_input_len, + max_output_len=config.max_output_len, + min_output_len=config.min_output_len, + max_output_input_ratio=config.max_output_input_ratio, + min_output_input_ratio=config.min_output_input_ratio, ) return dataset def __init__(self, diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index 0dfaec29..0414d04f 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -221,12 +221,12 @@ class DeepSpeech2Model(nn.Layer): model = cls( feat_size=dataloader.collate_fn.feature_size, dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights, - blank_id=config.model.blank_id, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights, + blank_id=config.blank_id, ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) @@ -240,7 +240,7 @@ class DeepSpeech2Model(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2Model diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py index 85876bce..f08e30d0 100644 --- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py @@ -353,14 +353,14 @@ class DeepSpeech2ModelOnline(nn.Layer): model = cls( feat_size=dataloader.collate_fn.feature_size, dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - rnn_direction=config.model.rnn_direction, - num_fc_layers=config.model.num_fc_layers, - fc_layers_size_list=config.model.fc_layers_size_list, - use_gru=config.model.use_gru, - blank_id=config.model.blank_id, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + rnn_direction=config.rnn_direction, + num_fc_layers=config.num_fc_layers, + fc_layers_size_list=config.fc_layers_size_list, + use_gru=config.use_gru, + blank_id=config.blank_id, ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) @@ -374,7 +374,7 @@ class DeepSpeech2ModelOnline(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2ModelOnline diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py index d4299ea3..bb85732a 100644 --- a/paddlespeech/s2t/training/cli.py +++ b/paddlespeech/s2t/training/cli.py @@ -101,7 +101,7 @@ def default_argument_parser(parser=None): title='Test Options', description=None) test_group.add_argument( - "--decode_config", + "--decode_cfg", metavar="DECODE_CONFIG_FILE", help="decode config file.") diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh index fcd0c235..c9d640ed 100644 --- a/tests/benchmark/conformer/run.sh +++ b/tests/benchmark/conformer/run.sh @@ -22,6 +22,7 @@ sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml fp_item_list=(fp32) bs_item=(16) config_path=conf/benchmark/conformer.yaml +decode_config_path=conf/tuning/decode.yaml seed=0 output=exp/conformer profiler_options=None @@ -34,13 +35,13 @@ for fp_item in ${fp_item_list[@]}; do echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer" run_mode=mp ngpu=8 - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 sleep 60 log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8 echo "index is speed, 1gpus, begin, ${log_name}" run_mode=sp ngpu=1 - CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) + CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) sleep 60 done done diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh index 5b83b15c..d5902c51 100644 --- a/tests/benchmark/conformer/run_benchmark.sh +++ b/tests/benchmark/conformer/run_benchmark.sh @@ -5,13 +5,14 @@ function _set_params(){ run_mode=${1:-"sp"} # 单卡sp|多卡mp config_path=${2:-"conf/conformer.yaml"} - output=${3:-"exp/conformer"} - seed=${4:-"0"} - ngpu=${5:-"1"} - profiler_options=${6:-"None"} - batch_size=${7:-"32"} - fp_item=${8:-"fp32"} - model_item=${9:-"conformer"} + decode_config_path=${3:-"conf/tuning/decode.yaml"} + output=${4:-"exp/conformer"} + seed=${5:-"0"} + ngpu=${6:-"1"} + profiler_options=${7:-"None"} + batch_size=${8:-"32"} + fp_item=${9:-"fp32"} + model_item=${10:-"conformer"} benchmark_max_step=0 run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 # 添加日志解析需要的参数 @@ -35,6 +36,7 @@ function _train(){ echo "Train on ${num_gpu_devices} GPUs" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" train_cmd="--config=${config_path} \ + --decode_cfg=${decode_config_path} \ --output=${output} \ --seed=${seed} \ --ngpu=${ngpu} \ @@ -68,7 +70,7 @@ function _train(){ } source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开 -_set_params $@ -# _train # 如果只想产出训练log,不解析,可取消注释 +#_set_params $@ +#_train # 如果只想产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开 diff --git a/tests/chains/ds2/ds2_params_lite_train_infer.txt b/tests/chains/ds2/ds2_params_lite_train_infer.txt index b11872bd..cad8efa3 100644 --- a/tests/chains/ds2/ds2_params_lite_train_infer.txt +++ b/tests/chains/ds2/ds2_params_lite_train_infer.txt @@ -21,13 +21,13 @@ null:null null:null ## ===========================eval_params=========================== -eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl --model_type offline +eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --result_file tests/4.rsl --model_type offline null:null ## ===========================infer_params=========================== null:null null:null -norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit +norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --export_path exp/deepspeech_tiny/checkpoints/4.jit quant_export:null fpgm_export:null distill_export:null diff --git a/tests/chains/ds2/ds2_params_whole_train_infer.txt b/tests/chains/ds2/ds2_params_whole_train_infer.txt index 875e3ccf..5c619506 100644 --- a/tests/chains/ds2/ds2_params_whole_train_infer.txt +++ b/tests/chains/ds2/ds2_params_whole_train_infer.txt @@ -21,7 +21,7 @@ null:null null:null ## ===========================eval_params=========================== -eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline +eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline null:null ## ===========================infer_params=========================== diff --git a/tests/chains/ds2/lite_train_infer.sh b/tests/chains/ds2/lite_train_infer.sh index 76b22a38..1dce1b29 100644 --- a/tests/chains/ds2/lite_train_infer.sh +++ b/tests/chains/ds2/lite_train_infer.sh @@ -1,5 +1,5 @@ bash prepare.sh ds2_params_lite_train_infer.txt lite_train_infer -cd ../../examples/tiny/s0 +cd ../../../examples/tiny/asr0 source path.sh -bash ../../../tests/chains/test.sh ../../../tests/chains/ds2_params_lite_train_infer.txt lite_train_infer +bash ../../../tests/chains/ds2/test.sh ../../../tests/chains/ds2/ds2_params_lite_train_infer.txt lite_train_infer cd ../../../tests/chains diff --git a/tests/chains/ds2/prepare.sh b/tests/chains/ds2/prepare.sh index 73a30283..4913ce42 100644 --- a/tests/chains/ds2/prepare.sh +++ b/tests/chains/ds2/prepare.sh @@ -34,7 +34,7 @@ MODE=$2 if [ ${MODE} = "lite_train_infer" ];then # pretrain lite train data curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/tiny/s0 + cd ${curPath}/../../../examples/tiny/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -47,7 +47,7 @@ if [ ${MODE} = "lite_train_infer" ];then elif [ ${MODE} = "whole_train_infer" ];then curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -59,7 +59,7 @@ elif [ ${MODE} = "whole_train_infer" ];then cd ${curPath} elif [ ${MODE} = "whole_infer" ];then curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -71,7 +71,7 @@ elif [ ${MODE} = "whole_infer" ];then cd ${curPath} else curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 diff --git a/tests/chains/ds2/test.sh b/tests/chains/ds2/test.sh index c9307820..26917c67 100644 --- a/tests/chains/ds2/test.sh +++ b/tests/chains/ds2/test.sh @@ -324,6 +324,7 @@ else gsu=${gpu//,/ } nump=`echo $gsu | wc -w` cmd="${python} ${run_train} --ngpu=$nump" + export CUDA_VISIBLE_DEVICES=${gpu} else # train with multi-machine cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" fi