diff --git a/examples/aishell/asr0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml index bdfa4219..1dc8581e 100644 --- a/examples/aishell/asr0/conf/deepspeech2.yaml +++ b/examples/aishell/asr0/conf/deepspeech2.yaml @@ -1,68 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 0 - ctc_grad_norm_type: instance +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 0 +ctc_grad_norm_type: instance -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 1.9 - beta: 5.0 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 10 +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml index 2f63f4de..c49973a2 100644 --- a/examples/aishell/asr0/conf/deepspeech2_online.yaml +++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml @@ -1,70 +1,68 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear #linear, mfcc, fbank - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear #linear, mfcc, fbank +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 -model: - num_conv_layers: 2 - num_rnn_layers: 5 - rnn_layer_size: 1024 - rnn_direction: forward # [forward, bidirect] - num_fc_layers: 0 - fc_layers_size_list: -1, - use_gru: False - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 5 +rnn_layer_size: 1024 +rnn_direction: forward # [forward, bidirect] +num_fc_layers: 0 +fc_layers_size_list: -1, +use_gru: False +blank_id: 0 -training: - n_epoch: 65 - accum_grad: 1 - lr: 5e-4 - lr_decay: 0.93 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 65 +accum_grad: 1 +lr: 5e-4 +lr_decay: 0.93 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 + -decoding: - batch_size: 32 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 2.2 #1.9 - beta: 4.3 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/conf/tuning/chunk_decode.yaml b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..9de06711 --- /dev/null +++ b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +chunk_batch_size: 32 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 2.2 #1.9 +beta: 4.3 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/conf/tuning/decode.yaml b/examples/aishell/asr0/conf/tuning/decode.yaml new file mode 100644 index 00000000..5778e656 --- /dev/null +++ b/examples/aishell/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 1.9 +beta: 5.0 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/local/test.sh b/examples/aishell/asr0/local/test.sh index 8cbff235..463593ef 100755 --- a/examples/aishell/asr0/local/test.sh +++ b/examples/aishell/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/aishell/asr0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh index 4f5e5c8b..7a4b87f8 100755 --- a/examples/aishell/asr0/local/test_export.sh +++ b/examples/aishell/asr0/local/test_export.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -jit_model_export_path=$2 -model_type=$3 +decode_config_path=$2 +jit_model_export_path=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh > /dev/null 2>&1 @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test_export.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${jit_model_export_path}.rsl \ --export_path ${jit_model_export_path} \ --model_type ${model_type} diff --git a/examples/aishell/asr0/local/test_wav.sh b/examples/aishell/asr0/local/test_wav.sh index 4a6d92fb..62b005a6 100755 --- a/examples/aishell/asr0/local/test_wav.sh +++ b/examples/aishell/asr0/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 4 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" +if [ $# != 5 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file" exit -1 fi @@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 -audio_file=$4 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 +audio_file=$5 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -33,6 +34,7 @@ fi python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} \ diff --git a/examples/aishell/asr0/run.sh b/examples/aishell/asr0/run.sh index 270b88fc..15685f21 100755 --- a/examples/aishell/asr0/run.sh +++ b/examples/aishell/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline # offline or online audio_file=data/demo_01_03.wav @@ -34,7 +35,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -44,11 +45,11 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test export ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 fi diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml index 50eaef98..68e852ba 100644 --- a/examples/aishell/asr1/conf/chunk_conformer.yaml +++ b/examples/aishell/asr1/conf/chunk_conformer.yaml @@ -1,103 +1,95 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +########################################### +# Data # +########################################### - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 64 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 - - -training: - n_epoch: 240 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - beam_size: 10 - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index 9de28c12..775a4527 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -1,97 +1,89 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 2 +subsampling_factor: 1 +num_encs: 1 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 64 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - num_workers: 8 - subsampling_factor: 1 - num_encs: 1 - - -training: - n_epoch: 240 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - beam_size: 10 - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml index 7c5fa624..9d294653 100644 --- a/examples/aishell/asr1/conf/transformer.yaml +++ b/examples/aishell/asr1/conf/transformer.yaml @@ -1,95 +1,85 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Data # +########################################### # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - - -collator: - unit_type: 'char' - vocab_filepath: data/lang_char/vocab.txt - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 64 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 - - +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -training: - n_epoch: 240 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Dataloader # +########################################### +unit_type: 'char' +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -decoding: - beam_size: 10 - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/tuning/chunk_decode.yaml b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..72ede927 --- /dev/null +++ b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/aishell/asr1/conf/tuning/decode.yaml b/examples/aishell/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..72ede927 --- /dev/null +++ b/examples/aishell/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/aishell/asr1/local/align.sh b/examples/aishell/asr1/local/align.sh index c65d611c..14d91d68 100755 --- a/examples/aishell/asr1/local/align.sh +++ b/examples/aishell/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh index da159de7..65b884e5 100755 --- a/examples/aishell/asr1/local/test.sh +++ b/examples/aishell/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/aishell/asr1/local/test_wav.sh b/examples/aishell/asr1/local/test_wav.sh index f85c1a47..d029f2fd 100755 --- a/examples/aishell/asr1/local/test_wav.sh +++ b/examples/aishell/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -42,10 +43,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} if [ $? -ne 0 ]; then diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh index d07a4ed5..c54dae9c 100644 --- a/examples/aishell/asr1/run.sh +++ b/examples/aishell/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/conformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=20 audio_file=data/demo_01_03.wav @@ -32,18 +33,18 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi # Not supported at now!!! diff --git a/examples/callcenter/asr1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml index 69959c68..19e783a6 100644 --- a/examples/callcenter/asr1/conf/chunk_conformer.yaml +++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml @@ -1,120 +1,98 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - - -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 8000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test + +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +spm_model_prefix: '' +preprocess_config: conf/preprocess.yaml +batch_size: 32 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 8000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -training: - n_epoch: 240 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/callcenter/asr1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml index 80c15abb..f6fcb949 100644 --- a/examples/callcenter/asr1/conf/conformer.yaml +++ b/examples/callcenter/asr1/conf/conformer.yaml @@ -1,117 +1,92 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.0 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 8000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +spm_model_prefix: '' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -training: - n_epoch: 100 # 50 will be lowest - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Training # +########################################### +n_epoch: 100 # 50 will be lowest +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/callcenter/asr1/conf/preprocess.yaml b/examples/callcenter/asr1/conf/preprocess.yaml index f7f4c58d..877e7d5a 100644 --- a/examples/callcenter/asr1/conf/preprocess.yaml +++ b/examples/callcenter/asr1/conf/preprocess.yaml @@ -1,7 +1,7 @@ process: # extract kaldi fbank from PCM - type: fbank_kaldi - fs: 16000 + fs: 8000 n_mels: 80 n_shift: 160 win_length: 400 diff --git a/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..49a6a114 --- /dev/null +++ b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: true # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/callcenter/asr1/conf/tuning/decode.yaml b/examples/callcenter/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..d2e0b72d --- /dev/null +++ b/examples/callcenter/asr1/conf/tuning/decode.yaml @@ -0,0 +1,13 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. + + diff --git a/examples/callcenter/asr1/local/align.sh b/examples/callcenter/asr1/local/align.sh index 681c77ed..1397ae57 100755 --- a/examples/callcenter/asr1/local/align.sh +++ b/examples/callcenter/asr1/local/align.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 ckpt_name=$(basename ${ckpt_prefxi}) @@ -25,9 +26,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/callcenter/asr1/local/test.sh b/examples/callcenter/asr1/local/test.sh index fc43c5a2..b7ff722a 100755 --- a/examples/callcenter/asr1/local/test.sh +++ b/examples/callcenter/asr1/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 + ckpt_name=$(basename ${ckpt_prefxi}) @@ -30,10 +32,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -49,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/callcenter/asr1/run.sh b/examples/callcenter/asr1/run.sh index e9be3d03..0c7ffc1e 100644 --- a/examples/callcenter/asr1/run.sh +++ b/examples/callcenter/asr1/run.sh @@ -4,8 +4,9 @@ source path.sh gpus=0,1,2,3 stage=0 -stop_stage=100 +stop_stage=50 conf_path=conf/conformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=20 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -31,15 +32,15 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then # export ckpt avg_n CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml index f3574e15..0b0a1550 100644 --- a/examples/librispeech/asr0/conf/deepspeech2.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2.yaml @@ -1,68 +1,65 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev-clean - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 30.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev-clean +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 30.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 20 - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 20.0 - delta_delta: False - dither: 1.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 20 +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 20.0 +delta_delta: False +dither: 1.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 0 -training: - n_epoch: 50 - accum_grad: 1 - lr: 1e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.9 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 1 +lr: 1e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml index 0d16bc57..8bd5a672 100644 --- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml @@ -1,70 +1,67 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev-clean - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 30.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev-clean +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 30.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 15 - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 20.0 - delta_delta: False - dither: 1.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 +########################################### +# Dataloader # +########################################### +batch_size: 15 +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 20.0 +delta_delta: False +dither: 1.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - rnn_direction: forward - num_fc_layers: 2 - fc_layers_size_list: 512, 256 - use_gru: False - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +rnn_direction: forward +num_fc_layers: 2 +fc_layers_size_list: 512, 256 +use_gru: False +blank_id: 0 -training: - n_epoch: 50 - accum_grad: 4 - lr: 1e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.9 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 4 +lr: 1e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..e07026ba --- /dev/null +++ b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.9 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/librispeech/asr0/conf/tuning/decode.yaml b/examples/librispeech/asr0/conf/tuning/decode.yaml new file mode 100644 index 00000000..e07026ba --- /dev/null +++ b/examples/librispeech/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.9 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/librispeech/asr0/local/test.sh b/examples/librispeech/asr0/local/test.sh index a627ef72..ea40046b 100755 --- a/examples/librispeech/asr0/local/test.sh +++ b/examples/librispeech/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/librispeech/asr0/local/test_wav.sh b/examples/librispeech/asr0/local/test_wav.sh index e8337da7..25cfc45e 100755 --- a/examples/librispeech/asr0/local/test_wav.sh +++ b/examples/librispeech/asr0/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 4 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" +if [ $# != 5 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file" exit -1 fi @@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 -audio_file=$4 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 +audio_file=$5 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ @@ -33,6 +34,7 @@ fi python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} \ diff --git a/examples/librispeech/asr0/run.sh b/examples/librispeech/asr0/run.sh index 5d811b65..ca2c2b9d 100755 --- a/examples/librispeech/asr0/run.sh +++ b/examples/librispeech/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=30 model_type=offline audio_file=data/demo_002_en.wav @@ -33,7 +34,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -43,5 +44,5 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 fi diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index 662d559c..72b9cb7b 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -1,103 +1,99 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: True - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: True + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 16 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 16 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 8 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 120 - accum_grad: 8 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - beam_size: 10 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml index bc77ba41..19ade8ad 100644 --- a/examples/librispeech/asr1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -1,103 +1,89 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 64 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 120 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 \ No newline at end of file diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index 5a570897..4f7b759b 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -1,104 +1,96 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - ctc_grad_norm_type: null - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + ctc_grad_norm_type: null + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 16 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 16 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 70 - accum_grad: 8 - global_grad_clip: 3.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - beam_size: 10 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 70 +accum_grad: 8 +global_grad_clip: 3.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml index b7f33e22..740ce78f 100644 --- a/examples/librispeech/asr1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -1,110 +1,88 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.5 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 100.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 32 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 - - -training: - n_epoch: 120 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..0760e721 --- /dev/null +++ b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: true # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/conf/tuning/decode.yaml b/examples/librispeech/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..805dd02f --- /dev/null +++ b/examples/librispeech/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/local/align.sh b/examples/librispeech/asr1/local/align.sh index c65d611c..14d91d68 100755 --- a/examples/librispeech/asr1/local/align.sh +++ b/examples/librispeech/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/librispeech/asr1/local/test.sh b/examples/librispeech/asr1/local/test.sh index aa06132e..51ced18b 100755 --- a/examples/librispeech/asr1/local/test.sh +++ b/examples/librispeech/asr1/local/test.sh @@ -15,8 +15,8 @@ recog_set="test-clean" stage=0 stop_stage=100 -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -24,7 +24,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -52,10 +53,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -76,10 +78,11 @@ for type in ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -96,10 +99,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/librispeech/asr1/local/test_wav.sh b/examples/librispeech/asr1/local/test_wav.sh index ab6d685d..e70fc83c 100755 --- a/examples/librispeech/asr1/local/test_wav.sh +++ b/examples/librispeech/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ @@ -49,10 +50,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} #score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict} diff --git a/examples/librispeech/asr1/run.sh b/examples/librispeech/asr1/run.sh index f839e5af..116dae12 100755 --- a/examples/librispeech/asr1/run.sh +++ b/examples/librispeech/asr1/run.sh @@ -8,6 +8,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=30 audio_file=data/demo_002_en.wav @@ -34,17 +35,17 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/librispeech/asr2/conf/decode/decode_base.yaml b/examples/librispeech/asr2/conf/decode/decode_base.yaml new file mode 100644 index 00000000..384ed197 --- /dev/null +++ b/examples/librispeech/asr2/conf/decode/decode_base.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 1 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml index a16563a5..32d95b41 100644 --- a/examples/librispeech/asr2/conf/transformer.yaml +++ b/examples/librispeech/asr2/conf/transformer.yaml @@ -1,73 +1,80 @@ # https://yaml.org/type/float.html -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/train_960_unigram5000_units.txt - unit_type: spm - spm_model_prefix: data/lang_char/train_960_unigram5000 - feat_dim: 83 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 30 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/train_960_unigram5000_units.txt +unit_type: spm +spm_model_prefix: data/lang_char/train_960_unigram5000 +feat_dim: 83 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 30 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 120 - accum_grad: 2 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +log_interval: 1 +checkpoint: + kbest_n: 50 + latest_n: 5 optim: adam optim_conf: @@ -79,23 +86,5 @@ scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 -decoding: - batch_size: 1 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr2/local/align.sh b/examples/librispeech/asr2/local/align.sh index 626c3574..60a16f42 100755 --- a/examples/librispeech/asr2/local/align.sh +++ b/examples/librispeech/asr2/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path dict_path ckpt_path_prefix" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path dict_path ckpt_path_prefix" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -dict_path=$2 -ckpt_prefix=$3 +decode_config_path=$2 +dict_path=$3 +ckpt_prefix=$4 batch_size=1 output_dir=${ckpt_prefix} @@ -24,9 +25,10 @@ python3 -u ${BIN_DIR}/test.py \ --dict-path ${dict_path} \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result-file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/librispeech/asr2/local/test.sh b/examples/librispeech/asr2/local/test.sh index d210f2a8..8cf3b52c 100755 --- a/examples/librispeech/asr2/local/test.sh +++ b/examples/librispeech/asr2/local/test.sh @@ -19,6 +19,7 @@ bpeprefix=data/lang_char/${train_set}_${bpemode}${nbpe} bpemodel=${bpeprefix}.model config_path=conf/transformer.yaml +decode_config_path=conf/decode/decode_base.yaml dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt ckpt_prefix= @@ -79,11 +80,12 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco --ngpu ${ngpu} \ --dict-path ${dict} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --checkpoint_path ${ckpt_prefix} \ --result-file ${decode_dir}/data.JOB.json \ - --opts decoding.decoding_method ${dmethd} \ - --opts decoding.batch_size ${batch_size} \ - --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} + --opts decode.decoding_method ${dmethd} \ + --opts decode.decode_batch_size ${batch_size} \ + --opts test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict} diff --git a/examples/librispeech/asr2/run.sh b/examples/librispeech/asr2/run.sh index 5b7596f2..c9a794e3 100755 --- a/examples/librispeech/asr2/run.sh +++ b/examples/librispeech/asr2/run.sh @@ -9,7 +9,8 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=50 conf_path=conf/transformer.yaml -dict_path=lang_char/train_960_unigram5000_units.txt +decode_conf_path=conf/decode/decode_base.yaml +dict_path=data/lang_char/train_960_unigram5000_units.txt avg_num=10 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -35,7 +36,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # attetion resocre decoder - ./local/test.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + ./local/test.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -45,7 +46,7 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then diff --git a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml index c2d69226..c2db2c7c 100644 --- a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml @@ -1,67 +1,65 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 4333 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 4333 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 + -decoding: - batch_size: 32 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 2.6 - beta: 5.0 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/aishell/conf/tuning/decode.yaml b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml new file mode 100644 index 00000000..b5283a93 --- /dev/null +++ b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 2.6 +beta: 5.0 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/aishell/local/test.sh b/examples/other/1xt2x/aishell/local/test.sh index 8cbff235..463593ef 100755 --- a/examples/other/1xt2x/aishell/local/test.sh +++ b/examples/other/1xt2x/aishell/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/aishell/run.sh b/examples/other/1xt2x/aishell/run.sh index 1ccac1c3..89a63411 100755 --- a/examples/other/1xt2x/aishell/run.sh +++ b/examples/other/1xt2x/aishell/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=2 @@ -23,6 +24,6 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml index be51a9b9..0c08fbc6 100644 --- a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml @@ -1,67 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: .inf # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: .inf # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 28 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 28 + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 32 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.4 - beta: 0.35 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml new file mode 100644 index 00000000..f52dde32 --- /dev/null +++ b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.4 +beta: 0.35 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/baidu_en8k/local/test.sh b/examples/other/1xt2x/baidu_en8k/local/test.sh index a627ef72..ea40046b 100755 --- a/examples/other/1xt2x/baidu_en8k/local/test.sh +++ b/examples/other/1xt2x/baidu_en8k/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/baidu_en8k/run.sh b/examples/other/1xt2x/baidu_en8k/run.sh index b7f69f6b..82de56b0 100755 --- a/examples/other/1xt2x/baidu_en8k/run.sh +++ b/examples/other/1xt2x/baidu_en8k/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=0 @@ -23,6 +24,6 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml index ad7fb2c1..a2a5649b 100644 --- a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml @@ -1,67 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 1000.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 1000.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 28 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 28 + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 32 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml new file mode 100644 index 00000000..f3b51def --- /dev/null +++ b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/librispeech/local/test.sh b/examples/other/1xt2x/librispeech/local/test.sh index a627ef72..ea40046b 100755 --- a/examples/other/1xt2x/librispeech/local/test.sh +++ b/examples/other/1xt2x/librispeech/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/librispeech/run.sh b/examples/other/1xt2x/librispeech/run.sh index 8c667de2..8b614bbb 100755 --- a/examples/other/1xt2x/librispeech/run.sh +++ b/examples/other/1xt2x/librispeech/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=1 @@ -23,5 +24,5 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/src_deepspeech2x/bin/test.py b/examples/other/1xt2x/src_deepspeech2x/bin/test.py index b4f9cdf9..88a13fdc 100644 --- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py +++ b/examples/other/1xt2x/src_deepspeech2x/bin/test.py @@ -13,8 +13,8 @@ # limitations under the License. """Evaluation for DeepSpeech2 model.""" from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester +from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -41,9 +41,13 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py index ad83a41d..003b02e2 100644 --- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py +++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py @@ -120,20 +120,6 @@ class DeepSpeech2Model(nn.Layer): :rtype: tuple of LayerOutput """ - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, feat_size, dict_size, @@ -233,11 +219,11 @@ class DeepSpeech2Model(nn.Layer): """ model = cls(feat_size=dataloader.collate_fn.feature_size, dict_size=len(dataloader.collate_fn.vocab_list), - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -250,7 +236,7 @@ class DeepSpeech2Model(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2Model diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py index 82e190d8..246fb107 100644 --- a/examples/other/1xt2x/src_deepspeech2x/test_model.py +++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py @@ -44,27 +44,11 @@ logger = Log(__name__).getlog() class DeepSpeech2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -98,7 +82,7 @@ class DeepSpeech2Trainer(Trainer): iteration_time = time.time() - start msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -126,7 +110,7 @@ class DeepSpeech2Trainer(Trainer): total_loss += float(loss) * num_utts valid_losses['val_loss'].append(float(loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -146,15 +130,15 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config.clone() config.defrost() - config.model.feat_size = self.train_loader.collate_fn.feature_size - #config.model.dict_size = self.train_loader.collate_fn.vocab_size - config.model.dict_size = len(self.train_loader.collate_fn.vocab_list) + config.feat_size = self.train_loader.collate_fn.feature_size + #config.dict_size = self.train_loader.collate_fn.vocab_size + config.dict_size = len(self.train_loader.collate_fn.vocab_list) config.freeze() if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") if self.parallel: @@ -163,17 +147,13 @@ class DeepSpeech2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - grad_clip = ClipGradByGlobalNormWithLog( - config.training.global_grad_clip) + grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip) lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=config.training.lr, - gamma=config.training.lr_decay, - verbose=True) + learning_rate=config.lr, gamma=config.lr_decay, verbose=True) optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config.training.weight_decay), + weight_decay=paddle.regularizer.L2Decay(config.weight_decay), grad_clip=grad_clip) self.model = model @@ -184,59 +164,59 @@ class DeepSpeech2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.collator.keep_transcription_text = False + config.keep_transcription_text = False - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest test_dataset = ManifestDataset.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) collate_fn_train = SpeechCollator.from_config(config) - config.collator.augmentation_config = "" + config.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" + config.keep_transcription_text = True + config.augmentation_config = "" collate_fn_test = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev) self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.decode_batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_test) @@ -250,31 +230,10 @@ class DeepSpeech2Trainer(Trainer): class DeepSpeech2Tester(DeepSpeech2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # testing config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab_filepath=None) + unit_type=config.unit_type, vocab=None) super().__init__(config, args) def ordid2token(self, texts, texts_len): @@ -293,7 +252,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer @@ -399,31 +358,3 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.export() except KeyboardInterrupt: exit(-1) - - def setup(self): - """Setup the experiment. - """ - paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') - - self.setup_output_dir() - self.setup_checkpointer() - - self.setup_dataloader() - self.setup_model() - - self.iteration = 0 - self.epoch = 0 - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - if self.args.output: - output_dir = Path(self.args.output).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - else: - output_dir = Path( - self.args.checkpoint_path).expanduser().parent.parent - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir diff --git a/examples/ted_en_zh/st0/conf/preprocess.yaml b/examples/ted_en_zh/st0/conf/preprocess.yaml new file mode 100644 index 00000000..d3992cb9 --- /dev/null +++ b/examples/ted_en_zh/st0/conf/preprocess.yaml @@ -0,0 +1,25 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: 0.1 + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml index 8afb107b..d113fc94 100644 --- a/examples/ted_en_zh/st0/conf/transformer.yaml +++ b/examples/ted_en_zh/st0/conf/transformer.yaml @@ -1,114 +1,98 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.05 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.05 # second +max_input_len: 30.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 16 - maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/bpe_unigram_8000 +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +batch_size: 16 +maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - asr_weight: 0.0 - ctc_weight: 0.0 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -training: - n_epoch: 120 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 2.5 - weight_decay: 1e-06 - scheduler: noam - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 50 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + asr_weight: 0.0 + ctc_weight: 0.0 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 1.0e-06 +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml index 017230fe..a01ec1a6 100644 --- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml @@ -1,114 +1,102 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.05 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.05 # second +max_input_len: 30.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 16 - maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/bpe_unigram_8000 +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +batch_size: 16 +maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.5 - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.5 + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 120 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 2.5 - weight_decay: 1e-06 - scheduler: noam - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 50 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 1.0e-06 +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - diff --git a/examples/ted_en_zh/st0/conf/tuning/decode.yaml b/examples/ted_en_zh/st0/conf/tuning/decode.yaml new file mode 100644 index 00000000..ed081cf4 --- /dev/null +++ b/examples/ted_en_zh/st0/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +batch_size: 5 +error_rate_type: char-bleu +decoding_method: fullsentence # 'fullsentence', 'simultaneous' +beam_size: 10 +word_reward: 0.7 +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/ted_en_zh/st0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh index 0796a06e..904f95c4 100755 --- a/examples/ted_en_zh/st0/local/test.sh +++ b/examples/ted_en_zh/st0/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,16 +9,18 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 for type in fullsentence; do echo "decoding ${type}" python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ + --opts decode.decoding_method ${type} \ if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/ted_en_zh/st0/run.sh b/examples/ted_en_zh/st0/run.sh index b85ba95a..1746c025 100755 --- a/examples/ted_en_zh/st0/run.sh +++ b/examples/ted_en_zh/st0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer_mtl_noam.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=5 data_path=./TED_EnZh # path to unzipped data source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -32,7 +33,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/ted_en_zh/st1/conf/preprocess.yaml b/examples/ted_en_zh/st1/conf/preprocess.yaml new file mode 100644 index 00000000..bc86d98c --- /dev/null +++ b/examples/ted_en_zh/st1/conf/preprocess.yaml @@ -0,0 +1,16 @@ +process: + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml index a8918a23..515edee2 100644 --- a/examples/ted_en_zh/st1/conf/transformer.yaml +++ b/examples/ted_en_zh/st1/conf/transformer.yaml @@ -1,104 +1,90 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 20 - feat_dim: 83 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 +mean_std_filepath: "" +# preprocess_config: conf/augmentation.json +batch_size: 20 +feat_dim: 83 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +############################################ +# Network Architecture # +############################################ +cmvn_file: None +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true -# network architecture -model: - cmvn_file: None - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + asr_weight: 0.0 + ctc_weight: 0.0 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false - # hybrid CTC/attention - model_conf: - asr_weight: 0.0 - ctc_weight: 0.0 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -training: - n_epoch: 40 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 2.5 - weight_decay: 0. - scheduler: noam - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 50 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 40 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 0. +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml index 3787037f..a5f956fa 100644 --- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml @@ -1,104 +1,90 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 20 - feat_dim: 83 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 +mean_std_filepath: "" +# preprocess_config: conf/augmentation.json +batch_size: 20 +feat_dim: 83 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +############################################ +# Network Architecture # +############################################ +cmvn_file: None +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true -# network architecture -model: - cmvn_file: None - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + asr_weight: 0.5 + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false - # hybrid CTC/attention - model_conf: - asr_weight: 0.5 - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -training: - n_epoch: 40 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 2.5 - weight_decay: 0. - scheduler: noam - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 50 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 40 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 0. +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st1/conf/tuning/decode.yaml b/examples/ted_en_zh/st1/conf/tuning/decode.yaml new file mode 100644 index 00000000..d6104dbc --- /dev/null +++ b/examples/ted_en_zh/st1/conf/tuning/decode.yaml @@ -0,0 +1,12 @@ + +batch_size: 5 +error_rate_type: char-bleu +decoding_method: fullsentence # 'fullsentence', 'simultaneous' +beam_size: 10 +word_reward: 0.7 +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh index 0796a06e..904f95c4 100755 --- a/examples/ted_en_zh/st1/local/test.sh +++ b/examples/ted_en_zh/st1/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,16 +9,18 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 for type in fullsentence; do echo "decoding ${type}" python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ + --opts decode.decoding_method ${type} \ if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh index f6362a8b..1808e37b 100755 --- a/examples/ted_en_zh/st1/run.sh +++ b/examples/ted_en_zh/st1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3 stage=1 stop_stage=4 conf_path=conf/transformer_mtl_noam.yaml +decode_conf_path=conf/tuning/decode.yaml ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model) avg_num=5 data_path=./TED_EnZh # path to unzipped data @@ -27,7 +28,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ -n "${ckpt_path}" ]; then echo "Finetune from Pretrained Model" ${ckpt_path} ./local/download_pretrain.sh || exit -1 - fi + fi CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" fi @@ -38,5 +39,5 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 -fi \ No newline at end of file + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml index 1c6059e4..4731395f 100644 --- a/examples/timit/asr1/conf/transformer.yaml +++ b/examples/timit/asr1/conf/transformer.yaml @@ -1,110 +1,89 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 # second - max_input_len: 10.0 # second - min_output_len: 0.0 # tokens - max_output_len: 150.0 # tokens - min_output_input_ratio: 0.005 - max_output_input_ratio: 1000.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: "word" - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: "word" +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 128 # dimension of attention - attention_heads: 4 - linear_units: 1024 # the number of units of position-wise feed forward - num_blocks: 6 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 128 # dimension of attention + attention_heads: 4 + linear_units: 1024 # the number of units of position-wise feed forward + num_blocks: 6 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 1024 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 1024 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.5 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.5 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 50 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 1200 - lr_decay: 1.0 - log_interval: 10 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 1200 + lr_decay: 1.0 +log_interval: 10 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/timit/asr1/conf/tuning/decode.yaml b/examples/timit/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..805dd02f --- /dev/null +++ b/examples/timit/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/timit/asr1/local/align.sh b/examples/timit/asr1/local/align.sh index c65d611c..14d91d68 100755 --- a/examples/timit/asr1/local/align.sh +++ b/examples/timit/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/timit/asr1/local/test.sh b/examples/timit/asr1/local/test.sh index 08ee0e36..88192c58 100755 --- a/examples/timit/asr1/local/test.sh +++ b/examples/timit/asr1/local/test.sh @@ -7,8 +7,8 @@ stop_stage=50 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1; -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -17,7 +17,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -43,10 +44,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -63,10 +65,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -82,10 +85,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/timit/asr1/run.sh b/examples/timit/asr1/run.sh index a95b5f3a..0d84be9f 100755 --- a/examples/timit/asr1/run.sh +++ b/examples/timit/asr1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=10 TIMIT_path=/path/to/TIMIT @@ -34,15 +35,15 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then -# # export ckpt avg_n -# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -# fi +if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml index 7d841d47..a16a79d3 100644 --- a/examples/tiny/asr0/conf/deepspeech2.yaml +++ b/examples/tiny/asr0/conf/deepspeech2.yaml @@ -1,70 +1,67 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.0 - max_input_len: 30.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.0 +max_input_len: 30.0 +min_output_len: 0.0 +max_output_len: 400.0 +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - batch_size: 4 +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 +batch_size: 4 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 0 -training: - n_epoch: 5 - accum_grad: 1 - lr: 1e-5 - lr_decay: 0.8 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 1 - checkpoint: - kbest_n: 3 - latest_n: 2 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +lr: 1e-5 +lr_decay: 0.8 +weight_decay: 1e-06 +global_grad_clip: 5.0 +log_interval: 1 +checkpoint: + kbest_n: 3 + latest_n: 2 -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml index 393b6439..5458cfb3 100644 --- a/examples/tiny/asr0/conf/deepspeech2_online.yaml +++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml @@ -1,72 +1,68 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.0 - max_input_len: 30.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.0 +max_input_len: 30.0 +min_output_len: 0.0 +max_output_len: 400.0 +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 - batch_size: 4 +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 +batch_size: 4 -model: - num_conv_layers: 2 - num_rnn_layers: 4 - rnn_layer_size: 2048 - rnn_direction: forward - num_fc_layers: 2 - fc_layers_size_list: 512, 256 - use_gru: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 4 +rnn_layer_size: 2048 +rnn_direction: forward +num_fc_layers: 2 +fc_layers_size_list: 512, 256 +use_gru: True +blank_id: 0 -training: - n_epoch: 5 - accum_grad: 1 - lr: 1e-5 - lr_decay: 1.0 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 1 - checkpoint: - kbest_n: 3 - latest_n: 2 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +lr: 1e-5 +lr_decay: 1.0 +weight_decay: 1e-06 +global_grad_clip: 5.0 +log_interval: 1 +checkpoint: + kbest_n: 3 + latest_n: 2 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/tuning/chunk_decode.yaml b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..94c3dbde --- /dev/null +++ b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/tuning/decode.yaml b/examples/tiny/asr0/conf/tuning/decode.yaml new file mode 100644 index 00000000..94c3dbde --- /dev/null +++ b/examples/tiny/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/local/test.sh b/examples/tiny/asr0/local/test.sh index a627ef72..ea40046b 100755 --- a/examples/tiny/asr0/local/test.sh +++ b/examples/tiny/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/tiny/asr0/run.sh b/examples/tiny/asr0/run.sh index f39fb3fa..25f04624 100755 --- a/examples/tiny/asr0/run.sh +++ b/examples/tiny/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline @@ -32,7 +33,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml index ad27478d..8f785121 100644 --- a/examples/tiny/asr1/conf/chunk_confermer.yaml +++ b/examples/tiny/asr1/conf/chunk_confermer.yaml @@ -1,120 +1,97 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: True - use_dynamic_chunk: True - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: True + use_dynamic_chunk: True + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml index 298518fb..2570bb85 100644 --- a/examples/tiny/asr1/conf/chunk_transformer.yaml +++ b/examples/tiny/asr1/conf/chunk_transformer.yaml @@ -1,113 +1,91 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 - +# https://yaml.org/type/float.html +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml index eb850902..eb8f0ab9 100644 --- a/examples/tiny/asr1/conf/conformer.yaml +++ b/examples/tiny/asr1/conf/conformer.yaml @@ -1,116 +1,97 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 5 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml index c641d1f5..4e3068d1 100644 --- a/examples/tiny/asr1/conf/transformer.yaml +++ b/examples/tiny/asr1/conf/transformer.yaml @@ -1,110 +1,90 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: data/mean_std.json - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 2 - latest_n: 1 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 2 + latest_n: 1 -decoding: - batch_size: 8 #64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/conf/tuning/chunk_decode.yaml b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..c5b641da --- /dev/null +++ b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 8 #64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/tiny/asr1/conf/tuning/decode.yaml b/examples/tiny/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..a0984f9e --- /dev/null +++ b/examples/tiny/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 8 #64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/local/align.sh b/examples/tiny/asr1/local/align.sh index c65d611c..14d91d68 100755 --- a/examples/tiny/asr1/local/align.sh +++ b/examples/tiny/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/tiny/asr1/local/test.sh b/examples/tiny/asr1/local/test.sh index 190bacff..79df969b 100755 --- a/examples/tiny/asr1/local/test.sh +++ b/examples/tiny/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -33,10 +34,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -50,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/tiny/asr1/run.sh b/examples/tiny/asr1/run.sh index ec9c5a56..1651c034 100755 --- a/examples/tiny/asr1/run.sh +++ b/examples/tiny/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -31,12 +32,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml index a438236d..6c2bbca4 100644 --- a/examples/wenetspeech/asr1/conf/conformer.yaml +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -1,111 +1,92 @@ -# network architecture -model: - # encoder related - encoder: conformer - encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - cnn_module_norm: layer_norm - activation_type: swish - pos_enc_layer_type: rel_pos - selfattention_layer_type: rel_selfattn +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + cnn_module_norm: layer_norm + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.1 # second - max_input_len: 12.0 # second - min_output_len: 1.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 240 - accum_grad: 16 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 5000 - lr_decay: 1.0 - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 16 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 5000 + lr_decay: 1.0 diff --git a/examples/wenetspeech/asr1/conf/tuning/decode.yaml b/examples/wenetspeech/asr1/conf/tuning/decode.yaml new file mode 100644 index 00000000..6924bfa6 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh index da159de7..65b884e5 100755 --- a/examples/wenetspeech/asr1/local/test.sh +++ b/examples/wenetspeech/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh index 5c779474..47464262 100755 --- a/examples/wenetspeech/asr1/local/test_wav.sh +++ b/examples/wenetspeech/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -43,10 +44,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} if [ $? -ne 0 ]; then diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh index d77f409f..9995bc63 100644 --- a/examples/wenetspeech/asr1/run.sh +++ b/examples/wenetspeech/asr1/run.sh @@ -7,7 +7,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/conformer.yaml - +decode_conf_path=conf/tuning/decode.yaml average_checkpoint=true avg_num=10 @@ -36,12 +36,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then @@ -51,5 +51,5 @@ fi if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi diff --git a/paddlespeech/s2t/decoders/recog.py b/paddlespeech/s2t/decoders/recog.py index 3e9939f0..88955eac 100644 --- a/paddlespeech/s2t/decoders/recog.py +++ b/paddlespeech/s2t/decoders/recog.py @@ -85,7 +85,7 @@ def recog_v2(args): mode="asr", load_output=False, sort_in_input_length=False, - preprocess_conf=confs.collator.augmentation_config + preprocess_conf=confs.preprocess_config if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py index 7ccb3a6c..ccb85906 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py @@ -20,7 +20,7 @@ from paddle.inference import Config from paddle.inference import create_predictor from paddle.io import DataLoader -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.models.ds2 import DeepSpeech2Model @@ -80,13 +80,13 @@ def inference(config, args): def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True - config.collator.batch_size = 1 - config.collator.num_workers = 0 + config.augmentation_config = "" + config.keep_transcription_text = True + config.batch_size = 1 + config.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) @@ -105,14 +105,14 @@ def start_server(config, args): paddle.to_tensor(audio), paddle.to_tensor(audio_len), vocab_list=test_loader.collate_fn.vocab_list, - decoding_method=config.decoding.decoding_method, - lang_model_path=config.decoding.lang_model_path, - beam_alpha=config.decoding.alpha, - beam_beta=config.decoding.beta, - beam_size=config.decoding.beam_size, - cutoff_prob=config.decoding.cutoff_prob, - cutoff_top_n=config.decoding.cutoff_top_n, - num_processes=config.decoding.num_proc_bsearch) + decoding_method=config.decode.decoding_method, + lang_model_path=config.decode.lang_model_path, + beam_alpha=config.decode.alpha, + beam_beta=config.decode.beta, + beam_size=config.decode.beam_size, + cutoff_prob=config.decode.cutoff_prob, + cutoff_top_n=config.decode.cutoff_top_n, + num_processes=config.decode.num_proc_bsearch) return result_transcript[0] # warming up with utterrances sampled from Librispeech @@ -176,15 +176,19 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() print(config) - args.warmup_manifest = config.data.test_manifest + args.warmup_manifest = config.test_manifest print_arguments(args, globals()) if args.dump_config: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py index 5c6eee3f..85c2466f 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py @@ -18,7 +18,7 @@ import numpy as np import paddle from paddle.io import DataLoader -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.models.ds2 import DeepSpeech2Model @@ -33,13 +33,13 @@ from paddlespeech.s2t.utils.utility import print_arguments def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True - config.collator.batch_size = 1 - config.collator.num_workers = 0 + config.augmentation_config = "" + config.keep_transcription_text = True + config.batch_size = 1 + config.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) @@ -62,14 +62,14 @@ def start_server(config, args): paddle.to_tensor(audio), paddle.to_tensor(audio_len), vocab_list=test_loader.collate_fn.vocab_list, - decoding_method=config.decoding.decoding_method, - lang_model_path=config.decoding.lang_model_path, - beam_alpha=config.decoding.alpha, - beam_beta=config.decoding.beta, - beam_size=config.decoding.beam_size, - cutoff_prob=config.decoding.cutoff_prob, - cutoff_top_n=config.decoding.cutoff_top_n, - num_processes=config.decoding.num_proc_bsearch) + decoding_method=config.decode.decoding_method, + lang_model_path=config.decode.lang_model_path, + beam_alpha=config.decode.alpha, + beam_beta=config.decode.beta, + beam_size=config.decode.beam_size, + cutoff_prob=config.decode.cutoff_prob, + cutoff_top_n=config.decode.cutoff_top_n, + num_processes=config.decode.num_proc_bsearch) return result_transcript[0] # warming up with utterrances sampled from Librispeech @@ -111,15 +111,19 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() print(config) - args.warmup_manifest = config.data.test_manifest + args.warmup_manifest = config.test_manifest print_arguments(args, globals()) if args.dump_config: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/paddlespeech/s2t/exps/deepspeech2/bin/export.py index 66042e84..090b5fab 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for DeepSpeech2 model.""" -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -41,7 +41,7 @@ if __name__ == "__main__": print_arguments(args) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py index f52615fa..388b380d 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Evaluation for DeepSpeech2 model.""" -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -41,9 +42,13 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py index e073ebbf..176028ed 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Evaluation for DeepSpeech2 model.""" -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -46,9 +46,13 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py index cf2ca0d6..e2cb7e2f 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py @@ -18,6 +18,7 @@ from pathlib import Path import paddle import soundfile +from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer @@ -41,7 +42,7 @@ class DeepSpeech2Tester_hub(): self.audio_file = args.audio_file self.collate_fn_test = SpeechCollator.from_config(config) self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab=None) + unit_type=config.unit_type, vocab=None) def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): result_transcripts = self.model.decode( @@ -74,7 +75,7 @@ class DeepSpeech2Tester_hub(): audio = paddle.unsqueeze(audio, axis=0) vocab_list = collate_fn_test.vocab_list result_transcripts = self.compute_result_transcripts( - audio, audio_len, vocab_list, cfg.decoding) + audio, audio_len, vocab_list, cfg.decode) logger.info("result_transcripts: " + result_transcripts[0]) def run_test(self): @@ -110,13 +111,13 @@ class DeepSpeech2Tester_hub(): def setup_model(self): config = self.config.clone() with UpdateConfig(config): - config.model.input_dim = self.collate_fn_test.feature_size - config.model.output_dim = self.collate_fn_test.vocab_size + config.input_dim = self.collate_fn_test.feature_size + config.output_dim = self.collate_fn_test.vocab_size if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") @@ -134,8 +135,8 @@ class DeepSpeech2Tester_hub(): self.checkpoint_dir = checkpoint_dir self.checkpoint = Checkpoint( - kbest_n=self.config.training.checkpoint.kbest_n, - latest_n=self.config.training.checkpoint.latest_n) + kbest_n=self.config.checkpoint.kbest_n, + latest_n=self.config.checkpoint.latest_n) def resume(self): """Resume from the checkpoint at checkpoints in the output @@ -187,9 +188,13 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py index 400538f9..5e8c0fff 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py @@ -14,7 +14,7 @@ """Trainer for DeepSpeech2 model.""" from paddle import distributed as dist -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -42,7 +42,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/deepspeech2/config.py b/paddlespeech/s2t/exps/deepspeech2/config.py deleted file mode 100644 index 58dc05ff..00000000 --- a/paddlespeech/s2t/exps/deepspeech2/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester -from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.ds2 import DeepSpeech2Model -from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline - - -def get_cfg_defaults(model_type='offline'): - _C = CfgNode() - _C.data = ManifestDataset.params() - _C.collator = SpeechCollator.params() - _C.training = DeepSpeech2Trainer.params() - _C.decoding = DeepSpeech2Tester.params() - if model_type == 'offline': - _C.model = DeepSpeech2Model.params() - else: - _C.model = DeepSpeech2ModelOnline.params() - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index a0b69d64..e7d5e20f 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -49,28 +49,12 @@ logger = Log(__name__).getlog() class DeepSpeech2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - batch_size = self.config.collator.batch_size - accum_grad = self.config.training.accum_grad + batch_size = self.config.batch_size + accum_grad = self.config.accum_grad start = time.time() @@ -133,7 +117,7 @@ class DeepSpeech2Trainer(Trainer): total_loss += float(loss) * num_utts valid_losses['val_loss'].append(float(loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -154,16 +138,16 @@ class DeepSpeech2Trainer(Trainer): config = self.config.clone() with UpdateConfig(config): if self.train: - config.model.input_dim = self.train_loader.collate_fn.feature_size - config.model.output_dim = self.train_loader.collate_fn.vocab_size + config.input_dim = self.train_loader.collate_fn.feature_size + config.output_dim = self.train_loader.collate_fn.vocab_size else: - config.model.input_dim = self.test_loader.collate_fn.feature_size - config.model.output_dim = self.test_loader.collate_fn.vocab_size + config.input_dim = self.test_loader.collate_fn.feature_size + config.output_dim = self.test_loader.collate_fn.vocab_size if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") if self.parallel: @@ -177,17 +161,13 @@ class DeepSpeech2Trainer(Trainer): if not self.train: return - grad_clip = ClipGradByGlobalNormWithLog( - config.training.global_grad_clip) + grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip) lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=config.training.lr, - gamma=config.training.lr_decay, - verbose=True) + learning_rate=config.lr, gamma=config.lr_decay, verbose=True) optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config.training.weight_decay), + weight_decay=paddle.regularizer.L2Decay(config.weight_decay), grad_clip=grad_clip) self.optimizer = optimizer self.lr_scheduler = lr_scheduler @@ -198,95 +178,75 @@ class DeepSpeech2Trainer(Trainer): config.defrost() if self.train: # train - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) - config.collator.keep_transcription_text = False + config.keep_transcription_text = False collate_fn_train = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) # dev - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = False + config.augmentation_config = "" + config.keep_transcription_text = False collate_fn_dev = SpeechCollator.from_config(config) self.valid_loader = DataLoader( dev_dataset, - batch_size=int(config.collator.batch_size), + batch_size=int(config.batch_size), shuffle=False, drop_last=False, collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) logger.info("Setup train/valid Dataloader!") else: # test - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest test_dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True + config.augmentation_config = "" + config.keep_transcription_text = True collate_fn_test = SpeechCollator.from_config(config) - + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_test, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) logger.info("Setup test Dataloader!") class DeepSpeech2Tester(DeepSpeech2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # testing config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab=None) + unit_type=config.unit_type, vocab=None) def ordid2token(self, texts, texts_len): """ ord() id to chr() chr """ @@ -304,17 +264,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer vocab_list = self.test_loader.collate_fn.vocab_list target_transcripts = self.ordid2token(texts, texts_len) - result_transcripts = self.compute_result_transcripts(audio, audio_len, - vocab_list, cfg) + result_transcripts = self.compute_result_transcripts( + audio, audio_len, vocab_list, decode_cfg) for utt, target, result in zip(utts, target_transcripts, result_transcripts): @@ -327,29 +287,31 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("Current error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info( + "Current error rate [%s] = %f" % + (decode_cfg.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type) + error_rate_type=decode_cfg.error_rate_type) - def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): + def compute_result_transcripts(self, audio, audio_len, vocab_list, + decode_cfg): result_transcripts = self.model.decode( audio, audio_len, vocab_list, - decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, - beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch) + decoding_method=decode_cfg.decoding_method, + lang_model_path=decode_cfg.lang_model_path, + beam_alpha=decode_cfg.alpha, + beam_beta=decode_cfg.beta, + beam_size=decode_cfg.beam_size, + cutoff_prob=decode_cfg.cutoff_prob, + cutoff_top_n=decode_cfg.cutoff_top_n, + num_processes=decode_cfg.num_proc_bsearch) return result_transcripts @@ -358,7 +320,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def test(self): logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") self.model.eval() - cfg = self.config error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 with jsonlines.open(self.args.result_file, 'w') as fout: @@ -412,11 +373,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): if self.args.enable_auto_log is True: from paddlespeech.s2t.utils.log import Autolog self.autolog = Autolog( - batch_size=self.config.decoding.batch_size, + batch_size=self.config.decode.decode_batch_size, model_name="deepspeech2", model_precision="fp32").getlog() self.model.eval() - cfg = self.config error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 with jsonlines.open(self.args.result_file, 'w') as fout: @@ -441,7 +401,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): if self.args.enable_auto_log is True: self.autolog.report() - def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): + def compute_result_transcripts(self, audio, audio_len, vocab_list, + decode_cfg): if self.args.model_type == "online": output_probs, output_lens = self.static_forward_online(audio, audio_len) @@ -454,13 +415,15 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): self.predictor.clear_intermediate_tensor() self.predictor.try_shrink_memory() - self.model.decoder.init_decode(cfg.alpha, cfg.beta, cfg.lang_model_path, - vocab_list, cfg.decoding_method) + self.model.decoder.init_decode(decode_cfg.alpha, decode_cfg.beta, + decode_cfg.lang_model_path, vocab_list, + decode_cfg.decoding_method) result_transcripts = self.model.decoder.decode_probs( - output_probs, output_lens, vocab_list, cfg.decoding_method, - cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size, - cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch) + output_probs, output_lens, vocab_list, decode_cfg.decoding_method, + decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta, + decode_cfg.beam_size, decode_cfg.cutoff_prob, + decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch) #replace the with ' ' result_transcripts = [ self._text_featurizer.detokenize(sentence) @@ -531,12 +494,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): num_chunk = int(num_chunk) chunk_state_h_box = np.zeros( - (self.config.model.num_rnn_layers, 1, - self.config.model.rnn_layer_size), + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), dtype=x.dtype) chunk_state_c_box = np.zeros( - (self.config.model.num_rnn_layers, 1, - self.config.model.rnn_layer_size), + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), dtype=x.dtype) input_names = self.predictor.get_input_names() diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py index df95baeb..e3390feb 100644 --- a/paddlespeech/s2t/exps/u2/bin/alignment.py +++ b/paddlespeech/s2t/exps/u2/bin/alignment.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Alignment for U2 model.""" -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,16 +32,20 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/export.py b/paddlespeech/s2t/exps/u2/bin/export.py index 44fc7c3e..3907cebd 100644 --- a/paddlespeech/s2t/exps/u2/bin/export.py +++ b/paddlespeech/s2t/exps/u2/bin/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for U2 model.""" -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,14 +31,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save jit model to + # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py index 48b0670d..f14d804f 100644 --- a/paddlespeech/s2t/exps/u2/bin/test.py +++ b/paddlespeech/s2t/exps/u2/bin/test.py @@ -14,12 +14,13 @@ """Evaluation for U2 model.""" import cProfile -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load +# TODO(hui zhang): dynamic load def main_sp(config, args): @@ -35,16 +36,20 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 556316ec..9904813a 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -18,8 +18,8 @@ from pathlib import Path import paddle import soundfile +from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.cli import default_argument_parser @@ -36,23 +36,22 @@ class U2Infer(): self.args = args self.config = config self.audio_file = args.audio_file - self.sr = config.collator.target_sample_rate - self.preprocess_conf = config.collator.augmentation_config + self.preprocess_conf = config.preprocess_config self.preprocess_args = {"train": False} self.preprocessing = Transformation(self.preprocess_conf) self.text_feature = TextFeaturizer( - unit_type=config.collator.unit_type, - vocab=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix) + unit_type=config.unit_type, + vocab=config.vocab_filepath, + spm_model_prefix=config.spm_model_prefix) paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') # model - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): - model_conf.input_dim = config.collator.feat_dim + model_conf.input_dim = config.feat_dim model_conf.output_dim = self.text_feature.vocab_size model = U2Model.from_config(model_conf) self.model = model @@ -70,10 +69,6 @@ class U2Infer(): # read audio, sample_rate = soundfile.read( self.audio_file, dtype="int16", always_2d=True) - if sample_rate != self.sr: - logger.error( - f"sample rate error: {sample_rate}, need {self.sr} ") - sys.exit(-1) audio = audio[:, 0] logger.info(f"audio shape: {audio.shape}") @@ -85,17 +80,17 @@ class U2Infer(): ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) - cfg = self.config.decoding + decode_config = self.config.decode result_transcripts = self.model.decode( xs, ilen, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_config.decoding_method, + beam_size=decode_config.beam_size, + ctc_weight=decode_config.ctc_weight, + decoding_chunk_size=decode_config.decoding_chunk_size, + num_decoding_left_chunks=decode_config.num_decoding_left_chunks, + simulate_streaming=decode_config.simulate_streaming) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {result_transcripts[0][0]}") @@ -133,9 +128,13 @@ if __name__ == "__main__": "--audio_file", type=str, help="path of the input audio file") args = parser.parse_args() - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py index d6ee8b30..d562278f 100644 --- a/paddlespeech/s2t/exps/u2/bin/train.py +++ b/paddlespeech/s2t/exps/u2/bin/train.py @@ -17,7 +17,7 @@ import os from paddle import distributed as dist -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -44,7 +44,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2/config.py b/paddlespeech/s2t/exps/u2/config.py deleted file mode 100644 index 898b0bb2..00000000 --- a/paddlespeech/s2t/exps/u2/config.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.u2.model import U2Tester -from paddlespeech.s2t.exps.u2.model import U2Trainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.u2 import U2Model - -_C = CfgNode() - -_C.data = ManifestDataset.params() - -_C.collator = SpeechCollator.params() - -_C.model = U2Model.params() - -_C.training = U2Trainer.params() - -_C.decoding = U2Tester.params() - - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 6b529b40..d0cea031 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -46,38 +46,11 @@ logger = Log(__name__).getlog() class U2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - global_grad_clip=5.0, # the global norm clip - )) - default.optim = 'adam' - default.optim_conf = CfgNode( - dict( - lr=5e-4, # learning rate - weight_decay=1e-6, # the coeff of weight decay - )) - default.scheduler = 'warmuplr' - default.scheduler_conf = CfgNode( - dict( - warmup_steps=25000, - lr_decay=1.0, # learning rate decay - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -120,7 +93,7 @@ class U2Trainer(Trainer): for k, v in losses_np.items(): report(k, v) - report("batch_size", self.config.collator.batch_size) + report("batch_size", self.config.batch_size) report("accum", train_conf.accum_grad) report("step_cost", iteration_time) @@ -153,7 +126,7 @@ class U2Trainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -182,7 +155,7 @@ class U2Trainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: @@ -214,8 +187,7 @@ class U2Trainer(Trainer): k.split(',')) == 2 else "" msg += "," msg = msg[:-1] # remove the last "," - if (batch_index + 1 - ) % self.config.training.log_interval == 0: + if (batch_index + 1) % self.config.log_interval == 0: logger.info(msg) data_start_time = time.time() except Exception as e: @@ -252,29 +224,29 @@ class U2Trainer(Trainer): if self.train: # train/valid dataset, return token ids self.train_loader = BatchDataLoader( - json_file=config.data.train_manifest, + json_file=config.train_manifest, train_mode=True, - sortagrad=config.collator.sortagrad, - batch_size=config.collator.batch_size, - maxlen_in=config.collator.maxlen_in, - maxlen_out=config.collator.maxlen_out, - minibatches=config.collator.minibatches, + sortagrad=config.sortagrad, + batch_size=config.batch_size, + maxlen_in=config.maxlen_in, + maxlen_out=config.maxlen_out, + minibatches=config.minibatches, mini_batch_size=self.args.ngpu, - batch_count=config.collator.batch_count, - batch_bins=config.collator.batch_bins, - batch_frames_in=config.collator.batch_frames_in, - batch_frames_out=config.collator.batch_frames_out, - batch_frames_inout=config.collator.batch_frames_inout, - preprocess_conf=config.collator.augmentation_config, - n_iter_processes=config.collator.num_workers, + batch_count=config.batch_count, + batch_bins=config.batch_bins, + batch_frames_in=config.batch_frames_in, + batch_frames_out=config.batch_frames_out, + batch_frames_inout=config.batch_frames_inout, + preprocess_conf=config.preprocess_config, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) self.valid_loader = BatchDataLoader( - json_file=config.data.dev_manifest, + json_file=config.dev_manifest, train_mode=False, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -284,18 +256,20 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, - n_iter_processes=config.collator.num_workers, + preprocess_conf=config.preprocess_config, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) logger.info("Setup train/valid Dataloader!") else: + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) # test dataset, return raw text self.test_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -305,16 +279,16 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, + preprocess_conf=config.preprocess_config, n_iter_processes=1, subsampling_factor=1, num_encs=1) self.align_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -324,7 +298,7 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, + preprocess_conf=config.preprocess_config, n_iter_processes=1, subsampling_factor=1, num_encs=1) @@ -332,7 +306,7 @@ class U2Trainer(Trainer): def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): if self.train: @@ -355,7 +329,7 @@ class U2Trainer(Trainer): if not self.train: return - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -375,7 +349,7 @@ class U2Trainer(Trainer): config, parameters, lr_scheduler=None, ): - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -400,41 +374,12 @@ class U2Trainer(Trainer): class U2Tester(U2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -453,10 +398,10 @@ class U2Tester(U2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_config = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_config.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer start_time = time.time() target_transcripts = self.id2token(texts, texts_len, self.text_feature) @@ -464,12 +409,12 @@ class U2Tester(U2Trainer): audio, audio_len, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_config.decoding_method, + beam_size=decode_config.beam_size, + ctc_weight=decode_config.ctc_weight, + decoding_chunk_size=decode_config.decoding_chunk_size, + num_decoding_left_chunks=decode_config.num_decoding_left_chunks, + simulate_streaming=decode_config.simulate_streaming) decode_time = time.time() - start_time for utt, target, result, rec_tids in zip( @@ -488,15 +433,15 @@ class U2Tester(U2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("One example error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info("One example error rate [%s] = %f" % ( + decode_config.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, # num examples error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type, + error_rate_type=decode_config.error_rate_type, num_frames=audio_len.sum().numpy().item(), decode_time=decode_time) @@ -507,7 +452,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.config.collator.stride_ms + stride_ms = self.config.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -558,15 +503,15 @@ class U2Tester(U2Trainer): "ref_len": len_refs, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') @paddle.no_grad() def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, + self.config.decode.decode_batch_size, + self.config.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -577,10 +522,10 @@ class U2Tester(U2Trainer): List[paddle.static.InputSpec]: input spec. """ from paddlespeech.s2t.models.u2 import U2InferModel - infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), + infer_model = U2InferModel.from_pretrained(self.train_loader, + self.config.clone(), self.args.checkpoint_path) - feat_dim = self.test_loader.feat_dim + feat_dim = self.train_loader.feat_dim input_spec = [ paddle.static.InputSpec(shape=[1, None, feat_dim], dtype='float32'), # audio, [B,T,D] diff --git a/paddlespeech/s2t/exps/u2/trainer.py b/paddlespeech/s2t/exps/u2/trainer.py index 22a0a3c5..ab87c30d 100644 --- a/paddlespeech/s2t/exps/u2/trainer.py +++ b/paddlespeech/s2t/exps/u2/trainer.py @@ -44,77 +44,75 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.collator.keep_transcription_text = False + config.keep_transcription_text = False # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) collate_fn_train = SpeechCollator.from_config(config) - config.collator.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) # test dataset, return raw text - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. - config.data.min_input_len = 0.0 # second - config.data.max_input_len = float('inf') # second - config.data.min_output_len = 0.0 # tokens - config.data.max_output_len = float('inf') # tokens - config.data.min_output_input_ratio = 0.00 - config.data.max_output_input_ratio = float('inf') + config.min_input_len = 0.0 # second + config.max_input_len = float('inf') # second + config.min_output_len = 0.0 # tokens + config.max_output_len = float('inf') # tokens + config.min_output_input_ratio = 0.00 + config.max_output_input_ratio = float('inf') test_dataset = ManifestDataset.from_config(config) # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" + config.keep_transcription_text = True self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.batch_size, shuffle=False, drop_last=False, collate_fn=SpeechCollator.from_config(config)) # return text token id - config.collator.keep_transcription_text = False + config.keep_transcription_text = False self.align_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.batch_size, shuffle=False, drop_last=False, collate_fn=SpeechCollator.from_config(config)) @@ -122,7 +120,7 @@ class U2Trainer(Trainer): def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size @@ -136,7 +134,7 @@ class U2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -156,7 +154,7 @@ class U2Trainer(Trainer): config, parameters, lr_scheduler=None, ): - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -182,7 +180,7 @@ class U2Trainer(Trainer): def setup_updater(self): output_dir = self.output_dir - config = self.config.training + config = self.config updater = U2Updater( model=self.model, diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py index 67bed349..422483b9 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py +++ b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py @@ -69,6 +69,10 @@ if __name__ == "__main__": config = CfgNode() config.set_new_allowed(True) config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 9b8274ad..780c5c08 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -42,45 +42,12 @@ from paddlespeech.s2t.utils.utility import UpdateConfig logger = Log(__name__).getlog() - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - _C = CfgNode() - - _C.model = U2Model.params() - - _C.training = U2Trainer.params() - - _C.decoding = U2Tester.params() - - config = _C.clone() - config.set_new_allowed(True) - return config - - class U2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - checkpoint=dict( - kbest_n=50, - latest_n=5, ), )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -122,7 +89,7 @@ class U2Trainer(Trainer): if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -157,7 +124,7 @@ class U2Trainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -186,7 +153,7 @@ class U2Trainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: @@ -235,10 +202,10 @@ class U2Trainer(Trainer): config = self.config.clone() # train/valid dataset, return token ids self.train_loader = BatchDataLoader( - json_file=config.data.train_manifest, + json_file=config.train_manifest, train_mode=True, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -248,16 +215,16 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, - n_iter_processes=config.collator.num_workers, + preprocess_conf=config.preprocess_config, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) self.valid_loader = BatchDataLoader( - json_file=config.data.dev_manifest, + json_file=config.dev_manifest, train_mode=False, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -268,16 +235,18 @@ class U2Trainer(Trainer): batch_frames_out=0, batch_frames_inout=0, preprocess_conf=None, - n_iter_processes=config.collator.num_workers, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) # test dataset, return raw text self.test_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -293,10 +262,10 @@ class U2Trainer(Trainer): num_encs=1) self.align_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -316,7 +285,7 @@ class U2Trainer(Trainer): config = self.config # model - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.feat_dim model_conf.output_dim = self.train_loader.vocab_size @@ -360,41 +329,12 @@ class U2Trainer(Trainer): class U2Tester(U2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -413,10 +353,10 @@ class U2Tester(U2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer start_time = time.time() target_transcripts = self.id2token(texts, texts_len, self.text_feature) @@ -424,12 +364,12 @@ class U2Tester(U2Trainer): audio, audio_len, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + ctc_weight=decode_cfg.ctc_weight, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) decode_time = time.time() - start_time for i, (utt, target, result, rec_tids) in enumerate( @@ -449,15 +389,16 @@ class U2Tester(U2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("One example error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info( + "One example error rate [%s] = %f" % + (decode_cfg.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, # num examples error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type, + error_rate_type=decode_cfg.error_rate_type, num_frames=audio_len.sum().numpy().item(), decode_time=decode_time) @@ -468,7 +409,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.config.collator.stride_ms + stride_ms = self.config.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -519,15 +460,15 @@ class U2Tester(U2Trainer): "ref_len": len_refs, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') @paddle.no_grad() def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, + self.config.decode.decode_batch_size, + self.config.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -539,7 +480,7 @@ class U2Tester(U2Trainer): """ from paddlespeech.s2t.models.u2 import U2InferModel infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), + self.config.clone(), self.args.checkpoint_path) feat_dim = self.test_loader.feat_dim input_spec = [ diff --git a/paddlespeech/s2t/exps/u2_st/bin/export.py b/paddlespeech/s2t/exps/u2_st/bin/export.py index 69d9718f..1bc4e1f3 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/export.py +++ b/paddlespeech/s2t/exps/u2_st/bin/export.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for U2 model.""" -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults +from yacs.config import CfgNode from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,14 +31,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save jit model to + # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py index 93c2fee0..1d70a310 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/test.py +++ b/paddlespeech/s2t/exps/u2_st/bin/test.py @@ -14,12 +14,13 @@ """Evaluation for U2 model.""" import cProfile -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load +# TODO(hui zhang): dynamic load def main_sp(config, args): @@ -35,16 +36,20 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_conf = CfgNode(new_allowed=True) + decode_conf.merge_from_file(args.decode_cfg) + config.decode = decode_conf if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py index 58496c88..4dec9ec8 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/train.py +++ b/paddlespeech/s2t/exps/u2_st/bin/train.py @@ -16,8 +16,8 @@ import cProfile import os from paddle import distributed as dist +from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -42,7 +42,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2_st/config.py b/paddlespeech/s2t/exps/u2_st/config.py deleted file mode 100644 index a48f9106..00000000 --- a/paddlespeech/s2t/exps/u2_st/config.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.u2_st.model import U2STTester -from paddlespeech.s2t.exps.u2_st.model import U2STTrainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.u2_st import U2STModel - -_C = CfgNode() - -_C.data = ManifestDataset.params() - -_C.collator = SpeechCollator.params() - -_C.model = U2STModel.params() - -_C.training = U2STTrainer.params() - -_C.decoding = U2STTester.params() - - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index 89408786..ca2c2c1d 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -45,38 +45,11 @@ logger = Log(__name__).getlog() class U2STTrainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - global_grad_clip=5.0, # the global norm clip - )) - default.optim = 'adam' - default.optim_conf = CfgNode( - dict( - lr=5e-4, # learning rate - weight_decay=1e-6, # the coeff of weight decay - )) - default.scheduler = 'warmuplr' - default.scheduler_conf = CfgNode( - dict( - warmup_steps=25000, - lr_decay=1.0, # learning rate decay - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward utt, audio, audio_len, text, text_len = batch_data @@ -127,13 +100,13 @@ class U2STTrainer(Trainer): for k, v in losses_np.items(): report(k, v) - report("batch_size", self.config.collator.batch_size) + report("batch_size", self.config.batch_size) report("accum", train_conf.accum_grad) report("step_cost", iteration_time) if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -174,7 +147,7 @@ class U2STTrainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_st_loss'] = total_loss / num_seen_utts @@ -203,7 +176,7 @@ class U2STTrainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: @@ -236,7 +209,7 @@ class U2STTrainer(Trainer): msg += "," msg = msg[:-1] # remove the last "," if (batch_index + 1 - ) % self.config.training.log_interval == 0: + ) % self.config.log_interval == 0: logger.info(msg) except Exception as e: logger.error(e) @@ -269,17 +242,17 @@ class U2STTrainer(Trainer): def setup_dataloader(self): config = self.config.clone() - load_transcript = True if config.model.model_conf.asr_weight > 0 else False + load_transcript = True if config.model_conf.asr_weight > 0 else False if self.train: # train/valid dataset, return token ids self.train_loader = BatchDataLoader( - json_file=config.data.train_manifest, + json_file=config.train_manifest, train_mode=True, sortagrad=False, - batch_size=config.collator.batch_size, - maxlen_in=config.collator.maxlen_in, - maxlen_out=config.collator.maxlen_out, + batch_size=config.batch_size, + maxlen_in=config.maxlen_in, + maxlen_out=config.maxlen_out, minibatches=0, mini_batch_size=1, batch_count='auto', @@ -287,19 +260,18 @@ class U2STTrainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator. - augmentation_config, # aug will be off when train_mode=False - n_iter_processes=config.collator.num_workers, + preprocess_conf=config.preprocess_config, # aug will be off when train_mode=False + n_iter_processes=config.num_workers, subsampling_factor=1, load_aux_output=load_transcript, num_encs=1, dist_sampler=True) self.valid_loader = BatchDataLoader( - json_file=config.data.dev_manifest, + json_file=config.dev_manifest, train_mode=False, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -309,9 +281,8 @@ class U2STTrainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator. - augmentation_config, # aug will be off when train_mode=False - n_iter_processes=config.collator.num_workers, + preprocess_conf=config.preprocess_config, # aug will be off when train_mode=False + n_iter_processes=config.num_workers, subsampling_factor=1, load_aux_output=load_transcript, num_encs=1, @@ -319,11 +290,12 @@ class U2STTrainer(Trainer): logger.info("Setup train/valid Dataloader!") else: # test dataset, return raw text + decode_batch_size = config.get('decode',dict()).get('decode_batch_size', 1) self.test_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -333,9 +305,8 @@ class U2STTrainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator. - augmentation_config, # aug will be off when train_mode=False - n_iter_processes=config.collator.num_workers, + preprocess_conf=config.preprocess_config, # aug will be off when train_mode=False + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1, dist_sampler=False) @@ -344,7 +315,7 @@ class U2STTrainer(Trainer): def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): if self.train: model_conf.input_dim = self.train_loader.feat_dim @@ -361,7 +332,7 @@ class U2STTrainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -381,7 +352,7 @@ class U2STTrainer(Trainer): config, parameters, lr_scheduler=None, ): - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -407,41 +378,12 @@ class U2STTrainer(Trainer): class U2STTester(U2STTrainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='bleu', # Error rate type for evaluation. Options `bleu`, 'char_bleu' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab_filepath=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -455,19 +397,19 @@ class U2STTester(U2STTrainer): def translate(self, audio, audio_len): """"E2E translation from extracted audio feature""" - cfg = self.config.decoding + decode_cfg = self.config.decode self.model.eval() hyps = self.model.decode( audio, audio_len, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - word_reward=cfg.word_reward, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + word_reward=decode_cfg.word_reward, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) return hyps def compute_translation_metrics(self, @@ -478,7 +420,7 @@ class U2STTester(U2STTrainer): texts_len, bleu_func, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode len_refs, num_ins = 0, 0 start_time = time.time() @@ -489,12 +431,12 @@ class U2STTester(U2STTrainer): audio, audio_len, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - word_reward=cfg.word_reward, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + word_reward=decode_cfg.word_reward, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) decode_time = time.time() - start_time @@ -525,10 +467,10 @@ class U2STTester(U2STTrainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - cfg = self.config.decoding - bleu_func = bleu_score.char_bleu if cfg.error_rate_type == 'char-bleu' else bleu_score.bleu + decode_cfg = self.config.decode + bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu - stride_ms = self.config.collator.stride_ms + stride_ms = self.config.stride_ms hyps, refs = [], [] len_refs, num_ins = 0, 0 num_frames = 0.0 @@ -573,7 +515,7 @@ class U2STTester(U2STTrainer): "num_examples": num_ins, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') @@ -586,7 +528,7 @@ class U2STTester(U2STTrainer): """ from paddlespeech.s2t.models.u2_st import U2STInferModel infer_model = U2STInferModel.from_pretrained(self.test_loader, - self.config.model.clone(), + self.config.clone(), self.args.checkpoint_path) feat_dim = self.test_loader.feat_dim input_spec = [ diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index 5f233549..3a14b2d5 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -219,33 +219,6 @@ class SpeechCollatorBase(): class SpeechCollator(SpeechCollatorBase): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - augmentation_config="", - random_seed=0, - mean_std_filepath="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - spectrum_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - dither=1.0, # feature dither - keep_transcription_text=False)) - - if config is not None: - config.merge_from_other_cfg(default) - return default - @classmethod def from_config(cls, config): """Build a SpeechCollator object from a config. @@ -256,45 +229,43 @@ class SpeechCollator(SpeechCollatorBase): Returns: SpeechCollator: collator object. """ - assert 'augmentation_config' in config.collator - assert 'keep_transcription_text' in config.collator - assert 'mean_std_filepath' in config.collator - assert 'vocab_filepath' in config.collator - assert 'spectrum_type' in config.collator - assert 'n_fft' in config.collator - assert config.collator - - if isinstance(config.collator.augmentation_config, (str, bytes)): - if config.collator.augmentation_config: + assert 'augmentation_config' in config + assert 'keep_transcription_text' in config + assert 'mean_std_filepath' in config + assert 'vocab_filepath' in config + assert 'spectrum_type' in config + assert 'n_fft' in config + assert config + + if isinstance(config.augmentation_config, (str, bytes)): + if config.augmentation_config: aug_file = io.open( - config.collator.augmentation_config, - mode='r', - encoding='utf8') + config.augmentation_config, mode='r', encoding='utf8') else: aug_file = io.StringIO(initial_value='{}', newline='') else: - aug_file = config.collator.augmentation_config + aug_file = config.augmentation_config assert isinstance(aug_file, io.StringIO) speech_collator = cls( aug_file=aug_file, random_seed=0, - mean_std_filepath=config.collator.mean_std_filepath, - unit_type=config.collator.unit_type, - vocab_filepath=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix, - spectrum_type=config.collator.spectrum_type, - feat_dim=config.collator.feat_dim, - delta_delta=config.collator.delta_delta, - stride_ms=config.collator.stride_ms, - window_ms=config.collator.window_ms, - n_fft=config.collator.n_fft, - max_freq=config.collator.max_freq, - target_sample_rate=config.collator.target_sample_rate, - use_dB_normalization=config.collator.use_dB_normalization, - target_dB=config.collator.target_dB, - dither=config.collator.dither, - keep_transcription_text=config.collator.keep_transcription_text) + mean_std_filepath=config.mean_std_filepath, + unit_type=config.unit_type, + vocab_filepath=config.vocab_filepath, + spm_model_prefix=config.spm_model_prefix, + spectrum_type=config.spectrum_type, + feat_dim=config.feat_dim, + delta_delta=config.delta_delta, + stride_ms=config.stride_ms, + window_ms=config.window_ms, + n_fft=config.n_fft, + max_freq=config.max_freq, + target_sample_rate=config.target_sample_rate, + use_dB_normalization=config.use_dB_normalization, + target_dB=config.target_dB, + dither=config.dither, + keep_transcription_text=config.keep_transcription_text) return speech_collator diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index d64d7d3e..9149fb27 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -28,22 +28,6 @@ logger = Log(__name__).getlog() class ManifestDataset(Dataset): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - manifest="", - max_input_len=27.0, - min_input_len=0.0, - max_output_len=float('inf'), - min_output_len=0.0, - max_output_input_ratio=float('inf'), - min_output_input_ratio=0.0, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - @classmethod def from_config(cls, config): """Build a ManifestDataset object from a config. @@ -54,17 +38,17 @@ class ManifestDataset(Dataset): Returns: ManifestDataset: dataet object. """ - assert 'manifest' in config.data - assert config.data.manifest + assert 'manifest' in config + assert config.manifest dataset = cls( - manifest_path=config.data.manifest, - max_input_len=config.data.max_input_len, - min_input_len=config.data.min_input_len, - max_output_len=config.data.max_output_len, - min_output_len=config.data.min_output_len, - max_output_input_ratio=config.data.max_output_input_ratio, - min_output_input_ratio=config.data.min_output_input_ratio, ) + manifest_path=config.manifest, + max_input_len=config.max_input_len, + min_input_len=config.min_input_len, + max_output_len=config.max_output_len, + min_output_len=config.min_output_len, + max_output_input_ratio=config.max_output_input_ratio, + min_output_input_ratio=config.min_output_input_ratio, ) return dataset def __init__(self, diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index 0dfaec29..ddc3612d 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -119,21 +119,6 @@ class DeepSpeech2Model(nn.Layer): before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput """ - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True, #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - ctc_grad_norm_type=None, )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, feat_size, dict_size, @@ -221,12 +206,12 @@ class DeepSpeech2Model(nn.Layer): model = cls( feat_size=dataloader.collate_fn.feature_size, dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights, - blank_id=config.model.blank_id, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights, + blank_id=config.blank_id, ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) @@ -240,7 +225,7 @@ class DeepSpeech2Model(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2Model diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py index 85876bce..aae77f74 100644 --- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py @@ -243,23 +243,6 @@ class DeepSpeech2ModelOnline(nn.Layer): before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput """ - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=4, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - num_fc_layers=2, - fc_layers_size_list=[512, 256], - use_gru=True, #Use gru if set True. Use simple rnn if set False. - blank_id=0, # index of blank in vocob.txt - ctc_grad_norm_type=None, )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__( self, feat_size, @@ -353,14 +336,14 @@ class DeepSpeech2ModelOnline(nn.Layer): model = cls( feat_size=dataloader.collate_fn.feature_size, dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - rnn_direction=config.model.rnn_direction, - num_fc_layers=config.model.num_fc_layers, - fc_layers_size_list=config.model.fc_layers_size_list, - use_gru=config.model.use_gru, - blank_id=config.model.blank_id, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + rnn_direction=config.rnn_direction, + num_fc_layers=config.num_fc_layers, + fc_layers_size_list=config.fc_layers_size_list, + use_gru=config.use_gru, + blank_id=config.blank_id, ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) @@ -374,7 +357,7 @@ class DeepSpeech2ModelOnline(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2ModelOnline diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 83eff467..26e81acf 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -59,57 +59,6 @@ logger = Log(__name__).getlog() class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # network architecture - default = CfgNode() - # allow add new item when merge_with_file - default.cmvn_file = "" - default.cmvn_file_type = "json" - default.input_dim = 0 - default.output_dim = 0 - # encoder related - default.encoder = 'transformer' - default.encoder_conf = CfgNode( - dict( - output_size=256, # dimension of attention - attention_heads=4, - linear_units=2048, # the number of units of position-wise feed forward - num_blocks=12, # the number of encoder blocks - dropout_rate=0.1, - positional_dropout_rate=0.1, - attention_dropout_rate=0.0, - input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before=True, - # use_cnn_module=True, - # cnn_module_kernel=15, - # activation_type='swish', - # pos_enc_layer_type='rel_pos', - # selfattention_layer_type='rel_selfattn', - )) - # decoder related - default.decoder = 'transformer' - default.decoder_conf = CfgNode( - dict( - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - positional_dropout_rate=0.1, - self_attention_dropout_rate=0.0, - src_attention_dropout_rate=0.0, )) - # hybrid CTC/attention - default.model_conf = CfgNode( - dict( - ctc_weight=0.3, - lsm_weight=0.1, # label smoothing option - length_normalized_loss=False, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, vocab_size: int, encoder: TransformerEncoder, diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index 8b07e389..1c5596ba 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """U2 ASR Model -Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition +Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition (https://arxiv.org/pdf/2012.05481.pdf) """ import time @@ -51,58 +51,6 @@ logger = Log(__name__).getlog() class U2STBaseModel(nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" - - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # network architecture - default = CfgNode() - # allow add new item when merge_with_file - default.cmvn_file = "" - default.cmvn_file_type = "json" - default.input_dim = 0 - default.output_dim = 0 - # encoder related - default.encoder = 'transformer' - default.encoder_conf = CfgNode( - dict( - output_size=256, # dimension of attention - attention_heads=4, - linear_units=2048, # the number of units of position-wise feed forward - num_blocks=12, # the number of encoder blocks - dropout_rate=0.1, - positional_dropout_rate=0.1, - attention_dropout_rate=0.0, - input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before=True, - # use_cnn_module=True, - # cnn_module_kernel=15, - # activation_type='swish', - # pos_enc_layer_type='rel_pos', - # selfattention_layer_type='rel_selfattn', - )) - # decoder related - default.decoder = 'transformer' - default.decoder_conf = CfgNode( - dict( - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - positional_dropout_rate=0.1, - self_attention_dropout_rate=0.0, - src_attention_dropout_rate=0.0, )) - # hybrid CTC/attention - default.model_conf = CfgNode( - dict( - asr_weight=0.0, - ctc_weight=0.0, - lsm_weight=0.1, # label smoothing option - length_normalized_loss=False, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, vocab_size: int, encoder: TransformerEncoder, @@ -289,8 +237,8 @@ class U2STBaseModel(nn.Layer): simulate_streaming (bool, optional): streaming or not. Defaults to False. Returns: - Tuple[paddle.Tensor, paddle.Tensor]: - encoder hiddens (B, Tmax, D), + Tuple[paddle.Tensor, paddle.Tensor]: + encoder hiddens (B, Tmax, D), encoder hiddens mask (B, 1, Tmax). """ # Let's assume B = batch_size @@ -533,21 +481,21 @@ class U2STBaseModel(nn.Layer): feats (Tenosr): audio features, (B, T, D) feats_lengths (Tenosr): (B) text_feature (TextFeaturizer): text feature object. - decoding_method (str): decoding mode, e.g. - 'fullsentence', + decoding_method (str): decoding mode, e.g. + 'fullsentence', 'simultaneous' beam_size (int): beam size for search decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1. <0: for decoding, use full chunk. >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here. - num_decoding_left_chunks (int, optional): + 0: used for training, it's prohibited here. + num_decoding_left_chunks (int, optional): number of left chunks for decoding. Defaults to -1. simulate_streaming (bool, optional): simulate streaming inference. Defaults to False. Raises: ValueError: when not support decoding_method. - + Returns: List[List[int]]: transcripts. """ @@ -601,7 +549,7 @@ class U2STModel(U2STBaseModel): ValueError: raise when using not support encoder type. Returns: - int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc + int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc """ if configs['cmvn_file'] is not None: mean, istd = load_cmvn(configs['cmvn_file'], diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py index 3ef871c5..bb85732a 100644 --- a/paddlespeech/s2t/training/cli.py +++ b/paddlespeech/s2t/training/cli.py @@ -97,6 +97,14 @@ def default_argument_parser(parser=None): train_group.add_argument( "--dump-config", metavar="FILE", help="dump config to `this` file.") + test_group = parser.add_argument_group( + title='Test Options', description=None) + + test_group.add_argument( + "--decode_cfg", + metavar="DECODE_CONFIG_FILE", + help="decode config file.") + profile_group = parser.add_argument_group( title='Benchmark Options', description=None) profile_group.add_argument( diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py index 9bf1ca4d..4b2011ec 100644 --- a/paddlespeech/s2t/training/trainer.py +++ b/paddlespeech/s2t/training/trainer.py @@ -117,8 +117,8 @@ class Trainer(): self.init_parallel() self.checkpoint = Checkpoint( - kbest_n=self.config.training.checkpoint.kbest_n, - latest_n=self.config.training.checkpoint.latest_n) + kbest_n=self.config.checkpoint.kbest_n, + latest_n=self.config.checkpoint.latest_n) # set random seed if needed if args.seed: @@ -129,8 +129,8 @@ class Trainer(): if hasattr(self.args, "benchmark_batch_size") and self.args.benchmark_batch_size: with UpdateConfig(self.config): - self.config.collator.batch_size = self.args.benchmark_batch_size - self.config.training.log_interval = 1 + self.config.batch_size = self.args.benchmark_batch_size + self.config.log_interval = 1 logger.info( f"Benchmark reset batch-size: {self.args.benchmark_batch_size}") @@ -260,7 +260,7 @@ class Trainer(): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: diff --git a/paddlespeech/s2t/utils/utility.py b/paddlespeech/s2t/utils/utility.py index 73c79816..dc1be815 100644 --- a/paddlespeech/s2t/utils/utility.py +++ b/paddlespeech/s2t/utils/utility.py @@ -130,7 +130,7 @@ def get_subsample(config): Returns: int: subsample rate. """ - input_layer = config["model"]["encoder_conf"]["input_layer"] + input_layer = config["encoder_conf"]["input_layer"] assert input_layer in ["conv2d", "conv2d6", "conv2d8"] if input_layer == "conv2d": return 4 diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh index fcd0c235..c9d640ed 100644 --- a/tests/benchmark/conformer/run.sh +++ b/tests/benchmark/conformer/run.sh @@ -22,6 +22,7 @@ sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml fp_item_list=(fp32) bs_item=(16) config_path=conf/benchmark/conformer.yaml +decode_config_path=conf/tuning/decode.yaml seed=0 output=exp/conformer profiler_options=None @@ -34,13 +35,13 @@ for fp_item in ${fp_item_list[@]}; do echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer" run_mode=mp ngpu=8 - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 sleep 60 log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8 echo "index is speed, 1gpus, begin, ${log_name}" run_mode=sp ngpu=1 - CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) + CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) sleep 60 done done diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh index 5b83b15c..16cd410e 100644 --- a/tests/benchmark/conformer/run_benchmark.sh +++ b/tests/benchmark/conformer/run_benchmark.sh @@ -5,13 +5,14 @@ function _set_params(){ run_mode=${1:-"sp"} # 单卡sp|多卡mp config_path=${2:-"conf/conformer.yaml"} - output=${3:-"exp/conformer"} - seed=${4:-"0"} - ngpu=${5:-"1"} - profiler_options=${6:-"None"} - batch_size=${7:-"32"} - fp_item=${8:-"fp32"} - model_item=${9:-"conformer"} + decode_config_path=${3:-"conf/tuning/decode.yaml"} + output=${4:-"exp/conformer"} + seed=${5:-"0"} + ngpu=${6:-"1"} + profiler_options=${7:-"None"} + batch_size=${8:-"32"} + fp_item=${9:-"fp32"} + model_item=${10:-"conformer"} benchmark_max_step=0 run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 # 添加日志解析需要的参数 @@ -35,6 +36,7 @@ function _train(){ echo "Train on ${num_gpu_devices} GPUs" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" train_cmd="--config=${config_path} \ + --decode_cfg=${decode_config_path} \ --output=${output} \ --seed=${seed} \ --ngpu=${ngpu} \ @@ -69,6 +71,6 @@ function _train(){ source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ -# _train # 如果只想产出训练log,不解析,可取消注释 +#_train # 如果只想产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开 diff --git a/tests/chains/ds2/ds2_params_lite_train_infer.txt b/tests/chains/ds2/ds2_params_lite_train_infer.txt index b11872bd..cad8efa3 100644 --- a/tests/chains/ds2/ds2_params_lite_train_infer.txt +++ b/tests/chains/ds2/ds2_params_lite_train_infer.txt @@ -21,13 +21,13 @@ null:null null:null ## ===========================eval_params=========================== -eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl --model_type offline +eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --result_file tests/4.rsl --model_type offline null:null ## ===========================infer_params=========================== null:null null:null -norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit +norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --export_path exp/deepspeech_tiny/checkpoints/4.jit quant_export:null fpgm_export:null distill_export:null diff --git a/tests/chains/ds2/ds2_params_whole_train_infer.txt b/tests/chains/ds2/ds2_params_whole_train_infer.txt index 875e3ccf..5c619506 100644 --- a/tests/chains/ds2/ds2_params_whole_train_infer.txt +++ b/tests/chains/ds2/ds2_params_whole_train_infer.txt @@ -21,7 +21,7 @@ null:null null:null ## ===========================eval_params=========================== -eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline +eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline null:null ## ===========================infer_params=========================== diff --git a/tests/chains/ds2/lite_train_infer.sh b/tests/chains/ds2/lite_train_infer.sh index 76b22a38..1dce1b29 100644 --- a/tests/chains/ds2/lite_train_infer.sh +++ b/tests/chains/ds2/lite_train_infer.sh @@ -1,5 +1,5 @@ bash prepare.sh ds2_params_lite_train_infer.txt lite_train_infer -cd ../../examples/tiny/s0 +cd ../../../examples/tiny/asr0 source path.sh -bash ../../../tests/chains/test.sh ../../../tests/chains/ds2_params_lite_train_infer.txt lite_train_infer +bash ../../../tests/chains/ds2/test.sh ../../../tests/chains/ds2/ds2_params_lite_train_infer.txt lite_train_infer cd ../../../tests/chains diff --git a/tests/chains/ds2/prepare.sh b/tests/chains/ds2/prepare.sh index 73a30283..4913ce42 100644 --- a/tests/chains/ds2/prepare.sh +++ b/tests/chains/ds2/prepare.sh @@ -34,7 +34,7 @@ MODE=$2 if [ ${MODE} = "lite_train_infer" ];then # pretrain lite train data curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/tiny/s0 + cd ${curPath}/../../../examples/tiny/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -47,7 +47,7 @@ if [ ${MODE} = "lite_train_infer" ];then elif [ ${MODE} = "whole_train_infer" ];then curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -59,7 +59,7 @@ elif [ ${MODE} = "whole_train_infer" ];then cd ${curPath} elif [ ${MODE} = "whole_infer" ];then curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -71,7 +71,7 @@ elif [ ${MODE} = "whole_infer" ];then cd ${curPath} else curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 diff --git a/tests/chains/ds2/test.sh b/tests/chains/ds2/test.sh index c9307820..26917c67 100644 --- a/tests/chains/ds2/test.sh +++ b/tests/chains/ds2/test.sh @@ -324,6 +324,7 @@ else gsu=${gpu//,/ } nump=`echo $gsu | wc -w` cmd="${python} ${run_train} --ngpu=$nump" + export CUDA_VISIBLE_DEVICES=${gpu} else # train with multi-machine cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" fi