diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml index 50eaef98..31e9be13 100644 --- a/examples/aishell/asr1/conf/chunk_conformer.yaml +++ b/examples/aishell/asr1/conf/chunk_conformer.yaml @@ -1,103 +1,94 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +########################################### +# Data # +########################################### - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +augmentation_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - augmentation_config: conf/preprocess.yaml - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 64 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 - - -training: - n_epoch: 240 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - beam_size: 10 - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index 2ba96e76..d9e3daec 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -1,7 +1,6 @@ ############################################ # Network Architecture # ############################################ -#model: cmvn_file: cmvn_file_type: "json" # encoder related @@ -42,7 +41,6 @@ model_conf: ########################################### # Data # ########################################### -#data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test @@ -50,11 +48,9 @@ test_manifest: data/manifest.test ########################################### # Dataloader # ########################################### -#collator: vocab_filepath: data/lang_char/vocab.txt unit_type: 'char' augmentation_config: conf/preprocess.yaml -spm_model_prefix: '' feat_dim: 80 stride_ms: 10.0 window_ms: 25.0 @@ -75,7 +71,6 @@ num_encs: 1 ########################################### # training # ########################################### -#training: n_epoch: 240 accum_grad: 2 global_grad_clip: 5.0 diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml index 7c5fa624..e6684ec8 100644 --- a/examples/aishell/asr1/conf/transformer.yaml +++ b/examples/aishell/asr1/conf/transformer.yaml @@ -1,95 +1,84 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Data # +########################################### # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - - -collator: - unit_type: 'char' - vocab_filepath: data/lang_char/vocab.txt - feat_dim: 80 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 64 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 - - +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -training: - n_epoch: 240 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Dataloader # +########################################### +unit_type: 'char' +vocab_filepath: data/lang_char/vocab.txt +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +augmentation_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -decoding: - beam_size: 10 - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/tuning/chunk_decode.yaml b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..72ede927 --- /dev/null +++ b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/aishell/asr1/conf/decode.yaml b/examples/aishell/asr1/conf/tuning/decode.yaml similarity index 98% rename from examples/aishell/asr1/conf/decode.yaml rename to examples/aishell/asr1/conf/tuning/decode.yaml index 49364f5d..72ede927 100644 --- a/examples/aishell/asr1/conf/decode.yaml +++ b/examples/aishell/asr1/conf/tuning/decode.yaml @@ -1,4 +1,3 @@ -#decoding: beam_size: 10 decode_batch_size: 128 error_rate_type: cer diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh index 11aff9c7..c54dae9c 100644 --- a/examples/aishell/asr1/run.sh +++ b/examples/aishell/asr1/run.sh @@ -6,7 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/conformer.yaml -decode_conf_path=conf/decode.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=20 audio_file=data/demo_01_03.wav