From 67709155e9f17e03579c7360882e2e92b65ad7c1 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 13 Sep 2022 08:29:21 +0000 Subject: [PATCH] add chunk conformer config from release model --- .../asr1/conf/chunk_conformer.yaml | 99 +++++++++++++++++++ .../wenetspeech/asr1/conf/preprocess.yaml | 2 +- .../asr1/conf/tuning/chunk_decode.yaml | 11 +++ 3 files changed, 111 insertions(+), 1 deletion(-) create mode 100644 examples/wenetspeech/asr1/conf/chunk_conformer.yaml create mode 100644 examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml new file mode 100644 index 00000000..69fa223a --- /dev/null +++ b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml @@ -0,0 +1,99 @@ +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + init_type: 'kaiming_uniform' + +# https://yaml.org/type/float.html +########################################### +# Data # +########################################### +train_manifest: data/train_l/data.list +dev_manifest: data/dev/data.list +test_manifest: data/test_meeting/data.list + +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +do_filter: True +maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced +minlen_in: 10 +minlen_out: 0 +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + + +########################################### +# Training # +########################################### +n_epoch: 26 +accum_grad: 32 +global_grad_clip: 5.0 +dist_sampler: True +log_interval: 1 +checkpoint: + kbest_n: 50 + latest_n: 5 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 5000 + lr_decay: 1.0 diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml index f7f4c58d..c7ccc522 100644 --- a/examples/wenetspeech/asr1/conf/preprocess.yaml +++ b/examples/wenetspeech/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: 0.1 + dither: 1.0 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml b/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 00000000..7e8afb7a --- /dev/null +++ b/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: 16 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: True # simulate streaming inference. Defaults to False.