From 67709155e9f17e03579c7360882e2e92b65ad7c1 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 13 Sep 2022 08:29:21 +0000
Subject: [PATCH] add chunk conformer config from release model

---
 .../asr1/conf/chunk_conformer.yaml            | 99 +++++++++++++++++++
 .../wenetspeech/asr1/conf/preprocess.yaml     |  2 +-
 .../asr1/conf/tuning/chunk_decode.yaml        | 11 +++
 3 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 examples/wenetspeech/asr1/conf/chunk_conformer.yaml
 create mode 100644 examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml

diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml
new file mode 100644
index 00000000..69fa223a
--- /dev/null
+++ b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml
@@ -0,0 +1,99 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    use_cnn_module: True
+    cnn_module_kernel: 15
+    activation_type: swish
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' 
+
+# https://yaml.org/type/float.html
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/train_l/data.list
+dev_manifest: data/dev/data.list
+test_manifest: data/test_meeting/data.list
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+spm_model_prefix: ''
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+do_filter: True
+maxlen_in: 1200  # if do_filter == False && input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 100  # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced
+minlen_in: 10
+minlen_out: 0
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 26
+accum_grad: 32
+global_grad_clip: 5.0
+dist_sampler: True
+log_interval: 1
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-6
+scheduler: warmuplr     
+scheduler_conf:
+  warmup_steps: 5000
+  lr_decay: 1.0
diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml
index f7f4c58d..c7ccc522 100644
--- a/examples/wenetspeech/asr1/conf/preprocess.yaml
+++ b/examples/wenetspeech/asr1/conf/preprocess.yaml
@@ -5,7 +5,7 @@ process:
     n_mels: 80
     n_shift: 160
     win_length: 400
-    dither: 0.1
+    dither: 1.0
   - type: cmvn_json
     cmvn_path: data/mean_std.json
   # these three processes are a.k.a. SpecAugument
diff --git a/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml b/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml
new file mode 100644
index 00000000..7e8afb7a
--- /dev/null
+++ b/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml
@@ -0,0 +1,11 @@
+beam_size: 10
+decode_batch_size: 128
+error_rate_type: cer 
+decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: 16 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: True  # simulate streaming inference. Defaults to False.