add the whole of aishell asr1

4 years ago · 960658f669
parent c40b6f4062
commit 960658f669
6 changed files with 177 additions and 192 deletions
--- a/examples/aishell/asr1/conf/chunk_conformer.yaml
+++ b/examples/aishell/asr1/conf/chunk_conformer.yaml
@ -1,103 +1,94 @@
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: 
+############################################
-    cmvn_file_type: "json"
+cmvn_file: 
-    # encoder related
+cmvn_file_type: "json"
-    encoder: conformer
+# encoder related
-    encoder_conf:
+encoder: conformer
-        output_size: 256    # dimension of attention
+encoder_conf:
-        attention_heads: 4
+    output_size: 256    # dimension of attention
-        linear_units: 2048  # the number of units of position-wise feed forward
+    attention_heads: 4
-        num_blocks: 12      # the number of encoder blocks
+    linear_units: 2048  # the number of units of position-wise feed forward
-        dropout_rate: 0.1
+    num_blocks: 12      # the number of encoder blocks
-        positional_dropout_rate: 0.1
+    dropout_rate: 0.1
-        attention_dropout_rate: 0.0
+    positional_dropout_rate: 0.1
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    attention_dropout_rate: 0.0
-        normalize_before: True
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        cnn_module_kernel: 15
+    normalize_before: True
-        use_cnn_module: True
+    cnn_module_kernel: 15
-        activation_type: 'swish'
+    use_cnn_module: True
-        pos_enc_layer_type: 'rel_pos'
+    activation_type: 'swish'
-        selfattention_layer_type: 'rel_selfattn'
+    pos_enc_layer_type: 'rel_pos'
-        causal: true
+    selfattention_layer_type: 'rel_selfattn'
-        use_dynamic_chunk: true
+    causal: true
-        cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_chunk: true
-        use_dynamic_left_chunk: false
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
    use_dynamic_left_chunk: false
 # decoder related
 decoder: transformer
 decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
 # hybrid CTC/attention
 model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
-    # decoder related
+###########################################
-    decoder: transformer
+#                   Data                  #
-    decoder_conf:
+###########################################
        attention_heads: 4
        linear_units: 2048
        num_blocks: 6
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        self_attention_dropout_rate: 0.0
        src_attention_dropout_rate: 0.0
-    # hybrid CTC/attention
+train_manifest: data/manifest.train
-    model_conf:
+dev_manifest: data/manifest.dev
-        ctc_weight: 0.3
+test_manifest: data/manifest.test
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
-data:
+###########################################
-  train_manifest: data/manifest.train
+#              Dataloader                 #
-  dev_manifest: data/manifest.dev
+###########################################
  test_manifest: data/manifest.test
 vocab_filepath: data/lang_char/vocab.txt 
 unit_type: 'char'
 augmentation_config: conf/preprocess.yaml
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
 batch_size: 64
 maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
 batch_bins: 0 
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 num_workers: 0
 subsampling_factor: 1
 num_encs: 1
-collator:
+###########################################
-  vocab_filepath: data/lang_char/vocab.txt 
+#                 training                #
-  unit_type: 'char'
+###########################################
-  augmentation_config: conf/preprocess.yaml
+n_epoch: 240 
-  feat_dim: 80
+accum_grad: 2
-  stride_ms: 10.0
+global_grad_clip: 5.0
-  window_ms: 25.0
+optim: adam
-  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+optim_conf:
-  batch_size: 64
+  lr: 0.002
-  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+  weight_decay: 1e-6
-  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+scheduler: warmuplr
-  minibatches: 0 # for debug
+scheduler_conf:
-  batch_count: auto
+  warmup_steps: 25000
-  batch_bins: 0 
+  lr_decay: 1.0
-  batch_frames_in: 0
+log_interval: 100
-  batch_frames_out: 0
+checkpoint:
-  batch_frames_inout: 0
+  kbest_n: 50
-  num_workers: 0
+  latest_n: 5
  subsampling_factor: 1
  num_encs: 1
 training:
  n_epoch: 240 
  accum_grad: 2
  global_grad_clip: 5.0
  optim: adam
  optim_conf:
    lr: 0.002
    weight_decay: 1e-6
  scheduler: warmuplr
  scheduler_conf:
    warmup_steps: 25000
    lr_decay: 1.0
  log_interval: 100
  checkpoint:
    kbest_n: 50
    latest_n: 5
 decoding:
  beam_size: 10
  batch_size: 128
  error_rate_type: cer 
  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
      # <0: for decoding, use full chunk.
      # >0: for decoding, use fixed chunk size as set.
      # 0: used for training, it's prohibited here. 
  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/aishell/asr1/conf/conformer.yaml
+++ b/examples/aishell/asr1/conf/conformer.yaml
@ -1,7 +1,6 @@
 ############################################
 #           Network Architecture           #
 ############################################
 #model:
 cmvn_file: 
 cmvn_file_type: "json"
 # encoder related
@ -42,7 +41,6 @@ model_conf:
 ###########################################
 #                   Data                  #
 ###########################################
 #data:
 train_manifest: data/manifest.train
 dev_manifest: data/manifest.dev
 test_manifest: data/manifest.test
@ -50,11 +48,9 @@ test_manifest: data/manifest.test
 ###########################################
 #              Dataloader                 #
 ###########################################
 #collator:
 vocab_filepath: data/lang_char/vocab.txt 
 unit_type: 'char'
 augmentation_config: conf/preprocess.yaml
 spm_model_prefix: ''
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
@ -75,7 +71,6 @@ num_encs: 1
 ###########################################
 #                 training                #
 ###########################################
 #training:
 n_epoch: 240 
 accum_grad: 2
 global_grad_clip: 5.0
--- a/examples/aishell/asr1/conf/transformer.yaml
+++ b/examples/aishell/asr1/conf/transformer.yaml
@ -1,95 +1,84 @@
-# network architecture
+############################################
-model:
+#           Network Architecture           #
-    cmvn_file: 
+############################################
-    cmvn_file_type: "json"
+cmvn_file: 
-    # encoder related
+cmvn_file_type: "json"
-    encoder: transformer
+# encoder related
-    encoder_conf:
+encoder: transformer
-        output_size: 256    # dimension of attention
+encoder_conf:
-        attention_heads: 4
+    output_size: 256    # dimension of attention
-        linear_units: 2048  # the number of units of position-wise feed forward
+    attention_heads: 4
-        num_blocks: 12      # the number of encoder blocks
+    linear_units: 2048  # the number of units of position-wise feed forward
-        dropout_rate: 0.1
+    num_blocks: 12      # the number of encoder blocks
-        positional_dropout_rate: 0.1
+    dropout_rate: 0.1
-        attention_dropout_rate: 0.0
+    positional_dropout_rate: 0.1
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    attention_dropout_rate: 0.0
-        normalize_before: true
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-
+    normalize_before: true
-    # decoder related
+# decoder related
-    decoder: transformer
+decoder: transformer
-    decoder_conf:
+decoder_conf:
-        attention_heads: 4
+    attention_heads: 4
-        linear_units: 2048
+    linear_units: 2048
-        num_blocks: 6
+    num_blocks: 6
-        dropout_rate: 0.1
+    dropout_rate: 0.1
-        positional_dropout_rate: 0.1
+    positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
+    self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
 # hybrid CTC/attention
 model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
 ###########################################
 #                   Data                  #
 ###########################################
 # https://yaml.org/type/float.html
-data:
+train_manifest: data/manifest.train
-  train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
-  dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
  test_manifest: data/manifest.test
 collator:
  unit_type: 'char'
  vocab_filepath: data/lang_char/vocab.txt 
  feat_dim: 80
  stride_ms: 10.0
  window_ms: 25.0
  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
  batch_size: 64 
  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
  minibatches: 0 # for debug
  batch_count: auto
  batch_bins: 0 
  batch_frames_in: 0
  batch_frames_out: 0
  batch_frames_inout: 0
  augmentation_config: conf/preprocess.yaml 
  num_workers: 0
  subsampling_factor: 1
  num_encs: 1
 training:
  n_epoch: 240 
  accum_grad: 2
  global_grad_clip: 5.0
  optim: adam
  optim_conf:
    lr: 0.002
    weight_decay: 1e-6
  scheduler: warmuplr     
  scheduler_conf:
    warmup_steps: 25000
    lr_decay: 1.0
  log_interval: 100
  checkpoint:
    kbest_n: 50
    latest_n: 5
 ###########################################
 #              Dataloader                 #
 ###########################################
 unit_type: 'char'
 vocab_filepath: data/lang_char/vocab.txt 
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
 batch_size: 64 
 maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
 batch_bins: 0 
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 augmentation_config: conf/preprocess.yaml 
 num_workers: 0
 subsampling_factor: 1
 num_encs: 1
-decoding:
+###########################################
-  beam_size: 10
+#                 training                #
-  batch_size: 128
+###########################################
-  error_rate_type: cer 
+n_epoch: 240 
-  decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+accum_grad: 2
-  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+global_grad_clip: 5.0
-  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+optim: adam
-      # <0: for decoding, use full chunk.
+optim_conf:
-      # >0: for decoding, use fixed chunk size as set.
+  lr: 0.002
-      # 0: used for training, it's prohibited here. 
+  weight_decay: 1e-6
-  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+scheduler: warmuplr     
-  simulate_streaming: False  # simulate streaming inference. Defaults to False.
+scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
 log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5
--- a/examples/aishell/asr1/conf/tuning/chunk_decode.yaml
+++ b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml
@ -0,0 +1,11 @@
 beam_size: 10
 decode_batch_size: 128
 error_rate_type: cer 
 decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/aishell/asr1/conf/tuning/decode.yaml
+++ b/examples/aishell/asr1/conf/tuning/decode.yaml
@ -1,4 +1,3 @@
 #decoding:
 beam_size: 10
 decode_batch_size: 128
 error_rate_type: cer 
--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
@ -6,7 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/conformer.yaml
-decode_conf_path=conf/decode.yaml
+decode_conf_path=conf/tuning/decode.yaml
 avg_num=20
 audio_file=data/demo_01_03.wav