From 6f78526e3e817456d89f000745056b4e9e9ead60 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 20 Apr 2023 11:32:08 +0000
Subject: [PATCH] add hubert pretrain config

---
 .../asr3/conf/hubertASR_base_pretrain.yaml    | 135 ++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 examples/librispeech/asr3/conf/hubertASR_base_pretrain.yaml

diff --git a/examples/librispeech/asr3/conf/hubertASR_base_pretrain.yaml b/examples/librispeech/asr3/conf/hubertASR_base_pretrain.yaml
new file mode 100644
index 000000000..41b28f69b
--- /dev/null
+++ b/examples/librispeech/asr3/conf/hubertASR_base_pretrain.yaml
@@ -0,0 +1,135 @@
+############################################
+#          Network Architecture           #
+############################################
+freeze_hubert: True
+normalize_wav: True
+output_norm: True
+init_type: kaiming_uniform # !Warning: need to convergence
+enc:
+  input_shape: 1024
+  dnn_blocks: 2
+  dnn_neurons: 1024
+  activation: True
+ctc:
+  enc_n_units: 1024
+  blank_id: 0
+  dropout_rate: 0.0
+hubert_params_path: "exp/hubert/hubert_base_model_paddle.pdparams"
+km_path: "exp/hubert/hubert_base_ls960_L9_km500.bin"
+
+
+task_cfg:
+  sample_rate: 16000
+  #pretrain: True
+
+model_cfg:
+  dropout_input: 0.1
+  final_dropout: 0.0
+  dropout: 0.1
+  attention_dropout: 0.1
+  activation_dropout: 0.0
+  apply_mask: False
+  mask_length: 10
+  mask_prob: 0.5
+  mask_selection: static
+  mask_other: 0.0
+  no_mask_overlap: False
+  mask_channel_length: 10
+  mask_channel_prob: 0.0
+  mask_channel_selection: static
+  mask_channel_other: 0.0
+  no_mask_channel_overlap: False
+  freeze_finetune_updates: 10000
+  feature_grad_mult: 0.1
+  layerdrop: 0.1
+  normalize: True
+  fp16: True
+  label_rate: 50
+  extractor_mode: default
+  encoder_layers: 12
+  encoder_embed_dim: 768
+  encoder_ffn_embed_dim: 3072
+  encoder_attention_heads: 12
+  activation_fn: gelu
+  encoder_layerdrop: 0.05
+  dropout_features: 0.1
+  final_dim: 256
+  untie_final_proj: False
+  layer_norm_first: False
+  conv_feature_layers: "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"
+  conv_bias: False
+  logit_temp: 0.1
+  target_glu: False
+  mask_min_space: 1
+  mask_channel_min_space: 1
+  conv_pos: 128
+  conv_pos_groups: 16
+  latent_temp: [2.0, 0.5, 0.999995]
+  skip_masked: False
+  skip_nomask: False
+
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test-clean
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: char
+mean_std_filepath: ""
+preprocess_config: conf/preprocess.yaml
+sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs 0: disabled other: enabled for other epochs 
+batch_size: 1  # Different batch_size may cause large differences in results
+maxlen_in: 51200000000  # if input length  > maxlen-in batchsize is automatically reduced
+maxlen_out: 1500000  # if output length > maxlen-out batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+dist_sampler: True
+shortest_first: True
+return_lens_rate: True
+
+############################################
+#             Data Augmentation            #
+############################################
+audio_augment:  # for raw audio 
+  sample_rate: 16000
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 1
+accum_grad: 1
+global_grad_clip: 5.0
+model_optim: adadelta
+model_optim_conf:
+  lr: 1.0
+  epsilon: 1.0e-6
+  rho: 0.95
+model_scheduler: constantlr    
+model_scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+hubert_optim: adadelta
+hubert_optim_conf:
+  lr: 0.9
+  epsilon: 1.0e-6
+  rho: 0.95
+hubert_scheduler: constantlr    
+hubert_scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 1
+checkpoint:
+  kbest_n: 50
+  latest_n: 5