diff --git a/examples/librispeech/asr3/conf/hubertASR.yaml b/examples/librispeech/asr3/conf/hubertASR.yaml deleted file mode 100644 index e147815a8..000000000 --- a/examples/librispeech/asr3/conf/hubertASR.yaml +++ /dev/null @@ -1,142 +0,0 @@ -############################################ -# Network Architecture # -############################################ -freeze_hubert: False -normalize_wav: True -output_norm: True -init_type: kaiming_uniform # !Warning: need to convergence -enc: - input_shape: 1024 - dnn_blocks: 2 - dnn_neurons: 1024 - activation: True -ctc: - enc_n_units: 1024 - blank_id: 0 - dropout_rate: 0.0 -hubert_params_path: "exp/hubert/pd_hubert_no_fintune.pdparams" - - -task_cfg: - label_rate: 50.0 - sample_rate: 16000 - normalize: True - enable_padding: False - max_keep_size: None - max_sample_size: 250000 - min_sample_size: 32000 - single_target: False - random_crop: True - pad_audio: False - -model_cfg: - dropout_input: 0.0 - final_dropout: 0.0 - dropout: 0.0 - attention_dropout: 0.0 - activation_dropout: 0.1 - apply_mask: True - mask_length: 10 - mask_prob: 0.5 - mask_selection: static - mask_other: 0.0 - no_mask_overlap: False - mask_channel_length: 64 - mask_channel_prob: 0.25 - mask_channel_selection: static - mask_channel_other: 0.0 - no_mask_channel_overlap: False - feature_grad_mult: 0.0 - layerdrop: 0.1 - normalize: True - fp16: True - label_rate: 50 - extractor_mode: layer_norm - encoder_layers: 24 - encoder_embed_dim: 1024 - encoder_ffn_embed_dim: 4096 - encoder_attention_heads: 16 - activation_fn: gelu - encoder_layerdrop: 0.1 - dropout_features: 0.0 - final_dim: 768 - untie_final_proj: True - layer_norm_first: True - conv_feature_layers: "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2" - conv_bias: False - logit_temp: 0.1 - target_glu: False - mask_min_space: 1 - mask_channel_min_space: 1 - conv_pos: 128 - conv_pos_groups: 16 - latent_temp: [2.0, 0.5, 0.999995] - skip_masked: False - skip_nomask: True - -########################################### -# Data # -########################################### -train_manifest: data/manifest.train-clean-100 -dev_manifest: data/manifest.dev -test_manifest: data/manifest.test-clean - -########################################### -# Dataloader # -########################################### -vocab_filepath: data/lang_char/vocab.txt -unit_type: char -mean_std_filepath: "" -preprocess_config: conf/preprocess.yaml -sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs 0: disabled other: enabled for other epochs -batch_size: 2 # Different batch_size may cause large differences in results -maxlen_in: 51200000000 # if input length > maxlen-in batchsize is automatically reduced -maxlen_out: 1500000 # if output length > maxlen-out batchsize is automatically reduced -minibatches: 0 # for debug -batch_count: auto -batch_bins: 0 -batch_frames_in: 0 -batch_frames_out: 0 -batch_frames_inout: 0 -num_workers: 0 -subsampling_factor: 1 -num_encs: 1 -dist_sampler: True -shortest_first: True -return_lens_rate: True - -############################################ -# Data Augmentation # -############################################ -audio_augment: # for raw audio - sample_rate: 16000 - speeds: [95, 100, 105] - -########################################### -# Training # -########################################### -n_epoch: 3 -accum_grad: 8 -global_grad_clip: 5.0 -model_optim: adadelta -model_optim_conf: - lr: 1.0 - epsilon: 1.0e-6 - rho: 0.95 -model_scheduler: constantlr -model_scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 -hubert_optim: adadelta -hubert_optim_conf: - lr: 1.0 - epsilon: 1.0e-6 - rho: 0.95 -hubert_scheduler: constantlr -hubert_scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 -log_interval: 1 -checkpoint: - kbest_n: 50 - latest_n: 5 diff --git a/examples/librispeech/asr3/local/data.sh b/examples/librispeech/asr3/local/data.sh index edea3e19b..8495a4ab6 100755 --- a/examples/librispeech/asr3/local/data.sh +++ b/examples/librispeech/asr3/local/data.sh @@ -1,6 +1,6 @@ #!/bin/bash -stage=0 +stage=-1 stop_stage=100 unit_type=char diff --git a/examples/librispeech/asr3/local/test.sh b/examples/librispeech/asr3/local/test.sh index 0a5104f1c..ccc0d84de 100755 --- a/examples/librispeech/asr3/local/test.sh +++ b/examples/librispeech/asr3/local/test.sh @@ -31,7 +31,7 @@ python3 utils/format_rsl.py \ for type in ctc_greedy_search; do echo "decoding ${type}" - batch_size=8 + batch_size=16 python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ diff --git a/examples/librispeech/asr3/run.sh b/examples/librispeech/asr3/run.sh index c880c9cbf..e9ee47210 100755 --- a/examples/librispeech/asr3/run.sh +++ b/examples/librispeech/asr3/run.sh @@ -6,7 +6,7 @@ set -e gpus=0 stage=0 -stop_stage=4 +stop_stage=0 conf_path=conf/wav2vec2ASR.yaml ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml diff --git a/examples/librispeech/asr4/conf/hubertASR.yaml b/examples/librispeech/asr4/conf/hubertASR.yaml index e147815a8..44c3d3e17 100644 --- a/examples/librispeech/asr4/conf/hubertASR.yaml +++ b/examples/librispeech/asr4/conf/hubertASR.yaml @@ -14,7 +14,7 @@ ctc: enc_n_units: 1024 blank_id: 0 dropout_rate: 0.0 -hubert_params_path: "exp/hubert/pd_hubert_no_fintune.pdparams" +hubert_params_path: "exp/hubert/hubert-large-lv60.pdparams" task_cfg: @@ -89,9 +89,9 @@ unit_type: char mean_std_filepath: "" preprocess_config: conf/preprocess.yaml sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs 0: disabled other: enabled for other epochs -batch_size: 2 # Different batch_size may cause large differences in results -maxlen_in: 51200000000 # if input length > maxlen-in batchsize is automatically reduced -maxlen_out: 1500000 # if output length > maxlen-out batchsize is automatically reduced +batch_size: 4 # Different batch_size may cause large differences in results +maxlen_in: 1500 # if input length > maxlen-in batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out batchsize is automatically reduced minibatches: 0 # for debug batch_count: auto batch_bins: 0 @@ -129,7 +129,7 @@ model_scheduler_conf: lr_decay: 1.0 hubert_optim: adadelta hubert_optim_conf: - lr: 1.0 + lr: 0.95 epsilon: 1.0e-6 rho: 0.95 hubert_scheduler: constantlr diff --git a/examples/librispeech/asr4/run.sh b/examples/librispeech/asr4/run.sh index 47e71d60f..6d7dc6c96 100755 --- a/examples/librispeech/asr4/run.sh +++ b/examples/librispeech/asr4/run.sh @@ -6,7 +6,7 @@ set -e gpus=0 stage=0 -stop_stage=4 +stop_stage=0 conf_path=conf/hubertASR.yaml ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml