wav2vec2_zh train work

3 years ago · fe2775150d
parent 899236b328
commit fe2775150d
16 changed files with 998 additions and 44 deletions
--- a/examples/aishell/asr2/cmd.sh
+++ b/examples/aishell/asr2/cmd.sh
@ -0,0 +1,89 @@
 # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
 # Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
 # e.g.
 #   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
 #
 # Options:
 #   --time <time>: Limit the maximum time to execute.
 #   --mem <mem>: Limit the maximum memory usage.
 #   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
 #   --num-threads <ngpu>: Specify the number of CPU core.
 #   --gpu <ngpu>: Specify the number of GPU devices.
 #   --config: Change the configuration file from default.
 #
 # "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
 # The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
 # e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
 # Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
 #
 # run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
 # These options are mapping to specific options for each backend and
 # it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
 # If jobs failed, your configuration might be wrong for your environment.
 #
 #
 # The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
 #   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
 # =========================================================~
 # Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
 cmd_backend='local'
 # Local machine, without any Job scheduling system
 if [ "${cmd_backend}" = local ]; then
    # The other usage
    export train_cmd="run.pl"
    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
    export cuda_cmd="run.pl"
    # Used for "*_recog.py"
    export decode_cmd="run.pl"
 # "qsub" (SGE, Torque, PBS, etc.)
 elif [ "${cmd_backend}" = sge ]; then
    # The default setting is written in conf/queue.conf.
    # You must change "-q g.q" for the "queue" for your environment.
    # To know the "queue" names, type "qhost -q"
    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
    export train_cmd="queue.pl"
    export cuda_cmd="queue.pl"
    export decode_cmd="queue.pl"
 # "sbatch" (Slurm)
 elif [ "${cmd_backend}" = slurm ]; then
    # The default setting is written in conf/slurm.conf.
    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
    # To know the "partion" names, type "sinfo".
    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
    export train_cmd="slurm.pl"
    export cuda_cmd="slurm.pl"
    export decode_cmd="slurm.pl"
 elif [ "${cmd_backend}" = ssh ]; then
    # You have to create ".queue/machines" to specify the host to execute jobs.
    # e.g. .queue/machines
    #   host1
    #   host2
    #   host3
    # Assuming you can login them without any password, i.e. You have to set ssh keys.
    export train_cmd="ssh.pl"
    export cuda_cmd="ssh.pl"
    export decode_cmd="ssh.pl"
 # This is an example of specifying several unique options in the JHU CLSP cluster setup.
 # Users can modify/add their own command options according to their cluster environments.
 elif [ "${cmd_backend}" = jhu ]; then
    export train_cmd="queue.pl --mem 2G"
    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
    export decode_cmd="queue.pl --mem 4G"
 else
    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
    return 1
 fi
--- a/examples/aishell/asr2/conf/conformer.yaml
+++ b/examples/aishell/asr2/conf/conformer.yaml
@ -0,0 +1,91 @@
 ############################################
 #           Network Architecture           #
 ############################################
 cmvn_file: 
 cmvn_file_type: "json"
 # encoder related
 encoder: conformer
 encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
    num_blocks: 12      # the number of encoder blocks
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.0
    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
    normalize_before: True
    cnn_module_kernel: 15
    use_cnn_module: True
    activation_type: 'swish'
    pos_enc_layer_type: 'rel_pos'
    selfattention_layer_type: 'rel_selfattn'
 # decoder related
 decoder: transformer
 decoder_conf:
    attention_heads: 4
    linear_units: 2048
    num_blocks: 6
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
 # hybrid CTC/attention
 model_conf:
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
    init_type: 'kaiming_uniform' # !Warning: need to convergence
 ###########################################
 #                   Data                  #
 ###########################################
 train_manifest: data/manifest.train
 dev_manifest: data/manifest.dev
 test_manifest: data/manifest.test
 ###########################################
 #              Dataloader                 #
 ###########################################
 vocab_filepath: data/lang_char/vocab.txt 
 spm_model_prefix: ''
 unit_type: 'char'
 preprocess_config: conf/preprocess.yaml
 feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
 batch_size: 32
 maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
 batch_bins: 0 
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 num_workers: 2
 subsampling_factor: 1
 num_encs: 1
 ###########################################
 #                Training                 #
 ###########################################
 n_epoch: 150 
 accum_grad: 8
 global_grad_clip: 5.0
 dist_sampler: False
 optim: adam
 optim_conf:
  lr: 0.002
  weight_decay: 1.0e-6
 scheduler: warmuplr
 scheduler_conf:
  warmup_steps: 25000
  lr_decay: 1.0
 log_interval: 100
 checkpoint:
  kbest_n: 50
  latest_n: 5
--- a/examples/aishell/asr2/conf/preprocess.yaml
+++ b/examples/aishell/asr2/conf/preprocess.yaml
@ -0,0 +1,4 @@
 process:
    # use raw audio
  - type: wav_process
    dither: 0.0
--- a/examples/aishell/asr2/conf/tuning/chunk_decode.yaml
+++ b/examples/aishell/asr2/conf/tuning/chunk_decode.yaml
@ -0,0 +1,11 @@
 beam_size: 10
 decode_batch_size: 128
 error_rate_type: cer 
 decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: 16 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: True  # simulate streaming inference. Defaults to False.
--- a/examples/aishell/asr2/conf/tuning/decode.yaml
+++ b/examples/aishell/asr2/conf/tuning/decode.yaml
@ -0,0 +1,11 @@
 beam_size: 10
 decode_batch_size: 128
 error_rate_type: cer 
 decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
    # <0: for decoding, use full chunk.
    # >0: for decoding, use fixed chunk size as set.
    # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
 simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/aishell/asr2/conf/wav2vec2ASR.yaml
+++ b/examples/aishell/asr2/conf/wav2vec2ASR.yaml
@ -0,0 +1,137 @@
 ############################################
 #          Network Architecture           #
 ############################################
 freeze_wav2vec2: False
 normalize_wav: True
 output_norm: True
 dnn_blocks: 2
 dnn_neurons: 1024
 blank_id: 0
 ctc_dropout_rate: 0.0
 wav2vec2_params_path: exp/wav2vec2/chinese-wav2vec2-large.pdparams
 ############################################
 #               Wav2Vec2.0                 #
 ############################################
 vocab_size: 32
 hidden_size: 1024
 num_hidden_layers: 24
 num_attention_heads: 16
 intermediate_size: 4096
 hidden_act: gelu
 hidden_dropout: 0.1
 activation_dropout: 0.0
 attention_dropout: 0.1
 feat_proj_dropout: 0.1
 feat_quantizer_dropout: 0.0
 final_dropout: 0.0
 layerdrop: 0.1
 initializer_range: 0.02
 layer_norm_eps: 1e-5
 feat_extract_norm: layer
 feat_extract_activation: gelu
 conv_dim: [512, 512, 512, 512, 512, 512, 512]
 conv_stride: [5, 2, 2, 2, 2, 2, 2]
 conv_kernel: [10, 3, 3, 3, 3, 2, 2]
 conv_bias: True
 num_conv_pos_embeddings: 128
 num_conv_pos_embedding_groups: 16
 do_stable_layer_norm: True
 apply_spec_augment: False
 mask_channel_length: 10
 mask_channel_min_space: 1
 mask_channel_other: 0.0
 mask_channel_prob: 0.0
 mask_channel_selection: static
 mask_feature_length: 10
 mask_feature_min_masks: 0
 mask_feature_prob: 0.0
 mask_time_length: 10
 mask_time_min_masks: 2
 mask_time_min_space: 1
 mask_time_other: 0.0
 mask_time_prob: 0.075
 mask_time_selection: static
 num_codevectors_per_group: 320
 num_codevector_groups: 2
 contrastive_logits_temperature: 0.1
 num_negatives: 100
 codevector_dim: 256
 proj_codevector_dim: 256
 diversity_loss_weight: 0.1
 use_weighted_layer_sum: False
 pad_token_id: 0
 bos_token_id: 1
 eos_token_id: 2
 add_adapter: False
 adapter_kernel_size: 3
 adapter_stride: 2
 num_adapter_layers: 3
 output_hidden_size: None
 ###########################################
 #                   Data                  #
 ###########################################
 train_manifest: data/manifest.train
 dev_manifest: data/manifest.dev
 test_manifest: data/manifest.test
 ###########################################
 #              Dataloader                 #
 ###########################################
 vocab_filepath: data/lang_char/vocab.txt 
 unit_type: 'char'
 mean_std_filepath: 
 preprocess_config: conf/preprocess.yaml
 sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs 0: disabled other: enabled for 'other' epochs 
 batch_size: 8  # Different batch_size may cause large differences in results
 maxlen_in: 51200000000  # if input length  > maxlen-in batchsize is automatically reduced
 maxlen_out: 1500000  # if output length > maxlen-out batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
 batch_bins: 0 
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 num_workers: 0
 subsampling_factor: 1
 num_encs: 1
 dist_sampler: True
 shortest_first: True
 return_lens_rate: True
 ###########################################
 #                 Training                #
 ###########################################
 n_epoch: 80
 accum_grad: 1
 global_grad_clip: 3.0
 model_optim: adadelta
 model_optim_conf:
  lr: 0.95
  epsilon: 1.0e-8
  rho: 0.95
 wav2vec2_optim: adam
 wav2vec2_optim_conf:
  lr: 0.0001
  epsilon: 1.0e-8
 model_scheduler: newbobscheduler    
 model_scheduler_conf:
  improvement_threshold: 0.0025
  annealing_factor: 0.8
  patient: 0
 wav2vec2_scheduler: newbobscheduler    
 wav2vec2_scheduler_conf:
  improvement_threshold: 0.0025
  annealing_factor: 0.9
  patient: 0
 log_interval: 1
 checkpoint:
  kbest_n: 50
  latest_n: 5
 augment: True
--- a/examples/aishell/asr2/local/data.sh
+++ b/examples/aishell/asr2/local/data.sh
@ -0,0 +1,95 @@
 #!/bin/bash
 stage=-1
 stop_stage=100
 dict_dir=data/lang_char
 . ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
 mkdir -p data
 mkdir -p ${dict_dir}
 TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    # download data, generate manifests
    python3 ${TARGET_DIR}/aishell/aishell.py \
    --manifest_prefix="data/manifest" \
    --target_dir="${TARGET_DIR}/aishell"
    if [ $? -ne 0 ]; then
        echo "Prepare Aishell failed. Terminated."
        exit 1
    fi
    for dataset in train dev test; do
        mv data/manifest.${dataset} data/manifest.${dataset}.raw
    done
 fi
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # compute mean and stddev for normalizer
    num_workers=$(nproc)
    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
    --manifest_path="data/manifest.train.raw" \
    --spectrum_type="fbank" \
    --feat_dim=80 \
    --delta_delta=false \
    --stride_ms=10 \
    --window_ms=25 \
    --sample_rate=16000 \
    --use_dB_normalization=False \
    --num_samples=-1 \
    --num_workers=${num_workers} \
    --output_path="data/mean_std.json"
    if [ $? -ne 0 ]; then
        echo "Compute mean and stddev failed. Terminated."
        exit 1
    fi
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # download data, generate manifests
    # build vocabulary
    python3 ${MAIN_ROOT}/utils/build_vocab.py \
    --unit_type="char" \
    --count_threshold=0 \
    --vocab_path="${dict_dir}/vocab.txt" \
    --manifest_paths "data/manifest.train.raw"
    if [ $? -ne 0 ]; then
        echo "Build vocabulary failed. Terminated."
        exit 1
    fi
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # format manifest with tokenids, vocab size
    for dataset in train dev test; do
    {
        python3 ${MAIN_ROOT}/utils/format_data.py \
            --cmvn_path "data/mean_std.json" \
            --unit_type "char" \
            --vocab_path="${dict_dir}/vocab.txt" \
            --manifest_path="data/manifest.${dataset}.raw" \
            --output_path="data/manifest.${dataset}"
        if [ $? -ne 0 ]; then
            echo "Formt mnaifest failed. Terminated."
            exit 1
        fi
    } &
    done
    wait
 fi
 echo "Aishell data preparation done."
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    mkdir -p exp/wav2vec2
    echo "Pretrained wav2vec2 model download"
    wget -P exp/wav2vec2 https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams
 fi
 exit 0
--- a/examples/aishell/asr2/local/test.sh
+++ b/examples/aishell/asr2/local/test.sh
@ -0,0 +1,84 @@
 #!/bin/bash
 set -e
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 expdir=exp
 datadir=data
 train_set=train_960
 recog_set="test-clean test-other dev-clean dev-other"
 recog_set="test-clean"
 config_path=$1
 decode_config_path=$2
 ckpt_prefix=$3
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 # download language model
 #bash local/download_lm_en.sh
 #if [ $? -ne 0 ]; then
 #    exit 1
 #fi
 python3 utils/format_rsl.py \
    --origin_ref data/manifest.test-clean.raw \
    --trans_ref data/manifest.test-clean.text
 for type in ctc_greedy_search; do
    echo "decoding ${type}"
    batch_size=16
    python3 -u ${BIN_DIR}/test.py \
        --ngpu ${ngpu} \
        --config ${config_path} \
        --decode_cfg ${decode_config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \
        --opts decode.decoding_method ${type} \
        --opts decode.decode_batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
        exit 1
    fi
    python3 utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
    python3 utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
 for type in ctc_prefix_beam_search; do
    echo "decoding ${type}"
    batch_size=1
    python3 -u ${BIN_DIR}/test.py \
        --ngpu ${ngpu} \
        --config ${config_path} \
        --decode_cfg ${decode_config_path} \
        --result_file ${ckpt_prefix}.${type}.rsl \
        --checkpoint_path ${ckpt_prefix} \
        --opts decode.decoding_method ${type} \
        --opts decode.decode_batch_size ${batch_size}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
        exit 1
    fi
    python3 utils/format_rsl.py \
        --origin_hyp ${ckpt_prefix}.${type}.rsl \
        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
    python3 utils/compute-wer.py --char=1 --v=1 \
        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
    echo "decoding ${type} done."
 done
 echo "Finished"
 exit 0
--- a/examples/aishell/asr2/local/test_wav.sh
+++ b/examples/aishell/asr2/local/test_wav.sh
@ -0,0 +1,58 @@
 #!/bin/bash
 if [ $# != 4 ];then
    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
    exit -1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
 decode_config_path=$2
 ckpt_prefix=$3
 audio_file=$4
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
 if [ $? -ne 0 ]; then
   exit 1
 fi
 if [ ! -f ${audio_file} ]; then
    echo "Plase input the right audio_file path"
    exit 1
 fi
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
    chunk_mode=true
 fi
 # download language model
 #bash local/download_lm_ch.sh
 #if [ $? -ne 0 ]; then
 #    exit 1
 #fi
 for type in ctc_greedy_search; do
    echo "decoding ${type}"
    batch_size=1
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test_wav.py \
    --ngpu ${ngpu} \
    --config ${config_path} \
    --decode_cfg ${decode_config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
    --opts decode.decoding_method ${type} \
    --opts decode.decode_batch_size ${batch_size} \
    --audio_file ${audio_file}
    if [ $? -ne 0 ]; then
        echo "Failed in evaluation!"
        exit 1
    fi
 done
 exit 0
--- a/examples/aishell/asr2/local/train.sh
+++ b/examples/aishell/asr2/local/train.sh
@ -0,0 +1,56 @@
 #!/bin/bash
 if [ $# -lt 2 ] && [ $# -gt 3 ];then
    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
 ips=$3
 if [ ! $ips ];then
  ips_config=
 else
  ips_config="--ips="${ips}
 fi
 mkdir -p exp
 # seed may break model convergence
 seed=1998
 if [ ${seed} != 0 ]; then
    export FLAGS_cudnn_deterministic=True
 fi
 # export FLAGS_cudnn_exhaustive_search=true
 # export FLAGS_conv_workspace_size_limit=4000
 # export FLAGS_allocator_strategy=naive_best_fit
 if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed} 
 else
 python3 -m paddle.distributed.launch --log_dir=aa --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
 fi
 if [ ${seed} != 0 ]; then
    unset FLAGS_cudnn_deterministic
 fi
 if [ $? -ne 0 ]; then
    echo "Failed in training!"
    exit 1
 fi
 exit 0
--- a/examples/aishell/asr2/path.sh
+++ b/examples/aishell/asr2/path.sh
@ -0,0 +1,15 @@
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
 MODEL=wav2vec2
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
--- a/examples/aishell/asr2/run.sh
+++ b/examples/aishell/asr2/run.sh
@ -0,0 +1,46 @@
 #!/bin/bash
 set -e
 . ./path.sh || exit 1;
 . ./cmd.sh || exit 1;
 gpus=6
 stage=1
 stop_stage=1
 conf_path=conf/wav2vec2ASR.yaml
 ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 audio_file=data/demo_002_en.wav
 avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1
 fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} 
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # avg n best model
    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # greedy search decoder
    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    # test a single .wav file
    CUDA_VISIBLE_DEVICES=${gpus} ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
--- a/examples/aishell/asr2/utils
+++ b/examples/aishell/asr2/utils
@ -0,0 +1 @@
 ../../../utils
--- a/paddlespeech/s2t/exps/wav2vec2/model.py
+++ b/paddlespeech/s2t/exps/wav2vec2/model.py
@ -19,6 +19,7 @@ import time
 from collections import defaultdict
 from collections import OrderedDict
 from contextlib import nullcontext
 import re
 import jsonlines
 import numpy as np
@ -62,6 +63,20 @@ class Wav2Vec2ASRTrainer(Trainer):
            self.avg_train_loss -= self.avg_train_loss / (batch_index + 1)
            self.avg_train_loss += loss / (batch_index + 1)
    def before_train(self):
        from_scratch = self.resume_or_scratch()
        if from_scratch:
            # scratch: save init model, i.e. 0 epoch
            self.save(tag='init', infos=None)
        else:
            # resume: train next_epoch and next_iteration
            self.epoch += 1
            self.iteration += 1
            logger.info(
                f"Resume train: epoch {self.epoch }, step {self.iteration}!")
        self.maybe_batch_sampler_step()
    def train_batch(self, batch_index, batch, msg):
        train_conf = self.config
        start = time.time()
@ -97,9 +112,16 @@ class Wav2Vec2ASRTrainer(Trainer):
        # optimizer step old
        if (batch_index + 1) % train_conf.accum_grad == 0:
-            self.optimizer.step()
+            self.model_optimizer.step()
-            self.optimizer.clear_grad()
+            self.model_optimizer.clear_grad()
-            self.lr_scheduler.step()
+            if not train_conf.freeze_wav2vec2:
                self.wav2vec2_optimizer.step()
                self.wav2vec2_optimizer.clear_grad()
            if self.config.model_scheduler is not 'newbobscheduler':
                self.model_lr_scheduler.step()
            if self.config.wav2vec2_scheduler is not 'newbobscheduler':
                if not train_conf.freeze_wav2vec2:
                    self.wav2vec2_lr_scheduler.step()
            self.iteration += 1
        losses_np = {'loss': self.avg_train_loss * train_conf.accum_grad}
@ -113,7 +135,7 @@ class Wav2Vec2ASRTrainer(Trainer):
        if (batch_index + 1) % train_conf.accum_grad == 0:
            if dist.get_rank() == 0 and self.visualizer:
                losses_np_v = losses_np.copy()
-                losses_np_v.update({"lr": self.lr_scheduler()})
+                losses_np_v.update({"model_lr": self.model_lr_scheduler(), "wav2vec2_lr": self.wav2vec2_lr_scheduler()})
                for key, val in losses_np_v.items():
                    self.visualizer.add_scalar(
                        tag='train/' + key, value=val, step=self.iteration - 1)
@ -134,7 +156,7 @@ class Wav2Vec2ASRTrainer(Trainer):
            wav = wav[:, :, 0]
            loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
-            if paddle.isfinite(loss):
+            if math.isfinite(float(loss)):
                num_utts = batch[1].shape[0]
                num_seen_utts += num_utts
                total_loss += float(loss) * num_utts
@ -159,6 +181,105 @@ class Wav2Vec2ASRTrainer(Trainer):
            dist.get_rank(), total_loss / num_seen_utts))
        return total_loss, num_seen_utts
    @mp_tools.rank_zero_only
    def save(self, tag=None, infos: dict=None):
        """Save checkpoint (model parameters and optimizer states).
        Args:
            tag (int or str, optional): None for step, else using tag, e.g epoch. Defaults to None.
            infos (dict, optional): meta data to save. Defaults to None.
        """
        infos = infos if infos else dict()
        infos.update({
            "step": self.iteration,
            "epoch": self.epoch,
            "model_lr": self.model_optimizer.get_lr(),
            "wav2vec2_lr": self.wav2vec2_optimizer.get_lr()
        })
        checkpoint_path = os.path.join(self.checkpoint_dir,
                                       "{}".format(self.iteration
                                        if tag is None else tag))
        model_dict = self.model.state_dict()
        params_path = checkpoint_path + ".pdparams"
        paddle.save(model_dict, params_path)
        logger.info("Saved model to {}".format(params_path))
        model_opt_dict = self.model_optimizer.state_dict()
        wav2vec2_opt_dict = self.wav2vec2_optimizer.state_dict()
        opt_dict = {
            'model': model_opt_dict, 
            'wav2vec2': wav2vec2_opt_dict}
        optimizer_path = checkpoint_path + ".pdopt"
        paddle.save(opt_dict, optimizer_path)
        logger.info("Saved optimzier state to {}".format(optimizer_path))
        scheduler_dict = {}
        if self.config.model_scheduler == 'newbobscheduler':
            scheduler_dict['model'] = self.model_lr_scheduler.save()
        if self.config.wav2vec2_scheduler =='newbobscheduler':
            scheduler_dict['wav2vec2'] = self.wav2vec2_lr_scheduler.save()
        if scheduler_dict:
            scheduler_path = checkpoint_path + ".pdlrs"
            paddle.save(scheduler_dict, scheduler_path)
            logger.info("Saved scheduler state to {}".format(scheduler_path))
        info_path = re.sub('.pdparams$', '.json', params_path)
        infos = {} if infos is None else infos
        with open(info_path, 'w') as fout:
            data = json.dumps(infos)
            fout.write(data)
    def resume_or_scratch(self):
        """Resume from latest checkpoint at checkpoints in the output
        directory or load a specified checkpoint.
        If ``args.checkpoint_path`` is not None, load the checkpoint, else
        resume training.
        """
        scratch = None
        infos = self.checkpoint.load_latest_parameters(
            self.model,
            checkpoint_dir=self.checkpoint_dir,
            checkpoint_path=self.args.checkpoint_path)
        if infos:
            # just restore ckpt
            # lr will resotre from optimizer ckpt
            self.iteration = infos["step"]
            self.epoch = infos["epoch"]
            # resotre optimizer from *.pdopt
            optimizer_path = os.path.join(self.checkpoint_dir,
                                       "{}".format(epoch)) + '.pdopt'
            optimizer_dict = paddle.load(optimizer_path)
            optimizer.set_state_dict(optimizer_dict)
            self.model_optimizer.set_state_dict(optimizer_dict['model'])
            self.wav2vec2_optimizer.set_state_dict(optimizer_dict['wav2vec2'])
            # resotre lr_scheduler from *.pdlrs
            scheduler_path = os.path.join(self.checkpoint_dir,
                                       "{}".format(epoch)) + '.pdlrs'
            if os.path.isfile(os.path.join(scheduler_path)):
                scheduler_dict = paddle.load(scheduler_path)
                if self.config.model_scheduler is 'newbobscheduler':
                    self.model_lr_scheduler.load(scheduler_dict['model'])
                if self.config.wav2vec2_scheduler is 'newbobscheduler':
                    self.wav2vec2_lr_scheduler.load(scheduler_dict['wav2vec2'])
            scratch = False
            logger.info(
                f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!")
        else:
            self.iteration = 0
            self.epoch = 0
            scratch = True
            logger.info("Init from scratch!")
        return scratch
    def do_train(self):
        """The training process control by step."""
        # !!!IMPORTANT!!!
@ -186,7 +307,8 @@ class Wav2Vec2ASRTrainer(Trainer):
                            report("Rank", dist.get_rank())
                            report("epoch", self.epoch)
                            report('step', self.iteration)
-                            report("lr", self.lr_scheduler())
+                            report("model_lr", self.model_lr_scheduler())
                            report("wav2vec2_lr", self.wav2vec2_lr_scheduler())
                            self.train_batch(batch_index, batch, msg)
                            self.after_train_batch()
                            report('iter', batch_index + 1)
@ -224,15 +346,21 @@ class Wav2Vec2ASRTrainer(Trainer):
                    cv_loss = float(cv_loss)
                else:
                    cv_loss = total_loss / num_seen_utts
            logger.info(
                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
            if self.visualizer:
                self.visualizer.add_scalar(
                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
                self.visualizer.add_scalar(
-                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+                    tag='eval/model_lr', value=self.model_lr_scheduler(), step=self.epoch)
                self.visualizer.add_scalar(
                    tag='eval/wav2vec2_lr', value=self.wav2vec2_lr_scheduler(), step=self.epoch)
            if self.config.model_scheduler is 'newbobscheduler':
                self.model_lr_scheduler.step(cv_loss)
            if self.config.wav2vec2_scheduler is 'newbobscheduler':
                if not self.config.freeze_wav2vec2:
                    self.wav2vec2_scheduler.step(cv_loss)
            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
            self.new_epoch()
@ -283,46 +411,56 @@ class Wav2Vec2ASRTrainer(Trainer):
            return
        train_config = config
-        optim_type = train_config.model_optim
+        model_optim_type = train_config.model_optim
-        optim_conf = train_config.model_optim_conf
+        model_optim_conf = train_config.model_optim_conf
-        scheduler_type = train_config.scheduler
+        wav2vec2_optim_type = train_config.model_optim
-        scheduler_conf = train_config.scheduler_conf
+        wav2vec2_optim_conf = train_config.wav2vec2_optim_conf
-
+
-        scheduler_args = {
+        model_scheduler_type = train_config.model_scheduler
-            "learning_rate": optim_conf.lr,
+        model_scheduler_conf = train_config.model_scheduler_conf
-            "verbose": False,
+        wav2vec2_scheduler_type = train_config.wav2vec2_scheduler
-            "warmup_steps": scheduler_conf.warmup_steps,
+        wav2vec2_scheduler_conf = train_config.wav2vec2_scheduler_conf
-            "gamma": scheduler_conf.lr_decay,
+
-            "d_model": model_conf.dnn_neurons,
+
-        }
+        model_scheduler_args = dict(**{
-        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
+            "learning_rate": model_optim_conf.lr,
-                                                    scheduler_args)
+            "verbose": False}, **(dict(model_scheduler_conf)))
        wav2vec2_scheduler_args = dict(**{
            "learning_rate": wav2vec2_optim_conf.lr,
            "verbose": False}, **(dict(wav2vec2_scheduler_conf)))
        model_lr_scheduler = LRSchedulerFactory.from_args(model_scheduler_type,
                                                    model_scheduler_args)
        wav2vec2_lr_scheduler = LRSchedulerFactory.from_args(wav2vec2_scheduler_type,
                                                    wav2vec2_scheduler_args)
        def optimizer_args(
                config,
                optim_type,
                optim_conf,
                parameters,
                lr_scheduler=None, ):
            train_config = config
-            optim_type = train_config.model_optim
+            optim_arg = dict(optim_conf)
-            optim_conf = train_config.model_optim_conf
+            optim_arg.update({
            scheduler_type = train_config.scheduler
            scheduler_conf = train_config.scheduler_conf
            return {
                "grad_clip": train_config.global_grad_clip,
                "learning_rate": lr_scheduler
                if lr_scheduler else optim_conf.lr,
-                "epsilon": optim_conf.epsilon,
+                "parameters": parameters})
-                "rho": optim_conf.rho,
+            return optim_arg
-                "parameters": parameters,
+
-                "beta1": 0.9 if optim_type == 'noam' else None,
+        model_optimizer_args = optimizer_args(config, model_optim_type, model_optim_conf,
-                "beat2": 0.98 if optim_type == 'noam' else None,
+                                              [*model.enc.parameters(), *model.ctc.parameters()], model_lr_scheduler)
-            }
+        wav2vec2_optimizer_args = optimizer_args(config, wav2vec2_optim_type, wav2vec2_optim_conf,
-
+                                                 model.wav2vec2.parameters(), wav2vec2_lr_scheduler)
-        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+        model_optimizer = OptimizerFactory.from_args(model_optim_type, model_optimizer_args)
-        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
+        wav2vec2_optimizer = OptimizerFactory.from_args(wav2vec2_optim_type, wav2vec2_optimizer_args)
-
+
-        self.optimizer = optimizer
+        self.model_optimizer = model_optimizer
-        self.lr_scheduler = lr_scheduler
+        self.wav2vec2_optimizer = wav2vec2_optimizer
        self.model_lr_scheduler = model_lr_scheduler
        self.wav2vec2_lr_scheduler = wav2vec2_lr_scheduler
        logger.info("Setup optimizer/lr_scheduler!")
--- a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
+++ b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
@ -1177,10 +1177,6 @@ class Wav2Vec2ConfigPure():
        self.proj_codevector_dim = config.proj_codevector_dim
        self.diversity_loss_weight = config.diversity_loss_weight
        # ctc loss
        self.ctc_loss_reduction = config.ctc_loss_reduction
        self.ctc_zero_infinity = config.ctc_zero_infinity
        # adapter
        self.add_adapter = config.add_adapter
        self.adapter_kernel_size = config.adapter_kernel_size
--- a/paddlespeech/s2t/training/scheduler.py
+++ b/paddlespeech/s2t/training/scheduler.py
@ -17,6 +17,7 @@ from typing import Dict
 from typing import Text
 from typing import Union
 import paddle
 from paddle.optimizer.lr import LRScheduler
 from typeguard import check_argument_types
@ -106,6 +107,127 @@ class ConstantLR(LRScheduler):
    def get_lr(self):
        return self.base_lr
@register_scheduler
 class NewBobScheduler(LRScheduler):
    """Scheduler with new-bob technique, used for LR annealing.
    The learning rate is annealed based on the validation performance.
    In particular: if (past_loss-current_loss)/past_loss< impr_threshold:
    lr=lr * annealing_factor.
    Arguments
    ---------
    initial_value : float
        The initial hyperparameter value.
    annealing_factor : float
        It is annealing factor used in new_bob strategy.
    improvement_threshold : float
        It is the improvement rate between losses used to perform learning
        annealing in new_bob strategy.
    patient : int
        When the annealing condition is violated patient times,
        the learning rate is finally reduced.
    Example
    -------
    >>> scheduler = NewBobScheduler(initial_value=1.0)
    >>> scheduler(metric_value=10.0)
    (1.0, 1.0)
    >>> scheduler(metric_value=2.0)
    (1.0, 1.0)
    >>> scheduler(metric_value=2.5)
    (1.0, 0.5)
    """
    def __init__(
        self,
        learning_rate, 
        last_epoch=-1, 
        verbose=False,
        annealing_factor=0.5,
        improvement_threshold=0.0025,
        patient=0,
    ):
        self.hyperparam_value = learning_rate
        self.annealing_factor = annealing_factor
        self.improvement_threshold = improvement_threshold
        self.patient = patient
        self.metric_values = []
        self.current_patient = self.patient
        super().__init__(learning_rate, last_epoch, verbose)
    def step(self, metric_value=None):
        """
        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
        The new learning rate will take effect on next ``optimizer.step`` .
        Args:
            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
        Returns:
            None
        """
        if metric_value is None:
            self.last_epoch += 1
            self.last_lr = self.hyperparam_value
        else:
            self.last_epoch += 1
            self.last_lr = self.get_lr(metric_value)
        if self.verbose:
            print('Epoch {}: {} set learning rate to {}.'.format(
                self.last_epoch, self.__class__.__name__, self.last_lr))
    def get_lr(self, metric_value):
        """Returns the current and new value for the hyperparameter.
        Arguments
        ---------
        metric_value : int
            A number for determining whether to change the hyperparameter value.
        """
        new_value = self.hyperparam_value
        if len(self.metric_values) > 0:
            prev_metric = self.metric_values[-1]
            # Update value if improvement too small and patience is 0
            if prev_metric == 0:  # Prevent division by zero
                improvement = 0
            else:
                improvement = (prev_metric - metric_value) / prev_metric
            if improvement < self.improvement_threshold:
                if self.current_patient == 0:
                    new_value *= self.annealing_factor
                    self.current_patient = self.patient
                else:
                    self.current_patient -= 1
        # Store relevant info
        self.metric_values.append(metric_value)
        self.hyperparam_value = new_value
        return new_value
    def save(self):
        """Saves the current metrics on the specified path."""
        data = {
            "current_epoch_index": self.last_epoch,
            "hyperparam_value": self.hyperparam_value,
            "metric_values": self.metric_values,
            "current_patient": self.current_patient
        }
        return data
    def load(self, data):
        """Loads the needed information."""
        data = paddle.load(data)
        self.last_epoch = data["current_epoch_index"]
        self.hyperparam_value = data["hyperparam_value"]
        self.metric_values = data["metric_values"]
        self.current_patient = data["current_patient"]
 def dynamic_import_scheduler(module):
    """Import Scheduler class dynamically.