wav2vec2_zh train work

3 years ago · fe2775150d
parent 899236b328
commit fe2775150d
16 changed files with 998 additions and 44 deletions
--- a/examples/aishell/asr2/cmd.sh
+++ b/examples/aishell/asr2/cmd.sh
@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
--- a/examples/aishell/asr2/conf/conformer.yaml
+++ b/examples/aishell/asr2/conf/conformer.yaml
@ -0,0 +1,91 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 150 
+accum_grad: 8
+global_grad_clip: 5.0
+dist_sampler: False
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
--- a/examples/aishell/asr2/conf/preprocess.yaml
+++ b/examples/aishell/asr2/conf/preprocess.yaml
@ -0,0 +1,4 @@
+process:
+    # use raw audio
+  - type: wav_process
+    dither: 0.0
--- a/examples/aishell/asr2/conf/tuning/chunk_decode.yaml
+++ b/examples/aishell/asr2/conf/tuning/chunk_decode.yaml
@ -0,0 +1,11 @@
+beam_size: 10
+decode_batch_size: 128
+error_rate_type: cer 
+decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: 16 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: True  # simulate streaming inference. Defaults to False.
--- a/examples/aishell/asr2/conf/tuning/decode.yaml
+++ b/examples/aishell/asr2/conf/tuning/decode.yaml
@ -0,0 +1,11 @@
+beam_size: 10
+decode_batch_size: 128
+error_rate_type: cer 
+decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/aishell/asr2/conf/wav2vec2ASR.yaml
+++ b/examples/aishell/asr2/conf/wav2vec2ASR.yaml
@ -0,0 +1,137 @@
+############################################
+#          Network Architecture           #
+############################################
+freeze_wav2vec2: False
+normalize_wav: True
+output_norm: True
+dnn_blocks: 2
+dnn_neurons: 1024
+blank_id: 0
+ctc_dropout_rate: 0.0
+wav2vec2_params_path: exp/wav2vec2/chinese-wav2vec2-large.pdparams
+
+############################################
+#               Wav2Vec2.0                 #
+############################################
+vocab_size: 32
+hidden_size: 1024
+num_hidden_layers: 24
+num_attention_heads: 16
+intermediate_size: 4096
+hidden_act: gelu
+hidden_dropout: 0.1
+activation_dropout: 0.0
+attention_dropout: 0.1
+feat_proj_dropout: 0.1
+feat_quantizer_dropout: 0.0
+final_dropout: 0.0
+layerdrop: 0.1
+initializer_range: 0.02
+layer_norm_eps: 1e-5
+feat_extract_norm: layer
+feat_extract_activation: gelu
+conv_dim: [512, 512, 512, 512, 512, 512, 512]
+conv_stride: [5, 2, 2, 2, 2, 2, 2]
+conv_kernel: [10, 3, 3, 3, 3, 2, 2]
+conv_bias: True
+num_conv_pos_embeddings: 128
+num_conv_pos_embedding_groups: 16
+do_stable_layer_norm: True
+apply_spec_augment: False
+mask_channel_length: 10
+mask_channel_min_space: 1
+mask_channel_other: 0.0
+mask_channel_prob: 0.0
+mask_channel_selection: static
+mask_feature_length: 10
+mask_feature_min_masks: 0
+mask_feature_prob: 0.0
+mask_time_length: 10
+mask_time_min_masks: 2
+mask_time_min_space: 1
+mask_time_other: 0.0
+mask_time_prob: 0.075
+mask_time_selection: static
+num_codevectors_per_group: 320
+num_codevector_groups: 2
+contrastive_logits_temperature: 0.1
+num_negatives: 100
+codevector_dim: 256
+proj_codevector_dim: 256
+diversity_loss_weight: 0.1
+use_weighted_layer_sum: False
+pad_token_id: 0
+bos_token_id: 1
+eos_token_id: 2
+add_adapter: False
+adapter_kernel_size: 3
+adapter_stride: 2
+num_adapter_layers: 3
+output_hidden_size: None
+
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: 'char'
+mean_std_filepath: 
+preprocess_config: conf/preprocess.yaml
+sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs 0: disabled other: enabled for 'other' epochs 
+batch_size: 8  # Different batch_size may cause large differences in results
+maxlen_in: 51200000000  # if input length  > maxlen-in batchsize is automatically reduced
+maxlen_out: 1500000  # if output length > maxlen-out batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+dist_sampler: True
+shortest_first: True
+return_lens_rate: True
+  
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 80
+accum_grad: 1
+global_grad_clip: 3.0
+model_optim: adadelta
+model_optim_conf:
+  lr: 0.95
+  epsilon: 1.0e-8
+  rho: 0.95
+wav2vec2_optim: adam
+wav2vec2_optim_conf:
+  lr: 0.0001
+  epsilon: 1.0e-8
+
+model_scheduler: newbobscheduler    
+model_scheduler_conf:
+  improvement_threshold: 0.0025
+  annealing_factor: 0.8
+  patient: 0
+wav2vec2_scheduler: newbobscheduler    
+wav2vec2_scheduler_conf:
+  improvement_threshold: 0.0025
+  annealing_factor: 0.9
+  patient: 0
+log_interval: 1
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
+augment: True
+
+
--- a/examples/aishell/asr2/local/data.sh
+++ b/examples/aishell/asr2/local/data.sh
@ -0,0 +1,95 @@
+#!/bin/bash
+
+stage=-1
+stop_stage=100
+dict_dir=data/lang_char
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+mkdir -p data
+mkdir -p ${dict_dir}
+TARGET_DIR=${MAIN_ROOT}/dataset
+mkdir -p ${TARGET_DIR}
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # download data, generate manifests
+    python3 ${TARGET_DIR}/aishell/aishell.py \
+    --manifest_prefix="data/manifest" \
+    --target_dir="${TARGET_DIR}/aishell"
+
+    if [ $? -ne 0 ]; then
+        echo "Prepare Aishell failed. Terminated."
+        exit 1
+    fi
+
+    for dataset in train dev test; do
+        mv data/manifest.${dataset} data/manifest.${dataset}.raw
+    done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # compute mean and stddev for normalizer
+    num_workers=$(nproc)
+    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+    --manifest_path="data/manifest.train.raw" \
+    --spectrum_type="fbank" \
+    --feat_dim=80 \
+    --delta_delta=false \
+    --stride_ms=10 \
+    --window_ms=25 \
+    --sample_rate=16000 \
+    --use_dB_normalization=False \
+    --num_samples=-1 \
+    --num_workers=${num_workers} \
+    --output_path="data/mean_std.json"
+
+    if [ $? -ne 0 ]; then
+        echo "Compute mean and stddev failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download data, generate manifests
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type="char" \
+    --count_threshold=0 \
+    --vocab_path="${dict_dir}/vocab.txt" \
+    --manifest_paths "data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # format manifest with tokenids, vocab size
+    for dataset in train dev test; do
+    {
+        python3 ${MAIN_ROOT}/utils/format_data.py \
+            --cmvn_path "data/mean_std.json" \
+            --unit_type "char" \
+            --vocab_path="${dict_dir}/vocab.txt" \
+            --manifest_path="data/manifest.${dataset}.raw" \
+            --output_path="data/manifest.${dataset}"
+
+        if [ $? -ne 0 ]; then
+            echo "Formt mnaifest failed. Terminated."
+            exit 1
+        fi
+    } &
+    done
+    wait
+fi
+echo "Aishell data preparation done."
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    mkdir -p exp/wav2vec2
+    echo "Pretrained wav2vec2 model download"
+    wget -P exp/wav2vec2 https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams
+fi
+
+exit 0
+
--- a/examples/aishell/asr2/local/test.sh
+++ b/examples/aishell/asr2/local/test.sh
@ -0,0 +1,84 @@
+#!/bin/bash
+
+set -e
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+expdir=exp
+datadir=data
+
+train_set=train_960
+recog_set="test-clean test-other dev-clean dev-other"
+recog_set="test-clean"
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+# download language model
+#bash local/download_lm_en.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+python3 utils/format_rsl.py \
+    --origin_ref data/manifest.test-clean.raw \
+    --trans_ref data/manifest.test-clean.text
+
+
+for type in ctc_greedy_search; do
+    echo "decoding ${type}"
+    batch_size=16
+    python3 -u ${BIN_DIR}/test.py \
+        --ngpu ${ngpu} \
+        --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+    python3 utils/format_rsl.py \
+        --origin_hyp ${ckpt_prefix}.${type}.rsl \
+        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
+
+    python3 utils/compute-wer.py --char=1 --v=1 \
+        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+    echo "decoding ${type} done."
+done
+
+for type in ctc_prefix_beam_search; do
+    echo "decoding ${type}"
+    batch_size=1
+    python3 -u ${BIN_DIR}/test.py \
+        --ngpu ${ngpu} \
+        --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+    python3 utils/format_rsl.py \
+        --origin_hyp ${ckpt_prefix}.${type}.rsl \
+        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
+
+    python3 utils/compute-wer.py --char=1 --v=1 \
+        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+    echo "decoding ${type} done."
+done
+
+echo "Finished"
+
+exit 0
--- a/examples/aishell/asr2/local/test_wav.sh
+++ b/examples/aishell/asr2/local/test_wav.sh
@ -0,0 +1,58 @@
+#!/bin/bash
+
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+audio_file=$4
+
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+for type in ctc_greedy_search; do
+    echo "decoding ${type}"
+    batch_size=1
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test_wav.py \
+    --ngpu ${ngpu} \
+    --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size} \
+    --audio_file ${audio_file}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+exit 0
--- a/examples/aishell/asr2/local/train.sh
+++ b/examples/aishell/asr2/local/train.sh
@ -0,0 +1,56 @@
+#!/bin/bash
+
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_name=$2
+ips=$3
+
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
+
+mkdir -p exp
+
+# seed may break model convergence
+seed=1998
+if [ ${seed} != 0 ]; then
+    export FLAGS_cudnn_deterministic=True
+fi
+
+# export FLAGS_cudnn_exhaustive_search=true
+# export FLAGS_conv_workspace_size_limit=4000
+# export FLAGS_allocator_strategy=naive_best_fit
+
+if [ ${ngpu} == 0 ]; then
+python3 -u ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--seed ${seed} 
+else
+python3 -m paddle.distributed.launch --log_dir=aa --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--seed ${seed}
+fi
+
+if [ ${seed} != 0 ]; then
+    unset FLAGS_cudnn_deterministic
+fi
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
--- a/examples/aishell/asr2/path.sh
+++ b/examples/aishell/asr2/path.sh
@ -0,0 +1,15 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+
+MODEL=wav2vec2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
--- a/examples/aishell/asr2/run.sh
+++ b/examples/aishell/asr2/run.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+set -e
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+gpus=6
+stage=1
+stop_stage=1
+conf_path=conf/wav2vec2ASR.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
+decode_conf_path=conf/tuning/decode.yaml
+avg_num=1
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+audio_file=data/demo_002_en.wav
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} 
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # greedy search decoder
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+fi
--- a/examples/aishell/asr2/utils
+++ b/examples/aishell/asr2/utils
@ -0,0 +1 @@
+../../../utils
--- a/paddlespeech/s2t/exps/wav2vec2/model.py
+++ b/paddlespeech/s2t/exps/wav2vec2/model.py
@ -19,6 +19,7 @@ import time
 from collections import defaultdict
 from collections import OrderedDict
 from contextlib import nullcontext
+import re

 import jsonlines
 import numpy as np
@ -62,6 +63,20 @@ class Wav2Vec2ASRTrainer(Trainer):
            self.avg_train_loss -= self.avg_train_loss / (batch_index + 1)
            self.avg_train_loss += loss / (batch_index + 1)

+    def before_train(self):
+        from_scratch = self.resume_or_scratch()
+        if from_scratch:
+            # scratch: save init model, i.e. 0 epoch
+            self.save(tag='init', infos=None)
+        else:
+            # resume: train next_epoch and next_iteration
+            self.epoch += 1
+            self.iteration += 1
+            logger.info(
+                f"Resume train: epoch {self.epoch }, step {self.iteration}!")
+
+        self.maybe_batch_sampler_step()
+
    def train_batch(self, batch_index, batch, msg):
        train_conf = self.config
        start = time.time()
@ -97,9 +112,16 @@ class Wav2Vec2ASRTrainer(Trainer):

        # optimizer step old
        if (batch_index + 1) % train_conf.accum_grad == 0:
-            self.optimizer.step()
-            self.optimizer.clear_grad()
-            self.lr_scheduler.step()
+            self.model_optimizer.step()
+            self.model_optimizer.clear_grad()
+            if not train_conf.freeze_wav2vec2:
+                self.wav2vec2_optimizer.step()
+                self.wav2vec2_optimizer.clear_grad()
+            if self.config.model_scheduler is not 'newbobscheduler':
+                self.model_lr_scheduler.step()
+            if self.config.wav2vec2_scheduler is not 'newbobscheduler':
+                if not train_conf.freeze_wav2vec2:
+                    self.wav2vec2_lr_scheduler.step()
            self.iteration += 1

        losses_np = {'loss': self.avg_train_loss * train_conf.accum_grad}
@ -113,7 +135,7 @@ class Wav2Vec2ASRTrainer(Trainer):
        if (batch_index + 1) % train_conf.accum_grad == 0:
            if dist.get_rank() == 0 and self.visualizer:
                losses_np_v = losses_np.copy()
-                losses_np_v.update({"lr": self.lr_scheduler()})
+                losses_np_v.update({"model_lr": self.model_lr_scheduler(), "wav2vec2_lr": self.wav2vec2_lr_scheduler()})
                for key, val in losses_np_v.items():
                    self.visualizer.add_scalar(
                        tag='train/' + key, value=val, step=self.iteration - 1)
@ -134,7 +156,7 @@ class Wav2Vec2ASRTrainer(Trainer):
            wav = wav[:, :, 0]
            loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)

-            if paddle.isfinite(loss):
+            if math.isfinite(float(loss)):
                num_utts = batch[1].shape[0]
                num_seen_utts += num_utts
                total_loss += float(loss) * num_utts
@ -158,6 +180,105 @@ class Wav2Vec2ASRTrainer(Trainer):
        logger.info('Rank {} Val info val_loss {}'.format(
            dist.get_rank(), total_loss / num_seen_utts))
        return total_loss, num_seen_utts
+    
+
+    @mp_tools.rank_zero_only
+    def save(self, tag=None, infos: dict=None):
+        """Save checkpoint (model parameters and optimizer states).
+
+        Args:
+            tag (int or str, optional): None for step, else using tag, e.g epoch. Defaults to None.
+            infos (dict, optional): meta data to save. Defaults to None.
+        """
+
+        infos = infos if infos else dict()
+        infos.update({
+            "step": self.iteration,
+            "epoch": self.epoch,
+            "model_lr": self.model_optimizer.get_lr(),
+            "wav2vec2_lr": self.wav2vec2_optimizer.get_lr()
+        })
+
+        checkpoint_path = os.path.join(self.checkpoint_dir,
+                                       "{}".format(self.iteration
+                                        if tag is None else tag))
+
+        model_dict = self.model.state_dict()
+        params_path = checkpoint_path + ".pdparams"
+        paddle.save(model_dict, params_path)
+        logger.info("Saved model to {}".format(params_path))
+
+        model_opt_dict = self.model_optimizer.state_dict()
+        wav2vec2_opt_dict = self.wav2vec2_optimizer.state_dict()
+        
+        opt_dict = {
+            'model': model_opt_dict, 
+            'wav2vec2': wav2vec2_opt_dict}
+
+        optimizer_path = checkpoint_path + ".pdopt"
+        paddle.save(opt_dict, optimizer_path)
+        logger.info("Saved optimzier state to {}".format(optimizer_path))
+
+        scheduler_dict = {}
+
+        if self.config.model_scheduler == 'newbobscheduler':
+            scheduler_dict['model'] = self.model_lr_scheduler.save()
+        if self.config.wav2vec2_scheduler =='newbobscheduler':
+            scheduler_dict['wav2vec2'] = self.wav2vec2_lr_scheduler.save()
+        if scheduler_dict:
+            scheduler_path = checkpoint_path + ".pdlrs"
+            paddle.save(scheduler_dict, scheduler_path)
+            logger.info("Saved scheduler state to {}".format(scheduler_path))
+        info_path = re.sub('.pdparams$', '.json', params_path)
+        infos = {} if infos is None else infos
+        with open(info_path, 'w') as fout:
+            data = json.dumps(infos)
+            fout.write(data)
+
+    def resume_or_scratch(self):
+        """Resume from latest checkpoint at checkpoints in the output
+        directory or load a specified checkpoint.
+
+        If ``args.checkpoint_path`` is not None, load the checkpoint, else
+        resume training.
+        """
+        scratch = None
+        infos = self.checkpoint.load_latest_parameters(
+            self.model,
+            checkpoint_dir=self.checkpoint_dir,
+            checkpoint_path=self.args.checkpoint_path)
+        if infos:
+            # just restore ckpt
+            # lr will resotre from optimizer ckpt
+            self.iteration = infos["step"]
+            self.epoch = infos["epoch"]
+            
+            # resotre optimizer from *.pdopt
+            optimizer_path = os.path.join(self.checkpoint_dir,
+                                       "{}".format(epoch)) + '.pdopt'
+            optimizer_dict = paddle.load(optimizer_path)
+            optimizer.set_state_dict(optimizer_dict)
+            self.model_optimizer.set_state_dict(optimizer_dict['model'])
+            self.wav2vec2_optimizer.set_state_dict(optimizer_dict['wav2vec2'])
+
+            # resotre lr_scheduler from *.pdlrs
+            scheduler_path = os.path.join(self.checkpoint_dir,
+                                       "{}".format(epoch)) + '.pdlrs'
+            if os.path.isfile(os.path.join(scheduler_path)):
+                scheduler_dict = paddle.load(scheduler_path)
+                if self.config.model_scheduler is 'newbobscheduler':
+                    self.model_lr_scheduler.load(scheduler_dict['model'])
+                if self.config.wav2vec2_scheduler is 'newbobscheduler':
+                    self.wav2vec2_lr_scheduler.load(scheduler_dict['wav2vec2'])
+            scratch = False
+            logger.info(
+                f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!")
+        else:
+            self.iteration = 0
+            self.epoch = 0
+            scratch = True
+            logger.info("Init from scratch!")
+        return scratch

    def do_train(self):
        """The training process control by step."""
@ -186,7 +307,8 @@ class Wav2Vec2ASRTrainer(Trainer):
                            report("Rank", dist.get_rank())
                            report("epoch", self.epoch)
                            report('step', self.iteration)
-                            report("lr", self.lr_scheduler())
+                            report("model_lr", self.model_lr_scheduler())
+                            report("wav2vec2_lr", self.wav2vec2_lr_scheduler())
                            self.train_batch(batch_index, batch, msg)
                            self.after_train_batch()
                            report('iter', batch_index + 1)
@ -224,15 +346,21 @@ class Wav2Vec2ASRTrainer(Trainer):
                    cv_loss = float(cv_loss)
                else:
                    cv_loss = total_loss / num_seen_utts
-
            logger.info(
                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
            if self.visualizer:
                self.visualizer.add_scalar(
                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
                self.visualizer.add_scalar(
-                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
-
+                    tag='eval/model_lr', value=self.model_lr_scheduler(), step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/wav2vec2_lr', value=self.wav2vec2_lr_scheduler(), step=self.epoch)
+            
+            if self.config.model_scheduler is 'newbobscheduler':
+                self.model_lr_scheduler.step(cv_loss)
+            if self.config.wav2vec2_scheduler is 'newbobscheduler':
+                if not self.config.freeze_wav2vec2:
+                    self.wav2vec2_scheduler.step(cv_loss)
            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
            self.new_epoch()

@ -283,46 +411,56 @@ class Wav2Vec2ASRTrainer(Trainer):
            return

        train_config = config
-        optim_type = train_config.model_optim
-        optim_conf = train_config.model_optim_conf
-        scheduler_type = train_config.scheduler
-        scheduler_conf = train_config.scheduler_conf
-
-        scheduler_args = {
-            "learning_rate": optim_conf.lr,
-            "verbose": False,
-            "warmup_steps": scheduler_conf.warmup_steps,
-            "gamma": scheduler_conf.lr_decay,
-            "d_model": model_conf.dnn_neurons,
-        }
-        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
-                                                    scheduler_args)
+        model_optim_type = train_config.model_optim
+        model_optim_conf = train_config.model_optim_conf
+        wav2vec2_optim_type = train_config.model_optim
+        wav2vec2_optim_conf = train_config.wav2vec2_optim_conf
+
+        model_scheduler_type = train_config.model_scheduler
+        model_scheduler_conf = train_config.model_scheduler_conf
+        wav2vec2_scheduler_type = train_config.wav2vec2_scheduler
+        wav2vec2_scheduler_conf = train_config.wav2vec2_scheduler_conf
+
+
+        model_scheduler_args = dict(**{
+            "learning_rate": model_optim_conf.lr,
+            "verbose": False}, **(dict(model_scheduler_conf)))
+
+        wav2vec2_scheduler_args = dict(**{
+            "learning_rate": wav2vec2_optim_conf.lr,
+            "verbose": False}, **(dict(wav2vec2_scheduler_conf)))
+
+        model_lr_scheduler = LRSchedulerFactory.from_args(model_scheduler_type,
+                                                    model_scheduler_args)
+        wav2vec2_lr_scheduler = LRSchedulerFactory.from_args(wav2vec2_scheduler_type,
+                                                    wav2vec2_scheduler_args)

        def optimizer_args(
                config,
+                optim_type,
+                optim_conf,
                parameters,
                lr_scheduler=None, ):
            train_config = config
-            optim_type = train_config.model_optim
-            optim_conf = train_config.model_optim_conf
-            scheduler_type = train_config.scheduler
-            scheduler_conf = train_config.scheduler_conf
-            return {
+            optim_arg = dict(optim_conf)
+            optim_arg.update({
                "grad_clip": train_config.global_grad_clip,
                "learning_rate": lr_scheduler
                if lr_scheduler else optim_conf.lr,
-                "epsilon": optim_conf.epsilon,
-                "rho": optim_conf.rho,
-                "parameters": parameters,
-                "beta1": 0.9 if optim_type == 'noam' else None,
-                "beat2": 0.98 if optim_type == 'noam' else None,
-            }
-
-        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
-        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
-
-        self.optimizer = optimizer
-        self.lr_scheduler = lr_scheduler
+                "parameters": parameters})
+            return optim_arg
+
+        model_optimizer_args = optimizer_args(config, model_optim_type, model_optim_conf,
+                                              [*model.enc.parameters(), *model.ctc.parameters()], model_lr_scheduler)
+        wav2vec2_optimizer_args = optimizer_args(config, wav2vec2_optim_type, wav2vec2_optim_conf,
+                                                 model.wav2vec2.parameters(), wav2vec2_lr_scheduler)
+        model_optimizer = OptimizerFactory.from_args(model_optim_type, model_optimizer_args)
+        wav2vec2_optimizer = OptimizerFactory.from_args(wav2vec2_optim_type, wav2vec2_optimizer_args)
+
+        self.model_optimizer = model_optimizer
+        self.wav2vec2_optimizer = wav2vec2_optimizer
+        self.model_lr_scheduler = model_lr_scheduler
+        self.wav2vec2_lr_scheduler = wav2vec2_lr_scheduler
        logger.info("Setup optimizer/lr_scheduler!")


--- a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
+++ b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
@ -1177,10 +1177,6 @@ class Wav2Vec2ConfigPure():
        self.proj_codevector_dim = config.proj_codevector_dim
        self.diversity_loss_weight = config.diversity_loss_weight

-        # ctc loss
-        self.ctc_loss_reduction = config.ctc_loss_reduction
-        self.ctc_zero_infinity = config.ctc_zero_infinity
-
        # adapter
        self.add_adapter = config.add_adapter
        self.adapter_kernel_size = config.adapter_kernel_size
--- a/paddlespeech/s2t/training/scheduler.py
+++ b/paddlespeech/s2t/training/scheduler.py
@ -17,6 +17,7 @@ from typing import Dict
 from typing import Text
 from typing import Union

+import paddle
 from paddle.optimizer.lr import LRScheduler
 from typeguard import check_argument_types

@ -106,6 +107,127 @@ class ConstantLR(LRScheduler):
    def get_lr(self):
        return self.base_lr

+@register_scheduler
+class NewBobScheduler(LRScheduler):
+    """Scheduler with new-bob technique, used for LR annealing.
+
+    The learning rate is annealed based on the validation performance.
+    In particular: if (past_loss-current_loss)/past_loss< impr_threshold:
+    lr=lr * annealing_factor.
+
+    Arguments
+    ---------
+    initial_value : float
+        The initial hyperparameter value.
+    annealing_factor : float
+        It is annealing factor used in new_bob strategy.
+    improvement_threshold : float
+        It is the improvement rate between losses used to perform learning
+        annealing in new_bob strategy.
+    patient : int
+        When the annealing condition is violated patient times,
+        the learning rate is finally reduced.
+
+    Example
+    -------
+    >>> scheduler = NewBobScheduler(initial_value=1.0)
+    >>> scheduler(metric_value=10.0)
+    (1.0, 1.0)
+    >>> scheduler(metric_value=2.0)
+    (1.0, 1.0)
+    >>> scheduler(metric_value=2.5)
+    (1.0, 0.5)
+    """
+
+    def __init__(
+        self,
+        learning_rate, 
+        last_epoch=-1, 
+        verbose=False,
+        annealing_factor=0.5,
+        improvement_threshold=0.0025,
+        patient=0,
+    ):
+        self.hyperparam_value = learning_rate
+        self.annealing_factor = annealing_factor
+        self.improvement_threshold = improvement_threshold
+        self.patient = patient
+        self.metric_values = []
+        self.current_patient = self.patient
+        super().__init__(learning_rate, last_epoch, verbose)
+
+    def step(self, metric_value=None):
+        """
+
+        ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
+        The new learning rate will take effect on next ``optimizer.step`` .
+
+        Args:
+            epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
+
+        Returns:
+            None
+        """
+        if metric_value is None:
+            self.last_epoch += 1
+            self.last_lr = self.hyperparam_value
+
+        else:
+            self.last_epoch += 1
+            self.last_lr = self.get_lr(metric_value)
+
+        if self.verbose:
+            print('Epoch {}: {} set learning rate to {}.'.format(
+                self.last_epoch, self.__class__.__name__, self.last_lr))
+
+    def get_lr(self, metric_value):
+        """Returns the current and new value for the hyperparameter.
+
+        Arguments
+        ---------
+        metric_value : int
+            A number for determining whether to change the hyperparameter value.
+        """
+        new_value = self.hyperparam_value
+        if len(self.metric_values) > 0:
+            prev_metric = self.metric_values[-1]
+            # Update value if improvement too small and patience is 0
+            if prev_metric == 0:  # Prevent division by zero
+                improvement = 0
+            else:
+                improvement = (prev_metric - metric_value) / prev_metric
+            if improvement < self.improvement_threshold:
+                if self.current_patient == 0:
+                    new_value *= self.annealing_factor
+                    self.current_patient = self.patient
+                else:
+                    self.current_patient -= 1
+
+        # Store relevant info
+        self.metric_values.append(metric_value)
+        self.hyperparam_value = new_value
+
+        return new_value
+
+    def save(self):
+        """Saves the current metrics on the specified path."""
+        data = {
+            "current_epoch_index": self.last_epoch,
+            "hyperparam_value": self.hyperparam_value,
+            "metric_values": self.metric_values,
+            "current_patient": self.current_patient
+        }
+        return data
+
+    def load(self, data):
+        """Loads the needed information."""
+        data = paddle.load(data)
+        self.last_epoch = data["current_epoch_index"]
+        self.hyperparam_value = data["hyperparam_value"]
+        self.metric_values = data["metric_values"]
+        self.current_patient = data["current_patient"]
+
+

 def dynamic_import_scheduler(module):
    """Import Scheduler class dynamically.