wav2vec2_zh train work

pull/2545/head
zhangtianhao 3 years ago
parent 899236b328
commit fe2775150d

@ -0,0 +1,89 @@
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
# e.g.
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
#
# Options:
# --time <time>: Limit the maximum time to execute.
# --mem <mem>: Limit the maximum memory usage.
# -max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
# --num-threads <ngpu>: Specify the number of CPU core.
# --gpu <ngpu>: Specify the number of GPU devices.
# --config: Change the configuration file from default.
#
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
#
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
# These options are mapping to specific options for each backend and
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
# If jobs failed, your configuration might be wrong for your environment.
#
#
# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
# =========================================================~
# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
cmd_backend='local'
# Local machine, without any Job scheduling system
if [ "${cmd_backend}" = local ]; then
# The other usage
export train_cmd="run.pl"
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
export cuda_cmd="run.pl"
# Used for "*_recog.py"
export decode_cmd="run.pl"
# "qsub" (SGE, Torque, PBS, etc.)
elif [ "${cmd_backend}" = sge ]; then
# The default setting is written in conf/queue.conf.
# You must change "-q g.q" for the "queue" for your environment.
# To know the "queue" names, type "qhost -q"
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
export train_cmd="queue.pl"
export cuda_cmd="queue.pl"
export decode_cmd="queue.pl"
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
export train_cmd="slurm.pl"
export cuda_cmd="slurm.pl"
export decode_cmd="slurm.pl"
elif [ "${cmd_backend}" = ssh ]; then
# You have to create ".queue/machines" to specify the host to execute jobs.
# e.g. .queue/machines
# host1
# host2
# host3
# Assuming you can login them without any password, i.e. You have to set ssh keys.
export train_cmd="ssh.pl"
export cuda_cmd="ssh.pl"
export decode_cmd="ssh.pl"
# This is an example of specifying several unique options in the JHU CLSP cluster setup.
# Users can modify/add their own command options according to their cluster environments.
elif [ "${cmd_backend}" = jhu ]; then
export train_cmd="queue.pl --mem 2G"
export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
export decode_cmd="queue.pl --mem 4G"
else
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
return 1
fi

@ -0,0 +1,91 @@
############################################
# Network Architecture #
############################################
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention
attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: True
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'
# decoder related
decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0
# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
init_type: 'kaiming_uniform' # !Warning: need to convergence
###########################################
# Data #
###########################################
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: ''
unit_type: 'char'
preprocess_config: conf/preprocess.yaml
feat_dim: 80
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 32
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
num_workers: 2
subsampling_factor: 1
num_encs: 1
###########################################
# Training #
###########################################
n_epoch: 150
accum_grad: 8
global_grad_clip: 5.0
dist_sampler: False
optim: adam
optim_conf:
lr: 0.002
weight_decay: 1.0e-6
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
lr_decay: 1.0
log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5

@ -0,0 +1,4 @@
process:
# use raw audio
- type: wav_process
dither: 0.0

@ -0,0 +1,11 @@
beam_size: 10
decode_batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: 16 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: True # simulate streaming inference. Defaults to False.

@ -0,0 +1,11 @@
beam_size: 10
decode_batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -0,0 +1,137 @@
############################################
# Network Architecture #
############################################
freeze_wav2vec2: False
normalize_wav: True
output_norm: True
dnn_blocks: 2
dnn_neurons: 1024
blank_id: 0
ctc_dropout_rate: 0.0
wav2vec2_params_path: exp/wav2vec2/chinese-wav2vec2-large.pdparams
############################################
# Wav2Vec2.0 #
############################################
vocab_size: 32
hidden_size: 1024
num_hidden_layers: 24
num_attention_heads: 16
intermediate_size: 4096
hidden_act: gelu
hidden_dropout: 0.1
activation_dropout: 0.0
attention_dropout: 0.1
feat_proj_dropout: 0.1
feat_quantizer_dropout: 0.0
final_dropout: 0.0
layerdrop: 0.1
initializer_range: 0.02
layer_norm_eps: 1e-5
feat_extract_norm: layer
feat_extract_activation: gelu
conv_dim: [512, 512, 512, 512, 512, 512, 512]
conv_stride: [5, 2, 2, 2, 2, 2, 2]
conv_kernel: [10, 3, 3, 3, 3, 2, 2]
conv_bias: True
num_conv_pos_embeddings: 128
num_conv_pos_embedding_groups: 16
do_stable_layer_norm: True
apply_spec_augment: False
mask_channel_length: 10
mask_channel_min_space: 1
mask_channel_other: 0.0
mask_channel_prob: 0.0
mask_channel_selection: static
mask_feature_length: 10
mask_feature_min_masks: 0
mask_feature_prob: 0.0
mask_time_length: 10
mask_time_min_masks: 2
mask_time_min_space: 1
mask_time_other: 0.0
mask_time_prob: 0.075
mask_time_selection: static
num_codevectors_per_group: 320
num_codevector_groups: 2
contrastive_logits_temperature: 0.1
num_negatives: 100
codevector_dim: 256
proj_codevector_dim: 256
diversity_loss_weight: 0.1
use_weighted_layer_sum: False
pad_token_id: 0
bos_token_id: 1
eos_token_id: 2
add_adapter: False
adapter_kernel_size: 3
adapter_stride: 2
num_adapter_layers: 3
output_hidden_size: None
###########################################
# Data #
###########################################
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
###########################################
# Dataloader #
###########################################
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
mean_std_filepath:
preprocess_config: conf/preprocess.yaml
sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs 0: disabled other: enabled for 'other' epochs
batch_size: 8 # Different batch_size may cause large differences in results
maxlen_in: 51200000000 # if input length > maxlen-in batchsize is automatically reduced
maxlen_out: 1500000 # if output length > maxlen-out batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
num_workers: 0
subsampling_factor: 1
num_encs: 1
dist_sampler: True
shortest_first: True
return_lens_rate: True
###########################################
# Training #
###########################################
n_epoch: 80
accum_grad: 1
global_grad_clip: 3.0
model_optim: adadelta
model_optim_conf:
lr: 0.95
epsilon: 1.0e-8
rho: 0.95
wav2vec2_optim: adam
wav2vec2_optim_conf:
lr: 0.0001
epsilon: 1.0e-8
model_scheduler: newbobscheduler
model_scheduler_conf:
improvement_threshold: 0.0025
annealing_factor: 0.8
patient: 0
wav2vec2_scheduler: newbobscheduler
wav2vec2_scheduler_conf:
improvement_threshold: 0.0025
annealing_factor: 0.9
patient: 0
log_interval: 1
checkpoint:
kbest_n: 50
latest_n: 5
augment: True

@ -0,0 +1,95 @@
#!/bin/bash
stage=-1
stop_stage=100
dict_dir=data/lang_char
. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
mkdir -p data
mkdir -p ${dict_dir}
TARGET_DIR=${MAIN_ROOT}/dataset
mkdir -p ${TARGET_DIR}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data, generate manifests
python3 ${TARGET_DIR}/aishell/aishell.py \
--manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/aishell"
if [ $? -ne 0 ]; then
echo "Prepare Aishell failed. Terminated."
exit 1
fi
for dataset in train dev test; do
mv data/manifest.${dataset} data/manifest.${dataset}.raw
done
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# compute mean and stddev for normalizer
num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--spectrum_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--stride_ms=10 \
--window_ms=25 \
--sample_rate=16000 \
--use_dB_normalization=False \
--num_samples=-1 \
--num_workers=${num_workers} \
--output_path="data/mean_std.json"
if [ $? -ne 0 ]; then
echo "Compute mean and stddev failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# download data, generate manifests
# build vocabulary
python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_paths "data/manifest.train.raw"
if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated."
exit 1
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# format manifest with tokenids, vocab size
for dataset in train dev test; do
{
python3 ${MAIN_ROOT}/utils/format_data.py \
--cmvn_path "data/mean_std.json" \
--unit_type "char" \
--vocab_path="${dict_dir}/vocab.txt" \
--manifest_path="data/manifest.${dataset}.raw" \
--output_path="data/manifest.${dataset}"
if [ $? -ne 0 ]; then
echo "Formt mnaifest failed. Terminated."
exit 1
fi
} &
done
wait
fi
echo "Aishell data preparation done."
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
mkdir -p exp/wav2vec2
echo "Pretrained wav2vec2 model download"
wget -P exp/wav2vec2 https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams
fi
exit 0

@ -0,0 +1,84 @@
#!/bin/bash
set -e
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
expdir=exp
datadir=data
train_set=train_960
recog_set="test-clean test-other dev-clean dev-other"
recog_set="test-clean"
config_path=$1
decode_config_path=$2
ckpt_prefix=$3
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
# download language model
#bash local/download_lm_en.sh
#if [ $? -ne 0 ]; then
# exit 1
#fi
python3 utils/format_rsl.py \
--origin_ref data/manifest.test-clean.raw \
--trans_ref data/manifest.test-clean.text
for type in ctc_greedy_search; do
echo "decoding ${type}"
batch_size=16
python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \
--config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \
--opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done
for type in ctc_prefix_beam_search; do
echo "decoding ${type}"
batch_size=1
python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \
--config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \
--opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done
echo "Finished"
exit 0

@ -0,0 +1,58 @@
#!/bin/bash
if [ $# != 4 ];then
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
decode_config_path=$2
ckpt_prefix=$3
audio_file=$4
mkdir -p data
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
if [ $? -ne 0 ]; then
exit 1
fi
if [ ! -f ${audio_file} ]; then
echo "Plase input the right audio_file path"
exit 1
fi
chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
chunk_mode=true
fi
# download language model
#bash local/download_lm_ch.sh
#if [ $? -ne 0 ]; then
# exit 1
#fi
for type in ctc_greedy_search; do
echo "decoding ${type}"
batch_size=1
output_dir=${ckpt_prefix}
mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/test_wav.py \
--ngpu ${ngpu} \
--config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \
--opts decode.decode_batch_size ${batch_size} \
--audio_file ${audio_file}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
exit 0

@ -0,0 +1,56 @@
#!/bin/bash
if [ $# -lt 2 ] && [ $# -gt 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
ips=$3
if [ ! $ips ];then
ips_config=
else
ips_config="--ips="${ips}
fi
mkdir -p exp
# seed may break model convergence
seed=1998
if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi
# export FLAGS_cudnn_exhaustive_search=true
# export FLAGS_conv_workspace_size_limit=4000
# export FLAGS_allocator_strategy=naive_best_fit
if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
--seed ${seed}
else
python3 -m paddle.distributed.launch --log_dir=aa --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
--seed ${seed}
fi
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
fi
if [ $? -ne 0 ]; then
echo "Failed in training!"
exit 1
fi
exit 0

@ -0,0 +1,15 @@
export MAIN_ROOT=`realpath ${PWD}/../../../`
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
export LC_ALL=C
export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=wav2vec2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin

@ -0,0 +1,46 @@
#!/bin/bash
set -e
. ./path.sh || exit 1;
. ./cmd.sh || exit 1;
gpus=6
stage=1
stop_stage=1
conf_path=conf/wav2vec2ASR.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
decode_conf_path=conf/tuning/decode.yaml
avg_num=1
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
audio_file=data/demo_002_en.wav
avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
echo "checkpoint name ${ckpt}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh || exit -1
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# greedy search decoder
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# test a single .wav file
CUDA_VISIBLE_DEVICES=${gpus} ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi

@ -0,0 +1 @@
../../../utils

@ -19,6 +19,7 @@ import time
from collections import defaultdict
from collections import OrderedDict
from contextlib import nullcontext
import re
import jsonlines
import numpy as np
@ -62,6 +63,20 @@ class Wav2Vec2ASRTrainer(Trainer):
self.avg_train_loss -= self.avg_train_loss / (batch_index + 1)
self.avg_train_loss += loss / (batch_index + 1)
def before_train(self):
from_scratch = self.resume_or_scratch()
if from_scratch:
# scratch: save init model, i.e. 0 epoch
self.save(tag='init', infos=None)
else:
# resume: train next_epoch and next_iteration
self.epoch += 1
self.iteration += 1
logger.info(
f"Resume train: epoch {self.epoch }, step {self.iteration}!")
self.maybe_batch_sampler_step()
def train_batch(self, batch_index, batch, msg):
train_conf = self.config
start = time.time()
@ -97,9 +112,16 @@ class Wav2Vec2ASRTrainer(Trainer):
# optimizer step old
if (batch_index + 1) % train_conf.accum_grad == 0:
self.optimizer.step()
self.optimizer.clear_grad()
self.lr_scheduler.step()
self.model_optimizer.step()
self.model_optimizer.clear_grad()
if not train_conf.freeze_wav2vec2:
self.wav2vec2_optimizer.step()
self.wav2vec2_optimizer.clear_grad()
if self.config.model_scheduler is not 'newbobscheduler':
self.model_lr_scheduler.step()
if self.config.wav2vec2_scheduler is not 'newbobscheduler':
if not train_conf.freeze_wav2vec2:
self.wav2vec2_lr_scheduler.step()
self.iteration += 1
losses_np = {'loss': self.avg_train_loss * train_conf.accum_grad}
@ -113,7 +135,7 @@ class Wav2Vec2ASRTrainer(Trainer):
if (batch_index + 1) % train_conf.accum_grad == 0:
if dist.get_rank() == 0 and self.visualizer:
losses_np_v = losses_np.copy()
losses_np_v.update({"lr": self.lr_scheduler()})
losses_np_v.update({"model_lr": self.model_lr_scheduler(), "wav2vec2_lr": self.wav2vec2_lr_scheduler()})
for key, val in losses_np_v.items():
self.visualizer.add_scalar(
tag='train/' + key, value=val, step=self.iteration - 1)
@ -134,7 +156,7 @@ class Wav2Vec2ASRTrainer(Trainer):
wav = wav[:, :, 0]
loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
if paddle.isfinite(loss):
if math.isfinite(float(loss)):
num_utts = batch[1].shape[0]
num_seen_utts += num_utts
total_loss += float(loss) * num_utts
@ -159,6 +181,105 @@ class Wav2Vec2ASRTrainer(Trainer):
dist.get_rank(), total_loss / num_seen_utts))
return total_loss, num_seen_utts
@mp_tools.rank_zero_only
def save(self, tag=None, infos: dict=None):
"""Save checkpoint (model parameters and optimizer states).
Args:
tag (int or str, optional): None for step, else using tag, e.g epoch. Defaults to None.
infos (dict, optional): meta data to save. Defaults to None.
"""
infos = infos if infos else dict()
infos.update({
"step": self.iteration,
"epoch": self.epoch,
"model_lr": self.model_optimizer.get_lr(),
"wav2vec2_lr": self.wav2vec2_optimizer.get_lr()
})
checkpoint_path = os.path.join(self.checkpoint_dir,
"{}".format(self.iteration
if tag is None else tag))
model_dict = self.model.state_dict()
params_path = checkpoint_path + ".pdparams"
paddle.save(model_dict, params_path)
logger.info("Saved model to {}".format(params_path))
model_opt_dict = self.model_optimizer.state_dict()
wav2vec2_opt_dict = self.wav2vec2_optimizer.state_dict()
opt_dict = {
'model': model_opt_dict,
'wav2vec2': wav2vec2_opt_dict}
optimizer_path = checkpoint_path + ".pdopt"
paddle.save(opt_dict, optimizer_path)
logger.info("Saved optimzier state to {}".format(optimizer_path))
scheduler_dict = {}
if self.config.model_scheduler == 'newbobscheduler':
scheduler_dict['model'] = self.model_lr_scheduler.save()
if self.config.wav2vec2_scheduler =='newbobscheduler':
scheduler_dict['wav2vec2'] = self.wav2vec2_lr_scheduler.save()
if scheduler_dict:
scheduler_path = checkpoint_path + ".pdlrs"
paddle.save(scheduler_dict, scheduler_path)
logger.info("Saved scheduler state to {}".format(scheduler_path))
info_path = re.sub('.pdparams$', '.json', params_path)
infos = {} if infos is None else infos
with open(info_path, 'w') as fout:
data = json.dumps(infos)
fout.write(data)
def resume_or_scratch(self):
"""Resume from latest checkpoint at checkpoints in the output
directory or load a specified checkpoint.
If ``args.checkpoint_path`` is not None, load the checkpoint, else
resume training.
"""
scratch = None
infos = self.checkpoint.load_latest_parameters(
self.model,
checkpoint_dir=self.checkpoint_dir,
checkpoint_path=self.args.checkpoint_path)
if infos:
# just restore ckpt
# lr will resotre from optimizer ckpt
self.iteration = infos["step"]
self.epoch = infos["epoch"]
# resotre optimizer from *.pdopt
optimizer_path = os.path.join(self.checkpoint_dir,
"{}".format(epoch)) + '.pdopt'
optimizer_dict = paddle.load(optimizer_path)
optimizer.set_state_dict(optimizer_dict)
self.model_optimizer.set_state_dict(optimizer_dict['model'])
self.wav2vec2_optimizer.set_state_dict(optimizer_dict['wav2vec2'])
# resotre lr_scheduler from *.pdlrs
scheduler_path = os.path.join(self.checkpoint_dir,
"{}".format(epoch)) + '.pdlrs'
if os.path.isfile(os.path.join(scheduler_path)):
scheduler_dict = paddle.load(scheduler_path)
if self.config.model_scheduler is 'newbobscheduler':
self.model_lr_scheduler.load(scheduler_dict['model'])
if self.config.wav2vec2_scheduler is 'newbobscheduler':
self.wav2vec2_lr_scheduler.load(scheduler_dict['wav2vec2'])
scratch = False
logger.info(
f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!")
else:
self.iteration = 0
self.epoch = 0
scratch = True
logger.info("Init from scratch!")
return scratch
def do_train(self):
"""The training process control by step."""
# !!!IMPORTANT!!!
@ -186,7 +307,8 @@ class Wav2Vec2ASRTrainer(Trainer):
report("Rank", dist.get_rank())
report("epoch", self.epoch)
report('step', self.iteration)
report("lr", self.lr_scheduler())
report("model_lr", self.model_lr_scheduler())
report("wav2vec2_lr", self.wav2vec2_lr_scheduler())
self.train_batch(batch_index, batch, msg)
self.after_train_batch()
report('iter', batch_index + 1)
@ -224,15 +346,21 @@ class Wav2Vec2ASRTrainer(Trainer):
cv_loss = float(cv_loss)
else:
cv_loss = total_loss / num_seen_utts
logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
if self.visualizer:
self.visualizer.add_scalar(
tag='eval/cv_loss', value=cv_loss, step=self.epoch)
self.visualizer.add_scalar(
tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
tag='eval/model_lr', value=self.model_lr_scheduler(), step=self.epoch)
self.visualizer.add_scalar(
tag='eval/wav2vec2_lr', value=self.wav2vec2_lr_scheduler(), step=self.epoch)
if self.config.model_scheduler is 'newbobscheduler':
self.model_lr_scheduler.step(cv_loss)
if self.config.wav2vec2_scheduler is 'newbobscheduler':
if not self.config.freeze_wav2vec2:
self.wav2vec2_scheduler.step(cv_loss)
self.save(tag=self.epoch, infos={'val_loss': cv_loss})
self.new_epoch()
@ -283,46 +411,56 @@ class Wav2Vec2ASRTrainer(Trainer):
return
train_config = config
optim_type = train_config.model_optim
optim_conf = train_config.model_optim_conf
scheduler_type = train_config.scheduler
scheduler_conf = train_config.scheduler_conf
scheduler_args = {
"learning_rate": optim_conf.lr,
"verbose": False,
"warmup_steps": scheduler_conf.warmup_steps,
"gamma": scheduler_conf.lr_decay,
"d_model": model_conf.dnn_neurons,
}
lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
scheduler_args)
model_optim_type = train_config.model_optim
model_optim_conf = train_config.model_optim_conf
wav2vec2_optim_type = train_config.model_optim
wav2vec2_optim_conf = train_config.wav2vec2_optim_conf
model_scheduler_type = train_config.model_scheduler
model_scheduler_conf = train_config.model_scheduler_conf
wav2vec2_scheduler_type = train_config.wav2vec2_scheduler
wav2vec2_scheduler_conf = train_config.wav2vec2_scheduler_conf
model_scheduler_args = dict(**{
"learning_rate": model_optim_conf.lr,
"verbose": False}, **(dict(model_scheduler_conf)))
wav2vec2_scheduler_args = dict(**{
"learning_rate": wav2vec2_optim_conf.lr,
"verbose": False}, **(dict(wav2vec2_scheduler_conf)))
model_lr_scheduler = LRSchedulerFactory.from_args(model_scheduler_type,
model_scheduler_args)
wav2vec2_lr_scheduler = LRSchedulerFactory.from_args(wav2vec2_scheduler_type,
wav2vec2_scheduler_args)
def optimizer_args(
config,
optim_type,
optim_conf,
parameters,
lr_scheduler=None, ):
train_config = config
optim_type = train_config.model_optim
optim_conf = train_config.model_optim_conf
scheduler_type = train_config.scheduler
scheduler_conf = train_config.scheduler_conf
return {
optim_arg = dict(optim_conf)
optim_arg.update({
"grad_clip": train_config.global_grad_clip,
"learning_rate": lr_scheduler
if lr_scheduler else optim_conf.lr,
"epsilon": optim_conf.epsilon,
"rho": optim_conf.rho,
"parameters": parameters,
"beta1": 0.9 if optim_type == 'noam' else None,
"beat2": 0.98 if optim_type == 'noam' else None,
}
optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
self.optimizer = optimizer
self.lr_scheduler = lr_scheduler
"parameters": parameters})
return optim_arg
model_optimizer_args = optimizer_args(config, model_optim_type, model_optim_conf,
[*model.enc.parameters(), *model.ctc.parameters()], model_lr_scheduler)
wav2vec2_optimizer_args = optimizer_args(config, wav2vec2_optim_type, wav2vec2_optim_conf,
model.wav2vec2.parameters(), wav2vec2_lr_scheduler)
model_optimizer = OptimizerFactory.from_args(model_optim_type, model_optimizer_args)
wav2vec2_optimizer = OptimizerFactory.from_args(wav2vec2_optim_type, wav2vec2_optimizer_args)
self.model_optimizer = model_optimizer
self.wav2vec2_optimizer = wav2vec2_optimizer
self.model_lr_scheduler = model_lr_scheduler
self.wav2vec2_lr_scheduler = wav2vec2_lr_scheduler
logger.info("Setup optimizer/lr_scheduler!")

@ -1177,10 +1177,6 @@ class Wav2Vec2ConfigPure():
self.proj_codevector_dim = config.proj_codevector_dim
self.diversity_loss_weight = config.diversity_loss_weight
# ctc loss
self.ctc_loss_reduction = config.ctc_loss_reduction
self.ctc_zero_infinity = config.ctc_zero_infinity
# adapter
self.add_adapter = config.add_adapter
self.adapter_kernel_size = config.adapter_kernel_size

@ -17,6 +17,7 @@ from typing import Dict
from typing import Text
from typing import Union
import paddle
from paddle.optimizer.lr import LRScheduler
from typeguard import check_argument_types
@ -106,6 +107,127 @@ class ConstantLR(LRScheduler):
def get_lr(self):
return self.base_lr
@register_scheduler
class NewBobScheduler(LRScheduler):
"""Scheduler with new-bob technique, used for LR annealing.
The learning rate is annealed based on the validation performance.
In particular: if (past_loss-current_loss)/past_loss< impr_threshold:
lr=lr * annealing_factor.
Arguments
---------
initial_value : float
The initial hyperparameter value.
annealing_factor : float
It is annealing factor used in new_bob strategy.
improvement_threshold : float
It is the improvement rate between losses used to perform learning
annealing in new_bob strategy.
patient : int
When the annealing condition is violated patient times,
the learning rate is finally reduced.
Example
-------
>>> scheduler = NewBobScheduler(initial_value=1.0)
>>> scheduler(metric_value=10.0)
(1.0, 1.0)
>>> scheduler(metric_value=2.0)
(1.0, 1.0)
>>> scheduler(metric_value=2.5)
(1.0, 0.5)
"""
def __init__(
self,
learning_rate,
last_epoch=-1,
verbose=False,
annealing_factor=0.5,
improvement_threshold=0.0025,
patient=0,
):
self.hyperparam_value = learning_rate
self.annealing_factor = annealing_factor
self.improvement_threshold = improvement_threshold
self.patient = patient
self.metric_values = []
self.current_patient = self.patient
super().__init__(learning_rate, last_epoch, verbose)
def step(self, metric_value=None):
"""
``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` .
The new learning rate will take effect on next ``optimizer.step`` .
Args:
epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1.
Returns:
None
"""
if metric_value is None:
self.last_epoch += 1
self.last_lr = self.hyperparam_value
else:
self.last_epoch += 1
self.last_lr = self.get_lr(metric_value)
if self.verbose:
print('Epoch {}: {} set learning rate to {}.'.format(
self.last_epoch, self.__class__.__name__, self.last_lr))
def get_lr(self, metric_value):
"""Returns the current and new value for the hyperparameter.
Arguments
---------
metric_value : int
A number for determining whether to change the hyperparameter value.
"""
new_value = self.hyperparam_value
if len(self.metric_values) > 0:
prev_metric = self.metric_values[-1]
# Update value if improvement too small and patience is 0
if prev_metric == 0: # Prevent division by zero
improvement = 0
else:
improvement = (prev_metric - metric_value) / prev_metric
if improvement < self.improvement_threshold:
if self.current_patient == 0:
new_value *= self.annealing_factor
self.current_patient = self.patient
else:
self.current_patient -= 1
# Store relevant info
self.metric_values.append(metric_value)
self.hyperparam_value = new_value
return new_value
def save(self):
"""Saves the current metrics on the specified path."""
data = {
"current_epoch_index": self.last_epoch,
"hyperparam_value": self.hyperparam_value,
"metric_values": self.metric_values,
"current_patient": self.current_patient
}
return data
def load(self, data):
"""Loads the needed information."""
data = paddle.load(data)
self.last_epoch = data["current_epoch_index"]
self.hyperparam_value = data["hyperparam_value"]
self.metric_values = data["metric_values"]
self.current_patient = data["current_patient"]
def dynamic_import_scheduler(module):
"""Import Scheduler class dynamically.

Loading…
Cancel
Save