PaddleSpeech/examples/aishell/asr3/conf/train_with_wav2vec.yaml

# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/recipes/AISHELL-1/ASR/CTC/hparams/train_with_wav2vec.yaml)

# ############################################################################
# Model: CTC-wav2vec2
# Encoder: wav2vec2
# Decoder: -
# Tokens: Char
# losses: CTC
# Training: AISHELL-1
# Authors:  Yingzhi WANG 2022
# ############################################################################

output_folder: !ref data
cer_file: !ref <output_folder>/cer.txt
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Data files
data_folder: data/aishell # e,g./path/to/aishell

skip_prep: False
ckpt_interval_minutes: 15 # save checkpoint every N min
train_data: !ref <output_folder>/train.csv
valid_data: !ref <output_folder>/dev.csv
test_data: !ref <output_folder>/test.csv

wav2vec2_hub: TencentGameMate/chinese-wav2vec2-large

# Training parameters
number_of_epochs: 80
lr: 1.0
lr_wav2vec: 0.0001
sorting: ascending
auto_mix_prec: False
sample_rate: 16000

# With data_parallel batch_size is split into N jobs
# With DDP batch_size is multiplied by N jobs
# Must be 8 per GPU to fit 32GB of VRAM
batch_size: 5
test_batch_size: 1 # need set to 1 when decoding

dynamic_batching: False
dynamic_batch_sampler:
   feats_hop_size: 0.01
   max_batch_len: 15 # in terms of "duration" in annotations by default, second here
   left_bucket_len: 200 # old implementation attributs
   multiplier: 1.1 # old implementation attributs
   shuffle_ex: False # if true re-creates batches at each epoch shuffling examples.
   num_buckets: 10 # floor(log(max_batch_len/left_bucket_len, multiplier)) + 1
   batch_ordering: ascending

num_workers: 6

# Dataloader options
train_dataloader_opts:
   batch_size: !ref <batch_size>
   num_workers: !ref <num_workers>
valid_dataloader_opts:
   batch_size: !ref <test_batch_size>
   num_workers: !ref <num_workers>
test_dataloader_opts:
   batch_size: !ref <test_batch_size>
   num_workers: !ref <num_workers>

wav2vec_output_dim: 1024
dnn_neurons: 1024
freeze_wav2vec: False
dropout: 0.15

tokenizer: !apply:transformers.BertTokenizer.from_pretrained
   pretrained_model_name_or_path: bert-base-chinese
# bert-base-chinese tokens length
output_neurons: 21128

# Decoding parameters
# Be sure that the bos and eos index match with the BPEs ones
blank_index: 0

# AISHELL-1 has spaces between words in the transcripts,
# which Chinese writing normally does not do.
# If remove_spaces, spaces are removed
# from the transcript before computing CER.
# (e.g., 祝 可爱 的 你 —> 祝可爱的你)
remove_spaces: True
split_tokens: !apply:operator.not_ [!ref <remove_spaces>]