# Copyright (c) 2023 speechbrain Authors. All Rights Reserved. # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/recipes/AISHELL-1/ASR/CTC/hparams/train_with_wav2vec.yaml) # ############################################################################ # Model: CTC-wav2vec2 # Encoder: wav2vec2 # Decoder: - # Tokens: Char # losses: CTC # Training: AISHELL-1 # Authors: Yingzhi WANG 2022 # ############################################################################ output_folder: !ref data cer_file: !ref /cer.txt save_folder: !ref /save train_log: !ref /train_log.txt # Data files data_folder: data/aishell # e,g./path/to/aishell skip_prep: False ckpt_interval_minutes: 15 # save checkpoint every N min train_data: !ref /train.csv valid_data: !ref /dev.csv test_data: !ref /test.csv wav2vec2_hub: TencentGameMate/chinese-wav2vec2-large # Training parameters number_of_epochs: 80 lr: 1.0 lr_wav2vec: 0.0001 sorting: ascending auto_mix_prec: False sample_rate: 16000 # With data_parallel batch_size is split into N jobs # With DDP batch_size is multiplied by N jobs # Must be 8 per GPU to fit 32GB of VRAM batch_size: 5 test_batch_size: 1 # need set to 1 when decoding dynamic_batching: False dynamic_batch_sampler: feats_hop_size: 0.01 max_batch_len: 15 # in terms of "duration" in annotations by default, second here left_bucket_len: 200 # old implementation attributs multiplier: 1.1 # old implementation attributs shuffle_ex: False # if true re-creates batches at each epoch shuffling examples. num_buckets: 10 # floor(log(max_batch_len/left_bucket_len, multiplier)) + 1 batch_ordering: ascending num_workers: 6 # Dataloader options train_dataloader_opts: batch_size: !ref num_workers: !ref valid_dataloader_opts: batch_size: !ref num_workers: !ref test_dataloader_opts: batch_size: !ref num_workers: !ref wav2vec_output_dim: 1024 dnn_neurons: 1024 freeze_wav2vec: False dropout: 0.15 tokenizer: !apply:transformers.BertTokenizer.from_pretrained pretrained_model_name_or_path: bert-base-chinese # bert-base-chinese tokens length output_neurons: 21128 # Decoding parameters # Be sure that the bos and eos index match with the BPEs ones blank_index: 0 # AISHELL-1 has spaces between words in the transcripts, # which Chinese writing normally does not do. # If remove_spaces, spaces are removed # from the transcript before computing CER. # (e.g., 祝 可爱 的 你 —> 祝可爱的你) remove_spaces: True split_tokens: !apply:operator.not_ [!ref ]