# Copyright (c) 2023 speechbrain Authors. All Rights Reserved. # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/recipes/AISHELL-1/ASR/CTC/train_with_wav2vec.py) import data_pipeline import dataio import numpy import paddle import tqdm import transformers from dataloader import make_dataloader from hyperpyyaml import load_hyperpyyaml import dataset def dataio_prepare(hparams): """This function prepares the datasets to be used in the brain class. It also defines the data processing pipeline through user-defined functions.""" data_folder = hparams["data_folder"] train_data = dataset.DynamicItemDataset.from_csv( csv_path=hparams["train_data"], replacements={"data_root": data_folder}, ) if hparams["sorting"] == "ascending": # we sort training data to speed up training and get better results. train_data = train_data.filtered_sorted(sort_key="duration") # when sorting do not shuffle in dataloader ! otherwise is pointless hparams["train_dataloader_opts"]["shuffle"] = False elif hparams["sorting"] == "descending": train_data = train_data.filtered_sorted( sort_key="duration", reverse=True) # when sorting do not shuffle in dataloader ! otherwise is pointless hparams["train_dataloader_opts"]["shuffle"] = False elif hparams["sorting"] == "random": pass else: raise NotImplementedError( "sorting must be random, ascending or descending") valid_data = dataset.DynamicItemDataset.from_csv( csv_path=hparams["valid_data"], replacements={"data_root": data_folder}, ) valid_data = valid_data.filtered_sorted(sort_key="duration") test_data = dataset.DynamicItemDataset.from_csv( csv_path=hparams["test_data"], replacements={"data_root": data_folder}, ) test_data = test_data.filtered_sorted(sort_key="duration") datasets = [train_data, valid_data, test_data] # Defining tokenizer and loading it tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-chinese') # 2. Define audio pipeline: @data_pipeline.takes("wav") @data_pipeline.provides("sig") def audio_pipeline(wav): sig = dataio.read_audio(wav) return sig dataset.add_dynamic_item(datasets, audio_pipeline) # 3. Define text pipeline: @data_pipeline.takes("transcript") @data_pipeline.provides("wrd", "tokens_list", "tokens") def text_pipeline(wrd): wrd = "".join(wrd.split(" ")) yield wrd tokens_list = tokenizer(wrd)["input_ids"] yield tokens_list tokens = numpy.array(tokens_list, dtype="int64") yield tokens dataset.add_dynamic_item(datasets, text_pipeline) # 4. Set output: dataset.set_output_keys( datasets, ["id", "sig", "wrd", "tokens"], ) # 5. If Dynamic Batching is used, we instantiate the needed samplers. train_batch_sampler = None valid_batch_sampler = None if hparams["dynamic_batching"]: from sampler import DynamicBatchSampler # noqa dynamic_hparams = hparams["dynamic_batch_sampler"] num_buckets = dynamic_hparams["num_buckets"] train_batch_sampler = DynamicBatchSampler( train_data, dynamic_hparams["max_batch_len"], num_buckets=num_buckets, length_func=lambda x: x["duration"], shuffle=dynamic_hparams["shuffle_ex"], batch_ordering=dynamic_hparams["batch_ordering"], ) valid_batch_sampler = DynamicBatchSampler( valid_data, dynamic_hparams["max_batch_len"], num_buckets=num_buckets, length_func=lambda x: x["duration"], shuffle=dynamic_hparams["shuffle_ex"], batch_ordering=dynamic_hparams["batch_ordering"], ) return (train_data, valid_data, test_data, tokenizer, train_batch_sampler, valid_batch_sampler, ) hparams_file = 'train_with_wav2vec.yaml' with open(hparams_file) as fin: hparams = load_hyperpyyaml(fin, None) (train_data, valid_data, test_data, tokenizer, train_bsampler, valid_bsampler, ) = dataio_prepare(hparams) train_dataloader_opts = hparams["train_dataloader_opts"] valid_dataloader_opts = hparams["valid_dataloader_opts"] if train_bsampler is not None: train_dataloader_opts = { "batch_sampler": train_bsampler, "num_workers": hparams["num_workers"], } if valid_bsampler is not None: valid_dataloader_opts = {"batch_sampler": valid_bsampler} train_set = make_dataloader(train_data, stage='train', **train_dataloader_opts) valid_set = make_dataloader( valid_data, stage='train', **valid_dataloader_opts, ) for batch in valid_set: print(batch) print('done') # exit()