# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Optional from paddle.io import Dataset from yacs.config import CfgNode from deepspeech.frontend.utility import read_manifest from deepspeech.utils.log import Log __all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"] logger = Log(__name__).getlog() class ManifestDataset(Dataset): @classmethod def params(cls, config: Optional[CfgNode]=None) -> CfgNode: default = CfgNode( dict( manifest="", max_input_len=27.0, min_input_len=0.0, max_output_len=float('inf'), min_output_len=0.0, max_output_input_ratio=float('inf'), min_output_input_ratio=0.0, )) if config is not None: config.merge_from_other_cfg(default) return default @classmethod def from_config(cls, config): """Build a ManifestDataset object from a config. Args: config (yacs.config.CfgNode): configs object. Returns: ManifestDataset: dataet object. """ assert 'manifest' in config.data assert config.data.manifest dataset = cls( manifest_path=config.data.manifest, max_input_len=config.data.max_input_len, min_input_len=config.data.min_input_len, max_output_len=config.data.max_output_len, min_output_len=config.data.min_output_len, max_output_input_ratio=config.data.max_output_input_ratio, min_output_input_ratio=config.data.min_output_input_ratio, ) return dataset def __init__(self, manifest_path, max_input_len=float('inf'), min_input_len=0.0, max_output_len=float('inf'), min_output_len=0.0, max_output_input_ratio=float('inf'), min_output_input_ratio=0.0): """Manifest Dataset Args: manifest_path (str): manifest josn file path max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf'). min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0. max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0. min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0. max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0. min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05. """ super().__init__() # read manifest self._manifest = read_manifest( manifest_path=manifest_path, max_input_len=max_input_len, min_input_len=min_input_len, max_output_len=max_output_len, min_output_len=min_output_len, max_output_input_ratio=max_output_input_ratio, min_output_input_ratio=min_output_input_ratio) self._manifest.sort(key=lambda x: x["feat_shape"][0]) def __len__(self): return len(self._manifest) def __getitem__(self, idx): instance = self._manifest[idx] return instance["utt"], instance["feat"], instance["text"] class TripletManifestDataset(ManifestDataset): """ For Joint Training of Speech Translation and ASR. text: translation, text1: transcript. """ def __getitem__(self, idx): instance = self._manifest[idx] return instance["utt"], instance["feat"], instance["text"], instance[ "text1"] class TransformDataset(Dataset): """Transform Dataset. Args: data: list object from make_batchset converter: batch function reader: read data """ def __init__(self, data, converter, reader): """Init function.""" super().__init__() self.data = data self.converter = converter self.reader = reader def __len__(self): """Len function.""" return len(self.data) def __getitem__(self, idx): """[] operator.""" return self.converter([self.reader(self.data[idx], return_uttid=True)]) class AudioDataset(Dataset): def __init__(self, data_file, max_length=10240, min_length=0, token_max_length=200, token_min_length=1, batch_type='static', batch_size=1, max_frames_in_batch=0, sort=True, raw_wav=True, stride_ms=10): """Dataset for loading audio data. Attributes:: data_file: input data file Plain text data file, each line contains following 7 fields, which is split by '\t': utt:utt1 feat:tmp/data/file1.wav or feat:tmp/data/fbank.ark:30 feat_shape: 4.95(in seconds) or feat_shape:495,80(495 is in frames) text:i love you token: i l o v e y o u tokenid: int id of this token token_shape: M,N # M is the number of token, N is vocab size max_length: drop utterance which is greater than max_length(10ms), unit 10ms. min_length: drop utterance which is less than min_length(10ms), unit 10ms. token_max_length: drop utterance which is greater than token_max_length, especially when use char unit for english modeling token_min_length: drop utterance which is less than token_max_length batch_type: static or dynamic, see max_frames_in_batch(dynamic) batch_size: number of utterances in a batch, it's for static batch size. max_frames_in_batch: max feature frames in a batch, when batch_type is dynamic, it's for dynamic batch size. Then batch_size is ignored, we will keep filling the batch until the total frames in batch up to max_frames_in_batch. sort: whether to sort all data, so the utterance with the same length could be filled in a same batch. raw_wav: use raw wave or extracted featute. if raw wave is used, dynamic waveform-level augmentation could be used and the feature is extracted by torchaudio. if extracted featute(e.g. by kaldi) is used, only feature-level augmentation such as specaug could be used. """ assert batch_type in ['static', 'dynamic'] # read manifest data = read_manifest(data_file) if sort: data = sorted(data, key=lambda x: x["feat_shape"][0]) if raw_wav: assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark', '.scp') data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms)) self.input_dim = data[0]['feat_shape'][1] self.output_dim = data[0]['token_shape'][1] # with open(data_file, 'r') as f: # for line in f: # arr = line.strip().split('\t') # if len(arr) != 7: # continue # key = arr[0].split(':')[1] # tokenid = arr[5].split(':')[1] # output_dim = int(arr[6].split(':')[1].split(',')[1]) # if raw_wav: # wav_path = ':'.join(arr[1].split(':')[1:]) # duration = int(float(arr[2].split(':')[1]) * 1000 / 10) # data.append((key, wav_path, duration, tokenid)) # else: # feat_ark = ':'.join(arr[1].split(':')[1:]) # feat_info = arr[2].split(':')[1].split(',') # feat_dim = int(feat_info[1].strip()) # num_frames = int(feat_info[0].strip()) # data.append((key, feat_ark, num_frames, tokenid)) # self.input_dim = feat_dim # self.output_dim = output_dim valid_data = [] for i in range(len(data)): length = data[i]['feat_shape'][0] token_length = data[i]['token_shape'][0] # remove too lang or too short utt for both input and output # to prevent from out of memory if length > max_length or length < min_length: # logging.warn('ignore utterance {} feature {}'.format( # data[i][0], length)) pass elif token_length > token_max_length or token_length < token_min_length: pass else: valid_data.append(data[i]) data = valid_data self.minibatch = [] num_data = len(data) # Dynamic batch size if batch_type == 'dynamic': assert (max_frames_in_batch > 0) self.minibatch.append([]) num_frames_in_batch = 0 for i in range(num_data): length = data[i]['feat_shape'][0] num_frames_in_batch += length if num_frames_in_batch > max_frames_in_batch: self.minibatch.append([]) num_frames_in_batch = length self.minibatch[-1].append(data[i]) # Static batch size else: cur = 0 while cur < num_data: end = min(cur + batch_size, num_data) item = [] for i in range(cur, end): item.append(data[i]) self.minibatch.append(item) cur = end def __len__(self): return len(self.minibatch) def __getitem__(self, idx): instance = self.minibatch[idx] return instance["utt"], instance["feat"], instance["text"]