""" Providing basic audio data preprocessing pipeline, and offering both instance-level and batch-level data reader interfaces. """ import paddle.v2 as paddle import logging import json import random import soundfile import numpy as np import os RANDOM_SEED = 0 logger = logging.getLogger(__name__) class DataGenerator(object): """ DataGenerator provides basic audio data preprocessing pipeline, and offers both instance-level and batch-level data reader interfaces. Normalized FFT are used as audio features here. :param vocab_filepath: Vocabulary file path for indexing tokenized transcriptions. :type vocab_filepath: basestring :param normalizer_manifest_path: Manifest filepath for collecting feature normalization statistics, e.g. mean, std. :type normalizer_manifest_path: basestring :param normalizer_num_samples: Number of instances sampled for collecting feature normalization statistics. Default is 100. :type normalizer_num_samples: int :param max_duration: Audio clips with duration (in seconds) greater than this will be discarded. Default is 20.0. :type max_duration: float :param min_duration: Audio clips with duration (in seconds) smaller than this will be discarded. Default is 0.0. :type min_duration: float :param stride_ms: Striding size (in milliseconds) for generating frames. Default is 10.0. :type stride_ms: float :param window_ms: Window size (in milliseconds) for frames. Default is 20.0. :type window_ms: float :param max_frequency: Maximun frequency for FFT features. FFT features of frequency larger than this will be discarded. If set None, all features will be kept. Default is None. :type max_frequency: float """ def __init__(self, vocab_filepath, normalizer_manifest_path, normalizer_num_samples=100, max_duration=20.0, min_duration=0.0, stride_ms=10.0, window_ms=20.0, max_frequency=None): self.__max_duration__ = max_duration self.__min_duration__ = min_duration self.__stride_ms__ = stride_ms self.__window_ms__ = window_ms self.__max_frequency__ = max_frequency self.__random__ = random.Random(RANDOM_SEED) # load vocabulary (dictionary) self.__vocab_dict__, self.__vocab_list__ = \ self.__load_vocabulary_from_file__(vocab_filepath) # collect normalizer statistics self.__mean__, self.__std__ = self.__collect_normalizer_statistics__( manifest_path=normalizer_manifest_path, num_samples=normalizer_num_samples) def __audio_featurize__(self, audio_filename): """ Preprocess audio data, including feature extraction, normalization etc.. """ features = self.__audio_basic_featurize__(audio_filename) return self.__normalize__(features) def __text_featurize__(self, text): """ Preprocess text data, including tokenizing and token indexing etc.. """ return self.__convert_text_to_char_index__( text=text, vocabulary=self.__vocab_dict__) def __audio_basic_featurize__(self, audio_filename): """ Compute basic (without normalization etc.) features for audio data. """ return self.__spectrogram_from_file__( filename=audio_filename, stride_ms=self.__stride_ms__, window_ms=self.__window_ms__, max_freq=self.__max_frequency__) def __collect_normalizer_statistics__(self, manifest_path, num_samples=100): """ Compute feature normalization statistics, i.e. mean and stddev. """ # read manifest manifest = self.__read_manifest__( manifest_path=manifest_path, max_duration=self.__max_duration__, min_duration=self.__min_duration__) # sample for statistics sampled_manifest = self.__random__.sample(manifest, num_samples) # extract spectrogram feature features = [] for instance in sampled_manifest: spectrogram = self.__audio_basic_featurize__( instance["audio_filepath"]) features.append(spectrogram) features = np.hstack(features) mean = np.mean(features, axis=1).reshape([-1, 1]) std = np.std(features, axis=1).reshape([-1, 1]) return mean, std def __normalize__(self, features, eps=1e-14): """ Normalize features to be of zero mean and unit stddev. """ return (features - self.__mean__) / (self.__std__ + eps) def __spectrogram_from_file__(self, filename, stride_ms=10.0, window_ms=20.0, max_freq=None, eps=1e-14): """ Laod audio data and calculate the log of spectrogram by FFT. Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech """ audio, sample_rate = soundfile.read(filename) if audio.ndim >= 2: audio = np.mean(audio, 1) if max_freq is None: max_freq = sample_rate / 2 if max_freq > sample_rate / 2: raise ValueError("max_freq must be greater than half of " "sample rate.") if stride_ms > window_ms: raise ValueError("Stride size must not be greater than " "window size.") stride_size = int(0.001 * sample_rate * stride_ms) window_size = int(0.001 * sample_rate * window_ms) spectrogram, freqs = self.__extract_spectrogram__( audio, window_size=window_size, stride_size=stride_size, sample_rate=sample_rate) ind = np.where(freqs <= max_freq)[0][-1] + 1 return np.log(spectrogram[:ind, :] + eps) def __extract_spectrogram__(self, samples, window_size, stride_size, sample_rate): """ Compute the spectrogram by FFT for a discrete real signal. Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech """ # extract strided windows truncate_size = (len(samples) - window_size) % stride_size samples = samples[:len(samples) - truncate_size] nshape = (window_size, (len(samples) - window_size) // stride_size + 1) nstrides = (samples.strides[0], samples.strides[0] * stride_size) windows = np.lib.stride_tricks.as_strided( samples, shape=nshape, strides=nstrides) assert np.all( windows[:, 1] == samples[stride_size:(stride_size + window_size)]) # window weighting, squared Fast Fourier Transform (fft), scaling weighting = np.hanning(window_size)[:, None] fft = np.fft.rfft(windows * weighting, axis=0) fft = np.absolute(fft)**2 scale = np.sum(weighting**2) * sample_rate fft[1:-1, :] *= (2.0 / scale) fft[(0, -1), :] /= scale # prepare fft frequency list freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) return fft, freqs def __load_vocabulary_from_file__(self, vocabulary_path): """ Load vocabulary from file. """ if not os.path.exists(vocabulary_path): raise ValueError("Vocabulary file %s not found.", vocabulary_path) vocab_lines = [] with open(vocabulary_path, 'r') as file: vocab_lines.extend(file.readlines()) vocab_list = [line[:-1] for line in vocab_lines] vocab_dict = dict( [(token, id) for (id, token) in enumerate(vocab_list)]) return vocab_dict, vocab_list def __convert_text_to_char_index__(self, text, vocabulary): """ Convert text string to a list of character index integers. """ return [vocabulary[w] for w in text] def __read_manifest__(self, manifest_path, max_duration, min_duration): """ Load and parse manifest file. """ manifest = [] for json_line in open(manifest_path): try: json_data = json.loads(json_line) except Exception as e: raise ValueError("Error reading manifest: %s" % str(e)) if (json_data["duration"] <= max_duration and json_data["duration"] >= min_duration): manifest.append(json_data) return manifest def __padding_batch__(self, batch, padding_to=-1, flatten=False): """ Padding audio part of features (only in the time axis -- column axis) with zeros, to make each instance in the batch share the same audio feature shape. If `padding_to` is set -1, the maximun column numbers in the batch will be used as the target size. Otherwise, `padding_to` will be the target size. Default is -1. If `flatten` is set True, audio data will be flatten to be a 1-dim ndarray. Default is False. """ new_batch = [] # get target shape max_length = max([audio.shape[1] for audio, text in batch]) if padding_to != -1: if padding_to < max_length: raise ValueError("If padding_to is not -1, it should be greater" " or equal to the original instance length.") max_length = padding_to # padding for audio, text in batch: padded_audio = np.zeros([audio.shape[0], max_length]) padded_audio[:, :audio.shape[1]] = audio if flatten: padded_audio = padded_audio.flatten() new_batch.append((padded_audio, text)) return new_batch def instance_reader_creator(self, manifest_path, sort_by_duration=True, shuffle=False): """ Instance reader creator for audio data. Creat a callable function to produce instances of data. Instance: a tuple of a numpy ndarray of audio spectrogram and a list of tokenized and indexed transcription text. :param manifest_path: Filepath of manifest for audio clip files. :type manifest_path: basestring :param sort_by_duration: Sort the audio clips by duration if set True (for SortaGrad). :type sort_by_duration: bool :param shuffle: Shuffle the audio clips if set True. :type shuffle: bool :return: Data reader function. :rtype: callable """ if sort_by_duration and shuffle: sort_by_duration = False logger.warn("When shuffle set to true, " "sort_by_duration is forced to set False.") def reader(): # read manifest manifest = self.__read_manifest__( manifest_path=manifest_path, max_duration=self.__max_duration__, min_duration=self.__min_duration__) # sort (by duration) or shuffle manifest if sort_by_duration: manifest.sort(key=lambda x: x["duration"]) if shuffle: self.__random__.shuffle(manifest) # extract spectrogram feature for instance in manifest: spectrogram = self.__audio_featurize__( instance["audio_filepath"]) transcript = self.__text_featurize__(instance["text"]) yield (spectrogram, transcript) return reader def batch_reader_creator(self, manifest_path, batch_size, padding_to=-1, flatten=False, sort_by_duration=True, shuffle=False): """ Batch data reader creator for audio data. Creat a callable function to produce batches of data. Audio features will be padded with zeros to make each instance in the batch to share the same audio feature shape. :param manifest_path: Filepath of manifest for audio clip files. :type manifest_path: basestring :param batch_size: Instance number in a batch. :type batch_size: int :param padding_to: If set -1, the maximun column numbers in the batch will be used as the target size for padding. Otherwise, `padding_to` will be the target size. Default is -1. :type padding_to: int :param flatten: If set True, audio data will be flatten to be a 1-dim ndarray. Otherwise, 2-dim ndarray. Default is False. :type flatten: bool :param sort_by_duration: Sort the audio clips by duration if set True (for SortaGrad). :type sort_by_duration: bool :param shuffle: Shuffle the audio clips if set True. :type shuffle: bool :return: Batch reader function, producing batches of data when called. :rtype: callable """ def batch_reader(): instance_reader = self.instance_reader_creator( manifest_path=manifest_path, sort_by_duration=sort_by_duration, shuffle=shuffle) batch = [] for instance in instance_reader(): batch.append(instance) if len(batch) == batch_size: yield self.__padding_batch__(batch, padding_to, flatten) batch = [] if len(batch) > 0: yield self.__padding_batch__(batch, padding_to, flatten) return batch_reader def vocabulary_size(self): """ Get vocabulary size. :return: Vocabulary size. :rtype: int """ return len(self.__vocab_list__) def vocabulary_dict(self): """ Get vocabulary in dict. :return: Vocabulary in dict. :rtype: dict """ return self.__vocab_dict__ def vocabulary_list(self): """ Get vocabulary in list. :return: Vocabulary in list :rtype: list """ return self.__vocab_list__ def data_name_feeding(self): """ Get feeddings (data field name and corresponding field id). :return: Feeding dict. :rtype: dict """ feeding = { "audio_spectrogram": 0, "transcript_text": 1, } return feeding