You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
248 lines
10 KiB
248 lines
10 KiB
"""
|
|
Providing basic audio data preprocessing pipeline, and offering
|
|
both instance-level and batch-level data reader interfaces.
|
|
"""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import random
|
|
import numpy as np
|
|
import paddle.v2 as paddle
|
|
from data_utils import utils
|
|
from data_utils.augmentor.augmentation import AugmentationPipeline
|
|
from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
|
|
from data_utils.audio import SpeechSegment
|
|
from data_utils.normalizer import FeatureNormalizer
|
|
|
|
|
|
class DataGenerator(object):
|
|
"""
|
|
DataGenerator provides basic audio data preprocessing pipeline, and offers
|
|
both instance-level and batch-level data reader interfaces.
|
|
Normalized FFT are used as audio features here.
|
|
|
|
:param vocab_filepath: Vocabulary file path for indexing tokenized
|
|
transcriptions.
|
|
:type vocab_filepath: basestring
|
|
:param normalizer_manifest_path: Manifest filepath for collecting feature
|
|
normalization statistics, e.g. mean, std.
|
|
:type normalizer_manifest_path: basestring
|
|
:param normalizer_num_samples: Number of instances sampled for collecting
|
|
feature normalization statistics.
|
|
Default is 100.
|
|
:type normalizer_num_samples: int
|
|
:param max_duration: Audio clips with duration (in seconds) greater than
|
|
this will be discarded. Default is 20.0.
|
|
:type max_duration: float
|
|
:param min_duration: Audio clips with duration (in seconds) smaller than
|
|
this will be discarded. Default is 0.0.
|
|
:type min_duration: float
|
|
:param stride_ms: Striding size (in milliseconds) for generating frames.
|
|
Default is 10.0.
|
|
:type stride_ms: float
|
|
:param window_ms: Window size (in milliseconds) for frames. Default is 20.0.
|
|
:type window_ms: float
|
|
:param max_frequency: Maximun frequency for FFT features. FFT features of
|
|
frequency larger than this will be discarded.
|
|
If set None, all features will be kept.
|
|
Default is None.
|
|
:type max_frequency: float
|
|
"""
|
|
|
|
def __init__(self,
|
|
vocab_filepath,
|
|
mean_std_filepath,
|
|
augmentation_config='{}',
|
|
max_duration=float('inf'),
|
|
min_duration=0.0,
|
|
stride_ms=10.0,
|
|
window_ms=20.0,
|
|
max_freq=None,
|
|
random_seed=0):
|
|
self._max_duration = max_duration
|
|
self._min_duration = min_duration
|
|
self._normalizer = FeatureNormalizer(mean_std_filepath)
|
|
self._augmentation_pipeline = AugmentationPipeline(
|
|
augmentation_config=augmentation_config, random_seed=random_seed)
|
|
self._speech_featurizer = SpeechFeaturizer(
|
|
vocab_filepath=vocab_filepath,
|
|
stride_ms=stride_ms,
|
|
window_ms=window_ms,
|
|
max_freq=max_freq,
|
|
random_seed=random_seed)
|
|
self._rng = random.Random(random_seed)
|
|
self._epoch = 0
|
|
|
|
def batch_reader_creator(self,
|
|
manifest_path,
|
|
batch_size,
|
|
padding_to=-1,
|
|
flatten=False,
|
|
sortagrad=False,
|
|
batch_shuffle=False):
|
|
"""
|
|
Batch data reader creator for audio data. Creat a callable function to
|
|
produce batches of data.
|
|
|
|
Audio features will be padded with zeros to make each instance in the
|
|
batch to share the same audio feature shape.
|
|
|
|
:param manifest_path: Filepath of manifest for audio clip files.
|
|
:type manifest_path: basestring
|
|
:param batch_size: Instance number in a batch.
|
|
:type batch_size: int
|
|
:param padding_to: If set -1, the maximun column numbers in the batch
|
|
will be used as the target size for padding.
|
|
Otherwise, `padding_to` will be the target size.
|
|
Default is -1.
|
|
:type padding_to: int
|
|
:param flatten: If set True, audio data will be flatten to be a 1-dim
|
|
ndarray. Otherwise, 2-dim ndarray. Default is False.
|
|
:type flatten: bool
|
|
:param sortagrad: Sort the audio clips by duration in the first epoc
|
|
if set True.
|
|
:type sortagrad: bool
|
|
:param batch_shuffle: Shuffle the audio clips if set True. It is
|
|
not a thorough instance-wise shuffle, but a
|
|
specific batch-wise shuffle. For more details,
|
|
please see `_batch_shuffle` function.
|
|
:type batch_shuffle: bool
|
|
:return: Batch reader function, producing batches of data when called.
|
|
:rtype: callable
|
|
"""
|
|
|
|
def batch_reader():
|
|
# read manifest
|
|
manifest = utils.read_manifest(
|
|
manifest_path=manifest_path,
|
|
max_duration=self._max_duration,
|
|
min_duration=self._min_duration)
|
|
# sort (by duration) or batch-wise shuffle the manifest
|
|
if self._epoch == 0 and sortagrad:
|
|
manifest.sort(key=lambda x: x["duration"])
|
|
elif batch_shuffle:
|
|
manifest = self._batch_shuffle(manifest, batch_size)
|
|
# prepare batches
|
|
instance_reader = self._instance_reader_creator(manifest)
|
|
batch = []
|
|
for instance in instance_reader():
|
|
batch.append(instance)
|
|
if len(batch) == batch_size:
|
|
yield self._padding_batch(batch, padding_to, flatten)
|
|
batch = []
|
|
if len(batch) > 0:
|
|
yield self._padding_batch(batch, padding_to, flatten)
|
|
self._epoch += 1
|
|
|
|
return batch_reader
|
|
|
|
@property
|
|
def feeding(self):
|
|
"""Returns data_reader's feeding dict."""
|
|
return {"audio_spectrogram": 0, "transcript_text": 1}
|
|
|
|
@property
|
|
def vocab_size(self):
|
|
"""Returns vocabulary size."""
|
|
return self._speech_featurizer.vocab_size
|
|
|
|
@property
|
|
def vocab_list(self):
|
|
"""Returns vocabulary list."""
|
|
return self._speech_featurizer.vocab_list
|
|
|
|
def _process_utterance(self, filename, transcript):
|
|
speech_segment = SpeechSegment.from_file(filename, transcript)
|
|
self._augmentation_pipeline.transform_audio(speech_segment)
|
|
specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
|
|
specgram = self._normalizer.apply(specgram)
|
|
return specgram, text_ids
|
|
|
|
def _instance_reader_creator(self, manifest):
|
|
"""
|
|
Instance reader creator for audio data. Creat a callable function to
|
|
produce instances of data.
|
|
|
|
Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
|
|
tokenized and indexed transcription text.
|
|
|
|
:param manifest: Filepath of manifest for audio clip files.
|
|
:type manifest: basestring
|
|
:return: Data reader function.
|
|
:rtype: callable
|
|
"""
|
|
|
|
def reader():
|
|
for instance in manifest:
|
|
yield self._process_utterance(instance["audio_filepath"],
|
|
instance["text"])
|
|
|
|
return reader
|
|
|
|
def _padding_batch(self, batch, padding_to=-1, flatten=False):
|
|
"""
|
|
Padding audio part of features (only in the time axis -- column axis)
|
|
with zeros, to make each instance in the batch share the same
|
|
audio feature shape.
|
|
|
|
If `padding_to` is set -1, the maximun column numbers in the batch will
|
|
be used as the target size. Otherwise, `padding_to` will be the target
|
|
size. Default is -1.
|
|
|
|
If `flatten` is set True, audio data will be flatten to be a 1-dim
|
|
ndarray. Default is False.
|
|
"""
|
|
new_batch = []
|
|
# get target shape
|
|
max_length = max([audio.shape[1] for audio, text in batch])
|
|
if padding_to != -1:
|
|
if padding_to < max_length:
|
|
raise ValueError("If padding_to is not -1, it should be greater"
|
|
" or equal to the original instance length.")
|
|
max_length = padding_to
|
|
# padding
|
|
for audio, text in batch:
|
|
padded_audio = np.zeros([audio.shape[0], max_length])
|
|
padded_audio[:, :audio.shape[1]] = audio
|
|
if flatten:
|
|
padded_audio = padded_audio.flatten()
|
|
new_batch.append((padded_audio, text))
|
|
return new_batch
|
|
|
|
def _batch_shuffle(self, manifest, batch_size):
|
|
"""
|
|
The instances have different lengths and they cannot be
|
|
combined into a single matrix multiplication. It usually
|
|
sorts the training examples by length and combines only
|
|
similarly-sized instances into minibatches, pads with
|
|
silence when necessary so that all instances in a batch
|
|
have the same length. This batch shuffle fuction is used
|
|
to make similarly-sized instances into minibatches and
|
|
make a batch-wise shuffle.
|
|
|
|
1. Sort the audio clips by duration.
|
|
2. Generate a random number `k`, k in [0, batch_size).
|
|
3. Randomly remove `k` instances in order to make different mini-batches,
|
|
then make minibatches and each minibatch size is batch_size.
|
|
4. Shuffle the minibatches.
|
|
|
|
:param manifest: manifest file.
|
|
:type manifest: list
|
|
:param batch_size: Batch size. This size is also used for generate
|
|
a random number for batch shuffle.
|
|
:type batch_size: int
|
|
:return: batch shuffled mainifest.
|
|
:rtype: list
|
|
"""
|
|
manifest.sort(key=lambda x: x["duration"])
|
|
shift_len = self._rng.randint(0, batch_size - 1)
|
|
batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
|
|
self._rng.shuffle(batch_manifest)
|
|
batch_manifest = list(sum(batch_manifest, ()))
|
|
res_len = len(manifest) - shift_len - len(batch_manifest)
|
|
batch_manifest.extend(manifest[-res_len:])
|
|
batch_manifest.extend(manifest[0:shift_len])
|
|
return batch_manifest
|