Merge branch 'develop' of https://github.com/PaddlePaddle/models into fix-81
commit
86eeb52c06
@ -1,411 +0,0 @@
|
||||
"""
|
||||
Providing basic audio data preprocessing pipeline, and offering
|
||||
both instance-level and batch-level data reader interfaces.
|
||||
"""
|
||||
import paddle.v2 as paddle
|
||||
import logging
|
||||
import json
|
||||
import random
|
||||
import soundfile
|
||||
import numpy as np
|
||||
import itertools
|
||||
import os
|
||||
|
||||
RANDOM_SEED = 0
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DataGenerator(object):
|
||||
"""
|
||||
DataGenerator provides basic audio data preprocessing pipeline, and offers
|
||||
both instance-level and batch-level data reader interfaces.
|
||||
Normalized FFT are used as audio features here.
|
||||
|
||||
:param vocab_filepath: Vocabulary file path for indexing tokenized
|
||||
transcriptions.
|
||||
:type vocab_filepath: basestring
|
||||
:param normalizer_manifest_path: Manifest filepath for collecting feature
|
||||
normalization statistics, e.g. mean, std.
|
||||
:type normalizer_manifest_path: basestring
|
||||
:param normalizer_num_samples: Number of instances sampled for collecting
|
||||
feature normalization statistics.
|
||||
Default is 100.
|
||||
:type normalizer_num_samples: int
|
||||
:param max_duration: Audio clips with duration (in seconds) greater than
|
||||
this will be discarded. Default is 20.0.
|
||||
:type max_duration: float
|
||||
:param min_duration: Audio clips with duration (in seconds) smaller than
|
||||
this will be discarded. Default is 0.0.
|
||||
:type min_duration: float
|
||||
:param stride_ms: Striding size (in milliseconds) for generating frames.
|
||||
Default is 10.0.
|
||||
:type stride_ms: float
|
||||
:param window_ms: Window size (in milliseconds) for frames. Default is 20.0.
|
||||
:type window_ms: float
|
||||
:param max_frequency: Maximun frequency for FFT features. FFT features of
|
||||
frequency larger than this will be discarded.
|
||||
If set None, all features will be kept.
|
||||
Default is None.
|
||||
:type max_frequency: float
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
vocab_filepath,
|
||||
normalizer_manifest_path,
|
||||
normalizer_num_samples=100,
|
||||
max_duration=20.0,
|
||||
min_duration=0.0,
|
||||
stride_ms=10.0,
|
||||
window_ms=20.0,
|
||||
max_frequency=None):
|
||||
self.__max_duration__ = max_duration
|
||||
self.__min_duration__ = min_duration
|
||||
self.__stride_ms__ = stride_ms
|
||||
self.__window_ms__ = window_ms
|
||||
self.__max_frequency__ = max_frequency
|
||||
self.__epoc__ = 0
|
||||
self.__random__ = random.Random(RANDOM_SEED)
|
||||
# load vocabulary (dictionary)
|
||||
self.__vocab_dict__, self.__vocab_list__ = \
|
||||
self.__load_vocabulary_from_file__(vocab_filepath)
|
||||
# collect normalizer statistics
|
||||
self.__mean__, self.__std__ = self.__collect_normalizer_statistics__(
|
||||
manifest_path=normalizer_manifest_path,
|
||||
num_samples=normalizer_num_samples)
|
||||
|
||||
def __audio_featurize__(self, audio_filename):
|
||||
"""
|
||||
Preprocess audio data, including feature extraction, normalization etc..
|
||||
"""
|
||||
features = self.__audio_basic_featurize__(audio_filename)
|
||||
return self.__normalize__(features)
|
||||
|
||||
def __text_featurize__(self, text):
|
||||
"""
|
||||
Preprocess text data, including tokenizing and token indexing etc..
|
||||
"""
|
||||
return self.__convert_text_to_char_index__(
|
||||
text=text, vocabulary=self.__vocab_dict__)
|
||||
|
||||
def __audio_basic_featurize__(self, audio_filename):
|
||||
"""
|
||||
Compute basic (without normalization etc.) features for audio data.
|
||||
"""
|
||||
return self.__spectrogram_from_file__(
|
||||
filename=audio_filename,
|
||||
stride_ms=self.__stride_ms__,
|
||||
window_ms=self.__window_ms__,
|
||||
max_freq=self.__max_frequency__)
|
||||
|
||||
def __collect_normalizer_statistics__(self, manifest_path, num_samples=100):
|
||||
"""
|
||||
Compute feature normalization statistics, i.e. mean and stddev.
|
||||
"""
|
||||
# read manifest
|
||||
manifest = self.__read_manifest__(
|
||||
manifest_path=manifest_path,
|
||||
max_duration=self.__max_duration__,
|
||||
min_duration=self.__min_duration__)
|
||||
# sample for statistics
|
||||
sampled_manifest = self.__random__.sample(manifest, num_samples)
|
||||
# extract spectrogram feature
|
||||
features = []
|
||||
for instance in sampled_manifest:
|
||||
spectrogram = self.__audio_basic_featurize__(
|
||||
instance["audio_filepath"])
|
||||
features.append(spectrogram)
|
||||
features = np.hstack(features)
|
||||
mean = np.mean(features, axis=1).reshape([-1, 1])
|
||||
std = np.std(features, axis=1).reshape([-1, 1])
|
||||
return mean, std
|
||||
|
||||
def __normalize__(self, features, eps=1e-14):
|
||||
"""
|
||||
Normalize features to be of zero mean and unit stddev.
|
||||
"""
|
||||
return (features - self.__mean__) / (self.__std__ + eps)
|
||||
|
||||
def __spectrogram_from_file__(self,
|
||||
filename,
|
||||
stride_ms=10.0,
|
||||
window_ms=20.0,
|
||||
max_freq=None,
|
||||
eps=1e-14):
|
||||
"""
|
||||
Laod audio data and calculate the log of spectrogram by FFT.
|
||||
Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech
|
||||
"""
|
||||
audio, sample_rate = soundfile.read(filename)
|
||||
if audio.ndim >= 2:
|
||||
audio = np.mean(audio, 1)
|
||||
if max_freq is None:
|
||||
max_freq = sample_rate / 2
|
||||
if max_freq > sample_rate / 2:
|
||||
raise ValueError("max_freq must be greater than half of "
|
||||
"sample rate.")
|
||||
if stride_ms > window_ms:
|
||||
raise ValueError("Stride size must not be greater than "
|
||||
"window size.")
|
||||
stride_size = int(0.001 * sample_rate * stride_ms)
|
||||
window_size = int(0.001 * sample_rate * window_ms)
|
||||
spectrogram, freqs = self.__extract_spectrogram__(
|
||||
audio,
|
||||
window_size=window_size,
|
||||
stride_size=stride_size,
|
||||
sample_rate=sample_rate)
|
||||
ind = np.where(freqs <= max_freq)[0][-1] + 1
|
||||
return np.log(spectrogram[:ind, :] + eps)
|
||||
|
||||
def __extract_spectrogram__(self, samples, window_size, stride_size,
|
||||
sample_rate):
|
||||
"""
|
||||
Compute the spectrogram by FFT for a discrete real signal.
|
||||
Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech
|
||||
"""
|
||||
# extract strided windows
|
||||
truncate_size = (len(samples) - window_size) % stride_size
|
||||
samples = samples[:len(samples) - truncate_size]
|
||||
nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
|
||||
nstrides = (samples.strides[0], samples.strides[0] * stride_size)
|
||||
windows = np.lib.stride_tricks.as_strided(
|
||||
samples, shape=nshape, strides=nstrides)
|
||||
assert np.all(
|
||||
windows[:, 1] == samples[stride_size:(stride_size + window_size)])
|
||||
# window weighting, squared Fast Fourier Transform (fft), scaling
|
||||
weighting = np.hanning(window_size)[:, None]
|
||||
fft = np.fft.rfft(windows * weighting, axis=0)
|
||||
fft = np.absolute(fft)**2
|
||||
scale = np.sum(weighting**2) * sample_rate
|
||||
fft[1:-1, :] *= (2.0 / scale)
|
||||
fft[(0, -1), :] /= scale
|
||||
# prepare fft frequency list
|
||||
freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
|
||||
return fft, freqs
|
||||
|
||||
def __load_vocabulary_from_file__(self, vocabulary_path):
|
||||
"""
|
||||
Load vocabulary from file.
|
||||
"""
|
||||
if not os.path.exists(vocabulary_path):
|
||||
raise ValueError("Vocabulary file %s not found.", vocabulary_path)
|
||||
vocab_lines = []
|
||||
with open(vocabulary_path, 'r') as file:
|
||||
vocab_lines.extend(file.readlines())
|
||||
vocab_list = [line[:-1] for line in vocab_lines]
|
||||
vocab_dict = dict(
|
||||
[(token, id) for (id, token) in enumerate(vocab_list)])
|
||||
return vocab_dict, vocab_list
|
||||
|
||||
def __convert_text_to_char_index__(self, text, vocabulary):
|
||||
"""
|
||||
Convert text string to a list of character index integers.
|
||||
"""
|
||||
return [vocabulary[w] for w in text]
|
||||
|
||||
def __read_manifest__(self, manifest_path, max_duration, min_duration):
|
||||
"""
|
||||
Load and parse manifest file.
|
||||
"""
|
||||
manifest = []
|
||||
for json_line in open(manifest_path):
|
||||
try:
|
||||
json_data = json.loads(json_line)
|
||||
except Exception as e:
|
||||
raise ValueError("Error reading manifest: %s" % str(e))
|
||||
if (json_data["duration"] <= max_duration and
|
||||
json_data["duration"] >= min_duration):
|
||||
manifest.append(json_data)
|
||||
return manifest
|
||||
|
||||
def __padding_batch__(self, batch, padding_to=-1, flatten=False):
|
||||
"""
|
||||
Padding audio part of features (only in the time axis -- column axis)
|
||||
with zeros, to make each instance in the batch share the same
|
||||
audio feature shape.
|
||||
|
||||
If `padding_to` is set -1, the maximun column numbers in the batch will
|
||||
be used as the target size. Otherwise, `padding_to` will be the target
|
||||
size. Default is -1.
|
||||
|
||||
If `flatten` is set True, audio data will be flatten to be a 1-dim
|
||||
ndarray. Default is False.
|
||||
"""
|
||||
new_batch = []
|
||||
# get target shape
|
||||
max_length = max([audio.shape[1] for audio, text in batch])
|
||||
if padding_to != -1:
|
||||
if padding_to < max_length:
|
||||
raise ValueError("If padding_to is not -1, it should be greater"
|
||||
" or equal to the original instance length.")
|
||||
max_length = padding_to
|
||||
# padding
|
||||
for audio, text in batch:
|
||||
padded_audio = np.zeros([audio.shape[0], max_length])
|
||||
padded_audio[:, :audio.shape[1]] = audio
|
||||
if flatten:
|
||||
padded_audio = padded_audio.flatten()
|
||||
new_batch.append((padded_audio, text))
|
||||
return new_batch
|
||||
|
||||
def __batch_shuffle__(self, manifest, batch_size):
|
||||
"""
|
||||
The instances have different lengths and they cannot be
|
||||
combined into a single matrix multiplication. It usually
|
||||
sorts the training examples by length and combines only
|
||||
similarly-sized instances into minibatches, pads with
|
||||
silence when necessary so that all instances in a batch
|
||||
have the same length. This batch shuffle fuction is used
|
||||
to make similarly-sized instances into minibatches and
|
||||
make a batch-wise shuffle.
|
||||
|
||||
1. Sort the audio clips by duration.
|
||||
2. Generate a random number `k`, k in [0, batch_size).
|
||||
3. Randomly remove `k` instances in order to make different mini-batches,
|
||||
then make minibatches and each minibatch size is batch_size.
|
||||
4. Shuffle the minibatches.
|
||||
|
||||
:param manifest: manifest file.
|
||||
:type manifest: list
|
||||
:param batch_size: Batch size. This size is also used for generate
|
||||
a random number for batch shuffle.
|
||||
:type batch_size: int
|
||||
:return: batch shuffled mainifest.
|
||||
:rtype: list
|
||||
"""
|
||||
manifest.sort(key=lambda x: x["duration"])
|
||||
shift_len = self.__random__.randint(0, batch_size - 1)
|
||||
batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
|
||||
self.__random__.shuffle(batch_manifest)
|
||||
batch_manifest = list(sum(batch_manifest, ()))
|
||||
res_len = len(manifest) - shift_len - len(batch_manifest)
|
||||
batch_manifest.extend(manifest[-res_len:])
|
||||
batch_manifest.extend(manifest[0:shift_len])
|
||||
return batch_manifest
|
||||
|
||||
def instance_reader_creator(self, manifest):
|
||||
"""
|
||||
Instance reader creator for audio data. Creat a callable function to
|
||||
produce instances of data.
|
||||
|
||||
Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
|
||||
tokenized and indexed transcription text.
|
||||
|
||||
:param manifest: Filepath of manifest for audio clip files.
|
||||
:type manifest: basestring
|
||||
:return: Data reader function.
|
||||
:rtype: callable
|
||||
"""
|
||||
|
||||
def reader():
|
||||
# extract spectrogram feature
|
||||
for instance in manifest:
|
||||
spectrogram = self.__audio_featurize__(
|
||||
instance["audio_filepath"])
|
||||
transcript = self.__text_featurize__(instance["text"])
|
||||
yield (spectrogram, transcript)
|
||||
|
||||
return reader
|
||||
|
||||
def batch_reader_creator(self,
|
||||
manifest_path,
|
||||
batch_size,
|
||||
padding_to=-1,
|
||||
flatten=False,
|
||||
sortagrad=False,
|
||||
batch_shuffle=False):
|
||||
"""
|
||||
Batch data reader creator for audio data. Creat a callable function to
|
||||
produce batches of data.
|
||||
|
||||
Audio features will be padded with zeros to make each instance in the
|
||||
batch to share the same audio feature shape.
|
||||
|
||||
:param manifest_path: Filepath of manifest for audio clip files.
|
||||
:type manifest_path: basestring
|
||||
:param batch_size: Instance number in a batch.
|
||||
:type batch_size: int
|
||||
:param padding_to: If set -1, the maximun column numbers in the batch
|
||||
will be used as the target size for padding.
|
||||
Otherwise, `padding_to` will be the target size.
|
||||
Default is -1.
|
||||
:type padding_to: int
|
||||
:param flatten: If set True, audio data will be flatten to be a 1-dim
|
||||
ndarray. Otherwise, 2-dim ndarray. Default is False.
|
||||
:type flatten: bool
|
||||
:param sortagrad: Sort the audio clips by duration in the first epoc
|
||||
if set True.
|
||||
:type sortagrad: bool
|
||||
:param batch_shuffle: Shuffle the audio clips if set True. It is
|
||||
not a thorough instance-wise shuffle, but a
|
||||
specific batch-wise shuffle. For more details,
|
||||
please see `__batch_shuffle__` function.
|
||||
:type batch_shuffle: bool
|
||||
:return: Batch reader function, producing batches of data when called.
|
||||
:rtype: callable
|
||||
"""
|
||||
|
||||
def batch_reader():
|
||||
# read manifest
|
||||
manifest = self.__read_manifest__(
|
||||
manifest_path=manifest_path,
|
||||
max_duration=self.__max_duration__,
|
||||
min_duration=self.__min_duration__)
|
||||
|
||||
# sort (by duration) or shuffle manifest
|
||||
if self.__epoc__ == 0 and sortagrad:
|
||||
manifest.sort(key=lambda x: x["duration"])
|
||||
elif batch_shuffle:
|
||||
manifest = self.__batch_shuffle__(manifest, batch_size)
|
||||
|
||||
instance_reader = self.instance_reader_creator(manifest)
|
||||
batch = []
|
||||
for instance in instance_reader():
|
||||
batch.append(instance)
|
||||
if len(batch) == batch_size:
|
||||
yield self.__padding_batch__(batch, padding_to, flatten)
|
||||
batch = []
|
||||
if len(batch) > 0:
|
||||
yield self.__padding_batch__(batch, padding_to, flatten)
|
||||
self.__epoc__ += 1
|
||||
|
||||
return batch_reader
|
||||
|
||||
def vocabulary_size(self):
|
||||
"""
|
||||
Get vocabulary size.
|
||||
|
||||
:return: Vocabulary size.
|
||||
:rtype: int
|
||||
"""
|
||||
return len(self.__vocab_list__)
|
||||
|
||||
def vocabulary_dict(self):
|
||||
"""
|
||||
Get vocabulary in dict.
|
||||
|
||||
:return: Vocabulary in dict.
|
||||
:rtype: dict
|
||||
"""
|
||||
return self.__vocab_dict__
|
||||
|
||||
def vocabulary_list(self):
|
||||
"""
|
||||
Get vocabulary in list.
|
||||
|
||||
:return: Vocabulary in list
|
||||
:rtype: list
|
||||
"""
|
||||
return self.__vocab_list__
|
||||
|
||||
def data_name_feeding(self):
|
||||
"""
|
||||
Get feeddings (data field name and corresponding field id).
|
||||
|
||||
:return: Feeding dict.
|
||||
:rtype: dict
|
||||
"""
|
||||
feeding = {
|
||||
"audio_spectrogram": 0,
|
||||
"transcript_text": 1,
|
||||
}
|
||||
return feeding
|
@ -0,0 +1,57 @@
|
||||
"""Compute mean and std for feature normalizer, and save to file."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
from data_utils.normalizer import FeatureNormalizer
|
||||
from data_utils.augmentor.augmentation import AugmentationPipeline
|
||||
from data_utils.featurizer.audio_featurizer import AudioFeaturizer
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Computing mean and stddev for feature normalizer.')
|
||||
parser.add_argument(
|
||||
"--manifest_path",
|
||||
default='datasets/manifest.train',
|
||||
type=str,
|
||||
help="Manifest path for computing normalizer's mean and stddev."
|
||||
"(default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--num_samples",
|
||||
default=2000,
|
||||
type=int,
|
||||
help="Number of samples for computing mean and stddev. "
|
||||
"(default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--augmentation_config",
|
||||
default='{}',
|
||||
type=str,
|
||||
help="Augmentation configuration in json-format. "
|
||||
"(default: %(default)s)")
|
||||
parser.add_argument(
|
||||
"--output_file",
|
||||
default='mean_std.npz',
|
||||
type=str,
|
||||
help="Filepath to write mean and std to (.npz)."
|
||||
"(default: %(default)s)")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
augmentation_pipeline = AugmentationPipeline(args.augmentation_config)
|
||||
audio_featurizer = AudioFeaturizer()
|
||||
|
||||
def augment_and_featurize(audio_segment):
|
||||
augmentation_pipeline.transform_audio(audio_segment)
|
||||
return audio_featurizer.featurize(audio_segment)
|
||||
|
||||
normalizer = FeatureNormalizer(
|
||||
mean_std_filepath=None,
|
||||
manifest_path=args.manifest_path,
|
||||
featurize_func=augment_and_featurize,
|
||||
num_samples=args.num_samples)
|
||||
normalizer.write_to_file(args.output_file)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -0,0 +1,252 @@
|
||||
"""Contains the audio segment class."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import io
|
||||
import soundfile
|
||||
|
||||
|
||||
class AudioSegment(object):
|
||||
"""Monaural audio segment abstraction.
|
||||
|
||||
:param samples: Audio samples [num_samples x num_channels].
|
||||
:type samples: ndarray.float32
|
||||
:param sample_rate: Audio sample rate.
|
||||
:type sample_rate: int
|
||||
:raises TypeError: If the sample data type is not float or int.
|
||||
"""
|
||||
|
||||
def __init__(self, samples, sample_rate):
|
||||
"""Create audio segment from samples.
|
||||
|
||||
Samples are convert float32 internally, with int scaled to [-1, 1].
|
||||
"""
|
||||
self._samples = self._convert_samples_to_float32(samples)
|
||||
self._sample_rate = sample_rate
|
||||
if self._samples.ndim >= 2:
|
||||
self._samples = np.mean(self._samples, 1)
|
||||
|
||||
def __eq__(self, other):
|
||||
"""Return whether two objects are equal."""
|
||||
if type(other) is not type(self):
|
||||
return False
|
||||
if self._sample_rate != other._sample_rate:
|
||||
return False
|
||||
if self._samples.shape != other._samples.shape:
|
||||
return False
|
||||
if np.any(self.samples != other._samples):
|
||||
return False
|
||||
return True
|
||||
|
||||
def __ne__(self, other):
|
||||
"""Return whether two objects are unequal."""
|
||||
return not self.__eq__(other)
|
||||
|
||||
def __str__(self):
|
||||
"""Return human-readable representation of segment."""
|
||||
return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
|
||||
"rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
|
||||
self.duration, self.rms_db))
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, file):
|
||||
"""Create audio segment from audio file.
|
||||
|
||||
:param filepath: Filepath or file object to audio file.
|
||||
:type filepath: basestring|file
|
||||
:return: Audio segment instance.
|
||||
:rtype: AudioSegment
|
||||
"""
|
||||
samples, sample_rate = soundfile.read(file, dtype='float32')
|
||||
return cls(samples, sample_rate)
|
||||
|
||||
@classmethod
|
||||
def from_bytes(cls, bytes):
|
||||
"""Create audio segment from a byte string containing audio samples.
|
||||
|
||||
:param bytes: Byte string containing audio samples.
|
||||
:type bytes: str
|
||||
:return: Audio segment instance.
|
||||
:rtype: AudioSegment
|
||||
"""
|
||||
samples, sample_rate = soundfile.read(
|
||||
io.BytesIO(bytes), dtype='float32')
|
||||
return cls(samples, sample_rate)
|
||||
|
||||
def to_wav_file(self, filepath, dtype='float32'):
|
||||
"""Save audio segment to disk as wav file.
|
||||
|
||||
:param filepath: WAV filepath or file object to save the
|
||||
audio segment.
|
||||
:type filepath: basestring|file
|
||||
:param dtype: Subtype for audio file. Options: 'int16', 'int32',
|
||||
'float32', 'float64'. Default is 'float32'.
|
||||
:type dtype: str
|
||||
:raises TypeError: If dtype is not supported.
|
||||
"""
|
||||
samples = self._convert_samples_from_float32(self._samples, dtype)
|
||||
subtype_map = {
|
||||
'int16': 'PCM_16',
|
||||
'int32': 'PCM_32',
|
||||
'float32': 'FLOAT',
|
||||
'float64': 'DOUBLE'
|
||||
}
|
||||
soundfile.write(
|
||||
filepath,
|
||||
samples,
|
||||
self._sample_rate,
|
||||
format='WAV',
|
||||
subtype=subtype_map[dtype])
|
||||
|
||||
def to_bytes(self, dtype='float32'):
|
||||
"""Create a byte string containing the audio content.
|
||||
|
||||
:param dtype: Data type for export samples. Options: 'int16', 'int32',
|
||||
'float32', 'float64'. Default is 'float32'.
|
||||
:type dtype: str
|
||||
:return: Byte string containing audio content.
|
||||
:rtype: str
|
||||
"""
|
||||
samples = self._convert_samples_from_float32(self._samples, dtype)
|
||||
return samples.tostring()
|
||||
|
||||
def apply_gain(self, gain):
|
||||
"""Apply gain in decibels to samples.
|
||||
|
||||
Note that this is an in-place transformation.
|
||||
|
||||
:param gain: Gain in decibels to apply to samples.
|
||||
:type gain: float
|
||||
"""
|
||||
self._samples *= 10.**(gain / 20.)
|
||||
|
||||
def change_speed(self, speed_rate):
|
||||
"""Change the audio speed by linear interpolation.
|
||||
|
||||
Note that this is an in-place transformation.
|
||||
|
||||
:param speed_rate: Rate of speed change:
|
||||
speed_rate > 1.0, speed up the audio;
|
||||
speed_rate = 1.0, unchanged;
|
||||
speed_rate < 1.0, slow down the audio;
|
||||
speed_rate <= 0.0, not allowed, raise ValueError.
|
||||
:type speed_rate: float
|
||||
:raises ValueError: If speed_rate <= 0.0.
|
||||
"""
|
||||
if speed_rate <= 0:
|
||||
raise ValueError("speed_rate should be greater than zero.")
|
||||
old_length = self._samples.shape[0]
|
||||
new_length = int(old_length / speed_rate)
|
||||
old_indices = np.arange(old_length)
|
||||
new_indices = np.linspace(start=0, stop=old_length, num=new_length)
|
||||
self._samples = np.interp(new_indices, old_indices, self._samples)
|
||||
|
||||
def normalize(self, target_sample_rate):
|
||||
raise NotImplementedError()
|
||||
|
||||
def resample(self, target_sample_rate):
|
||||
raise NotImplementedError()
|
||||
|
||||
def pad_silence(self, duration, sides='both'):
|
||||
raise NotImplementedError()
|
||||
|
||||
def subsegment(self, start_sec=None, end_sec=None):
|
||||
raise NotImplementedError()
|
||||
|
||||
def convolve(self, filter, allow_resample=False):
|
||||
raise NotImplementedError()
|
||||
|
||||
def convolve_and_normalize(self, filter, allow_resample=False):
|
||||
raise NotImplementedError()
|
||||
|
||||
@property
|
||||
def samples(self):
|
||||
"""Return audio samples.
|
||||
|
||||
:return: Audio samples.
|
||||
:rtype: ndarray
|
||||
"""
|
||||
return self._samples.copy()
|
||||
|
||||
@property
|
||||
def sample_rate(self):
|
||||
"""Return audio sample rate.
|
||||
|
||||
:return: Audio sample rate.
|
||||
:rtype: int
|
||||
"""
|
||||
return self._sample_rate
|
||||
|
||||
@property
|
||||
def num_samples(self):
|
||||
"""Return number of samples.
|
||||
|
||||
:return: Number of samples.
|
||||
:rtype: int
|
||||
"""
|
||||
return self._samples.shape(0)
|
||||
|
||||
@property
|
||||
def duration(self):
|
||||
"""Return audio duration.
|
||||
|
||||
:return: Audio duration in seconds.
|
||||
:rtype: float
|
||||
"""
|
||||
return self._samples.shape[0] / float(self._sample_rate)
|
||||
|
||||
@property
|
||||
def rms_db(self):
|
||||
"""Return root mean square energy of the audio in decibels.
|
||||
|
||||
:return: Root mean square energy in decibels.
|
||||
:rtype: float
|
||||
"""
|
||||
# square root => multiply by 10 instead of 20 for dBs
|
||||
mean_square = np.mean(self._samples**2)
|
||||
return 10 * np.log10(mean_square)
|
||||
|
||||
def _convert_samples_to_float32(self, samples):
|
||||
"""Convert sample type to float32.
|
||||
|
||||
Audio sample type is usually integer or float-point.
|
||||
Integers will be scaled to [-1, 1] in float32.
|
||||
"""
|
||||
float32_samples = samples.astype('float32')
|
||||
if samples.dtype in np.sctypes['int']:
|
||||
bits = np.iinfo(samples.dtype).bits
|
||||
float32_samples *= (1. / 2**(bits - 1))
|
||||
elif samples.dtype in np.sctypes['float']:
|
||||
pass
|
||||
else:
|
||||
raise TypeError("Unsupported sample type: %s." % samples.dtype)
|
||||
return float32_samples
|
||||
|
||||
def _convert_samples_from_float32(self, samples, dtype):
|
||||
"""Convert sample type from float32 to dtype.
|
||||
|
||||
Audio sample type is usually integer or float-point. For integer
|
||||
type, float32 will be rescaled from [-1, 1] to the maximum range
|
||||
supported by the integer type.
|
||||
|
||||
This is for writing a audio file.
|
||||
"""
|
||||
dtype = np.dtype(dtype)
|
||||
output_samples = samples.copy()
|
||||
if dtype in np.sctypes['int']:
|
||||
bits = np.iinfo(dtype).bits
|
||||
output_samples *= (2**(bits - 1) / 1.)
|
||||
min_val = np.iinfo(dtype).min
|
||||
max_val = np.iinfo(dtype).max
|
||||
output_samples[output_samples > max_val] = max_val
|
||||
output_samples[output_samples < min_val] = min_val
|
||||
elif samples.dtype in np.sctypes['float']:
|
||||
min_val = np.finfo(dtype).min
|
||||
max_val = np.finfo(dtype).max
|
||||
output_samples[output_samples > max_val] = max_val
|
||||
output_samples[output_samples < min_val] = min_val
|
||||
else:
|
||||
raise TypeError("Unsupported sample type: %s." % samples.dtype)
|
||||
return output_samples.astype(dtype)
|
@ -0,0 +1,80 @@
|
||||
"""Contains the data augmentation pipeline."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import json
|
||||
import random
|
||||
from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
|
||||
|
||||
|
||||
class AugmentationPipeline(object):
|
||||
"""Build a pre-processing pipeline with various augmentation models.Such a
|
||||
data augmentation pipeline is oftern leveraged to augment the training
|
||||
samples to make the model invariant to certain types of perturbations in the
|
||||
real world, improving model's generalization ability.
|
||||
|
||||
The pipeline is built according the the augmentation configuration in json
|
||||
string, e.g.
|
||||
|
||||
.. code-block::
|
||||
|
||||
'[{"type": "volume",
|
||||
"params": {"min_gain_dBFS": -15,
|
||||
"max_gain_dBFS": 15},
|
||||
"prob": 0.5},
|
||||
{"type": "speed",
|
||||
"params": {"min_speed_rate": 0.8,
|
||||
"max_speed_rate": 1.2},
|
||||
"prob": 0.5}
|
||||
]'
|
||||
|
||||
This augmentation configuration inserts two augmentation models
|
||||
into the pipeline, with one is VolumePerturbAugmentor and the other
|
||||
SpeedPerturbAugmentor. "prob" indicates the probability of the current
|
||||
augmentor to take effect.
|
||||
|
||||
:param augmentation_config: Augmentation configuration in json string.
|
||||
:type augmentation_config: str
|
||||
:param random_seed: Random seed.
|
||||
:type random_seed: int
|
||||
:raises ValueError: If the augmentation json config is in incorrect format".
|
||||
"""
|
||||
|
||||
def __init__(self, augmentation_config, random_seed=0):
|
||||
self._rng = random.Random(random_seed)
|
||||
self._augmentors, self._rates = self._parse_pipeline_from(
|
||||
augmentation_config)
|
||||
|
||||
def transform_audio(self, audio_segment):
|
||||
"""Run the pre-processing pipeline for data augmentation.
|
||||
|
||||
Note that this is an in-place transformation.
|
||||
|
||||
:param audio_segment: Audio segment to process.
|
||||
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||
"""
|
||||
for augmentor, rate in zip(self._augmentors, self._rates):
|
||||
if self._rng.uniform(0., 1.) <= rate:
|
||||
augmentor.transform_audio(audio_segment)
|
||||
|
||||
def _parse_pipeline_from(self, config_json):
|
||||
"""Parse the config json to build a augmentation pipelien."""
|
||||
try:
|
||||
configs = json.loads(config_json)
|
||||
augmentors = [
|
||||
self._get_augmentor(config["type"], config["params"])
|
||||
for config in configs
|
||||
]
|
||||
rates = [config["prob"] for config in configs]
|
||||
except Exception as e:
|
||||
raise ValueError("Failed to parse the augmentation config json: "
|
||||
"%s" % str(e))
|
||||
return augmentors, rates
|
||||
|
||||
def _get_augmentor(self, augmentor_type, params):
|
||||
"""Return an augmentation model by the type name, and pass in params."""
|
||||
if augmentor_type == "volume":
|
||||
return VolumePerturbAugmentor(self._rng, **params)
|
||||
else:
|
||||
raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
|
@ -0,0 +1,33 @@
|
||||
"""Contains the abstract base class for augmentation models."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
|
||||
class AugmentorBase(object):
|
||||
"""Abstract base class for augmentation model (augmentor) class.
|
||||
All augmentor classes should inherit from this class, and implement the
|
||||
following abstract methods.
|
||||
"""
|
||||
|
||||
__metaclass__ = ABCMeta
|
||||
|
||||
@abstractmethod
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def transform_audio(self, audio_segment):
|
||||
"""Adds various effects to the input audio segment. Such effects
|
||||
will augment the training data to make the model invariant to certain
|
||||
types of perturbations in the real world, improving model's
|
||||
generalization ability.
|
||||
|
||||
Note that this is an in-place transformation.
|
||||
|
||||
:param audio_segment: Audio segment to add effects to.
|
||||
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||
"""
|
||||
pass
|
@ -0,0 +1,40 @@
|
||||
"""Contains the volume perturb augmentation model."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from data_utils.augmentor.base import AugmentorBase
|
||||
|
||||
|
||||
class VolumePerturbAugmentor(AugmentorBase):
|
||||
"""Augmentation model for adding random volume perturbation.
|
||||
|
||||
This is used for multi-loudness training of PCEN. See
|
||||
|
||||
https://arxiv.org/pdf/1607.05666v1.pdf
|
||||
|
||||
for more details.
|
||||
|
||||
:param rng: Random generator object.
|
||||
:type rng: random.Random
|
||||
:param min_gain_dBFS: Minimal gain in dBFS.
|
||||
:type min_gain_dBFS: float
|
||||
:param max_gain_dBFS: Maximal gain in dBFS.
|
||||
:type max_gain_dBFS: float
|
||||
"""
|
||||
|
||||
def __init__(self, rng, min_gain_dBFS, max_gain_dBFS):
|
||||
self._min_gain_dBFS = min_gain_dBFS
|
||||
self._max_gain_dBFS = max_gain_dBFS
|
||||
self._rng = rng
|
||||
|
||||
def transform_audio(self, audio_segment):
|
||||
"""Change audio loadness.
|
||||
|
||||
Note that this is an in-place transformation.
|
||||
|
||||
:param audio_segment: Audio segment to add effects to.
|
||||
:type audio_segment: AudioSegmenet|SpeechSegment
|
||||
"""
|
||||
gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS)
|
||||
audio_segment.apply_gain(gain)
|
@ -0,0 +1,273 @@
|
||||
"""Contains data generator for orgnaizing various audio data preprocessing
|
||||
pipeline and offering data reader interface of PaddlePaddle requirements.
|
||||
"""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import random
|
||||
import numpy as np
|
||||
import paddle.v2 as paddle
|
||||
from data_utils import utils
|
||||
from data_utils.augmentor.augmentation import AugmentationPipeline
|
||||
from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
|
||||
from data_utils.speech import SpeechSegment
|
||||
from data_utils.normalizer import FeatureNormalizer
|
||||
|
||||
|
||||
class DataGenerator(object):
|
||||
"""
|
||||
DataGenerator provides basic audio data preprocessing pipeline, and offers
|
||||
data reader interfaces of PaddlePaddle requirements.
|
||||
|
||||
:param vocab_filepath: Vocabulary filepath for indexing tokenized
|
||||
transcripts.
|
||||
:type vocab_filepath: basestring
|
||||
:param mean_std_filepath: File containing the pre-computed mean and stddev.
|
||||
:type mean_std_filepath: None|basestring
|
||||
:param augmentation_config: Augmentation configuration in json string.
|
||||
Details see AugmentationPipeline.__doc__.
|
||||
:type augmentation_config: str
|
||||
:param max_duration: Audio with duration (in seconds) greater than
|
||||
this will be discarded.
|
||||
:type max_duration: float
|
||||
:param min_duration: Audio with duration (in seconds) smaller than
|
||||
this will be discarded.
|
||||
:type min_duration: float
|
||||
:param stride_ms: Striding size (in milliseconds) for generating frames.
|
||||
:type stride_ms: float
|
||||
:param window_ms: Window size (in milliseconds) for generating frames.
|
||||
:type window_ms: float
|
||||
:param max_freq: Used when specgram_type is 'linear', only FFT bins
|
||||
corresponding to frequencies between [0, max_freq] are
|
||||
returned.
|
||||
:types max_freq: None|float
|
||||
:param specgram_type: Specgram feature type. Options: 'linear'.
|
||||
:type specgram_type: str
|
||||
:param random_seed: Random seed.
|
||||
:type random_seed: int
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
vocab_filepath,
|
||||
mean_std_filepath,
|
||||
augmentation_config='{}',
|
||||
max_duration=float('inf'),
|
||||
min_duration=0.0,
|
||||
stride_ms=10.0,
|
||||
window_ms=20.0,
|
||||
max_freq=None,
|
||||
specgram_type='linear',
|
||||
random_seed=0):
|
||||
self._max_duration = max_duration
|
||||
self._min_duration = min_duration
|
||||
self._normalizer = FeatureNormalizer(mean_std_filepath)
|
||||
self._augmentation_pipeline = AugmentationPipeline(
|
||||
augmentation_config=augmentation_config, random_seed=random_seed)
|
||||
self._speech_featurizer = SpeechFeaturizer(
|
||||
vocab_filepath=vocab_filepath,
|
||||
specgram_type=specgram_type,
|
||||
stride_ms=stride_ms,
|
||||
window_ms=window_ms,
|
||||
max_freq=max_freq)
|
||||
self._rng = random.Random(random_seed)
|
||||
self._epoch = 0
|
||||
|
||||
def batch_reader_creator(self,
|
||||
manifest_path,
|
||||
batch_size,
|
||||
min_batch_size=1,
|
||||
padding_to=-1,
|
||||
flatten=False,
|
||||
sortagrad=False,
|
||||
shuffle_method="batch_shuffle"):
|
||||
"""
|
||||
Batch data reader creator for audio data. Return a callable generator
|
||||
function to produce batches of data.
|
||||
|
||||
Audio features within one batch will be padded with zeros to have the
|
||||
same shape, or a user-defined shape.
|
||||
|
||||
:param manifest_path: Filepath of manifest for audio files.
|
||||
:type manifest_path: basestring
|
||||
:param batch_size: Number of instances in a batch.
|
||||
:type batch_size: int
|
||||
:param min_batch_size: Any batch with batch size smaller than this will
|
||||
be discarded. (To be deprecated in the future.)
|
||||
:type min_batch_size: int
|
||||
:param padding_to: If set -1, the maximun shape in the batch
|
||||
will be used as the target shape for padding.
|
||||
Otherwise, `padding_to` will be the target shape.
|
||||
:type padding_to: int
|
||||
:param flatten: If set True, audio features will be flatten to 1darray.
|
||||
:type flatten: bool
|
||||
:param sortagrad: If set True, sort the instances by audio duration
|
||||
in the first epoch for speed up training.
|
||||
:type sortagrad: bool
|
||||
:param shuffle_method: Shuffle method. Options:
|
||||
'' or None: no shuffle.
|
||||
'instance_shuffle': instance-wise shuffle.
|
||||
'batch_shuffle': similarly-sized instances are
|
||||
put into batches, and then
|
||||
batch-wise shuffle the batches.
|
||||
For more details, please see
|
||||
``_batch_shuffle.__doc__``.
|
||||
'batch_shuffle_clipped': 'batch_shuffle' with
|
||||
head shift and tail
|
||||
clipping. For more
|
||||
details, please see
|
||||
``_batch_shuffle``.
|
||||
If sortagrad is True, shuffle is disabled
|
||||
for the first epoch.
|
||||
:type shuffle_method: None|str
|
||||
:return: Batch reader function, producing batches of data when called.
|
||||
:rtype: callable
|
||||
"""
|
||||
|
||||
def batch_reader():
|
||||
# read manifest
|
||||
manifest = utils.read_manifest(
|
||||
manifest_path=manifest_path,
|
||||
max_duration=self._max_duration,
|
||||
min_duration=self._min_duration)
|
||||
# sort (by duration) or batch-wise shuffle the manifest
|
||||
if self._epoch == 0 and sortagrad:
|
||||
manifest.sort(key=lambda x: x["duration"])
|
||||
else:
|
||||
if shuffle_method == "batch_shuffle":
|
||||
manifest = self._batch_shuffle(
|
||||
manifest, batch_size, clipped=False)
|
||||
elif shuffle_method == "batch_shuffle_clipped":
|
||||
manifest = self._batch_shuffle(
|
||||
manifest, batch_size, clipped=True)
|
||||
elif shuffle_method == "instance_shuffle":
|
||||
self._rng.shuffle(manifest)
|
||||
elif not shuffle_method:
|
||||
pass
|
||||
else:
|
||||
raise ValueError("Unknown shuffle method %s." %
|
||||
shuffle_method)
|
||||
# prepare batches
|
||||
instance_reader = self._instance_reader_creator(manifest)
|
||||
batch = []
|
||||
for instance in instance_reader():
|
||||
batch.append(instance)
|
||||
if len(batch) == batch_size:
|
||||
yield self._padding_batch(batch, padding_to, flatten)
|
||||
batch = []
|
||||
if len(batch) >= min_batch_size:
|
||||
yield self._padding_batch(batch, padding_to, flatten)
|
||||
self._epoch += 1
|
||||
|
||||
return batch_reader
|
||||
|
||||
@property
|
||||
def feeding(self):
|
||||
"""Returns data reader's feeding dict.
|
||||
|
||||
:return: Data feeding dict.
|
||||
:rtype: dict
|
||||
"""
|
||||
return {"audio_spectrogram": 0, "transcript_text": 1}
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""Return the vocabulary size.
|
||||
|
||||
:return: Vocabulary size.
|
||||
:rtype: int
|
||||
"""
|
||||
return self._speech_featurizer.vocab_size
|
||||
|
||||
@property
|
||||
def vocab_list(self):
|
||||
"""Return the vocabulary in list.
|
||||
|
||||
:return: Vocabulary in list.
|
||||
:rtype: list
|
||||
"""
|
||||
return self._speech_featurizer.vocab_list
|
||||
|
||||
def _process_utterance(self, filename, transcript):
|
||||
"""Load, augment, featurize and normalize for speech data."""
|
||||
speech_segment = SpeechSegment.from_file(filename, transcript)
|
||||
self._augmentation_pipeline.transform_audio(speech_segment)
|
||||
specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
|
||||
specgram = self._normalizer.apply(specgram)
|
||||
return specgram, text_ids
|
||||
|
||||
def _instance_reader_creator(self, manifest):
|
||||
"""
|
||||
Instance reader creator. Create a callable function to produce
|
||||
instances of data.
|
||||
|
||||
Instance: a tuple of ndarray of audio spectrogram and a list of
|
||||
token indices for transcript.
|
||||
"""
|
||||
|
||||
def reader():
|
||||
for instance in manifest:
|
||||
yield self._process_utterance(instance["audio_filepath"],
|
||||
instance["text"])
|
||||
|
||||
return reader
|
||||
|
||||
def _padding_batch(self, batch, padding_to=-1, flatten=False):
|
||||
"""
|
||||
Padding audio features with zeros to make them have the same shape (or
|
||||
a user-defined shape) within one bach.
|
||||
|
||||
If ``padding_to`` is -1, the maximun shape in the batch will be used
|
||||
as the target shape for padding. Otherwise, `padding_to` will be the
|
||||
target shape (only refers to the second axis).
|
||||
|
||||
If `flatten` is True, features will be flatten to 1darray.
|
||||
"""
|
||||
new_batch = []
|
||||
# get target shape
|
||||
max_length = max([audio.shape[1] for audio, text in batch])
|
||||
if padding_to != -1:
|
||||
if padding_to < max_length:
|
||||
raise ValueError("If padding_to is not -1, it should be larger "
|
||||
"than any instance's shape in the batch")
|
||||
max_length = padding_to
|
||||
# padding
|
||||
for audio, text in batch:
|
||||
padded_audio = np.zeros([audio.shape[0], max_length])
|
||||
padded_audio[:, :audio.shape[1]] = audio
|
||||
if flatten:
|
||||
padded_audio = padded_audio.flatten()
|
||||
new_batch.append((padded_audio, text))
|
||||
return new_batch
|
||||
|
||||
def _batch_shuffle(self, manifest, batch_size, clipped=False):
|
||||
"""Put similarly-sized instances into minibatches for better efficiency
|
||||
and make a batch-wise shuffle.
|
||||
|
||||
1. Sort the audio clips by duration.
|
||||
2. Generate a random number `k`, k in [0, batch_size).
|
||||
3. Randomly shift `k` instances in order to create different batches
|
||||
for different epochs. Create minibatches.
|
||||
4. Shuffle the minibatches.
|
||||
|
||||
:param manifest: Manifest contents. List of dict.
|
||||
:type manifest: list
|
||||
:param batch_size: Batch size. This size is also used for generate
|
||||
a random number for batch shuffle.
|
||||
:type batch_size: int
|
||||
:param clipped: Whether to clip the heading (small shift) and trailing
|
||||
(incomplete batch) instances.
|
||||
:type clipped: bool
|
||||
:return: Batch shuffled mainifest.
|
||||
:rtype: list
|
||||
"""
|
||||
manifest.sort(key=lambda x: x["duration"])
|
||||
shift_len = self._rng.randint(0, batch_size - 1)
|
||||
batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
|
||||
self._rng.shuffle(batch_manifest)
|
||||
batch_manifest = list(sum(batch_manifest, ()))
|
||||
if not clipped:
|
||||
res_len = len(manifest) - shift_len - len(batch_manifest)
|
||||
batch_manifest.extend(manifest[-res_len:])
|
||||
batch_manifest.extend(manifest[0:shift_len])
|
||||
return batch_manifest
|
@ -0,0 +1,106 @@
|
||||
"""Contains the audio featurizer class."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
from data_utils import utils
|
||||
from data_utils.audio import AudioSegment
|
||||
|
||||
|
||||
class AudioFeaturizer(object):
|
||||
"""Audio featurizer, for extracting features from audio contents of
|
||||
AudioSegment or SpeechSegment.
|
||||
|
||||
Currently, it only supports feature type of linear spectrogram.
|
||||
|
||||
:param specgram_type: Specgram feature type. Options: 'linear'.
|
||||
:type specgram_type: str
|
||||
:param stride_ms: Striding size (in milliseconds) for generating frames.
|
||||
:type stride_ms: float
|
||||
:param window_ms: Window size (in milliseconds) for generating frames.
|
||||
:type window_ms: float
|
||||
:param max_freq: Used when specgram_type is 'linear', only FFT bins
|
||||
corresponding to frequencies between [0, max_freq] are
|
||||
returned.
|
||||
:types max_freq: None|float
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
specgram_type='linear',
|
||||
stride_ms=10.0,
|
||||
window_ms=20.0,
|
||||
max_freq=None):
|
||||
self._specgram_type = specgram_type
|
||||
self._stride_ms = stride_ms
|
||||
self._window_ms = window_ms
|
||||
self._max_freq = max_freq
|
||||
|
||||
def featurize(self, audio_segment):
|
||||
"""Extract audio features from AudioSegment or SpeechSegment.
|
||||
|
||||
:param audio_segment: Audio/speech segment to extract features from.
|
||||
:type audio_segment: AudioSegment|SpeechSegment
|
||||
:return: Spectrogram audio feature in 2darray.
|
||||
:rtype: ndarray
|
||||
"""
|
||||
return self._compute_specgram(audio_segment.samples,
|
||||
audio_segment.sample_rate)
|
||||
|
||||
def _compute_specgram(self, samples, sample_rate):
|
||||
"""Extract various audio features."""
|
||||
if self._specgram_type == 'linear':
|
||||
return self._compute_linear_specgram(
|
||||
samples, sample_rate, self._stride_ms, self._window_ms,
|
||||
self._max_freq)
|
||||
else:
|
||||
raise ValueError("Unknown specgram_type %s. "
|
||||
"Supported values: linear." % self._specgram_type)
|
||||
|
||||
def _compute_linear_specgram(self,
|
||||
samples,
|
||||
sample_rate,
|
||||
stride_ms=10.0,
|
||||
window_ms=20.0,
|
||||
max_freq=None,
|
||||
eps=1e-14):
|
||||
"""Compute the linear spectrogram from FFT energy."""
|
||||
if max_freq is None:
|
||||
max_freq = sample_rate / 2
|
||||
if max_freq > sample_rate / 2:
|
||||
raise ValueError("max_freq must be greater than half of "
|
||||
"sample rate.")
|
||||
if stride_ms > window_ms:
|
||||
raise ValueError("Stride size must not be greater than "
|
||||
"window size.")
|
||||
stride_size = int(0.001 * sample_rate * stride_ms)
|
||||
window_size = int(0.001 * sample_rate * window_ms)
|
||||
specgram, freqs = self._specgram_real(
|
||||
samples,
|
||||
window_size=window_size,
|
||||
stride_size=stride_size,
|
||||
sample_rate=sample_rate)
|
||||
ind = np.where(freqs <= max_freq)[0][-1] + 1
|
||||
return np.log(specgram[:ind, :] + eps)
|
||||
|
||||
def _specgram_real(self, samples, window_size, stride_size, sample_rate):
|
||||
"""Compute the spectrogram for samples from a real signal."""
|
||||
# extract strided windows
|
||||
truncate_size = (len(samples) - window_size) % stride_size
|
||||
samples = samples[:len(samples) - truncate_size]
|
||||
nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
|
||||
nstrides = (samples.strides[0], samples.strides[0] * stride_size)
|
||||
windows = np.lib.stride_tricks.as_strided(
|
||||
samples, shape=nshape, strides=nstrides)
|
||||
assert np.all(
|
||||
windows[:, 1] == samples[stride_size:(stride_size + window_size)])
|
||||
# window weighting, squared Fast Fourier Transform (fft), scaling
|
||||
weighting = np.hanning(window_size)[:, None]
|
||||
fft = np.fft.rfft(windows * weighting, axis=0)
|
||||
fft = np.absolute(fft)**2
|
||||
scale = np.sum(weighting**2) * sample_rate
|
||||
fft[1:-1, :] *= (2.0 / scale)
|
||||
fft[(0, -1), :] /= scale
|
||||
# prepare fft frequency list
|
||||
freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
|
||||
return fft, freqs
|
@ -0,0 +1,77 @@
|
||||
"""Contains the speech featurizer class."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from data_utils.featurizer.audio_featurizer import AudioFeaturizer
|
||||
from data_utils.featurizer.text_featurizer import TextFeaturizer
|
||||
|
||||
|
||||
class SpeechFeaturizer(object):
|
||||
"""Speech featurizer, for extracting features from both audio and transcript
|
||||
contents of SpeechSegment.
|
||||
|
||||
Currently, for audio parts, it only supports feature type of linear
|
||||
spectrogram; for transcript parts, it only supports char-level tokenizing
|
||||
and conversion into a list of token indices. Note that the token indexing
|
||||
order follows the given vocabulary file.
|
||||
|
||||
:param vocab_filepath: Filepath to load vocabulary for token indices
|
||||
conversion.
|
||||
:type specgram_type: basestring
|
||||
:param specgram_type: Specgram feature type. Options: 'linear'.
|
||||
:type specgram_type: str
|
||||
:param stride_ms: Striding size (in milliseconds) for generating frames.
|
||||
:type stride_ms: float
|
||||
:param window_ms: Window size (in milliseconds) for generating frames.
|
||||
:type window_ms: float
|
||||
:param max_freq: Used when specgram_type is 'linear', only FFT bins
|
||||
corresponding to frequencies between [0, max_freq] are
|
||||
returned.
|
||||
:types max_freq: None|float
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
vocab_filepath,
|
||||
specgram_type='linear',
|
||||
stride_ms=10.0,
|
||||
window_ms=20.0,
|
||||
max_freq=None):
|
||||
self._audio_featurizer = AudioFeaturizer(specgram_type, stride_ms,
|
||||
window_ms, max_freq)
|
||||
self._text_featurizer = TextFeaturizer(vocab_filepath)
|
||||
|
||||
def featurize(self, speech_segment):
|
||||
"""Extract features for speech segment.
|
||||
|
||||
1. For audio parts, extract the audio features.
|
||||
2. For transcript parts, convert text string to a list of token indices
|
||||
in char-level.
|
||||
|
||||
:param audio_segment: Speech segment to extract features from.
|
||||
:type audio_segment: SpeechSegment
|
||||
:return: A tuple of 1) spectrogram audio feature in 2darray, 2) list of
|
||||
char-level token indices.
|
||||
:rtype: tuple
|
||||
"""
|
||||
audio_feature = self._audio_featurizer.featurize(speech_segment)
|
||||
text_ids = self._text_featurizer.featurize(speech_segment.transcript)
|
||||
return audio_feature, text_ids
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""Return the vocabulary size.
|
||||
|
||||
:return: Vocabulary size.
|
||||
:rtype: int
|
||||
"""
|
||||
return self._text_featurizer.vocab_size
|
||||
|
||||
@property
|
||||
def vocab_list(self):
|
||||
"""Return the vocabulary in list.
|
||||
|
||||
:return: Vocabulary in list.
|
||||
:rtype: list
|
||||
"""
|
||||
return self._text_featurizer.vocab_list
|
@ -0,0 +1,67 @@
|
||||
"""Contains the text featurizer class."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
|
||||
|
||||
class TextFeaturizer(object):
|
||||
"""Text featurizer, for processing or extracting features from text.
|
||||
|
||||
Currently, it only supports char-level tokenizing and conversion into
|
||||
a list of token indices. Note that the token indexing order follows the
|
||||
given vocabulary file.
|
||||
|
||||
:param vocab_filepath: Filepath to load vocabulary for token indices
|
||||
conversion.
|
||||
:type specgram_type: basestring
|
||||
"""
|
||||
|
||||
def __init__(self, vocab_filepath):
|
||||
self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
|
||||
vocab_filepath)
|
||||
|
||||
def featurize(self, text):
|
||||
"""Convert text string to a list of token indices in char-level.Note
|
||||
that the token indexing order follows the given vocabulary file.
|
||||
|
||||
:param text: Text to process.
|
||||
:type text: basestring
|
||||
:return: List of char-level token indices.
|
||||
:rtype: list
|
||||
"""
|
||||
tokens = self._char_tokenize(text)
|
||||
return [self._vocab_dict[token] for token in tokens]
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""Return the vocabulary size.
|
||||
|
||||
:return: Vocabulary size.
|
||||
:rtype: int
|
||||
"""
|
||||
return len(self._vocab_list)
|
||||
|
||||
@property
|
||||
def vocab_list(self):
|
||||
"""Return the vocabulary in list.
|
||||
|
||||
:return: Vocabulary in list.
|
||||
:rtype: list
|
||||
"""
|
||||
return self._vocab_list
|
||||
|
||||
def _char_tokenize(self, text):
|
||||
"""Character tokenizer."""
|
||||
return list(text.strip())
|
||||
|
||||
def _load_vocabulary_from_file(self, vocab_filepath):
|
||||
"""Load vocabulary from file."""
|
||||
vocab_lines = []
|
||||
with open(vocab_filepath, 'r') as file:
|
||||
vocab_lines.extend(file.readlines())
|
||||
vocab_list = [line[:-1] for line in vocab_lines]
|
||||
vocab_dict = dict(
|
||||
[(token, id) for (id, token) in enumerate(vocab_list)])
|
||||
return vocab_dict, vocab_list
|
@ -0,0 +1,87 @@
|
||||
"""Contains feature normalizers."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import random
|
||||
import data_utils.utils as utils
|
||||
from data_utils.audio import AudioSegment
|
||||
|
||||
|
||||
class FeatureNormalizer(object):
|
||||
"""Feature normalizer. Normalize features to be of zero mean and unit
|
||||
stddev.
|
||||
|
||||
if mean_std_filepath is provided (not None), the normalizer will directly
|
||||
initilize from the file. Otherwise, both manifest_path and featurize_func
|
||||
should be given for on-the-fly mean and stddev computing.
|
||||
|
||||
:param mean_std_filepath: File containing the pre-computed mean and stddev.
|
||||
:type mean_std_filepath: None|basestring
|
||||
:param manifest_path: Manifest of instances for computing mean and stddev.
|
||||
:type meanifest_path: None|basestring
|
||||
:param featurize_func: Function to extract features. It should be callable
|
||||
with ``featurize_func(audio_segment)``.
|
||||
:type featurize_func: None|callable
|
||||
:param num_samples: Number of random samples for computing mean and stddev.
|
||||
:type num_samples: int
|
||||
:param random_seed: Random seed for sampling instances.
|
||||
:type random_seed: int
|
||||
:raises ValueError: If both mean_std_filepath and manifest_path
|
||||
(or both mean_std_filepath and featurize_func) are None.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
mean_std_filepath,
|
||||
manifest_path=None,
|
||||
featurize_func=None,
|
||||
num_samples=500,
|
||||
random_seed=0):
|
||||
if not mean_std_filepath:
|
||||
if not (manifest_path and featurize_func):
|
||||
raise ValueError("If mean_std_filepath is None, meanifest_path "
|
||||
"and featurize_func should not be None.")
|
||||
self._rng = random.Random(random_seed)
|
||||
self._compute_mean_std(manifest_path, featurize_func, num_samples)
|
||||
else:
|
||||
self._read_mean_std_from_file(mean_std_filepath)
|
||||
|
||||
def apply(self, features, eps=1e-14):
|
||||
"""Normalize features to be of zero mean and unit stddev.
|
||||
|
||||
:param features: Input features to be normalized.
|
||||
:type features: ndarray
|
||||
:param eps: added to stddev to provide numerical stablibity.
|
||||
:type eps: float
|
||||
:return: Normalized features.
|
||||
:rtype: ndarray
|
||||
"""
|
||||
return (features - self._mean) / (self._std + eps)
|
||||
|
||||
def write_to_file(self, filepath):
|
||||
"""Write the mean and stddev to the file.
|
||||
|
||||
:param filepath: File to write mean and stddev.
|
||||
:type filepath: basestring
|
||||
"""
|
||||
np.savez(filepath, mean=self._mean, std=self._std)
|
||||
|
||||
def _read_mean_std_from_file(self, filepath):
|
||||
"""Load mean and std from file."""
|
||||
npzfile = np.load(filepath)
|
||||
self._mean = npzfile["mean"]
|
||||
self._std = npzfile["std"]
|
||||
|
||||
def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
|
||||
"""Compute mean and std from randomly sampled instances."""
|
||||
manifest = utils.read_manifest(manifest_path)
|
||||
sampled_manifest = self._rng.sample(manifest, num_samples)
|
||||
features = []
|
||||
for instance in sampled_manifest:
|
||||
features.append(
|
||||
featurize_func(
|
||||
AudioSegment.from_file(instance["audio_filepath"])))
|
||||
features = np.hstack(features)
|
||||
self._mean = np.mean(features, axis=1).reshape([-1, 1])
|
||||
self._std = np.std(features, axis=1).reshape([-1, 1])
|
@ -0,0 +1,75 @@
|
||||
"""Contains the speech segment class."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from data_utils.audio import AudioSegment
|
||||
|
||||
|
||||
class SpeechSegment(AudioSegment):
|
||||
"""Speech segment abstraction, a subclass of AudioSegment,
|
||||
with an additional transcript.
|
||||
|
||||
:param samples: Audio samples [num_samples x num_channels].
|
||||
:type samples: ndarray.float32
|
||||
:param sample_rate: Audio sample rate.
|
||||
:type sample_rate: int
|
||||
:param transcript: Transcript text for the speech.
|
||||
:type transript: basestring
|
||||
:raises TypeError: If the sample data type is not float or int.
|
||||
"""
|
||||
|
||||
def __init__(self, samples, sample_rate, transcript):
|
||||
AudioSegment.__init__(self, samples, sample_rate)
|
||||
self._transcript = transcript
|
||||
|
||||
def __eq__(self, other):
|
||||
"""Return whether two objects are equal.
|
||||
"""
|
||||
if not AudioSegment.__eq__(self, other):
|
||||
return False
|
||||
if self._transcript != other._transcript:
|
||||
return False
|
||||
return True
|
||||
|
||||
def __ne__(self, other):
|
||||
"""Return whether two objects are unequal."""
|
||||
return not self.__eq__(other)
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, filepath, transcript):
|
||||
"""Create speech segment from audio file and corresponding transcript.
|
||||
|
||||
:param filepath: Filepath or file object to audio file.
|
||||
:type filepath: basestring|file
|
||||
:param transcript: Transcript text for the speech.
|
||||
:type transript: basestring
|
||||
:return: Audio segment instance.
|
||||
:rtype: AudioSegment
|
||||
"""
|
||||
audio = AudioSegment.from_file(filepath)
|
||||
return cls(audio.samples, audio.sample_rate, transcript)
|
||||
|
||||
@classmethod
|
||||
def from_bytes(cls, bytes, transcript):
|
||||
"""Create speech segment from a byte string and corresponding
|
||||
transcript.
|
||||
|
||||
:param bytes: Byte string containing audio samples.
|
||||
:type bytes: str
|
||||
:param transcript: Transcript text for the speech.
|
||||
:type transript: basestring
|
||||
:return: Audio segment instance.
|
||||
:rtype: AudioSegment
|
||||
"""
|
||||
audio = AudioSegment.from_bytes(bytes)
|
||||
return cls(audio.samples, audio.sample_rate, transcript)
|
||||
|
||||
@property
|
||||
def transcript(self):
|
||||
"""Return the transcript text.
|
||||
|
||||
:return: Transcript text for the speech.
|
||||
:rtype: basestring
|
||||
"""
|
||||
return self._transcript
|
@ -0,0 +1,34 @@
|
||||
"""Contains data helper functions."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import json
|
||||
|
||||
|
||||
def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
|
||||
"""Load and parse manifest file.
|
||||
|
||||
Instances with durations outside [min_duration, max_duration] will be
|
||||
filtered out.
|
||||
|
||||
:param manifest_path: Manifest file to load and parse.
|
||||
:type manifest_path: basestring
|
||||
:param max_duration: Maximal duration in seconds for instance filter.
|
||||
:type max_duration: float
|
||||
:param min_duration: Minimal duration in seconds for instance filter.
|
||||
:type min_duration: float
|
||||
:return: Manifest parsing results. List of dict.
|
||||
:rtype: list
|
||||
:raises IOError: If failed to parse the manifest.
|
||||
"""
|
||||
manifest = []
|
||||
for json_line in open(manifest_path):
|
||||
try:
|
||||
json_data = json.loads(json_line)
|
||||
except Exception as e:
|
||||
raise IOError("Error reading manifest: %s" % str(e))
|
||||
if (json_data["duration"] <= max_duration and
|
||||
json_data["duration"] >= min_duration):
|
||||
manifest.append(json_data)
|
||||
return manifest
|
@ -0,0 +1,13 @@
|
||||
cd librispeech
|
||||
python librispeech.py
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare LibriSpeech failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
cd -
|
||||
|
||||
cat librispeech/manifest.train* | shuf > manifest.train
|
||||
cat librispeech/manifest.dev-clean > manifest.dev
|
||||
cat librispeech/manifest.test-clean > manifest.test
|
||||
|
||||
echo "All done."
|
@ -0,0 +1,25 @@
|
||||
"""Contains common utility functions."""
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
def print_arguments(args):
|
||||
"""Print argparse's arguments.
|
||||
|
||||
Usage:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("name", default="Jonh", type=str, help="User name.")
|
||||
args = parser.parse_args()
|
||||
print_arguments(args)
|
||||
|
||||
:param args: Input argparse.Namespace for printing.
|
||||
:type args: argparse.Namespace
|
||||
"""
|
||||
print("----- Configuration Arguments -----")
|
||||
for arg, value in vars(args).iteritems():
|
||||
print("%s: %s" % (arg, value))
|
||||
print("------------------------------------")
|
Loading…
Reference in new issue