PaddleSpeech/data_utils/data.py

"""
    Providing basic audio data preprocessing pipeline, and offering
    both instance-level and batch-level data reader interfaces.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import random
import numpy as np
import paddle.v2 as paddle
from data_utils import utils
from data_utils.augmentor.augmentation import AugmentationPipeline
from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
from data_utils.audio import SpeechSegment
from data_utils.normalizer import FeatureNormalizer


class DataGenerator(object):
    """
    DataGenerator provides basic audio data preprocessing pipeline, and offers
    both instance-level and batch-level data reader interfaces.
    Normalized FFT are used as audio features here.

    :param vocab_filepath: Vocabulary file path for indexing tokenized
                           transcriptions.
    :type vocab_filepath: basestring
    :param normalizer_manifest_path: Manifest filepath for collecting feature
                                     normalization statistics, e.g. mean, std.
    :type normalizer_manifest_path: basestring
    :param normalizer_num_samples: Number of instances sampled for collecting
                                   feature normalization statistics.
                                   Default is 100.
    :type normalizer_num_samples: int
    :param max_duration: Audio clips with duration (in seconds) greater than
                         this will be discarded. Default is 20.0.
    :type max_duration: float
    :param min_duration: Audio clips with duration (in seconds) smaller than
                         this will be discarded. Default is 0.0.
    :type min_duration: float
    :param stride_ms: Striding size (in milliseconds) for generating frames.
                      Default is 10.0.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for frames. Default is 20.0.
    :type window_ms: float
    :param max_frequency: Maximun frequency for FFT features. FFT features of
                          frequency larger than this will be discarded.
                          If set None, all features will be kept.
                          Default is None.
    :type max_frequency: float
    """

    def __init__(self,
                 vocab_filepath,
                 mean_std_filepath,
                 augmentation_config='{}',
                 max_duration=float('inf'),
                 min_duration=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_freq=None,
                 random_seed=0):
        self._max_duration = max_duration
        self._min_duration = min_duration
        self._normalizer = FeatureNormalizer(mean_std_filepath)
        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=augmentation_config, random_seed=random_seed)
        self._speech_featurizer = SpeechFeaturizer(
            vocab_filepath=vocab_filepath,
            stride_ms=stride_ms,
            window_ms=window_ms,
            max_freq=max_freq,
            random_seed=random_seed)
        self._rng = random.Random(random_seed)
        self._epoch = 0

    def batch_reader_creator(self,
                             manifest_path,
                             batch_size,
                             padding_to=-1,
                             flatten=False,
                             sortagrad=False,
                             batch_shuffle=False):
        """
        Batch data reader creator for audio data. Creat a callable function to
        produce batches of data.

        Audio features will be padded with zeros to make each instance in the
        batch to share the same audio feature shape.

        :param manifest_path: Filepath of manifest for audio clip files.
        :type manifest_path: basestring
        :param batch_size: Instance number in a batch.
        :type batch_size: int
        :param padding_to:  If set -1, the maximun column numbers in the batch
                            will be used as the target size for padding.
                            Otherwise, `padding_to` will be the target size.
                            Default is -1.
        :type padding_to: int
        :param flatten: If set True, audio data will be flatten to be a 1-dim
                        ndarray. Otherwise, 2-dim ndarray. Default is False.
        :type flatten: bool
        :param sortagrad: Sort the audio clips by duration in the first epoc
                          if set True.
        :type sortagrad: bool
        :param batch_shuffle: Shuffle the audio clips if set True. It is
                              not a thorough instance-wise shuffle, but a
                              specific batch-wise shuffle. For more details,
                              please see `_batch_shuffle` function.
        :type batch_shuffle: bool
        :return: Batch reader function, producing batches of data when called.
        :rtype: callable
        """

        def batch_reader():
            # read manifest
            manifest = utils.read_manifest(
                manifest_path=manifest_path,
                max_duration=self._max_duration,
                min_duration=self._min_duration)
            # sort (by duration) or batch-wise shuffle the manifest
            if self._epoch == 0 and sortagrad:
                manifest.sort(key=lambda x: x["duration"])
            elif batch_shuffle:
                manifest = self._batch_shuffle(manifest, batch_size)
            # prepare batches
            instance_reader = self._instance_reader_creator(manifest)
            batch = []
            for instance in instance_reader():
                batch.append(instance)
                if len(batch) == batch_size:
                    yield self._padding_batch(batch, padding_to, flatten)
                    batch = []
            if len(batch) > 0:
                yield self._padding_batch(batch, padding_to, flatten)
            self._epoch += 1

        return batch_reader

    @property
    def feeding(self):
        """Returns data_reader's feeding dict."""
        return {"audio_spectrogram": 0, "transcript_text": 1}

    @property
    def vocab_size(self):
        """Returns vocabulary size."""
        return self._speech_featurizer.vocab_size

    @property
    def vocab_list(self):
        """Returns vocabulary list."""
        return self._speech_featurizer.vocab_list

    def _process_utterance(self, filename, transcript):
        speech_segment = SpeechSegment.from_file(filename, transcript)
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
        specgram = self._normalizer.apply(specgram)
        return specgram, text_ids

    def _instance_reader_creator(self, manifest):
        """
        Instance reader creator for audio data. Creat a callable function to
        produce instances of data.

        Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
        tokenized and indexed transcription text.

        :param manifest: Filepath of manifest for audio clip files.
        :type manifest: basestring
        :return: Data reader function.
        :rtype: callable
        """

        def reader():
            for instance in manifest:
                yield self._process_utterance(instance["audio_filepath"],
                                              instance["text"])

        return reader

    def _padding_batch(self, batch, padding_to=-1, flatten=False):
        """
        Padding audio part of features (only in the time axis -- column axis)
        with zeros, to make each instance in the batch share the same
        audio feature shape.

        If `padding_to` is set -1, the maximun column numbers in the batch will
        be used as the target size. Otherwise, `padding_to` will be the target
        size. Default is -1.

        If `flatten` is set True, audio data will be flatten to be a 1-dim
        ndarray. Default is False.
        """
        new_batch = []
        # get target shape
        max_length = max([audio.shape[1] for audio, text in batch])
        if padding_to != -1:
            if padding_to < max_length:
                raise ValueError("If padding_to is not -1, it should be greater"
                                 " or equal to the original instance length.")
            max_length = padding_to
        # padding
        for audio, text in batch:
            padded_audio = np.zeros([audio.shape[0], max_length])
            padded_audio[:, :audio.shape[1]] = audio
            if flatten:
                padded_audio = padded_audio.flatten()
            new_batch.append((padded_audio, text))
        return new_batch

    def _batch_shuffle(self, manifest, batch_size):
        """
        The instances have different lengths and they cannot be
        combined into a single matrix multiplication. It usually
        sorts the training examples by length and combines only
        similarly-sized instances into minibatches, pads with
        silence when necessary so that all instances in a batch
        have the same length. This batch shuffle fuction is used
        to make similarly-sized instances into minibatches and
        make a batch-wise shuffle.

        1. Sort the audio clips by duration.
        2. Generate a random number `k`, k in [0, batch_size).
        3. Randomly remove `k` instances in order to make different mini-batches,
           then make minibatches and each minibatch size is batch_size.
        4. Shuffle the minibatches.

        :param manifest: manifest file.
        :type manifest: list
        :param batch_size: Batch size. This size is also used for generate
                           a random number for batch shuffle.
        :type batch_size: int
        :return: batch shuffled mainifest.
        :rtype: list
        """
        manifest.sort(key=lambda x: x["duration"])
        shift_len = self._rng.randint(0, batch_size - 1)
        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
        self._rng.shuffle(batch_manifest)
        batch_manifest = list(sum(batch_manifest, ()))
        res_len = len(manifest) - shift_len - len(batch_manifest)
        batch_manifest.extend(manifest[-res_len:])
        batch_manifest.extend(manifest[0:shift_len])
        return batch_manifest