PaddleSpeech/audio_data_utils.py

"""
    Providing basic audio data preprocessing pipeline, and offering
    both instance-level and batch-level data reader interfaces.
"""
import paddle.v2 as paddle
import logging
import json
import random
import soundfile
import numpy as np
import os

RANDOM_SEED = 0
logger = logging.getLogger(__name__)


class DataGenerator(object):
    """
    DataGenerator provides basic audio data preprocessing pipeline, and offers
    both instance-level and batch-level data reader interfaces.
    Normalized FFT are used as audio features here.

    :param vocab_filepath: Vocabulary file path for indexing tokenized
                           transcriptions.
    :type vocab_filepath: basestring
    :param normalizer_manifest_path: Manifest filepath for collecting feature
                                     normalization statistics, e.g. mean, std.
    :type normalizer_manifest_path: basestring
    :param normalizer_num_samples: Number of instances sampled for collecting
                                   feature normalization statistics.
                                   Default is 100.
    :type normalizer_num_samples: int
    :param max_duration: Audio clips with duration (in seconds) greater than
                         this will be discarded. Default is 20.0.
    :type max_duration: float
    :param min_duration: Audio clips with duration (in seconds) smaller than
                         this will be discarded. Default is 0.0.
    :type min_duration: float
    :param stride_ms: Striding size (in milliseconds) for generating frames.
                      Default is 10.0. 
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for frames. Default is 20.0.
    :type window_ms: float
    :param max_frequency: Maximun frequency for FFT features. FFT features of
                          frequency larger than this will be discarded.
                          If set None, all features will be kept.
                          Default is None.
    :type max_frequency: float
    """

    def __init__(self,
                 vocab_filepath,
                 normalizer_manifest_path,
                 normalizer_num_samples=100,
                 max_duration=20.0,
                 min_duration=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_frequency=None):
        self.__max_duration__ = max_duration
        self.__min_duration__ = min_duration
        self.__stride_ms__ = stride_ms
        self.__window_ms__ = window_ms
        self.__max_frequency__ = max_frequency
        self.__random__ = random.Random(RANDOM_SEED)
        # load vocabulary (dictionary)
        self.__vocab_dict__, self.__vocab_list__ = \
            self.__load_vocabulary_from_file__(vocab_filepath)
        # collect normalizer statistics
        self.__mean__, self.__std__ = self.__collect_normalizer_statistics__(
            manifest_path=normalizer_manifest_path,
            num_samples=normalizer_num_samples)

    def __audio_featurize__(self, audio_filename):
        """
        Preprocess audio data, including feature extraction, normalization etc..
        """
        features = self.__audio_basic_featurize__(audio_filename)
        return self.__normalize__(features)

    def __text_featurize__(self, text):
        """
        Preprocess text data, including tokenizing and token indexing etc..
        """
        return self.__convert_text_to_char_index__(
            text=text, vocabulary=self.__vocab_dict__)

    def __audio_basic_featurize__(self, audio_filename):
        """
        Compute basic (without normalization etc.) features for audio data.
        """
        return self.__spectrogram_from_file__(
            filename=audio_filename,
            stride_ms=self.__stride_ms__,
            window_ms=self.__window_ms__,
            max_freq=self.__max_frequency__)

    def __collect_normalizer_statistics__(self, manifest_path, num_samples=100):
        """
        Compute feature normalization statistics, i.e. mean and stddev.
        """
        # read manifest
        manifest = self.__read_manifest__(
            manifest_path=manifest_path,
            max_duration=self.__max_duration__,
            min_duration=self.__min_duration__)
        # sample for statistics
        sampled_manifest = self.__random__.sample(manifest, num_samples)
        # extract spectrogram feature
        features = []
        for instance in sampled_manifest:
            spectrogram = self.__audio_basic_featurize__(
                instance["audio_filepath"])
            features.append(spectrogram)
        features = np.hstack(features)
        mean = np.mean(features, axis=1).reshape([-1, 1])
        std = np.std(features, axis=1).reshape([-1, 1])
        return mean, std

    def __normalize__(self, features, eps=1e-14):
        """
        Normalize features to be of zero mean and unit stddev.
        """
        return (features - self.__mean__) / (self.__std__ + eps)

    def __spectrogram_from_file__(self,
                                  filename,
                                  stride_ms=10.0,
                                  window_ms=20.0,
                                  max_freq=None,
                                  eps=1e-14):
        """
        Laod audio data and calculate the log of spectrogram by FFT.
        Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech
        """
        audio, sample_rate = soundfile.read(filename)
        if audio.ndim >= 2:
            audio = np.mean(audio, 1)
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
            raise ValueError("max_freq must be greater than half of "
                             "sample rate.")
        if stride_ms > window_ms:
            raise ValueError("Stride size must not be greater than "
                             "window size.")
        stride_size = int(0.001 * sample_rate * stride_ms)
        window_size = int(0.001 * sample_rate * window_ms)
        spectrogram, freqs = self.__extract_spectrogram__(
            audio,
            window_size=window_size,
            stride_size=stride_size,
            sample_rate=sample_rate)
        ind = np.where(freqs <= max_freq)[0][-1] + 1
        return np.log(spectrogram[:ind, :] + eps)

    def __extract_spectrogram__(self, samples, window_size, stride_size,
                                sample_rate):
        """
        Compute the spectrogram by FFT for a discrete real signal.
        Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech
        """
        # extract strided windows
        truncate_size = (len(samples) - window_size) % stride_size
        samples = samples[:len(samples) - truncate_size]
        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
        windows = np.lib.stride_tricks.as_strided(
            samples, shape=nshape, strides=nstrides)
        assert np.all(
            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
        # window weighting, squared Fast Fourier Transform (fft), scaling
        weighting = np.hanning(window_size)[:, None]
        fft = np.fft.rfft(windows * weighting, axis=0)
        fft = np.absolute(fft)**2
        scale = np.sum(weighting**2) * sample_rate
        fft[1:-1, :] *= (2.0 / scale)
        fft[(0, -1), :] /= scale
        # prepare fft frequency list
        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
        return fft, freqs

    def __load_vocabulary_from_file__(self, vocabulary_path):
        """
        Load vocabulary from file.
        """
        if not os.path.exists(vocabulary_path):
            raise ValueError("Vocabulary file %s not found.", vocabulary_path)
        vocab_lines = []
        with open(vocabulary_path, 'r') as file:
            vocab_lines.extend(file.readlines())
        vocab_list = [line[:-1] for line in vocab_lines]
        vocab_dict = dict(
            [(token, id) for (id, token) in enumerate(vocab_list)])
        return vocab_dict, vocab_list

    def __convert_text_to_char_index__(self, text, vocabulary):
        """
        Convert text string to a list of character index integers.
        """
        return [vocabulary[w] for w in text]

    def __read_manifest__(self, manifest_path, max_duration, min_duration):
        """
        Load and parse manifest file.
        """
        manifest = []
        for json_line in open(manifest_path):
            try:
                json_data = json.loads(json_line)
            except Exception as e:
                raise ValueError("Error reading manifest: %s" % str(e))
            if (json_data["duration"] <= max_duration and
                    json_data["duration"] >= min_duration):
                manifest.append(json_data)
        return manifest

    def __padding_batch__(self, batch, padding_to=-1, flatten=False):
        """
        Padding audio part of features (only in the time axis -- column axis)
        with zeros, to make each instance in the batch share the same
        audio feature shape.

        If `padding_to` is set -1, the maximun column numbers in the batch will
        be used as the target size. Otherwise, `padding_to` will be the target
        size. Default is -1.

        If `flatten` is set True, audio data will be flatten to be a 1-dim
        ndarray. Default is False.
        """
        new_batch = []
        # get target shape
        max_length = max([audio.shape[1] for audio, text in batch])
        if padding_to != -1:
            if padding_to < max_length:
                raise ValueError("If padding_to is not -1, it should be greater"
                                 " or equal to the original instance length.")
            max_length = padding_to
        # padding
        for audio, text in batch:
            padded_audio = np.zeros([audio.shape[0], max_length])
            padded_audio[:, :audio.shape[1]] = audio
            if flatten:
                padded_audio = padded_audio.flatten()
            new_batch.append((padded_audio, text))
        return new_batch

    def instance_reader_creator(self,
                                manifest_path,
                                sort_by_duration=True,
                                shuffle=False):
        """
        Instance reader creator for audio data. Creat a callable function to
        produce instances of data.

        Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
        tokenized and indexed transcription text.

        :param manifest_path: Filepath of manifest for audio clip files.
        :type manifest_path: basestring
        :param sort_by_duration: Sort the audio clips by duration if set True
                                 (for SortaGrad).
        :type sort_by_duration: bool
        :param shuffle: Shuffle the audio clips if set True.
        :type shuffle: bool
        :return: Data reader function.
        :rtype: callable
        """
        if sort_by_duration and shuffle:
            sort_by_duration = False
            logger.warn("When shuffle set to true, "
                        "sort_by_duration is forced to set False.")

        def reader():
            # read manifest
            manifest = self.__read_manifest__(
                manifest_path=manifest_path,
                max_duration=self.__max_duration__,
                min_duration=self.__min_duration__)
            # sort (by duration) or shuffle manifest
            if sort_by_duration:
                manifest.sort(key=lambda x: x["duration"])
            if shuffle:
                self.__random__.shuffle(manifest)
            # extract spectrogram feature
            for instance in manifest:
                spectrogram = self.__audio_featurize__(
                    instance["audio_filepath"])
                transcript = self.__text_featurize__(instance["text"])
                yield (spectrogram, transcript)

        return reader

    def batch_reader_creator(self,
                             manifest_path,
                             batch_size,
                             padding_to=-1,
                             flatten=False,
                             sort_by_duration=True,
                             shuffle=False):
        """
        Batch data reader creator for audio data. Creat a callable function to
        produce batches of data.
        
        Audio features will be padded with zeros to make each instance in the
        batch to share the same audio feature shape.

        :param manifest_path: Filepath of manifest for audio clip files.
        :type manifest_path: basestring
        :param batch_size: Instance number in a batch.
        :type batch_size: int
        :param padding_to:  If set -1, the maximun column numbers in the batch
                            will be used as the target size for padding.
                            Otherwise, `padding_to` will be the target size.
                            Default is -1.
        :type padding_to: int
        :param flatten: If set True, audio data will be flatten to be a 1-dim
                        ndarray. Otherwise, 2-dim ndarray. Default is False.
        :type flatten: bool
        :param sort_by_duration: Sort the audio clips by duration if set True
                                 (for SortaGrad).
        :type sort_by_duration: bool
        :param shuffle: Shuffle the audio clips if set True.
        :type shuffle: bool
        :return: Batch reader function, producing batches of data when called.
        :rtype: callable
        """

        def batch_reader():
            instance_reader = self.instance_reader_creator(
                manifest_path=manifest_path,
                sort_by_duration=sort_by_duration,
                shuffle=shuffle)
            batch = []
            for instance in instance_reader():
                batch.append(instance)
                if len(batch) == batch_size:
                    yield self.__padding_batch__(batch, padding_to, flatten)
                    batch = []
            if len(batch) > 0:
                yield self.__padding_batch__(batch, padding_to, flatten)

        return batch_reader

    def vocabulary_size(self):
        """
        Get vocabulary size.

        :return: Vocabulary size.
        :rtype: int
        """
        return len(self.__vocab_list__)

    def vocabulary_dict(self):
        """
        Get vocabulary in dict.

        :return: Vocabulary in dict.
        :rtype: dict
        """
        return self.__vocab_dict__

    def vocabulary_list(self):
        """
        Get vocabulary in list.

        :return: Vocabulary in list
        :rtype: list
        """
        return self.__vocab_list__

    def data_name_feeding(self):
        """
        Get feeddings (data field name and corresponding field id).

        :return: Feeding dict.
        :rtype: dict
        """
        feeding = {
            "audio_spectrogram": 0,
            "transcript_text": 1,
        }
        return feeding
Add function docs. 8 years ago			`"""`
Refactor data utils into a class and add feature normalization. 8 years ago			`Providing basic audio data preprocessing pipeline, and offering`
			`both instance-level and batch-level data reader interfaces.`
Add function docs. 8 years ago			`"""`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago			`import paddle.v2 as paddle`
			`import logging`
			`import json`
			`import random`
			`import soundfile`
			`import numpy as np`
			`import os`

Refactor data utils into a class and add feature normalization. 8 years ago			`RANDOM_SEED = 0`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago			`logger = logging.getLogger(__name__)`


Refactor data utils into a class and add feature normalization. 8 years ago			`class DataGenerator(object):`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago			`"""`
1. Fix incorrect decoder result printing. 2. Fix incorrect batch-norm usage in RNN. 3. Fix overlapping train/dev/test manfests. 4. Update README.md and requirements.txt. 5. Expose more arguments to users in argparser. 6. Update all other details. 8 years ago			`DataGenerator provides basic audio data preprocessing pipeline, and offers`
Refactor data utils into a class and add feature normalization. 8 years ago			`both instance-level and batch-level data reader interfaces.`
			`Normalized FFT are used as audio features here.`

			`:param vocab_filepath: Vocabulary file path for indexing tokenized`
			`transcriptions.`
			`:type vocab_filepath: basestring`
			`:param normalizer_manifest_path: Manifest filepath for collecting feature`
			`normalization statistics, e.g. mean, std.`
			`:type normalizer_manifest_path: basestring`
			`:param normalizer_num_samples: Number of instances sampled for collecting`
			`feature normalization statistics.`
			`Default is 100.`
			`:type normalizer_num_samples: int`
			`:param max_duration: Audio clips with duration (in seconds) greater than`
			`this will be discarded. Default is 20.0.`
			`:type max_duration: float`
			`:param min_duration: Audio clips with duration (in seconds) smaller than`
			`this will be discarded. Default is 0.0.`
			`:type min_duration: float`
			`:param stride_ms: Striding size (in milliseconds) for generating frames.`
			`Default is 10.0.`
			`:type stride_ms: float`
			`:param window_ms: Window size (in milliseconds) for frames. Default is 20.0.`
			`:type window_ms: float`
			`:param max_frequency: Maximun frequency for FFT features. FFT features of`
			`frequency larger than this will be discarded.`
			`If set None, all features will be kept.`
			`Default is None.`
			`:type max_frequency: float`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago			`"""`

Refactor data utils into a class and add feature normalization. 8 years ago			`def __init__(self,`
			`vocab_filepath,`
			`normalizer_manifest_path,`
			`normalizer_num_samples=100,`
			`max_duration=20.0,`
			`min_duration=0.0,`
			`stride_ms=10.0,`
			`window_ms=20.0,`
			`max_frequency=None):`
			`self.__max_duration__ = max_duration`
			`self.__min_duration__ = min_duration`
			`self.__stride_ms__ = stride_ms`
			`self.__window_ms__ = window_ms`
			`self.__max_frequency__ = max_frequency`
			`self.__random__ = random.Random(RANDOM_SEED)`
			`# load vocabulary (dictionary)`
			`self.__vocab_dict__, self.__vocab_list__ = \`
			`self.__load_vocabulary_from_file__(vocab_filepath)`
			`# collect normalizer statistics`
			`self.__mean__, self.__std__ = self.__collect_normalizer_statistics__(`
			`manifest_path=normalizer_manifest_path,`
			`num_samples=normalizer_num_samples)`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago
Refactor data utils into a class and add feature normalization. 8 years ago			`def __audio_featurize__(self, audio_filename):`
			`"""`
			`Preprocess audio data, including feature extraction, normalization etc..`
			`"""`
			`features = self.__audio_basic_featurize__(audio_filename)`
			`return self.__normalize__(features)`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago
Refactor data utils into a class and add feature normalization. 8 years ago			`def __text_featurize__(self, text):`
			`"""`
			`Preprocess text data, including tokenizing and token indexing etc..`
			`"""`
			`return self.__convert_text_to_char_index__(`
			`text=text, vocabulary=self.__vocab_dict__)`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago
Refactor data utils into a class and add feature normalization. 8 years ago			`def __audio_basic_featurize__(self, audio_filename):`
			`"""`
			`Compute basic (without normalization etc.) features for audio data.`
			`"""`
			`return self.__spectrogram_from_file__(`
			`filename=audio_filename,`
			`stride_ms=self.__stride_ms__,`
			`window_ms=self.__window_ms__,`
			`max_freq=self.__max_frequency__)`
Add infererence and add SortaGrad for only first pass. 8 years ago
Refactor data utils into a class and add feature normalization. 8 years ago			`def __collect_normalizer_statistics__(self, manifest_path, num_samples=100):`
			`"""`
			`Compute feature normalization statistics, i.e. mean and stddev.`
			`"""`
			`# read manifest`
			`manifest = self.__read_manifest__(`
			`manifest_path=manifest_path,`
			`max_duration=self.__max_duration__,`
			`min_duration=self.__min_duration__)`
			`# sample for statistics`
			`sampled_manifest = self.__random__.sample(manifest, num_samples)`
			`# extract spectrogram feature`
			`features = []`
			`for instance in sampled_manifest:`
			`spectrogram = self.__audio_basic_featurize__(`
			`instance["audio_filepath"])`
			`features.append(spectrogram)`
			`features = np.hstack(features)`
			`mean = np.mean(features, axis=1).reshape([-1, 1])`
			`std = np.std(features, axis=1).reshape([-1, 1])`
			`return mean, std`
Add infererence and add SortaGrad for only first pass. 8 years ago
Refactor data utils into a class and add feature normalization. 8 years ago			`def __normalize__(self, features, eps=1e-14):`
			`"""`
			`Normalize features to be of zero mean and unit stddev.`
			`"""`
			`return (features - self.__mean__) / (self.__std__ + eps)`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago
Refactor data utils into a class and add feature normalization. 8 years ago			`def __spectrogram_from_file__(self,`
			`filename,`
			`stride_ms=10.0,`
			`window_ms=20.0,`
			`max_freq=None,`
			`eps=1e-14):`
			`"""`
			`Laod audio data and calculate the log of spectrogram by FFT.`
			`Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech`
			`"""`
			`audio, sample_rate = soundfile.read(filename)`
			`if audio.ndim >= 2:`
			`audio = np.mean(audio, 1)`
			`if max_freq is None:`
			`max_freq = sample_rate / 2`
			`if max_freq > sample_rate / 2:`
			`raise ValueError("max_freq must be greater than half of "`
			`"sample rate.")`
			`if stride_ms > window_ms:`
			`raise ValueError("Stride size must not be greater than "`
			`"window size.")`
			`stride_size = int(0.001 * sample_rate * stride_ms)`
			`window_size = int(0.001 * sample_rate * window_ms)`
			`spectrogram, freqs = self.__extract_spectrogram__(`
			`audio,`
			`window_size=window_size,`
			`stride_size=stride_size,`
			`sample_rate=sample_rate)`
			`ind = np.where(freqs <= max_freq)[0][-1] + 1`
			`return np.log(spectrogram[:ind, :] + eps)`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago
Refactor data utils into a class and add feature normalization. 8 years ago			`def __extract_spectrogram__(self, samples, window_size, stride_size,`
			`sample_rate):`
			`"""`
			`Compute the spectrogram by FFT for a discrete real signal.`
			`Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech`
			`"""`
			`# extract strided windows`
			`truncate_size = (len(samples) - window_size) % stride_size`
			`samples = samples[:len(samples) - truncate_size]`
			`nshape = (window_size, (len(samples) - window_size) // stride_size + 1)`
			`nstrides = (samples.strides[0], samples.strides[0] * stride_size)`
			`windows = np.lib.stride_tricks.as_strided(`
			`samples, shape=nshape, strides=nstrides)`
			`assert np.all(`
			`windows[:, 1] == samples[stride_size:(stride_size + window_size)])`
			`# window weighting, squared Fast Fourier Transform (fft), scaling`
			`weighting = np.hanning(window_size)[:, None]`
			`fft = np.fft.rfft(windows * weighting, axis=0)`
			`fft = np.absolute(fft)**2`
			`scale = np.sum(weighting*2) sample_rate`
			`fft[1:-1, :] *= (2.0 / scale)`
			`fft[(0, -1), :] /= scale`
			`# prepare fft frequency list`
			`freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])`
			`return fft, freqs`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago
Refactor data utils into a class and add feature normalization. 8 years ago			`def __load_vocabulary_from_file__(self, vocabulary_path):`
			`"""`
			`Load vocabulary from file.`
			`"""`
			`if not os.path.exists(vocabulary_path):`
			`raise ValueError("Vocabulary file %s not found.", vocabulary_path)`
			`vocab_lines = []`
			`with open(vocabulary_path, 'r') as file:`
			`vocab_lines.extend(file.readlines())`
			`vocab_list = [line[:-1] for line in vocab_lines]`
			`vocab_dict = dict(`
			`[(token, id) for (id, token) in enumerate(vocab_list)])`
			`return vocab_dict, vocab_list`

			`def __convert_text_to_char_index__(self, text, vocabulary):`
			`"""`
			`Convert text string to a list of character index integers.`
			`"""`
			`return [vocabulary[w] for w in text]`

			`def __read_manifest__(self, manifest_path, max_duration, min_duration):`
			`"""`
			`Load and parse manifest file.`
			`"""`
			`manifest = []`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago			`for json_line in open(manifest_path):`
			`try:`
			`json_data = json.loads(json_line)`
			`except Exception as e:`
			`raise ValueError("Error reading manifest: %s" % str(e))`
			`if (json_data["duration"] <= max_duration and`
			`json_data["duration"] >= min_duration):`
Refactor data utils into a class and add feature normalization. 8 years ago			`manifest.append(json_data)`
			`return manifest`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago
Refactor data utils into a class and add feature normalization. 8 years ago			`def __padding_batch__(self, batch, padding_to=-1, flatten=False):`
			`"""`
			`Padding audio part of features (only in the time axis -- column axis)`
			`with zeros, to make each instance in the batch share the same`
			`audio feature shape.`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago
Refactor data utils into a class and add feature normalization. 8 years ago			If `padding_to` is set -1, the maximun column numbers in the batch will
			be used as the target size. Otherwise, `padding_to` will be the target
			`size. Default is -1.`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago
Refactor data utils into a class and add feature normalization. 8 years ago			If `flatten` is set True, audio data will be flatten to be a 1-dim
			`ndarray. Default is False.`
			`"""`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago			`new_batch = []`
Refactor data utils into a class and add feature normalization. 8 years ago			`# get target shape`
			`max_length = max([audio.shape[1] for audio, text in batch])`
			`if padding_to != -1:`
			`if padding_to < max_length:`
			`raise ValueError("If padding_to is not -1, it should be greater"`
			`" or equal to the original instance length.")`
			`max_length = padding_to`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago			`# padding`
			`for audio, text in batch:`
Refactor data utils into a class and add feature normalization. 8 years ago			`padded_audio = np.zeros([audio.shape[0], max_length])`
			`padded_audio[:, :audio.shape[1]] = audio`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago			`if flatten:`
			`padded_audio = padded_audio.flatten()`
			`new_batch.append((padded_audio, text))`
			`return new_batch`

Refactor data utils into a class and add feature normalization. 8 years ago			`def instance_reader_creator(self,`
			`manifest_path,`
			`sort_by_duration=True,`
			`shuffle=False):`
			`"""`
			`Instance reader creator for audio data. Creat a callable function to`
			`produce instances of data.`

			`Instance: a tuple of a numpy ndarray of audio spectrogram and a list of`
			`tokenized and indexed transcription text.`

			`:param manifest_path: Filepath of manifest for audio clip files.`
			`:type manifest_path: basestring`
			`:param sort_by_duration: Sort the audio clips by duration if set True`
			`(for SortaGrad).`
			`:type sort_by_duration: bool`
			`:param shuffle: Shuffle the audio clips if set True.`
			`:type shuffle: bool`
			`:return: Data reader function.`
			`:rtype: callable`
			`"""`
			`if sort_by_duration and shuffle:`
			`sort_by_duration = False`
			`logger.warn("When shuffle set to true, "`
			`"sort_by_duration is forced to set False.")`

			`def reader():`
			`# read manifest`
			`manifest = self.__read_manifest__(`
			`manifest_path=manifest_path,`
			`max_duration=self.__max_duration__,`
			`min_duration=self.__min_duration__)`
			`# sort (by duration) or shuffle manifest`
			`if sort_by_duration:`
			`manifest.sort(key=lambda x: x["duration"])`
			`if shuffle:`
			`self.__random__.shuffle(manifest)`
			`# extract spectrogram feature`
			`for instance in manifest:`
			`spectrogram = self.__audio_featurize__(`
			`instance["audio_filepath"])`
			`transcript = self.__text_featurize__(instance["text"])`
			`yield (spectrogram, transcript)`

			`return reader`

			`def batch_reader_creator(self,`
			`manifest_path,`
			`batch_size,`
			`padding_to=-1,`
			`flatten=False,`
			`sort_by_duration=True,`
			`shuffle=False):`
			`"""`
			`Batch data reader creator for audio data. Creat a callable function to`
			`produce batches of data.`

			`Audio features will be padded with zeros to make each instance in the`
			`batch to share the same audio feature shape.`

			`:param manifest_path: Filepath of manifest for audio clip files.`
			`:type manifest_path: basestring`
			`:param batch_size: Instance number in a batch.`
			`:type batch_size: int`
			`:param padding_to: If set -1, the maximun column numbers in the batch`
			`will be used as the target size for padding.`
			Otherwise, `padding_to` will be the target size.
			`Default is -1.`
			`:type padding_to: int`
			`:param flatten: If set True, audio data will be flatten to be a 1-dim`
			`ndarray. Otherwise, 2-dim ndarray. Default is False.`
			`:type flatten: bool`
			`:param sort_by_duration: Sort the audio clips by duration if set True`
			`(for SortaGrad).`
			`:type sort_by_duration: bool`
			`:param shuffle: Shuffle the audio clips if set True.`
			`:type shuffle: bool`
			`:return: Batch reader function, producing batches of data when called.`
			`:rtype: callable`
			`"""`

			`def batch_reader():`
			`instance_reader = self.instance_reader_creator(`
			`manifest_path=manifest_path,`
			`sort_by_duration=sort_by_duration,`
			`shuffle=shuffle)`
			`batch = []`
			`for instance in instance_reader():`
			`batch.append(instance)`
			`if len(batch) == batch_size:`
			`yield self.__padding_batch__(batch, padding_to, flatten)`
			`batch = []`
			`if len(batch) > 0:`
			`yield self.__padding_batch__(batch, padding_to, flatten)`

			`return batch_reader`

			`def vocabulary_size(self):`
			`"""`
			`Get vocabulary size.`

			`:return: Vocabulary size.`
			`:rtype: int`
			`"""`
			`return len(self.__vocab_list__)`

			`def vocabulary_dict(self):`
			`"""`
			`Get vocabulary in dict.`

			`:return: Vocabulary in dict.`
			`:rtype: dict`
			`"""`
			`return self.__vocab_dict__`

			`def vocabulary_list(self):`
			`"""`
			`Get vocabulary in list.`

			`:return: Vocabulary in list`
			`:rtype: list`
			`"""`
			`return self.__vocab_list__`

			`def data_name_feeding(self):`
			`"""`
			`Get feeddings (data field name and corresponding field id).`
Add librispeech dataset, audio data provider and simplfied DeepSpeech2 model configuration. Bug exists when run training. 8 years ago
Refactor data utils into a class and add feature normalization. 8 years ago			`:return: Feeding dict.`
			`:rtype: dict`
			`"""`
			`feeding = {`
			`"audio_spectrogram": 0,`
			`"transcript_text": 1,`
			`}`
			`return feeding`