PaddleSpeech/data_utils/featurizer/audio_featurizer.py

"""Contains the audio featurizer class."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
from data_utils import utils
from data_utils.audio import AudioSegment


class AudioFeaturizer(object):
    """Audio featurizer, for extracting features from audio contents of
    AudioSegment or SpeechSegment.

    Currently, it only supports feature type of linear spectrogram.

    :param specgram_type: Specgram feature type. Options: 'linear'.
    :type specgram_type: str
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
    :param max_freq: Used when specgram_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
                     returned.
    :types max_freq: None|float
    :param target_sample_rate: Audio are resampled (if upsampling or
                               downsampling is allowed) to this before
                               extracting spectrogram features.
    :type target_sample_rate: float
    :param use_dB_normalization: Whether to normalize the audio to a certain
                                 decibels before extracting the features.
    :type use_dB_normalization: bool
    :param target_dB: Target audio decibels for normalization.
    :type target_dB: float
    """

    def __init__(self,
                 specgram_type='linear',
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_freq=None,
                 target_sample_rate=16000,
                 use_dB_normalization=True,
                 target_dB=-20):
        self._specgram_type = specgram_type
        self._stride_ms = stride_ms
        self._window_ms = window_ms
        self._max_freq = max_freq
        self._target_sample_rate = target_sample_rate
        self._use_dB_normalization = use_dB_normalization
        self._target_dB = target_dB

    def featurize(self,
                  audio_segment,
                  allow_downsampling=True,
                  allow_upsamplling=True):
        """Extract audio features from AudioSegment or SpeechSegment.

        :param audio_segment: Audio/speech segment to extract features from.
        :type audio_segment: AudioSegment|SpeechSegment
        :param allow_downsampling: Whether to allow audio downsampling before
                                   featurizing.
        :type allow_downsampling: bool
        :param allow_upsampling: Whether to allow audio upsampling before
                                 featurizing.
        :type allow_upsampling: bool
        :return: Spectrogram audio feature in 2darray.
        :rtype: ndarray
        :raises ValueError: If audio sample rate is not supported.
        """
        # upsampling or downsampling
        if ((audio_segment.sample_rate > self._target_sample_rate and
             allow_downsampling) or
            (audio_segment.sample_rate < self._target_sample_rate and
             allow_upsampling)):
            audio_segment.resample(self._target_sample_rate)
        if audio_segment.sample_rate != self._target_sample_rate:
            raise ValueError("Audio sample rate is not supported. "
                             "Turn allow_downsampling or allow up_sampling on.")
        # decibel normalization
        if self._use_dB_normalization:
            audio_segment.normalize(target_db=self._target_dB)
        # extract spectrogram
        return self._compute_specgram(audio_segment.samples,
                                      audio_segment.sample_rate)

    def _compute_specgram(self, samples, sample_rate):
        """Extract various audio features."""
        if self._specgram_type == 'linear':
            return self._compute_linear_specgram(
                samples, sample_rate, self._stride_ms, self._window_ms,
                self._max_freq)
        else:
            raise ValueError("Unknown specgram_type %s. "
                             "Supported values: linear." % self._specgram_type)

    def _compute_linear_specgram(self,
                                 samples,
                                 sample_rate,
                                 stride_ms=10.0,
                                 window_ms=20.0,
                                 max_freq=None,
                                 eps=1e-14):
        """Compute the linear spectrogram from FFT energy."""
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
            raise ValueError("max_freq must be greater than half of "
                             "sample rate.")
        if stride_ms > window_ms:
            raise ValueError("Stride size must not be greater than "
                             "window size.")
        stride_size = int(0.001 * sample_rate * stride_ms)
        window_size = int(0.001 * sample_rate * window_ms)
        specgram, freqs = self._specgram_real(
            samples,
            window_size=window_size,
            stride_size=stride_size,
            sample_rate=sample_rate)
        ind = np.where(freqs <= max_freq)[0][-1] + 1
        return np.log(specgram[:ind, :] + eps)

    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
        """Compute the spectrogram for samples from a real signal."""
        # extract strided windows
        truncate_size = (len(samples) - window_size) % stride_size
        samples = samples[:len(samples) - truncate_size]
        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
        windows = np.lib.stride_tricks.as_strided(
            samples, shape=nshape, strides=nstrides)
        assert np.all(
            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
        # window weighting, squared Fast Fourier Transform (fft), scaling
        weighting = np.hanning(window_size)[:, None]
        fft = np.fft.rfft(windows * weighting, axis=0)
        fft = np.absolute(fft)**2
        scale = np.sum(weighting**2) * sample_rate
        fft[1:-1, :] *= (2.0 / scale)
        fft[(0, -1), :] /= scale
        # prepare fft frequency list
        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
        return fft, freqs
Add function, class and module docs for data parts in DS2. 8 years ago			`"""Contains the audio featurizer class."""`
Refactor whole data preprocessor for DS2 (re-design classes, re-organize dir, add augmentaion interfaces etc.). 1. Refactor data preprocessor with new added class AudioSegment, SpeechSegment, TextFeaturizer, AudioFeaturizer, SpeechFeaturizer. 2. Add data augmentation interfaces and class AugmentorBase, AugmentationPipeline, VolumnPerturbAugmentor etc.. 3. Seperate normalizer's mean and std computing from training, by adding FeatureNormalizer and a seperate tool compute_mean_std.py. 4. Re-organize directory. 8 years ago			`from __future__ import absolute_import`
			`from __future__ import division`
			`from __future__ import print_function`

			`import numpy as np`
			`from data_utils import utils`
			`from data_utils.audio import AudioSegment`


			`class AudioFeaturizer(object):`
Add function, class and module docs for data parts in DS2. 8 years ago			`"""Audio featurizer, for extracting features from audio contents of`
			`AudioSegment or SpeechSegment.`

			`Currently, it only supports feature type of linear spectrogram.`

			`:param specgram_type: Specgram feature type. Options: 'linear'.`
			`:type specgram_type: str`
			`:param stride_ms: Striding size (in milliseconds) for generating frames.`
			`:type stride_ms: float`
			`:param window_ms: Window size (in milliseconds) for generating frames.`
			`:type window_ms: float`
			`:param max_freq: Used when specgram_type is 'linear', only FFT bins`
			`corresponding to frequencies between [0, max_freq] are`
			`returned.`
			`:types max_freq: None\|float`
Improve audio featurizer and add shift augmentor. 1. Improve audio featurizer. 2. Add shift augmentor. 3. Update default argument to be the current best seggestion. 4. Add checkpoints with pass id. 8 years ago			`:param target_sample_rate: Audio are resampled (if upsampling or`
			`downsampling is allowed) to this before`
			`extracting spectrogram features.`
			`:type target_sample_rate: float`
			`:param use_dB_normalization: Whether to normalize the audio to a certain`
			`decibels before extracting the features.`
			`:type use_dB_normalization: bool`
			`:param target_dB: Target audio decibels for normalization.`
			`:type target_dB: float`
Add function, class and module docs for data parts in DS2. 8 years ago			`"""`

Refactor whole data preprocessor for DS2 (re-design classes, re-organize dir, add augmentaion interfaces etc.). 1. Refactor data preprocessor with new added class AudioSegment, SpeechSegment, TextFeaturizer, AudioFeaturizer, SpeechFeaturizer. 2. Add data augmentation interfaces and class AugmentorBase, AugmentationPipeline, VolumnPerturbAugmentor etc.. 3. Seperate normalizer's mean and std computing from training, by adding FeatureNormalizer and a seperate tool compute_mean_std.py. 4. Re-organize directory. 8 years ago			`def __init__(self,`
			`specgram_type='linear',`
			`stride_ms=10.0,`
			`window_ms=20.0,`
Improve audio featurizer and add shift augmentor. 1. Improve audio featurizer. 2. Add shift augmentor. 3. Update default argument to be the current best seggestion. 4. Add checkpoints with pass id. 8 years ago			`max_freq=None,`
			`target_sample_rate=16000,`
			`use_dB_normalization=True,`
			`target_dB=-20):`
Refactor whole data preprocessor for DS2 (re-design classes, re-organize dir, add augmentaion interfaces etc.). 1. Refactor data preprocessor with new added class AudioSegment, SpeechSegment, TextFeaturizer, AudioFeaturizer, SpeechFeaturizer. 2. Add data augmentation interfaces and class AugmentorBase, AugmentationPipeline, VolumnPerturbAugmentor etc.. 3. Seperate normalizer's mean and std computing from training, by adding FeatureNormalizer and a seperate tool compute_mean_std.py. 4. Re-organize directory. 8 years ago			`self._specgram_type = specgram_type`
			`self._stride_ms = stride_ms`
			`self._window_ms = window_ms`
			`self._max_freq = max_freq`
Improve audio featurizer and add shift augmentor. 1. Improve audio featurizer. 2. Add shift augmentor. 3. Update default argument to be the current best seggestion. 4. Add checkpoints with pass id. 8 years ago			`self._target_sample_rate = target_sample_rate`
			`self._use_dB_normalization = use_dB_normalization`
			`self._target_dB = target_dB`
Refactor whole data preprocessor for DS2 (re-design classes, re-organize dir, add augmentaion interfaces etc.). 1. Refactor data preprocessor with new added class AudioSegment, SpeechSegment, TextFeaturizer, AudioFeaturizer, SpeechFeaturizer. 2. Add data augmentation interfaces and class AugmentorBase, AugmentationPipeline, VolumnPerturbAugmentor etc.. 3. Seperate normalizer's mean and std computing from training, by adding FeatureNormalizer and a seperate tool compute_mean_std.py. 4. Re-organize directory. 8 years ago
Improve audio featurizer and add shift augmentor. 1. Improve audio featurizer. 2. Add shift augmentor. 3. Update default argument to be the current best seggestion. 4. Add checkpoints with pass id. 8 years ago			`def featurize(self,`
			`audio_segment,`
			`allow_downsampling=True,`
			`allow_upsamplling=True):`
Add function, class and module docs for data parts in DS2. 8 years ago			`"""Extract audio features from AudioSegment or SpeechSegment.`

			`:param audio_segment: Audio/speech segment to extract features from.`
			`:type audio_segment: AudioSegment\|SpeechSegment`
Improve audio featurizer and add shift augmentor. 1. Improve audio featurizer. 2. Add shift augmentor. 3. Update default argument to be the current best seggestion. 4. Add checkpoints with pass id. 8 years ago			`:param allow_downsampling: Whether to allow audio downsampling before`
			`featurizing.`
			`:type allow_downsampling: bool`
			`:param allow_upsampling: Whether to allow audio upsampling before`
			`featurizing.`
			`:type allow_upsampling: bool`
Add function, class and module docs for data parts in DS2. 8 years ago			`:return: Spectrogram audio feature in 2darray.`
			`:rtype: ndarray`
Improve audio featurizer and add shift augmentor. 1. Improve audio featurizer. 2. Add shift augmentor. 3. Update default argument to be the current best seggestion. 4. Add checkpoints with pass id. 8 years ago			`:raises ValueError: If audio sample rate is not supported.`
Add function, class and module docs for data parts in DS2. 8 years ago			`"""`
Improve audio featurizer and add shift augmentor. 1. Improve audio featurizer. 2. Add shift augmentor. 3. Update default argument to be the current best seggestion. 4. Add checkpoints with pass id. 8 years ago			`# upsampling or downsampling`
			`if ((audio_segment.sample_rate > self._target_sample_rate and`
			`allow_downsampling) or`
			`(audio_segment.sample_rate < self._target_sample_rate and`
			`allow_upsampling)):`
			`audio_segment.resample(self._target_sample_rate)`
			`if audio_segment.sample_rate != self._target_sample_rate:`
			`raise ValueError("Audio sample rate is not supported. "`
			`"Turn allow_downsampling or allow up_sampling on.")`
			`# decibel normalization`
			`if self._use_dB_normalization:`
			`audio_segment.normalize(target_db=self._target_dB)`
			`# extract spectrogram`
Refactor whole data preprocessor for DS2 (re-design classes, re-organize dir, add augmentaion interfaces etc.). 1. Refactor data preprocessor with new added class AudioSegment, SpeechSegment, TextFeaturizer, AudioFeaturizer, SpeechFeaturizer. 2. Add data augmentation interfaces and class AugmentorBase, AugmentationPipeline, VolumnPerturbAugmentor etc.. 3. Seperate normalizer's mean and std computing from training, by adding FeatureNormalizer and a seperate tool compute_mean_std.py. 4. Re-organize directory. 8 years ago			`return self._compute_specgram(audio_segment.samples,`
			`audio_segment.sample_rate)`

			`def _compute_specgram(self, samples, sample_rate):`
Add function, class and module docs for data parts in DS2. 8 years ago			`"""Extract various audio features."""`
Refactor whole data preprocessor for DS2 (re-design classes, re-organize dir, add augmentaion interfaces etc.). 1. Refactor data preprocessor with new added class AudioSegment, SpeechSegment, TextFeaturizer, AudioFeaturizer, SpeechFeaturizer. 2. Add data augmentation interfaces and class AugmentorBase, AugmentationPipeline, VolumnPerturbAugmentor etc.. 3. Seperate normalizer's mean and std computing from training, by adding FeatureNormalizer and a seperate tool compute_mean_std.py. 4. Re-organize directory. 8 years ago			`if self._specgram_type == 'linear':`
			`return self._compute_linear_specgram(`
			`samples, sample_rate, self._stride_ms, self._window_ms,`
			`self._max_freq)`
			`else:`
			`raise ValueError("Unknown specgram_type %s. "`
			`"Supported values: linear." % self._specgram_type)`

			`def _compute_linear_specgram(self,`
			`samples,`
			`sample_rate,`
			`stride_ms=10.0,`
			`window_ms=20.0,`
			`max_freq=None,`
			`eps=1e-14):`
Add function, class and module docs for data parts in DS2. 8 years ago			`"""Compute the linear spectrogram from FFT energy."""`
Refactor whole data preprocessor for DS2 (re-design classes, re-organize dir, add augmentaion interfaces etc.). 1. Refactor data preprocessor with new added class AudioSegment, SpeechSegment, TextFeaturizer, AudioFeaturizer, SpeechFeaturizer. 2. Add data augmentation interfaces and class AugmentorBase, AugmentationPipeline, VolumnPerturbAugmentor etc.. 3. Seperate normalizer's mean and std computing from training, by adding FeatureNormalizer and a seperate tool compute_mean_std.py. 4. Re-organize directory. 8 years ago			`if max_freq is None:`
			`max_freq = sample_rate / 2`
			`if max_freq > sample_rate / 2:`
			`raise ValueError("max_freq must be greater than half of "`
			`"sample rate.")`
			`if stride_ms > window_ms:`
			`raise ValueError("Stride size must not be greater than "`
			`"window size.")`
			`stride_size = int(0.001 * sample_rate * stride_ms)`
			`window_size = int(0.001 * sample_rate * window_ms)`
			`specgram, freqs = self._specgram_real(`
			`samples,`
			`window_size=window_size,`
			`stride_size=stride_size,`
			`sample_rate=sample_rate)`
			`ind = np.where(freqs <= max_freq)[0][-1] + 1`
			`return np.log(specgram[:ind, :] + eps)`

			`def _specgram_real(self, samples, window_size, stride_size, sample_rate):`
Add function, class and module docs for data parts in DS2. 8 years ago			`"""Compute the spectrogram for samples from a real signal."""`
Refactor whole data preprocessor for DS2 (re-design classes, re-organize dir, add augmentaion interfaces etc.). 1. Refactor data preprocessor with new added class AudioSegment, SpeechSegment, TextFeaturizer, AudioFeaturizer, SpeechFeaturizer. 2. Add data augmentation interfaces and class AugmentorBase, AugmentationPipeline, VolumnPerturbAugmentor etc.. 3. Seperate normalizer's mean and std computing from training, by adding FeatureNormalizer and a seperate tool compute_mean_std.py. 4. Re-organize directory. 8 years ago			`# extract strided windows`
			`truncate_size = (len(samples) - window_size) % stride_size`
			`samples = samples[:len(samples) - truncate_size]`
			`nshape = (window_size, (len(samples) - window_size) // stride_size + 1)`
			`nstrides = (samples.strides[0], samples.strides[0] * stride_size)`
			`windows = np.lib.stride_tricks.as_strided(`
			`samples, shape=nshape, strides=nstrides)`
			`assert np.all(`
			`windows[:, 1] == samples[stride_size:(stride_size + window_size)])`
			`# window weighting, squared Fast Fourier Transform (fft), scaling`
			`weighting = np.hanning(window_size)[:, None]`
			`fft = np.fft.rfft(windows * weighting, axis=0)`
			`fft = np.absolute(fft)**2`
			`scale = np.sum(weighting*2) sample_rate`
			`fft[1:-1, :] *= (2.0 / scale)`
			`fft[(0, -1), :] /= scale`
			`# prepare fft frequency list`
			`freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])`
			`return fft, freqs`