PaddleSpeech/data_utils/augmentor/speed_perturb.py

"""Speed perturbation module for making ASR robust to different voice
types (high pitched, low pitched, etc)
Samples uniformly between speed_min and speed_max
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from . import base


class SpeedPerturbatioAugmentor(base.AugmentorBase):
    """
    Instantiates a speed perturbation module.

    See reference paper here:

    http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf

    :param speed_min: Lower bound on new rate to sample
    :type speed_min: func[int->scalar]
    :param speed_max: Upper bound on new rate to sample
    :type speed_max: func[int->scalar]
    """

    def __init__(self, rng, speed_min, speed_max):

        if (speed_min < 0.9):
            raise ValueError(
                "Sampling speed below 0.9 can cause unnatural effects")
        if (speed_min > 1.1):
            raise ValueError(
                "Sampling speed above 1.1 can cause unnatural effects")
        self.speed_min = speed_min
        self.speed_max = speed_max
        self.rng = rng

    def transform_audio(self, audio_segment):
        """
        Samples a new speed rate from the given range and
        changes the speed of the given audio clip.

        Note that this is an in-place transformation.

        :param audio_segment: input audio
        :type audio_segment: SpeechDLSegment
        """
        read_size = 0
        speed_min = self.speed_min(iteration)
        speed_max = self.speed_max(iteration)
        sampled_speed = rng.uniform(speed_min, speed_max)
        audio = audio.change_speed(sampled_speed)