speed perturb with sox

5 years ago · 9ad688c6aa
parent e55b5baf66
commit 9ad688c6aa
10 changed files with 398 additions and 48 deletions
--- a/.notebook/python_test.ipynb
+++ b/.notebook/python_test.ipynb
--- a/deepspeech/frontend/audio.py
+++ b/deepspeech/frontend/audio.py
@ -22,6 +22,7 @@ import resampy
 from scipy import signal
 import random
 import copy
 import sox
 class AudioSegment(object):
@ -323,11 +324,15 @@ class AudioSegment(object):
        """
        if speed_rate <= 0:
            raise ValueError("speed_rate should be greater than zero.")
-        old_length = self._samples.shape[0]
+        # old_length = self._samples.shape[0]
-        new_length = int(old_length / speed_rate)
+        # new_length = int(old_length / speed_rate)
-        old_indices = np.arange(old_length)
+        # old_indices = np.arange(old_length)
-        new_indices = np.linspace(start=0, stop=old_length, num=new_length)
+        # new_indices = np.linspace(start=0, stop=old_length, num=new_length)
-        self._samples = np.interp(new_indices, old_indices, self._samples)
+        # self._samples = np.interp(new_indices, old_indices, self._samples)
        tfm = sox.Transformer()
        tfm.speed(speed_rate)
        self._samples = tfm.build_array(
            input_array=self._samples, sample_rate_in=self._sample_rate)
    def normalize(self, target_db=-20, max_gain_db=300.0):
        """Normalize audio to be of the desired RMS value in decibels.
--- a/deepspeech/frontend/augmentor/speed_perturb.py
+++ b/deepspeech/frontend/augmentor/speed_perturb.py
@ -13,35 +13,71 @@
 # limitations under the License.
 """Contain the speech perturbation augmentation model."""
 import numpy as np
 from deepspeech.frontend.augmentor.base import AugmentorBase
 class SpeedPerturbAugmentor(AugmentorBase):
-    """Augmentation model for adding speed perturbation.
+    """Augmentation model for adding speed perturbation."""
    def __init__(self, rng, min_speed_rate=0.9, max_speed_rate=1.1,
                 num_rates=3):
        """speed perturbation.
        The speed perturbation in kaldi uses sox-speed instead of sox-tempo,
        and sox-speed just to resample the input,
        i.e pitch and tempo are changed both.
        "Why use speed option instead of tempo -s in SoX for speed perturbation"
        https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8
        Sox speed:
        https://pysox.readthedocs.io/en/latest/api.html#sox.transform.Transformer
        See reference paper here:
        http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
-    :param rng: Random generator object.
+        Espnet:
-    :type rng: random.Random
+        https://espnet.github.io/espnet/_modules/espnet/transform/perturb.html
-    :param min_speed_rate: Lower bound of new speed rate to sample and should
+        
        Nemo:
        https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/parts/perturb.py#L92
        Args:
            rng (random.Random): Random generator object.
            min_speed_rate (float): Lower bound of new speed rate to sample and should
                not be smaller than 0.9.
-    :type min_speed_rate: float
+            max_speed_rate (float): Upper bound of new speed rate to sample and should
    :param max_speed_rate: Upper bound of new speed rate to sample and should
                not be larger than 1.1.
-    :type max_speed_rate: float
+            num_rates (int, optional): Number of discrete rates to allow. 
-    """
+                Can be a positive or negative integer. Defaults to 3.
                If a positive integer greater than 0 is provided, the range of
                speed rates will be discretized into `num_rates` values.
                If a negative integer or 0 is provided, the full range of speed rates
                will be sampled uniformly.
                Note: If a positive integer is provided and the resultant discretized
                range of rates contains the value '1.0', then those samples with rate=1.0,
                will not be augmented at all and simply skipped. This is to unnecessary
                augmentation and increase computation time. Effective augmentation chance
                in such a case is = `prob * (num_rates - 1 / num_rates) * 100`% chance
                where `prob` is the global probability of a sample being augmented.
-    def __init__(self, rng, min_speed_rate, max_speed_rate):
+        Raises:
            ValueError: when speed_rate error
        """
        if min_speed_rate < 0.9:
            raise ValueError(
                "Sampling speed below 0.9 can cause unnatural effects")
        if max_speed_rate > 1.1:
            raise ValueError(
                "Sampling speed above 1.1 can cause unnatural effects")
-        self._min_speed_rate = min_speed_rate
+        self._min_rate = min_speed_rate
-        self._max_speed_rate = max_speed_rate
+        self._max_rate = max_speed_rate
        self._rng = rng
        self._num_rates = num_rates
        if num_rates > 0:
            self._rates = np.linspace(
                self._min_rate, self._max_rate, self._num_rates, endpoint=True)
    def transform_audio(self, audio_segment):
        """Sample a new speed rate from the given range and
@ -52,6 +88,13 @@ class SpeedPerturbAugmentor(AugmentorBase):
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegment|SpeechSegment
        """
-        sampled_speed = self._rng.uniform(self._min_speed_rate,
+        if self._num_rates < 0:
-                                          self._max_speed_rate)
+            speed_rate = self._rng.uniform(self._min_rate, self._max_rate)
-        audio_segment.change_speed(sampled_speed)
+        else:
            speed_rate = self._rng.choice(self._rates)
        # Skip perturbation in case of identity speed rate
        if speed_rate == 1.0:
            return
        audio_segment.change_speed(speed_rate)
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@ -25,7 +25,7 @@ python3 ${MAIN_ROOT}/utils/build_vocab.py \
 --unit_type="char" \
 --count_threshold=0 \
 --vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
+--manifest_paths "data/manifest.train.raw"
 if [ $? -ne 0 ]; then
    echo "Build vocabulary failed. Terminated."
--- a/examples/aishell/s1/conf/augmentation.config
+++ b/examples/aishell/s1/conf/augmentation.config
@ -1,8 +0,0 @@
 [
    {
        "type": "shift",
        "params": {"min_shift_ms": -5,
                   "max_shift_ms": 5},
        "prob": 1.0
    }
 ]
--- a/examples/aishell/s1/local/train.sh
+++ b/examples/aishell/s1/local/train.sh
@ -1,9 +1,5 @@
 #! /usr/bin/env bash
 # train model
 # if you wish to resume from an exists model, uncomment --init_from_pretrained_model
 export FLAGS_sync_nccl_allreduce=0
 ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));')
 echo "using $ngpu gpus..."
--- a/examples/aug_conf/augmentation.config.example
+++ b/examples/aug_conf/augmentation.config.example
@ -14,7 +14,8 @@
    {
        "type": "speed",
        "params": {"min_speed_rate": 0.95,
-                   "max_speed_rate": 1.05},
+                   "max_speed_rate": 1.05,
                   "num_rates": 3},
        "prob": 0.5
    },
    {
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@ -9,12 +9,12 @@ data:
  mean_std_filepath: ""
  augmentation_config: conf/augmentation.config
  batch_size: 4
-  max_input_len: 27.0
+  min_input_len: 0.5
-  min_input_len: 0.0
+  max_input_len: 20.0
  max_output_len: .INF
  min_output_len: 0.0
-  max_output_input_ratio: .INF
+  max_output_len: 400
-  min_output_input_ratio: 0.0
+  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
--- a/requirements.txt
+++ b/requirements.txt
@ -5,6 +5,7 @@ resampy==0.2.2
 scipy==1.2.1
 sentencepiece
 SoundFile==0.9.0.post1
 sox
 tensorboardX
 typeguard
 yacs
--- a/setup.sh
+++ b/setup.sh
@ -7,7 +7,7 @@ fi
 if [ -e /etc/lsb-release ];then
    #${SUDO} apt-get update
-    ${SUDO} apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
+    ${SUDO} apt-get install -y sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
 fi
 # install python dependencies