speed perturb with sox

pull/578/head
Hui Zhang 5 years ago
parent e55b5baf66
commit 9ad688c6aa

File diff suppressed because one or more lines are too long

@ -22,6 +22,7 @@ import resampy
from scipy import signal from scipy import signal
import random import random
import copy import copy
import sox
class AudioSegment(object): class AudioSegment(object):
@ -323,11 +324,15 @@ class AudioSegment(object):
""" """
if speed_rate <= 0: if speed_rate <= 0:
raise ValueError("speed_rate should be greater than zero.") raise ValueError("speed_rate should be greater than zero.")
old_length = self._samples.shape[0] # old_length = self._samples.shape[0]
new_length = int(old_length / speed_rate) # new_length = int(old_length / speed_rate)
old_indices = np.arange(old_length) # old_indices = np.arange(old_length)
new_indices = np.linspace(start=0, stop=old_length, num=new_length) # new_indices = np.linspace(start=0, stop=old_length, num=new_length)
self._samples = np.interp(new_indices, old_indices, self._samples) # self._samples = np.interp(new_indices, old_indices, self._samples)
tfm = sox.Transformer()
tfm.speed(speed_rate)
self._samples = tfm.build_array(
input_array=self._samples, sample_rate_in=self._sample_rate)
def normalize(self, target_db=-20, max_gain_db=300.0): def normalize(self, target_db=-20, max_gain_db=300.0):
"""Normalize audio to be of the desired RMS value in decibels. """Normalize audio to be of the desired RMS value in decibels.

@ -13,35 +13,71 @@
# limitations under the License. # limitations under the License.
"""Contain the speech perturbation augmentation model.""" """Contain the speech perturbation augmentation model."""
import numpy as np
from deepspeech.frontend.augmentor.base import AugmentorBase from deepspeech.frontend.augmentor.base import AugmentorBase
class SpeedPerturbAugmentor(AugmentorBase): class SpeedPerturbAugmentor(AugmentorBase):
"""Augmentation model for adding speed perturbation. """Augmentation model for adding speed perturbation."""
def __init__(self, rng, min_speed_rate=0.9, max_speed_rate=1.1,
num_rates=3):
"""speed perturbation.
The speed perturbation in kaldi uses sox-speed instead of sox-tempo,
and sox-speed just to resample the input,
i.e pitch and tempo are changed both.
"Why use speed option instead of tempo -s in SoX for speed perturbation"
https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8
Sox speed:
https://pysox.readthedocs.io/en/latest/api.html#sox.transform.Transformer
See reference paper here: See reference paper here:
http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
:param rng: Random generator object. Espnet:
:type rng: random.Random https://espnet.github.io/espnet/_modules/espnet/transform/perturb.html
:param min_speed_rate: Lower bound of new speed rate to sample and should
Nemo:
https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/parts/perturb.py#L92
Args:
rng (random.Random): Random generator object.
min_speed_rate (float): Lower bound of new speed rate to sample and should
not be smaller than 0.9. not be smaller than 0.9.
:type min_speed_rate: float max_speed_rate (float): Upper bound of new speed rate to sample and should
:param max_speed_rate: Upper bound of new speed rate to sample and should
not be larger than 1.1. not be larger than 1.1.
:type max_speed_rate: float num_rates (int, optional): Number of discrete rates to allow.
""" Can be a positive or negative integer. Defaults to 3.
If a positive integer greater than 0 is provided, the range of
speed rates will be discretized into `num_rates` values.
If a negative integer or 0 is provided, the full range of speed rates
will be sampled uniformly.
Note: If a positive integer is provided and the resultant discretized
range of rates contains the value '1.0', then those samples with rate=1.0,
will not be augmented at all and simply skipped. This is to unnecessary
augmentation and increase computation time. Effective augmentation chance
in such a case is = `prob * (num_rates - 1 / num_rates) * 100`% chance
where `prob` is the global probability of a sample being augmented.
def __init__(self, rng, min_speed_rate, max_speed_rate): Raises:
ValueError: when speed_rate error
"""
if min_speed_rate < 0.9: if min_speed_rate < 0.9:
raise ValueError( raise ValueError(
"Sampling speed below 0.9 can cause unnatural effects") "Sampling speed below 0.9 can cause unnatural effects")
if max_speed_rate > 1.1: if max_speed_rate > 1.1:
raise ValueError( raise ValueError(
"Sampling speed above 1.1 can cause unnatural effects") "Sampling speed above 1.1 can cause unnatural effects")
self._min_speed_rate = min_speed_rate self._min_rate = min_speed_rate
self._max_speed_rate = max_speed_rate self._max_rate = max_speed_rate
self._rng = rng self._rng = rng
self._num_rates = num_rates
if num_rates > 0:
self._rates = np.linspace(
self._min_rate, self._max_rate, self._num_rates, endpoint=True)
def transform_audio(self, audio_segment): def transform_audio(self, audio_segment):
"""Sample a new speed rate from the given range and """Sample a new speed rate from the given range and
@ -52,6 +88,13 @@ class SpeedPerturbAugmentor(AugmentorBase):
:param audio_segment: Audio segment to add effects to. :param audio_segment: Audio segment to add effects to.
:type audio_segment: AudioSegment|SpeechSegment :type audio_segment: AudioSegment|SpeechSegment
""" """
sampled_speed = self._rng.uniform(self._min_speed_rate, if self._num_rates < 0:
self._max_speed_rate) speed_rate = self._rng.uniform(self._min_rate, self._max_rate)
audio_segment.change_speed(sampled_speed) else:
speed_rate = self._rng.choice(self._rates)
# Skip perturbation in case of identity speed rate
if speed_rate == 1.0:
return
audio_segment.change_speed(speed_rate)

@ -25,7 +25,7 @@ python3 ${MAIN_ROOT}/utils/build_vocab.py \
--unit_type="char" \ --unit_type="char" \
--count_threshold=0 \ --count_threshold=0 \
--vocab_path="data/vocab.txt" \ --vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw" --manifest_paths "data/manifest.train.raw"
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Build vocabulary failed. Terminated." echo "Build vocabulary failed. Terminated."

@ -1,8 +0,0 @@
[
{
"type": "shift",
"params": {"min_shift_ms": -5,
"max_shift_ms": 5},
"prob": 1.0
}
]

@ -1,9 +1,5 @@
#! /usr/bin/env bash #! /usr/bin/env bash
# train model
# if you wish to resume from an exists model, uncomment --init_from_pretrained_model
export FLAGS_sync_nccl_allreduce=0
ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));') ngpu=$(echo ${CUDA_VISIBLE_DEVICES} | python -c 'import sys; a = sys.stdin.read(); print(len(a.split(",")));')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."

@ -14,7 +14,8 @@
{ {
"type": "speed", "type": "speed",
"params": {"min_speed_rate": 0.95, "params": {"min_speed_rate": 0.95,
"max_speed_rate": 1.05}, "max_speed_rate": 1.05,
"num_rates": 3},
"prob": 0.5 "prob": 0.5
}, },
{ {

@ -9,12 +9,12 @@ data:
mean_std_filepath: "" mean_std_filepath: ""
augmentation_config: conf/augmentation.config augmentation_config: conf/augmentation.config
batch_size: 4 batch_size: 4
max_input_len: 27.0 min_input_len: 0.5
min_input_len: 0.0 max_input_len: 20.0
max_output_len: .INF
min_output_len: 0.0 min_output_len: 0.0
max_output_input_ratio: .INF max_output_len: 400
min_output_input_ratio: 0.0 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank specgram_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80

@ -5,6 +5,7 @@ resampy==0.2.2
scipy==1.2.1 scipy==1.2.1
sentencepiece sentencepiece
SoundFile==0.9.0.post1 SoundFile==0.9.0.post1
sox
tensorboardX tensorboardX
typeguard typeguard
yacs yacs

@ -7,7 +7,7 @@ fi
if [ -e /etc/lsb-release ];then if [ -e /etc/lsb-release ];then
#${SUDO} apt-get update #${SUDO} apt-get update
${SUDO} apt-get install -y pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev ${SUDO} apt-get install -y sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
fi fi
# install python dependencies # install python dependencies

Loading…
Cancel
Save