You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/data_utils/augmentor/noise_speech.py

319 lines
13 KiB

""" noise speech
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import numpy as np
import os
from collections import defaultdict
from . import base
from . import audio_database
from data_utils.speech import SpeechSegment
TURK = "turk"
USE_AUDIO_DATABASE_SOURCES = frozenset(["freesound", "chime"])
HALF_NOISE_LENGTH_MIN_THRESHOLD = 3.0
FIND_NOISE_MAX_ATTEMPTS = 20
logger = logging.getLogger(__name__)
def get_first_smaller(items, value):
index = bisect.bisect_left(items, value) - 1
assert items[index] < value, \
'get_first_smaller failed! %d %d' % (items[index], value)
return items[index]
def get_first_larger(items, value):
'Find leftmost value greater than value'
index = bisect.bisect_right(items, value)
assert index < len(items), \
"no noise bin exists for this audio length (%f)" % value
assert items[index] > value, \
'get_first_larger failed! %d %d' % (items[index], value)
return items[index]
def _get_turk_noise_files(noise_dir, index_file):
""" Creates a map from duration => a list of noise filenames
:param noise_dir: Directory of noise files which contains
"noise-samples-list"
:type noise_dir: basestring
:param index_file: Noise list
:type index_file: basestring
returns:noise_files (defaultdict): A map of bins to noise files.
Each key is the duration, and the value is a list of noise
files binned to this duration. Each bin is 2 secs.
Note: noise-samples-list should contain one line per noise (wav) file
along with its duration in milliseconds
"""
noise_files = defaultdict(list)
if not os.path.exists(index_file):
logger.error('No noise files were found at {}'.format(index_file))
return noise_files
num_noise_files = 0
rounded_durations = list(range(0, 65, 2))
with open(index_file, 'r') as fl:
for line in fl:
fname = os.path.join(noise_dir, line.strip().split()[0])
duration = float(line.strip().split()[1]) / 1000
# bin the noise files into length bins rounded by 2 sec
bin_id = get_first_smaller(rounded_durations, duration)
noise_files[bin_id].append(fname)
num_noise_files += 1
logger.info('Loaded {} turk noise files'.format(num_noise_files))
return noise_files
class NoiseSpeechAugmentor(base.AugmentorBase):
""" Noise addition block
:param snr_min: minimum signal-to-noise ratio
:type snr_min: float
:param snr_max: maximum signal-to-noise ratio
:type snr_max: float
:param noise_dir: root of where noise files are stored
:type noise_fir: basestring
:param index_file: index of noises of interest in noise_dir
:type index_file: basestring
:param source: select one from
- turk
- freesound
- chime
Note that this field is no longer required for the freesound
and chime
:type source: string
:param tags: optional parameter for specifying what
particular noises we want to add. See above for the available tags.
:type tags: list
:param tag_distr: optional noise distribution
:type tag_distr: dict
"""
def __init__(self,
rng,
snr_min,
snr_max,
noise_dir,
source,
allow_downsampling=None,
index_file=None,
tags=None,
tag_distr=None):
# Define all required parameter maps here.
self.rng = rng
self.snr_min = snr_min
self.snr_max = snr_max
self.noise_dir = noise_dir
self.source = source
self.allow_downsampling = allow_downsampling
self.index_file = index_file
self.tags = tags
self.tag_distr = tag_distr
# When new noise sources are added, make sure to define the
# associated bookkeeping variables here.
self.turk_noise_files = []
self.turk_noise_dir = None
self.audio_index = audio_database.AudioIndex()
def _init_data(self):
""" Preloads stuff from disk in an attempt (e.g. list of files, etc)
to make later loading faster. If the data configuration remains the
same, this function does nothing.
"""
noise_dir = self.noise_dir
index_file = self.index_file
source = self.source
if not index_file:
if source == TURK:
index_file = os.path.join(noise_dir, 'noise-samples-list')
logger.debug("index_file not provided; " + "defaulting to " +
index_file)
else:
if source != "":
assert source in USE_AUDIO_DATABASE_SOURCES, \
"{} not supported by audio_database".format(source)
index_file = os.path.join(noise_dir,
"audio_index_commercial.txt")
logger.debug("index_file not provided; " + "defaulting to " +
index_file)
if source == TURK:
if self.turk_noise_dir != noise_dir:
self.turk_noise_dir = noise_dir
self.turk_noise_files = _get_turk_noise_files(noise_dir,
index_file)
# elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
else:
if source != "":
assert source in USE_AUDIO_DATABASE_SOURCES, \
"{} not supported by audio_database".format(source)
self.audio_index.refresh_records_from_index_file(
self.noise_dir, index_file, self.tags)
def transform_audio(self, audio_segment):
"""Adds walla noise
:param audio_segment: Input audio
:type audio_segment: SpeechSegment
"""
# This handles the cases where the data source or directories change.
self._init_data
source = self.source
allow_downsampling = self.allow_downsampling
if source == TURK:
self._add_turk_noise(audio_segment, self.rng, allow_downsampling)
# elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
else:
self._add_noise(audio_segment, self.rng, allow_downsampling)
def _sample_snr(self):
""" Returns a float sampled in [`self.snr_min`, `self.snr_max`]
if both `self.snr_min` and `self.snr_max` are non-zero.
"""
snr_min = self.snr_min
snr_max = self.snr_max
sampled_snr = self.rng.uniform(snr_min, snr_max)
return sampled_snr
def _add_turk_noise(self, audio_segment, allow_downsampling):
""" Adds a turk noise to the input audio.
:param audio_segment: input audio
:type audio_segment: audiosegment
:param allow_downsampling: indicates whether downsampling
is allowed
:type allow_downsampling: boolean
"""
read_size = 0
if len(self.turk_noise_files) > 0:
snr = self._sample_snr(self.rng)
# Draw the noise file randomly from noise files that are
# slightly longer than the utterance
noise_bins = sorted(self.turk_noise_files.keys())
# note some bins can be empty, so we can't just round up
# to the nearest 2-sec interval
rounded_duration = get_first_larger(noise_bins,
audio_segment.duration)
noise_fname = \
self.rng.sample(self.turk_noise_files[rounded_duration], 1)[0]
noise = SpeechSegment.from_wav_file(noise_fname)
logger.debug('noise_fname {}'.format(noise_fname))
logger.debug('snr {}'.format(snr))
read_size = len(noise) * 2
# May throw exceptions, but this is caught by
# AudioFeaturizer.get_audio_files.
audio_segment.add_noise(
noise, snr, rng=self.rng, allow_downsampling=allow_downsampling)
def _add_noise(self, audio_segment, allow_downsampling):
""" Adds a noise indexed in audio_database.AudioIndex.
:param audio_segment: input audio
:type audio_segment: SpeechSegment
:param allow_downsampling: indicates whether downsampling
is allowed
:type allow_downsampling: boolean
Returns:
(SpeechSegment, int)
- sound with turk noise added
- number of bytes read from disk
"""
read_size = 0
tag_distr = self.tag_distr
if not self.audio_index.has_audio(tag_distr):
if tag_distr is None:
if not self.tags:
raise RuntimeError("The noise index does not have audio "
"files to sample from.")
else:
raise RuntimeError("The noise index does not have audio "
"files of the given tags to sample "
"from.")
else:
raise RuntimeError("The noise index does not have audio "
"files to match the target noise "
"distribution.")
else:
# Compute audio segment related statistics
audio_duration = audio_segment.duration
# Sample relevant augmentation parameters.
snr = self._sample_snr(self.rng)
# Perhaps, we may not have a sufficiently long noise, so we need
# to search iteratively.
min_duration = audio_duration + 0.25
for _ in range(FIND_NOISE_MAX_ATTEMPTS):
logger.debug("attempting to find noise of length "
"at least {}".format(min_duration))
success, record = \
self.audio_index.sample_audio(min_duration,
rng=self.rng,
distr=tag_distr)
if success is True:
noise_duration, read_size, noise_fname = record
# Assert after logging so we know
# what caused augmentation to fail.
logger.debug("noise_fname {}".format(noise_fname))
logger.debug("snr {}".format(snr))
assert noise_duration >= min_duration
break
# Decrease the desired minimum duration linearly.
# If the value becomes smaller than some threshold,
# we half the value instead.
if min_duration > HALF_NOISE_LENGTH_MIN_THRESHOLD:
min_duration -= 2.0
else:
min_duration *= 0.5
if success is False:
logger.info("Failed to find a noise file")
return
diff_duration = audio_duration + 0.25 - noise_duration
if diff_duration >= 0.0:
# Here, the noise is shorter than the audio file, so
# we pad with zeros to make sure the noise sound is applied
# with a uniformly random shift.
noise = SpeechSegment.from_file(noise_fname)
noise = noise.pad_silence(diff_duration, sides="both")
else:
# The noise clip is at least ~25 ms longer than the audio
# segment here.
diff_duration = int(noise_duration * audio_segment.sample_rate) - \
int(audio_duration * audio_segment.sample_rate) - \
int(0.02 * audio_segment.sample_rate)
start = float(self.rng.randint(0, diff_duration)) / \
audio.sample_rate
finish = min(start + audio_duration + 0.2, noise_duration)
noise = SpeechSegment.slice_from_file(noise_fname, start,
finish)
if len(noise) < len(audio_segment):
# This is to ensure that the noise clip is at least as
# long as the audio segment.
num_samples_to_pad = len(audio_segment) - len(noise)
# Padding this amount of silence on both ends ensures that
# the placement of the noise clip is uniformly random.
silence = SpeechSegment(
np.zeros(num_samples_to_pad), audio_segment.sample_rate)
noise = SpeechSegment.concatenate(silence, noise, silence)
audio_segment.add_noise(
noise, snr, rng=self.rng, allow_downsampling=allow_downsampling)