You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
319 lines
13 KiB
319 lines
13 KiB
""" noise speech
|
|
"""
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
import logging
|
|
import numpy as np
|
|
import os
|
|
from collections import defaultdict
|
|
|
|
from . import base
|
|
from . import audio_database
|
|
from data_utils.speech import SpeechSegment
|
|
|
|
TURK = "turk"
|
|
USE_AUDIO_DATABASE_SOURCES = frozenset(["freesound", "chime"])
|
|
HALF_NOISE_LENGTH_MIN_THRESHOLD = 3.0
|
|
FIND_NOISE_MAX_ATTEMPTS = 20
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def get_first_smaller(items, value):
|
|
index = bisect.bisect_left(items, value) - 1
|
|
assert items[index] < value, \
|
|
'get_first_smaller failed! %d %d' % (items[index], value)
|
|
return items[index]
|
|
|
|
|
|
def get_first_larger(items, value):
|
|
'Find leftmost value greater than value'
|
|
index = bisect.bisect_right(items, value)
|
|
assert index < len(items), \
|
|
"no noise bin exists for this audio length (%f)" % value
|
|
assert items[index] > value, \
|
|
'get_first_larger failed! %d %d' % (items[index], value)
|
|
return items[index]
|
|
|
|
|
|
def _get_turk_noise_files(noise_dir, index_file):
|
|
""" Creates a map from duration => a list of noise filenames
|
|
|
|
:param noise_dir: Directory of noise files which contains
|
|
"noise-samples-list"
|
|
:type noise_dir: basestring
|
|
:param index_file: Noise list
|
|
:type index_file: basestring
|
|
|
|
returns:noise_files (defaultdict): A map of bins to noise files.
|
|
Each key is the duration, and the value is a list of noise
|
|
files binned to this duration. Each bin is 2 secs.
|
|
|
|
Note: noise-samples-list should contain one line per noise (wav) file
|
|
along with its duration in milliseconds
|
|
"""
|
|
noise_files = defaultdict(list)
|
|
if not os.path.exists(index_file):
|
|
logger.error('No noise files were found at {}'.format(index_file))
|
|
return noise_files
|
|
num_noise_files = 0
|
|
rounded_durations = list(range(0, 65, 2))
|
|
with open(index_file, 'r') as fl:
|
|
for line in fl:
|
|
fname = os.path.join(noise_dir, line.strip().split()[0])
|
|
duration = float(line.strip().split()[1]) / 1000
|
|
# bin the noise files into length bins rounded by 2 sec
|
|
bin_id = get_first_smaller(rounded_durations, duration)
|
|
noise_files[bin_id].append(fname)
|
|
num_noise_files += 1
|
|
logger.info('Loaded {} turk noise files'.format(num_noise_files))
|
|
return noise_files
|
|
|
|
|
|
class NoiseSpeechAugmentor(base.AugmentorBase):
|
|
""" Noise addition block
|
|
|
|
:param snr_min: minimum signal-to-noise ratio
|
|
:type snr_min: float
|
|
:param snr_max: maximum signal-to-noise ratio
|
|
:type snr_max: float
|
|
:param noise_dir: root of where noise files are stored
|
|
:type noise_fir: basestring
|
|
:param index_file: index of noises of interest in noise_dir
|
|
:type index_file: basestring
|
|
:param source: select one from
|
|
- turk
|
|
- freesound
|
|
- chime
|
|
Note that this field is no longer required for the freesound
|
|
and chime
|
|
:type source: string
|
|
:param tags: optional parameter for specifying what
|
|
particular noises we want to add. See above for the available tags.
|
|
:type tags: list
|
|
:param tag_distr: optional noise distribution
|
|
:type tag_distr: dict
|
|
"""
|
|
|
|
def __init__(self,
|
|
rng,
|
|
snr_min,
|
|
snr_max,
|
|
noise_dir,
|
|
source,
|
|
allow_downsampling=None,
|
|
index_file=None,
|
|
tags=None,
|
|
tag_distr=None):
|
|
# Define all required parameter maps here.
|
|
self.rng = rng
|
|
self.snr_min = snr_min
|
|
self.snr_max = snr_max
|
|
self.noise_dir = noise_dir
|
|
self.source = source
|
|
|
|
self.allow_downsampling = allow_downsampling
|
|
self.index_file = index_file
|
|
self.tags = tags
|
|
self.tag_distr = tag_distr
|
|
|
|
# When new noise sources are added, make sure to define the
|
|
# associated bookkeeping variables here.
|
|
self.turk_noise_files = []
|
|
self.turk_noise_dir = None
|
|
self.audio_index = audio_database.AudioIndex()
|
|
|
|
def _init_data(self):
|
|
""" Preloads stuff from disk in an attempt (e.g. list of files, etc)
|
|
to make later loading faster. If the data configuration remains the
|
|
same, this function does nothing.
|
|
|
|
"""
|
|
noise_dir = self.noise_dir
|
|
index_file = self.index_file
|
|
source = self.source
|
|
if not index_file:
|
|
if source == TURK:
|
|
index_file = os.path.join(noise_dir, 'noise-samples-list')
|
|
logger.debug("index_file not provided; " + "defaulting to " +
|
|
index_file)
|
|
else:
|
|
if source != "":
|
|
assert source in USE_AUDIO_DATABASE_SOURCES, \
|
|
"{} not supported by audio_database".format(source)
|
|
index_file = os.path.join(noise_dir,
|
|
"audio_index_commercial.txt")
|
|
logger.debug("index_file not provided; " + "defaulting to " +
|
|
index_file)
|
|
|
|
if source == TURK:
|
|
if self.turk_noise_dir != noise_dir:
|
|
self.turk_noise_dir = noise_dir
|
|
self.turk_noise_files = _get_turk_noise_files(noise_dir,
|
|
index_file)
|
|
# elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
|
|
else:
|
|
if source != "":
|
|
assert source in USE_AUDIO_DATABASE_SOURCES, \
|
|
"{} not supported by audio_database".format(source)
|
|
self.audio_index.refresh_records_from_index_file(
|
|
self.noise_dir, index_file, self.tags)
|
|
|
|
def transform_audio(self, audio_segment):
|
|
"""Adds walla noise
|
|
|
|
:param audio_segment: Input audio
|
|
:type audio_segment: SpeechSegment
|
|
"""
|
|
# This handles the cases where the data source or directories change.
|
|
self._init_data
|
|
source = self.source
|
|
allow_downsampling = self.allow_downsampling
|
|
if source == TURK:
|
|
self._add_turk_noise(audio_segment, self.rng, allow_downsampling)
|
|
# elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
|
|
else:
|
|
self._add_noise(audio_segment, self.rng, allow_downsampling)
|
|
|
|
def _sample_snr(self):
|
|
""" Returns a float sampled in [`self.snr_min`, `self.snr_max`]
|
|
if both `self.snr_min` and `self.snr_max` are non-zero.
|
|
"""
|
|
snr_min = self.snr_min
|
|
snr_max = self.snr_max
|
|
sampled_snr = self.rng.uniform(snr_min, snr_max)
|
|
return sampled_snr
|
|
|
|
def _add_turk_noise(self, audio_segment, allow_downsampling):
|
|
""" Adds a turk noise to the input audio.
|
|
|
|
:param audio_segment: input audio
|
|
:type audio_segment: audiosegment
|
|
:param allow_downsampling: indicates whether downsampling
|
|
is allowed
|
|
:type allow_downsampling: boolean
|
|
"""
|
|
read_size = 0
|
|
if len(self.turk_noise_files) > 0:
|
|
snr = self._sample_snr(self.rng)
|
|
# Draw the noise file randomly from noise files that are
|
|
# slightly longer than the utterance
|
|
noise_bins = sorted(self.turk_noise_files.keys())
|
|
# note some bins can be empty, so we can't just round up
|
|
# to the nearest 2-sec interval
|
|
rounded_duration = get_first_larger(noise_bins,
|
|
audio_segment.duration)
|
|
noise_fname = \
|
|
self.rng.sample(self.turk_noise_files[rounded_duration], 1)[0]
|
|
noise = SpeechSegment.from_wav_file(noise_fname)
|
|
logger.debug('noise_fname {}'.format(noise_fname))
|
|
logger.debug('snr {}'.format(snr))
|
|
read_size = len(noise) * 2
|
|
# May throw exceptions, but this is caught by
|
|
# AudioFeaturizer.get_audio_files.
|
|
audio_segment.add_noise(
|
|
noise, snr, rng=self.rng, allow_downsampling=allow_downsampling)
|
|
|
|
def _add_noise(self, audio_segment, allow_downsampling):
|
|
""" Adds a noise indexed in audio_database.AudioIndex.
|
|
|
|
:param audio_segment: input audio
|
|
:type audio_segment: SpeechSegment
|
|
:param allow_downsampling: indicates whether downsampling
|
|
is allowed
|
|
:type allow_downsampling: boolean
|
|
|
|
Returns:
|
|
(SpeechSegment, int)
|
|
- sound with turk noise added
|
|
- number of bytes read from disk
|
|
"""
|
|
read_size = 0
|
|
tag_distr = self.tag_distr
|
|
if not self.audio_index.has_audio(tag_distr):
|
|
if tag_distr is None:
|
|
if not self.tags:
|
|
raise RuntimeError("The noise index does not have audio "
|
|
"files to sample from.")
|
|
else:
|
|
raise RuntimeError("The noise index does not have audio "
|
|
"files of the given tags to sample "
|
|
"from.")
|
|
else:
|
|
raise RuntimeError("The noise index does not have audio "
|
|
"files to match the target noise "
|
|
"distribution.")
|
|
else:
|
|
# Compute audio segment related statistics
|
|
audio_duration = audio_segment.duration
|
|
|
|
# Sample relevant augmentation parameters.
|
|
snr = self._sample_snr(self.rng)
|
|
|
|
# Perhaps, we may not have a sufficiently long noise, so we need
|
|
# to search iteratively.
|
|
min_duration = audio_duration + 0.25
|
|
for _ in range(FIND_NOISE_MAX_ATTEMPTS):
|
|
logger.debug("attempting to find noise of length "
|
|
"at least {}".format(min_duration))
|
|
|
|
success, record = \
|
|
self.audio_index.sample_audio(min_duration,
|
|
rng=self.rng,
|
|
distr=tag_distr)
|
|
|
|
if success is True:
|
|
noise_duration, read_size, noise_fname = record
|
|
|
|
# Assert after logging so we know
|
|
# what caused augmentation to fail.
|
|
logger.debug("noise_fname {}".format(noise_fname))
|
|
logger.debug("snr {}".format(snr))
|
|
assert noise_duration >= min_duration
|
|
break
|
|
|
|
# Decrease the desired minimum duration linearly.
|
|
# If the value becomes smaller than some threshold,
|
|
# we half the value instead.
|
|
if min_duration > HALF_NOISE_LENGTH_MIN_THRESHOLD:
|
|
min_duration -= 2.0
|
|
else:
|
|
min_duration *= 0.5
|
|
|
|
if success is False:
|
|
logger.info("Failed to find a noise file")
|
|
return
|
|
|
|
diff_duration = audio_duration + 0.25 - noise_duration
|
|
if diff_duration >= 0.0:
|
|
# Here, the noise is shorter than the audio file, so
|
|
# we pad with zeros to make sure the noise sound is applied
|
|
# with a uniformly random shift.
|
|
noise = SpeechSegment.from_file(noise_fname)
|
|
noise = noise.pad_silence(diff_duration, sides="both")
|
|
else:
|
|
# The noise clip is at least ~25 ms longer than the audio
|
|
# segment here.
|
|
diff_duration = int(noise_duration * audio_segment.sample_rate) - \
|
|
int(audio_duration * audio_segment.sample_rate) - \
|
|
int(0.02 * audio_segment.sample_rate)
|
|
start = float(self.rng.randint(0, diff_duration)) / \
|
|
audio.sample_rate
|
|
finish = min(start + audio_duration + 0.2, noise_duration)
|
|
noise = SpeechSegment.slice_from_file(noise_fname, start,
|
|
finish)
|
|
|
|
if len(noise) < len(audio_segment):
|
|
# This is to ensure that the noise clip is at least as
|
|
# long as the audio segment.
|
|
num_samples_to_pad = len(audio_segment) - len(noise)
|
|
# Padding this amount of silence on both ends ensures that
|
|
# the placement of the noise clip is uniformly random.
|
|
silence = SpeechSegment(
|
|
np.zeros(num_samples_to_pad), audio_segment.sample_rate)
|
|
noise = SpeechSegment.concatenate(silence, noise, silence)
|
|
|
|
audio_segment.add_noise(
|
|
noise, snr, rng=self.rng, allow_downsampling=allow_downsampling)
|