PaddleSpeech/data_utils/augmentor/noise_speech.py

""" noise speech
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import numpy as np
import os
from collections import defaultdict

from . import base
from . import audio_database
from data_utils.speech import SpeechSegment

TURK = "turk"
USE_AUDIO_DATABASE_SOURCES = frozenset(["freesound", "chime"])
HALF_NOISE_LENGTH_MIN_THRESHOLD = 3.0
FIND_NOISE_MAX_ATTEMPTS = 20

logger = logging.getLogger(__name__)


def get_first_smaller(items, value):
    index = bisect.bisect_left(items, value) - 1
    assert items[index] < value, \
        'get_first_smaller failed! %d %d' % (items[index], value)
    return items[index]


def get_first_larger(items, value):
    'Find leftmost value greater than value'
    index = bisect.bisect_right(items, value)
    assert index < len(items), \
        "no noise bin exists for this audio length (%f)" % value
    assert items[index] > value, \
        'get_first_larger failed! %d %d' % (items[index], value)
    return items[index]


def _get_turk_noise_files(noise_dir, index_file):
    """ Creates a map from duration => a list of noise filenames

    :param noise_dir: Directory of noise files which contains
        "noise-samples-list"
    :type noise_dir: basestring
    :param index_file: Noise list
    :type index_file: basestring

    returns:noise_files (defaultdict): A map of bins to noise files.
        Each key is the duration, and the value is a list of noise
        files binned to this duration. Each bin is 2 secs.

    Note: noise-samples-list should contain one line per noise (wav) file
        along with its duration in milliseconds
    """
    noise_files = defaultdict(list)
    if not os.path.exists(index_file):
        logger.error('No noise files were found at {}'.format(index_file))
        return noise_files
    num_noise_files = 0
    rounded_durations = list(range(0, 65, 2))
    with open(index_file, 'r') as fl:
        for line in fl:
            fname = os.path.join(noise_dir, line.strip().split()[0])
            duration = float(line.strip().split()[1]) / 1000
            # bin the noise files into length bins rounded by 2 sec
            bin_id = get_first_smaller(rounded_durations, duration)
            noise_files[bin_id].append(fname)
            num_noise_files += 1
    logger.info('Loaded {} turk noise files'.format(num_noise_files))
    return noise_files


class NoiseSpeechAugmentor(base.AugmentorBase):
    """ Noise addition block

    :param snr_min: minimum signal-to-noise ratio
    :type snr_min: float
    :param snr_max: maximum signal-to-noise ratio
    :type snr_max: float
    :param noise_dir: root of where noise files are stored
    :type noise_fir: basestring
    :param index_file: index of noises of interest in noise_dir
    :type index_file: basestring
    :param source: select one from
        - turk
        - freesound
        - chime
        Note that this field is no longer required for the freesound
        and chime
    :type source: string
    :param tags: optional parameter for specifying what
        particular noises we want to add. See above for the available tags.
    :type tags: list
    :param tag_distr: optional noise distribution
    :type tag_distr: dict
    """

    def __init__(self,
                 rng,
                 snr_min,
                 snr_max,
                 noise_dir,
                 source,
                 allow_downsampling=None,
                 index_file=None,
                 tags=None,
                 tag_distr=None):
        # Define all required parameter maps here.
        self.rng = rng
        self.snr_min = snr_min
        self.snr_max = snr_max
        self.noise_dir = noise_dir
        self.source = source

        self.allow_downsampling = allow_downsampling
        self.index_file = index_file
        self.tags = tags
        self.tag_distr = tag_distr

        # When new noise sources are added, make sure to define the
        # associated bookkeeping variables here.
        self.turk_noise_files = []
        self.turk_noise_dir = None
        self.audio_index = audio_database.AudioIndex()

    def _init_data(self):
        """ Preloads stuff from disk in an attempt (e.g. list of files, etc)
        to make later loading faster. If the data configuration remains the
        same, this function does nothing.

        """
        noise_dir = self.noise_dir
        index_file = self.index_file
        source = self.source
        if not index_file:
            if source == TURK:
                index_file = os.path.join(noise_dir, 'noise-samples-list')
                logger.debug("index_file not provided; " + "defaulting to " +
                             index_file)
            else:
                if source != "":
                    assert source in USE_AUDIO_DATABASE_SOURCES, \
                        "{} not supported by audio_database".format(source)
                index_file = os.path.join(noise_dir,
                                          "audio_index_commercial.txt")
                logger.debug("index_file not provided; " + "defaulting to " +
                             index_file)

        if source == TURK:
            if self.turk_noise_dir != noise_dir:
                self.turk_noise_dir = noise_dir
                self.turk_noise_files = _get_turk_noise_files(noise_dir,
                                                              index_file)
        # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
        else:
            if source != "":
                assert source in USE_AUDIO_DATABASE_SOURCES, \
                    "{} not supported by audio_database".format(source)
            self.audio_index.refresh_records_from_index_file(
                self.noise_dir, index_file, self.tags)

    def transform_audio(self, audio_segment):
        """Adds walla noise

        :param audio_segment: Input audio
        :type audio_segment: SpeechSegment
        """
        # This handles the cases where the data source or directories change.
        self._init_data
        source = self.source
        allow_downsampling = self.allow_downsampling
        if source == TURK:
            self._add_turk_noise(audio_segment, self.rng, allow_downsampling)
        # elif source == TODO_SUPPORT_NON_AUDIO_DATABASE_BASED_SOURCES:
        else:
            self._add_noise(audio_segment, self.rng, allow_downsampling)

    def _sample_snr(self):
        """ Returns a float sampled in [`self.snr_min`, `self.snr_max`]
        if both `self.snr_min` and `self.snr_max` are non-zero.
        """
        snr_min = self.snr_min
        snr_max = self.snr_max
        sampled_snr = self.rng.uniform(snr_min, snr_max)
        return sampled_snr

    def _add_turk_noise(self, audio_segment, allow_downsampling):
        """ Adds a turk noise to the input audio.

        :param audio_segment: input audio
        :type audio_segment: audiosegment
        :param allow_downsampling: indicates whether downsampling
            is allowed
        :type allow_downsampling: boolean
        """
        read_size = 0
        if len(self.turk_noise_files) > 0:
            snr = self._sample_snr(self.rng)
            # Draw the noise file randomly from noise files that are
            # slightly longer than the utterance
            noise_bins = sorted(self.turk_noise_files.keys())
            # note some bins can be empty, so we can't just round up
            # to the nearest 2-sec interval
            rounded_duration = get_first_larger(noise_bins,
                                                audio_segment.duration)
            noise_fname = \
                self.rng.sample(self.turk_noise_files[rounded_duration], 1)[0]
            noise = SpeechSegment.from_wav_file(noise_fname)
            logger.debug('noise_fname {}'.format(noise_fname))
            logger.debug('snr {}'.format(snr))
            read_size = len(noise) * 2
            # May throw exceptions, but this is caught by
            # AudioFeaturizer.get_audio_files.
            audio_segment.add_noise(
                noise, snr, rng=self.rng, allow_downsampling=allow_downsampling)

    def _add_noise(self, audio_segment, allow_downsampling):
        """ Adds a noise indexed in audio_database.AudioIndex.

        :param audio_segment: input audio
        :type audio_segment: SpeechSegment
        :param allow_downsampling: indicates whether downsampling
            is allowed
        :type allow_downsampling: boolean

        Returns:
            (SpeechSegment, int)
                - sound with turk noise added
                - number of bytes read from disk
        """
        read_size = 0
        tag_distr = self.tag_distr
        if not self.audio_index.has_audio(tag_distr):
            if tag_distr is None:
                if not self.tags:
                    raise RuntimeError("The noise index does not have audio "
                                       "files to sample from.")
                else:
                    raise RuntimeError("The noise index does not have audio "
                                       "files of the given tags to sample "
                                       "from.")
            else:
                raise RuntimeError("The noise index does not have audio "
                                   "files to match the target noise "
                                   "distribution.")
        else:
            # Compute audio segment related statistics
            audio_duration = audio_segment.duration

            # Sample relevant augmentation parameters.
            snr = self._sample_snr(self.rng)

            # Perhaps, we may not have a sufficiently long noise, so we need
            # to search iteratively.
            min_duration = audio_duration + 0.25
            for _ in range(FIND_NOISE_MAX_ATTEMPTS):
                logger.debug("attempting to find noise of length "
                             "at least {}".format(min_duration))

                success, record = \
                    self.audio_index.sample_audio(min_duration,
                                                  rng=self.rng,
                                                  distr=tag_distr)

                if success is True:
                    noise_duration, read_size, noise_fname = record

                    # Assert after logging so we know
                    # what caused augmentation to fail.
                    logger.debug("noise_fname {}".format(noise_fname))
                    logger.debug("snr {}".format(snr))
                    assert noise_duration >= min_duration
                    break

                # Decrease the desired minimum duration linearly.
                # If the value becomes smaller than some threshold,
                # we half the value instead.
                if min_duration > HALF_NOISE_LENGTH_MIN_THRESHOLD:
                    min_duration -= 2.0
                else:
                    min_duration *= 0.5

            if success is False:
                logger.info("Failed to find a noise file")
                return

            diff_duration = audio_duration + 0.25 - noise_duration
            if diff_duration >= 0.0:
                # Here, the noise is shorter than the audio file, so
                # we pad with zeros to make sure the noise sound is applied
                # with a uniformly random shift.
                noise = SpeechSegment.from_file(noise_fname)
                noise = noise.pad_silence(diff_duration, sides="both")
            else:
                # The noise clip is at least ~25 ms longer than the audio
                # segment here.
                diff_duration = int(noise_duration * audio_segment.sample_rate) - \
                    int(audio_duration * audio_segment.sample_rate) - \
                    int(0.02 * audio_segment.sample_rate)
                start = float(self.rng.randint(0, diff_duration)) / \
                    audio.sample_rate
                finish = min(start + audio_duration + 0.2, noise_duration)
                noise = SpeechSegment.slice_from_file(noise_fname, start,
                                                      finish)

            if len(noise) < len(audio_segment):
                # This is to ensure that the noise clip is at least as
                # long as the audio segment.
                num_samples_to_pad = len(audio_segment) - len(noise)
                # Padding this amount of silence on both ends ensures that
                # the placement of the noise clip is uniformly random.
                silence = SpeechSegment(
                    np.zeros(num_samples_to_pad), audio_segment.sample_rate)
                noise = SpeechSegment.concatenate(silence, noise, silence)

            audio_segment.add_noise(
                noise, snr, rng=self.rng, allow_downsampling=allow_downsampling)