PaddleSpeech/third_party/python_kaldi_features/python_speech_features/sigproc_orig.py

# This file includes routines for basic signal processing including framing and computing power spectra.
# Author: James Lyons 2012
import decimal

import numpy
import math
import logging


def round_half_up(number):
    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))


def rolling_window(a, window, step=1):
    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]


def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):
    """Frame a signal into overlapping frames.

    :param sig: the audio signal to frame.
    :param frame_len: length of each frame measured in samples.
    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
    :returns: an array of frames. Size is NUMFRAMES by frame_len.
    """
    slen = len(sig)
    frame_len = int(round_half_up(frame_len))
    frame_step = int(round_half_up(frame_step))
    if slen <= frame_len:
        numframes = 1
    else:
        numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))

    padlen = int((numframes - 1) * frame_step + frame_len)

    zeros = numpy.zeros((padlen - slen,))
    padsignal = numpy.concatenate((sig, zeros))
    if stride_trick:
        win = winfunc(frame_len)
        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
    else:
        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
        indices = numpy.array(indices, dtype=numpy.int32)
        frames = padsignal[indices]
        win = numpy.tile(winfunc(frame_len), (numframes, 1))

    return frames * win


def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
    """Does overlap-add procedure to undo the action of framesig.

    :param frames: the array of frames.
    :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
    :param frame_len: length of each frame measured in samples.
    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
    :returns: a 1-D signal.
    """
    frame_len = round_half_up(frame_len)
    frame_step = round_half_up(frame_step)
    numframes = numpy.shape(frames)[0]
    assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'

    indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
        numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
    indices = numpy.array(indices, dtype=numpy.int32)
    padlen = (numframes - 1) * frame_step + frame_len

    if siglen <= 0: siglen = padlen

    rec_signal = numpy.zeros((padlen,))
    window_correction = numpy.zeros((padlen,))
    win = winfunc(frame_len)

    for i in range(0, numframes):
        window_correction[indices[i, :]] = window_correction[
                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]

    rec_signal = rec_signal / window_correction
    return rec_signal[0:siglen]


def magspec(frames, NFFT):
    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).

    :param frames: the array of frames. Each row is a frame.
    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
    """
    if numpy.shape(frames)[1] > NFFT:
        logging.warn(
            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
            numpy.shape(frames)[1], NFFT)
    complex_spec = numpy.fft.rfft(frames, NFFT)
    return numpy.absolute(complex_spec)


def powspec(frames, NFFT):
    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).

    :param frames: the array of frames. Each row is a frame.
    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
    """
    return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))


def logpowspec(frames, NFFT, norm=1):
    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).

    :param frames: the array of frames. Each row is a frame.
    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
    :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
    """
    ps = powspec(frames, NFFT);
    ps[ps <= 1e-30] = 1e-30
    lps = 10 * numpy.log10(ps)
    if norm:
        return lps - numpy.max(lps)
    else:
        return lps


def preemphasis(signal, coeff=0.95):
    """perform preemphasis on the input signal.

    :param signal: The signal to filter.
    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
    :returns: the filtered signal.
    """
    return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
E2E/Streaming Transformer/Conformer ASR (#578) * add cmvn and label smoothing loss layer * add layer for transformer * add glu and conformer conv * add torch compatiable hack, mask funcs * not hack size since it exists * add test; attention * add attention, common utils, hack paddle * add audio utils * conformer batch padding mask bug fix #223 * fix typo, python infer fix rnn mem opt name error and batchnorm1d, will be available at 2.0.2 * fix ci * fix ci * add encoder * refactor egs * add decoder * refactor ctc, add ctc align, refactor ckpt, add warmup lr scheduler, cmvn utils * refactor docs * add fix * fix readme * fix bugs, refactor collator, add pad_sequence, fix ckpt bugs * fix docstring * refactor data feed order * add u2 model * refactor cmvn, test * add utils * add u2 config * fix bugs * fix bugs * fix autograd maybe has problem when using inplace operation * refactor data, build vocab; add format data * fix text featurizer * refactor build vocab * add fbank, refactor feature of speech * refactor audio feat * refactor data preprare * refactor data * model init from config * add u2 bins * flake8 * can train * fix bugs, add coverage, add scripts * test can run * fix data * speed perturb with sox * add spec aug * fix for train * fix train logitc * fix logger * log valid loss, time dataset process * using np for speed perturb, remove some debug log of grad clip * fix logger * fix build vocab * fix logger name * using module logger as default * fix * fix install * reorder imports * fix board logger * fix logger * kaldi fbank and mfcc * fix cmvn and print prarams * fix add_eos_sos and cmvn * fix cmvn compute * fix logger and cmvn * fix subsampling, label smoothing loss, remove useless * add notebook test * fix log * fix tb logger * multi gpu valid * fix log * fix log * fix config * fix compute cmvn, need paddle 2.1 * add cmvn notebook * fix layer tools * fix compute cmvn * add rtf * fix decoding * fix layer tools * fix log, add avg script * more avg and test info * fix dataset pickle problem; using 2.1 paddle; num_workers can > 0; ckpt save in exp dir;fix setup.sh; * add vimrc * refactor tiny script, add transformer and stream conf * spm demo; librisppech scripts and confs * fix log * add librispeech scripts * refactor data pipe; fix conf; fix u2 default params * fix bugs * refactor aishell scripts * fix test * fix cmvn * fix s0 scripts * fix ds2 scripts and bugs * fix dev & test dataset filter * fix dataset filter * filter dev * fix ckpt path * filter test, since librispeech will cause OOM, but all test wer will be worse, since mismatch train with test * add comment * add syllable doc * fix ds2 configs * add doc * add pypinyin tools * fix decoder using blank_id=0 * mmseg with pybind11 * format code 3 years ago			`# This file includes routines for basic signal processing including framing and computing power spectra.`
			`# Author: James Lyons 2012`
			`import decimal`

			`import numpy`
			`import math`
			`import logging`


			`def round_half_up(number):`
			`return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))`


			`def rolling_window(a, window, step=1):`
			`# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick`
			`shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)`
			`strides = a.strides + (a.strides[-1],)`
			`return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]`


			`def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):`
			`"""Frame a signal into overlapping frames.`

			`:param sig: the audio signal to frame.`
			`:param frame_len: length of each frame measured in samples.`
			`:param frame_step: number of samples after the start of the previous frame that the next frame should begin.`
			`:param winfunc: the analysis window to apply to each frame. By default no window is applied.`
			`:param stride_trick: use stride trick to compute the rolling window and window multiplication faster`
			`:returns: an array of frames. Size is NUMFRAMES by frame_len.`
			`"""`
			`slen = len(sig)`
			`frame_len = int(round_half_up(frame_len))`
			`frame_step = int(round_half_up(frame_step))`
			`if slen <= frame_len:`
			`numframes = 1`
			`else:`
			`numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))`

			`padlen = int((numframes - 1) * frame_step + frame_len)`

			`zeros = numpy.zeros((padlen - slen,))`
			`padsignal = numpy.concatenate((sig, zeros))`
			`if stride_trick:`
			`win = winfunc(frame_len)`
			`frames = rolling_window(padsignal, window=frame_len, step=frame_step)`
			`else:`
			`indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(`
			`numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T`
			`indices = numpy.array(indices, dtype=numpy.int32)`
			`frames = padsignal[indices]`
			`win = numpy.tile(winfunc(frame_len), (numframes, 1))`

			`return frames * win`


			`def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):`
			`"""Does overlap-add procedure to undo the action of framesig.`

			`:param frames: the array of frames.`
			`:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.`
			`:param frame_len: length of each frame measured in samples.`
			`:param frame_step: number of samples after the start of the previous frame that the next frame should begin.`
			`:param winfunc: the analysis window to apply to each frame. By default no window is applied.`
			`:returns: a 1-D signal.`
			`"""`
			`frame_len = round_half_up(frame_len)`
			`frame_step = round_half_up(frame_step)`
			`numframes = numpy.shape(frames)[0]`
			`assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'`

			`indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(`
			`numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T`
			`indices = numpy.array(indices, dtype=numpy.int32)`
			`padlen = (numframes - 1) * frame_step + frame_len`

			`if siglen <= 0: siglen = padlen`

			`rec_signal = numpy.zeros((padlen,))`
			`window_correction = numpy.zeros((padlen,))`
			`win = winfunc(frame_len)`

			`for i in range(0, numframes):`
			`window_correction[indices[i, :]] = window_correction[`
			`indices[i, :]] + win + 1e-15 # add a little bit so it is never zero`
			`rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]`

			`rec_signal = rec_signal / window_correction`
			`return rec_signal[0:siglen]`


			`def magspec(frames, NFFT):`
			`"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).`

			`:param frames: the array of frames. Each row is a frame.`
			`:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.`
			`:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.`
			`"""`
			`if numpy.shape(frames)[1] > NFFT:`
			`logging.warn(`
			`'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',`
			`numpy.shape(frames)[1], NFFT)`
			`complex_spec = numpy.fft.rfft(frames, NFFT)`
			`return numpy.absolute(complex_spec)`


			`def powspec(frames, NFFT):`
			`"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).`

			`:param frames: the array of frames. Each row is a frame.`
			`:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.`
			`:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.`
			`"""`
			`return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))`


			`def logpowspec(frames, NFFT, norm=1):`
			`"""Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).`

			`:param frames: the array of frames. Each row is a frame.`
			`:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.`
			`:param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.`
			`:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.`
			`"""`
			`ps = powspec(frames, NFFT);`
			`ps[ps <= 1e-30] = 1e-30`
			`lps = 10 * numpy.log10(ps)`
			`if norm:`
			`return lps - numpy.max(lps)`
			`else:`
			`return lps`


			`def preemphasis(signal, coeff=0.95):`
			`"""perform preemphasis on the input signal.`

			`:param signal: The signal to filter.`
			`:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.`
			`:returns: the filtered signal.`
			`"""`
			`return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])`