commit
d0bca1982e
@ -1 +1,5 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
Date: 2022-2-25, Author: Hui Zhang.
|
||||||
|
- Refactor architecture.
|
||||||
|
- dtw distance and mcd style dtw
|
||||||
|
@ -1,170 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from numpy import ndarray as array
|
|
||||||
|
|
||||||
from ..backends import depth_convert
|
|
||||||
from ..utils import ParameterError
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
'depth_augment',
|
|
||||||
'spect_augment',
|
|
||||||
'random_crop1d',
|
|
||||||
'random_crop2d',
|
|
||||||
'adaptive_spect_augment',
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def randint(high: int) -> int:
|
|
||||||
"""Generate one random integer in range [0 high)
|
|
||||||
|
|
||||||
This is a helper function for random data augmentaiton
|
|
||||||
"""
|
|
||||||
return int(np.random.randint(0, high=high))
|
|
||||||
|
|
||||||
|
|
||||||
def rand() -> float:
|
|
||||||
"""Generate one floating-point number in range [0 1)
|
|
||||||
|
|
||||||
This is a helper function for random data augmentaiton
|
|
||||||
"""
|
|
||||||
return float(np.random.rand(1))
|
|
||||||
|
|
||||||
|
|
||||||
def depth_augment(y: array,
|
|
||||||
choices: List=['int8', 'int16'],
|
|
||||||
probs: List[float]=[0.5, 0.5]) -> array:
|
|
||||||
""" Audio depth augmentation
|
|
||||||
|
|
||||||
Do audio depth augmentation to simulate the distortion brought by quantization.
|
|
||||||
"""
|
|
||||||
assert len(probs) == len(
|
|
||||||
choices
|
|
||||||
), 'number of choices {} must be equal to size of probs {}'.format(
|
|
||||||
len(choices), len(probs))
|
|
||||||
depth = np.random.choice(choices, p=probs)
|
|
||||||
src_depth = y.dtype
|
|
||||||
y1 = depth_convert(y, depth)
|
|
||||||
y2 = depth_convert(y1, src_depth)
|
|
||||||
|
|
||||||
return y2
|
|
||||||
|
|
||||||
|
|
||||||
def adaptive_spect_augment(spect: array, tempo_axis: int=0,
|
|
||||||
level: float=0.1) -> array:
|
|
||||||
"""Do adpative spectrogram augmentation
|
|
||||||
|
|
||||||
The level of the augmentation is gowern by the paramter level,
|
|
||||||
ranging from 0 to 1, with 0 represents no augmentation。
|
|
||||||
|
|
||||||
"""
|
|
||||||
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
|
|
||||||
if tempo_axis == 0:
|
|
||||||
nt, nf = spect.shape
|
|
||||||
else:
|
|
||||||
nf, nt = spect.shape
|
|
||||||
|
|
||||||
time_mask_width = int(nt * level * 0.5)
|
|
||||||
freq_mask_width = int(nf * level * 0.5)
|
|
||||||
|
|
||||||
num_time_mask = int(10 * level)
|
|
||||||
num_freq_mask = int(10 * level)
|
|
||||||
|
|
||||||
if tempo_axis == 0:
|
|
||||||
for _ in range(num_time_mask):
|
|
||||||
start = randint(nt - time_mask_width)
|
|
||||||
spect[start:start + time_mask_width, :] = 0
|
|
||||||
for _ in range(num_freq_mask):
|
|
||||||
start = randint(nf - freq_mask_width)
|
|
||||||
spect[:, start:start + freq_mask_width] = 0
|
|
||||||
else:
|
|
||||||
for _ in range(num_time_mask):
|
|
||||||
start = randint(nt - time_mask_width)
|
|
||||||
spect[:, start:start + time_mask_width] = 0
|
|
||||||
for _ in range(num_freq_mask):
|
|
||||||
start = randint(nf - freq_mask_width)
|
|
||||||
spect[start:start + freq_mask_width, :] = 0
|
|
||||||
|
|
||||||
return spect
|
|
||||||
|
|
||||||
|
|
||||||
def spect_augment(spect: array,
|
|
||||||
tempo_axis: int=0,
|
|
||||||
max_time_mask: int=3,
|
|
||||||
max_freq_mask: int=3,
|
|
||||||
max_time_mask_width: int=30,
|
|
||||||
max_freq_mask_width: int=20) -> array:
|
|
||||||
"""Do spectrogram augmentation in both time and freq axis
|
|
||||||
|
|
||||||
Reference:
|
|
||||||
|
|
||||||
"""
|
|
||||||
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
|
|
||||||
if tempo_axis == 0:
|
|
||||||
nt, nf = spect.shape
|
|
||||||
else:
|
|
||||||
nf, nt = spect.shape
|
|
||||||
|
|
||||||
num_time_mask = randint(max_time_mask)
|
|
||||||
num_freq_mask = randint(max_freq_mask)
|
|
||||||
|
|
||||||
time_mask_width = randint(max_time_mask_width)
|
|
||||||
freq_mask_width = randint(max_freq_mask_width)
|
|
||||||
|
|
||||||
if tempo_axis == 0:
|
|
||||||
for _ in range(num_time_mask):
|
|
||||||
start = randint(nt - time_mask_width)
|
|
||||||
spect[start:start + time_mask_width, :] = 0
|
|
||||||
for _ in range(num_freq_mask):
|
|
||||||
start = randint(nf - freq_mask_width)
|
|
||||||
spect[:, start:start + freq_mask_width] = 0
|
|
||||||
else:
|
|
||||||
for _ in range(num_time_mask):
|
|
||||||
start = randint(nt - time_mask_width)
|
|
||||||
spect[:, start:start + time_mask_width] = 0
|
|
||||||
for _ in range(num_freq_mask):
|
|
||||||
start = randint(nf - freq_mask_width)
|
|
||||||
spect[start:start + freq_mask_width, :] = 0
|
|
||||||
|
|
||||||
return spect
|
|
||||||
|
|
||||||
|
|
||||||
def random_crop1d(y: array, crop_len: int) -> array:
|
|
||||||
""" Do random cropping on 1d input signal
|
|
||||||
|
|
||||||
The input is a 1d signal, typically a sound waveform
|
|
||||||
"""
|
|
||||||
if y.ndim != 1:
|
|
||||||
'only accept 1d tensor or numpy array'
|
|
||||||
n = len(y)
|
|
||||||
idx = randint(n - crop_len)
|
|
||||||
return y[idx:idx + crop_len]
|
|
||||||
|
|
||||||
|
|
||||||
def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
|
|
||||||
""" Do random cropping for 2D array, typically a spectrogram.
|
|
||||||
|
|
||||||
The cropping is done in temporal direction on the time-freq input signal.
|
|
||||||
"""
|
|
||||||
if tempo_axis >= s.ndim:
|
|
||||||
raise ParameterError('axis out of range')
|
|
||||||
|
|
||||||
n = s.shape[tempo_axis]
|
|
||||||
idx = randint(high=n - crop_len)
|
|
||||||
sli = [slice(None) for i in range(s.ndim)]
|
|
||||||
sli[tempo_axis] = slice(idx, idx + crop_len)
|
|
||||||
out = s[tuple(sli)]
|
|
||||||
return out
|
|
@ -1,461 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import math
|
|
||||||
from functools import partial
|
|
||||||
from typing import Optional
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
import paddle
|
|
||||||
import paddle.nn as nn
|
|
||||||
|
|
||||||
from .window import get_window
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
'Spectrogram',
|
|
||||||
'MelSpectrogram',
|
|
||||||
'LogMelSpectrogram',
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def hz_to_mel(freq: Union[paddle.Tensor, float],
|
|
||||||
htk: bool=False) -> Union[paddle.Tensor, float]:
|
|
||||||
"""Convert Hz to Mels.
|
|
||||||
Parameters:
|
|
||||||
freq: the input tensor of arbitrary shape, or a single floating point number.
|
|
||||||
htk: use HTK formula to do the conversion.
|
|
||||||
The default value is False.
|
|
||||||
Returns:
|
|
||||||
The frequencies represented in Mel-scale.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if htk:
|
|
||||||
if isinstance(freq, paddle.Tensor):
|
|
||||||
return 2595.0 * paddle.log10(1.0 + freq / 700.0)
|
|
||||||
else:
|
|
||||||
return 2595.0 * math.log10(1.0 + freq / 700.0)
|
|
||||||
|
|
||||||
# Fill in the linear part
|
|
||||||
f_min = 0.0
|
|
||||||
f_sp = 200.0 / 3
|
|
||||||
|
|
||||||
mels = (freq - f_min) / f_sp
|
|
||||||
|
|
||||||
# Fill in the log-scale part
|
|
||||||
|
|
||||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
|
||||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
|
||||||
logstep = math.log(6.4) / 27.0 # step size for log region
|
|
||||||
|
|
||||||
if isinstance(freq, paddle.Tensor):
|
|
||||||
target = min_log_mel + paddle.log(
|
|
||||||
freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10
|
|
||||||
mask = (freq > min_log_hz).astype(freq.dtype)
|
|
||||||
mels = target * mask + mels * (
|
|
||||||
1 - mask) # will replace by masked_fill OP in future
|
|
||||||
else:
|
|
||||||
if freq >= min_log_hz:
|
|
||||||
mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
|
|
||||||
|
|
||||||
return mels
|
|
||||||
|
|
||||||
|
|
||||||
def mel_to_hz(mel: Union[float, paddle.Tensor],
|
|
||||||
htk: bool=False) -> Union[float, paddle.Tensor]:
|
|
||||||
"""Convert mel bin numbers to frequencies.
|
|
||||||
Parameters:
|
|
||||||
mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
|
|
||||||
htk: use HTK formula to do the conversion.
|
|
||||||
Returns:
|
|
||||||
The frequencies represented in hz.
|
|
||||||
"""
|
|
||||||
if htk:
|
|
||||||
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
|
|
||||||
|
|
||||||
f_min = 0.0
|
|
||||||
f_sp = 200.0 / 3
|
|
||||||
freqs = f_min + f_sp * mel
|
|
||||||
# And now the nonlinear scale
|
|
||||||
min_log_hz = 1000.0 # beginning of log region (Hz)
|
|
||||||
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
|
||||||
logstep = math.log(6.4) / 27.0 # step size for log region
|
|
||||||
if isinstance(mel, paddle.Tensor):
|
|
||||||
target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
|
|
||||||
mask = (mel > min_log_mel).astype(mel.dtype)
|
|
||||||
freqs = target * mask + freqs * (
|
|
||||||
1 - mask) # will replace by masked_fill OP in future
|
|
||||||
else:
|
|
||||||
if mel >= min_log_mel:
|
|
||||||
freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
|
|
||||||
|
|
||||||
return freqs
|
|
||||||
|
|
||||||
|
|
||||||
def mel_frequencies(n_mels: int=64,
|
|
||||||
f_min: float=0.0,
|
|
||||||
f_max: float=11025.0,
|
|
||||||
htk: bool=False,
|
|
||||||
dtype: str=paddle.float32):
|
|
||||||
"""Compute mel frequencies.
|
|
||||||
Parameters:
|
|
||||||
n_mels(int): number of Mel bins.
|
|
||||||
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
|
||||||
f_max(float): the upper cut-off frequency, above which the filter response is zero.
|
|
||||||
htk(bool): whether to use htk formula.
|
|
||||||
dtype(str): the datatype of the return frequencies.
|
|
||||||
Returns:
|
|
||||||
The frequencies represented in Mel-scale
|
|
||||||
"""
|
|
||||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
|
||||||
min_mel = hz_to_mel(f_min, htk=htk)
|
|
||||||
max_mel = hz_to_mel(f_max, htk=htk)
|
|
||||||
mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
|
|
||||||
freqs = mel_to_hz(mels, htk=htk)
|
|
||||||
return freqs
|
|
||||||
|
|
||||||
|
|
||||||
def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
|
|
||||||
"""Compute fourier frequencies.
|
|
||||||
Parameters:
|
|
||||||
sr(int): the audio sample rate.
|
|
||||||
n_fft(float): the number of fft bins.
|
|
||||||
dtype(str): the datatype of the return frequencies.
|
|
||||||
Returns:
|
|
||||||
The frequencies represented in hz.
|
|
||||||
"""
|
|
||||||
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
|
|
||||||
|
|
||||||
|
|
||||||
def compute_fbank_matrix(sr: int,
|
|
||||||
n_fft: int,
|
|
||||||
n_mels: int=64,
|
|
||||||
f_min: float=0.0,
|
|
||||||
f_max: Optional[float]=None,
|
|
||||||
htk: bool=False,
|
|
||||||
norm: Union[str, float]='slaney',
|
|
||||||
dtype: str=paddle.float32):
|
|
||||||
"""Compute fbank matrix.
|
|
||||||
Parameters:
|
|
||||||
sr(int): the audio sample rate.
|
|
||||||
n_fft(int): the number of fft bins.
|
|
||||||
n_mels(int): the number of Mel bins.
|
|
||||||
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
|
||||||
f_max(float): the upper cut-off frequency, above which the filter response is zero.
|
|
||||||
htk: whether to use htk formula.
|
|
||||||
return_complex(bool): whether to return complex matrix. If True, the matrix will
|
|
||||||
be complex type. Otherwise, the real and image part will be stored in the last
|
|
||||||
axis of returned tensor.
|
|
||||||
dtype(str): the datatype of the returned fbank matrix.
|
|
||||||
Returns:
|
|
||||||
The fbank matrix of shape (n_mels, int(1+n_fft//2)).
|
|
||||||
Shape:
|
|
||||||
output: (n_mels, int(1+n_fft//2))
|
|
||||||
"""
|
|
||||||
|
|
||||||
if f_max is None:
|
|
||||||
f_max = float(sr) / 2
|
|
||||||
|
|
||||||
# Initialize the weights
|
|
||||||
weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
|
|
||||||
|
|
||||||
# Center freqs of each FFT bin
|
|
||||||
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
|
|
||||||
|
|
||||||
# 'Center freqs' of mel bands - uniformly spaced between limits
|
|
||||||
mel_f = mel_frequencies(
|
|
||||||
n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
|
|
||||||
|
|
||||||
fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f)
|
|
||||||
ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
|
|
||||||
#ramps = np.subtract.outer(mel_f, fftfreqs)
|
|
||||||
|
|
||||||
for i in range(n_mels):
|
|
||||||
# lower and upper slopes for all bins
|
|
||||||
lower = -ramps[i] / fdiff[i]
|
|
||||||
upper = ramps[i + 2] / fdiff[i + 1]
|
|
||||||
|
|
||||||
# .. then intersect them with each other and zero
|
|
||||||
weights[i] = paddle.maximum(
|
|
||||||
paddle.zeros_like(lower), paddle.minimum(lower, upper))
|
|
||||||
|
|
||||||
# Slaney-style mel is scaled to be approx constant energy per channel
|
|
||||||
if norm == 'slaney':
|
|
||||||
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
|
|
||||||
weights *= enorm.unsqueeze(1)
|
|
||||||
elif isinstance(norm, int) or isinstance(norm, float):
|
|
||||||
weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
|
|
||||||
|
|
||||||
return weights
|
|
||||||
|
|
||||||
|
|
||||||
def power_to_db(magnitude: paddle.Tensor,
|
|
||||||
ref_value: float=1.0,
|
|
||||||
amin: float=1e-10,
|
|
||||||
top_db: Optional[float]=None) -> paddle.Tensor:
|
|
||||||
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units.
|
|
||||||
The function computes the scaling ``10 * log10(x / ref)`` in a numerically
|
|
||||||
stable way.
|
|
||||||
Parameters:
|
|
||||||
magnitude(Tensor): the input magnitude tensor of any shape.
|
|
||||||
ref_value(float): the reference value. If smaller than 1.0, the db level
|
|
||||||
of the signal will be pulled up accordingly. Otherwise, the db level
|
|
||||||
is pushed down.
|
|
||||||
amin(float): the minimum value of input magnitude, below which the input
|
|
||||||
magnitude is clipped(to amin).
|
|
||||||
top_db(float): the maximum db value of resulting spectrum, above which the
|
|
||||||
spectrum is clipped(to top_db).
|
|
||||||
Returns:
|
|
||||||
The spectrogram in log-scale.
|
|
||||||
shape:
|
|
||||||
input: any shape
|
|
||||||
output: same as input
|
|
||||||
"""
|
|
||||||
if amin <= 0:
|
|
||||||
raise Exception("amin must be strictly positive")
|
|
||||||
|
|
||||||
if ref_value <= 0:
|
|
||||||
raise Exception("ref_value must be strictly positive")
|
|
||||||
|
|
||||||
ones = paddle.ones_like(magnitude)
|
|
||||||
log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
|
|
||||||
log_spec -= 10.0 * math.log10(max(ref_value, amin))
|
|
||||||
|
|
||||||
if top_db is not None:
|
|
||||||
if top_db < 0:
|
|
||||||
raise Exception("top_db must be non-negative")
|
|
||||||
log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
|
|
||||||
|
|
||||||
return log_spec
|
|
||||||
|
|
||||||
|
|
||||||
class Spectrogram(nn.Layer):
|
|
||||||
def __init__(self,
|
|
||||||
n_fft: int=512,
|
|
||||||
hop_length: Optional[int]=None,
|
|
||||||
win_length: Optional[int]=None,
|
|
||||||
window: str='hann',
|
|
||||||
center: bool=True,
|
|
||||||
pad_mode: str='reflect',
|
|
||||||
dtype: str=paddle.float32):
|
|
||||||
"""Compute spectrogram of a given signal, typically an audio waveform.
|
|
||||||
The spectorgram is defined as the complex norm of the short-time
|
|
||||||
Fourier transformation.
|
|
||||||
Parameters:
|
|
||||||
n_fft(int): the number of frequency components of the discrete Fourier transform.
|
|
||||||
The default value is 2048,
|
|
||||||
hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
|
||||||
The default value is None.
|
|
||||||
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
|
||||||
The default value is None.
|
|
||||||
window(str): the name of the window function applied to the single before the Fourier transform.
|
|
||||||
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
|
||||||
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
|
||||||
The default value is 'hann'
|
|
||||||
center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
|
||||||
If False, frame t begins at x[t * hop_length]
|
|
||||||
The default value is True
|
|
||||||
pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
|
||||||
and 'constant'. The default value is 'reflect'.
|
|
||||||
dtype(str): the data type of input and window.
|
|
||||||
Notes:
|
|
||||||
The Spectrogram transform relies on STFT transform to compute the spectrogram.
|
|
||||||
By default, the weights are not learnable. To fine-tune the Fourier coefficients,
|
|
||||||
set stop_gradient=False before training.
|
|
||||||
For more information, see STFT().
|
|
||||||
"""
|
|
||||||
super(Spectrogram, self).__init__()
|
|
||||||
|
|
||||||
if win_length is None:
|
|
||||||
win_length = n_fft
|
|
||||||
|
|
||||||
fft_window = get_window(window, win_length, fftbins=True, dtype=dtype)
|
|
||||||
self._stft = partial(
|
|
||||||
paddle.signal.stft,
|
|
||||||
n_fft=n_fft,
|
|
||||||
hop_length=hop_length,
|
|
||||||
win_length=win_length,
|
|
||||||
window=fft_window,
|
|
||||||
center=center,
|
|
||||||
pad_mode=pad_mode)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
stft = self._stft(x)
|
|
||||||
spectrogram = paddle.square(paddle.abs(stft))
|
|
||||||
return spectrogram
|
|
||||||
|
|
||||||
|
|
||||||
class MelSpectrogram(nn.Layer):
|
|
||||||
def __init__(self,
|
|
||||||
sr: int=22050,
|
|
||||||
n_fft: int=512,
|
|
||||||
hop_length: Optional[int]=None,
|
|
||||||
win_length: Optional[int]=None,
|
|
||||||
window: str='hann',
|
|
||||||
center: bool=True,
|
|
||||||
pad_mode: str='reflect',
|
|
||||||
n_mels: int=64,
|
|
||||||
f_min: float=50.0,
|
|
||||||
f_max: Optional[float]=None,
|
|
||||||
htk: bool=False,
|
|
||||||
norm: Union[str, float]='slaney',
|
|
||||||
dtype: str=paddle.float32):
|
|
||||||
"""Compute the melspectrogram of a given signal, typically an audio waveform.
|
|
||||||
The melspectrogram is also known as filterbank or fbank feature in audio community.
|
|
||||||
It is computed by multiplying spectrogram with Mel filter bank matrix.
|
|
||||||
Parameters:
|
|
||||||
sr(int): the audio sample rate.
|
|
||||||
The default value is 22050.
|
|
||||||
n_fft(int): the number of frequency components of the discrete Fourier transform.
|
|
||||||
The default value is 2048,
|
|
||||||
hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
|
||||||
The default value is None.
|
|
||||||
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
|
||||||
The default value is None.
|
|
||||||
window(str): the name of the window function applied to the single before the Fourier transform.
|
|
||||||
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
|
||||||
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
|
||||||
The default value is 'hann'
|
|
||||||
center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
|
||||||
If False, frame t begins at x[t * hop_length]
|
|
||||||
The default value is True
|
|
||||||
pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
|
||||||
and 'constant'.
|
|
||||||
The default value is 'reflect'.
|
|
||||||
n_mels(int): the mel bins.
|
|
||||||
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
|
||||||
f_max(float): the upper cut-off frequency, above which the filter response is zeros.
|
|
||||||
htk(bool): whether to use HTK formula in computing fbank matrix.
|
|
||||||
norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
|
|
||||||
You can specify norm=1.0/2.0 to use customized p-norm normalization.
|
|
||||||
dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
|
|
||||||
accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
|
|
||||||
"""
|
|
||||||
super(MelSpectrogram, self).__init__()
|
|
||||||
|
|
||||||
self._spectrogram = Spectrogram(
|
|
||||||
n_fft=n_fft,
|
|
||||||
hop_length=hop_length,
|
|
||||||
win_length=win_length,
|
|
||||||
window=window,
|
|
||||||
center=center,
|
|
||||||
pad_mode=pad_mode,
|
|
||||||
dtype=dtype)
|
|
||||||
self.n_mels = n_mels
|
|
||||||
self.f_min = f_min
|
|
||||||
self.f_max = f_max
|
|
||||||
self.htk = htk
|
|
||||||
self.norm = norm
|
|
||||||
if f_max is None:
|
|
||||||
f_max = sr // 2
|
|
||||||
self.fbank_matrix = compute_fbank_matrix(
|
|
||||||
sr=sr,
|
|
||||||
n_fft=n_fft,
|
|
||||||
n_mels=n_mels,
|
|
||||||
f_min=f_min,
|
|
||||||
f_max=f_max,
|
|
||||||
htk=htk,
|
|
||||||
norm=norm,
|
|
||||||
dtype=dtype) # float64 for better numerical results
|
|
||||||
self.register_buffer('fbank_matrix', self.fbank_matrix)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
spect_feature = self._spectrogram(x)
|
|
||||||
mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
|
|
||||||
return mel_feature
|
|
||||||
|
|
||||||
|
|
||||||
class LogMelSpectrogram(nn.Layer):
|
|
||||||
def __init__(self,
|
|
||||||
sr: int=22050,
|
|
||||||
n_fft: int=512,
|
|
||||||
hop_length: Optional[int]=None,
|
|
||||||
win_length: Optional[int]=None,
|
|
||||||
window: str='hann',
|
|
||||||
center: bool=True,
|
|
||||||
pad_mode: str='reflect',
|
|
||||||
n_mels: int=64,
|
|
||||||
f_min: float=50.0,
|
|
||||||
f_max: Optional[float]=None,
|
|
||||||
htk: bool=False,
|
|
||||||
norm: Union[str, float]='slaney',
|
|
||||||
ref_value: float=1.0,
|
|
||||||
amin: float=1e-10,
|
|
||||||
top_db: Optional[float]=None,
|
|
||||||
dtype: str=paddle.float32):
|
|
||||||
"""Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
|
|
||||||
typically an audio waveform.
|
|
||||||
Parameters:
|
|
||||||
sr(int): the audio sample rate.
|
|
||||||
The default value is 22050.
|
|
||||||
n_fft(int): the number of frequency components of the discrete Fourier transform.
|
|
||||||
The default value is 2048,
|
|
||||||
hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
|
||||||
The default value is None.
|
|
||||||
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
|
||||||
The default value is None.
|
|
||||||
window(str): the name of the window function applied to the single before the Fourier transform.
|
|
||||||
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
|
||||||
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
|
||||||
The default value is 'hann'
|
|
||||||
center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
|
||||||
If False, frame t begins at x[t * hop_length]
|
|
||||||
The default value is True
|
|
||||||
pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
|
||||||
and 'constant'.
|
|
||||||
The default value is 'reflect'.
|
|
||||||
n_mels(int): the mel bins.
|
|
||||||
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
|
||||||
f_max(float): the upper cut-off frequency, above which the filter response is zeros.
|
|
||||||
ref_value(float): the reference value. If smaller than 1.0, the db level
|
|
||||||
htk(bool): whether to use HTK formula in computing fbank matrix.
|
|
||||||
norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
|
|
||||||
You can specify norm=1.0/2.0 to use customized p-norm normalization.
|
|
||||||
dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
|
|
||||||
accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
|
|
||||||
amin(float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
|
|
||||||
Otherwise, the db level is pushed down.
|
|
||||||
magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
|
|
||||||
e.g., 1e-3.
|
|
||||||
top_db(float): the maximum db value of resulting spectrum, above which the
|
|
||||||
spectrum is clipped(to top_db).
|
|
||||||
"""
|
|
||||||
super(LogMelSpectrogram, self).__init__()
|
|
||||||
|
|
||||||
self._melspectrogram = MelSpectrogram(
|
|
||||||
sr=sr,
|
|
||||||
n_fft=n_fft,
|
|
||||||
hop_length=hop_length,
|
|
||||||
win_length=win_length,
|
|
||||||
window=window,
|
|
||||||
center=center,
|
|
||||||
pad_mode=pad_mode,
|
|
||||||
n_mels=n_mels,
|
|
||||||
f_min=f_min,
|
|
||||||
f_max=f_max,
|
|
||||||
htk=htk,
|
|
||||||
norm=norm,
|
|
||||||
dtype=dtype)
|
|
||||||
|
|
||||||
self.ref_value = ref_value
|
|
||||||
self.amin = amin
|
|
||||||
self.top_db = top_db
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
# import ipdb; ipdb.set_trace()
|
|
||||||
mel_feature = self._melspectrogram(x)
|
|
||||||
log_mel_feature = power_to_db(
|
|
||||||
mel_feature,
|
|
||||||
ref_value=self.ref_value,
|
|
||||||
amin=self.amin,
|
|
||||||
top_db=self.top_db)
|
|
||||||
return log_mel_feature
|
|
@ -0,0 +1,22 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from . import compliance
|
||||||
|
from . import datasets
|
||||||
|
from . import features
|
||||||
|
from . import functional
|
||||||
|
from . import io
|
||||||
|
from . import metric
|
||||||
|
from . import sox_effects
|
||||||
|
from .backends import load
|
||||||
|
from .backends import save
|
@ -0,0 +1,19 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .soundfile_backend import depth_convert
|
||||||
|
from .soundfile_backend import load
|
||||||
|
from .soundfile_backend import normalize
|
||||||
|
from .soundfile_backend import resample
|
||||||
|
from .soundfile_backend import save
|
||||||
|
from .soundfile_backend import to_mono
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,638 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from torchaudio(https://github.com/pytorch/audio)
|
||||||
|
import math
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import Tensor
|
||||||
|
|
||||||
|
from ..functional import create_dct
|
||||||
|
from ..functional.window import get_window
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'spectrogram',
|
||||||
|
'fbank',
|
||||||
|
'mfcc',
|
||||||
|
]
|
||||||
|
|
||||||
|
# window types
|
||||||
|
HANNING = 'hann'
|
||||||
|
HAMMING = 'hamming'
|
||||||
|
POVEY = 'povey'
|
||||||
|
RECTANGULAR = 'rect'
|
||||||
|
BLACKMAN = 'blackman'
|
||||||
|
|
||||||
|
|
||||||
|
def _get_epsilon(dtype):
|
||||||
|
return paddle.to_tensor(1e-07, dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def _next_power_of_2(x: int) -> int:
|
||||||
|
return 1 if x == 0 else 2**(x - 1).bit_length()
|
||||||
|
|
||||||
|
|
||||||
|
def _get_strided(waveform: Tensor,
|
||||||
|
window_size: int,
|
||||||
|
window_shift: int,
|
||||||
|
snip_edges: bool) -> Tensor:
|
||||||
|
assert waveform.dim() == 1
|
||||||
|
num_samples = waveform.shape[0]
|
||||||
|
|
||||||
|
if snip_edges:
|
||||||
|
if num_samples < window_size:
|
||||||
|
return paddle.empty((0, 0), dtype=waveform.dtype)
|
||||||
|
else:
|
||||||
|
m = 1 + (num_samples - window_size) // window_shift
|
||||||
|
else:
|
||||||
|
reversed_waveform = paddle.flip(waveform, [0])
|
||||||
|
m = (num_samples + (window_shift // 2)) // window_shift
|
||||||
|
pad = window_size // 2 - window_shift // 2
|
||||||
|
pad_right = reversed_waveform
|
||||||
|
if pad > 0:
|
||||||
|
pad_left = reversed_waveform[-pad:]
|
||||||
|
waveform = paddle.concat((pad_left, waveform, pad_right), axis=0)
|
||||||
|
else:
|
||||||
|
waveform = paddle.concat((waveform[-pad:], pad_right), axis=0)
|
||||||
|
|
||||||
|
return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T
|
||||||
|
|
||||||
|
|
||||||
|
def _feature_window_function(
|
||||||
|
window_type: str,
|
||||||
|
window_size: int,
|
||||||
|
blackman_coeff: float,
|
||||||
|
dtype: int, ) -> Tensor:
|
||||||
|
if window_type == HANNING:
|
||||||
|
return get_window('hann', window_size, fftbins=False, dtype=dtype)
|
||||||
|
elif window_type == HAMMING:
|
||||||
|
return get_window('hamming', window_size, fftbins=False, dtype=dtype)
|
||||||
|
elif window_type == POVEY:
|
||||||
|
return get_window(
|
||||||
|
'hann', window_size, fftbins=False, dtype=dtype).pow(0.85)
|
||||||
|
elif window_type == RECTANGULAR:
|
||||||
|
return paddle.ones([window_size], dtype=dtype)
|
||||||
|
elif window_type == BLACKMAN:
|
||||||
|
a = 2 * math.pi / (window_size - 1)
|
||||||
|
window_function = paddle.arange(window_size, dtype=dtype)
|
||||||
|
return (blackman_coeff - 0.5 * paddle.cos(a * window_function) +
|
||||||
|
(0.5 - blackman_coeff) * paddle.cos(2 * a * window_function)
|
||||||
|
).astype(dtype)
|
||||||
|
else:
|
||||||
|
raise Exception('Invalid window type ' + window_type)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
|
||||||
|
energy_floor: float) -> Tensor:
|
||||||
|
log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log()
|
||||||
|
if energy_floor == 0.0:
|
||||||
|
return log_energy
|
||||||
|
return paddle.maximum(
|
||||||
|
log_energy,
|
||||||
|
paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype))
|
||||||
|
|
||||||
|
|
||||||
|
def _get_waveform_and_window_properties(
|
||||||
|
waveform: Tensor,
|
||||||
|
channel: int,
|
||||||
|
sr: int,
|
||||||
|
frame_shift: float,
|
||||||
|
frame_length: float,
|
||||||
|
round_to_power_of_two: bool,
|
||||||
|
preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]:
|
||||||
|
channel = max(channel, 0)
|
||||||
|
assert channel < waveform.shape[0], (
|
||||||
|
'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
|
||||||
|
waveform = waveform[channel, :] # size (n)
|
||||||
|
window_shift = int(
|
||||||
|
sr * frame_shift *
|
||||||
|
0.001) # pass frame_shift and frame_length in milliseconds
|
||||||
|
window_size = int(sr * frame_length * 0.001)
|
||||||
|
padded_window_size = _next_power_of_2(
|
||||||
|
window_size) if round_to_power_of_two else window_size
|
||||||
|
|
||||||
|
assert 2 <= window_size <= len(waveform), (
|
||||||
|
'choose a window size {} that is [2, {}]'.format(window_size,
|
||||||
|
len(waveform)))
|
||||||
|
assert 0 < window_shift, '`window_shift` must be greater than 0'
|
||||||
|
assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
|
||||||
|
' use `round_to_power_of_two` or change `frame_length`'
|
||||||
|
assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
|
||||||
|
assert sr > 0, '`sr` must be greater than zero'
|
||||||
|
return waveform, window_shift, window_size, padded_window_size
|
||||||
|
|
||||||
|
|
||||||
|
def _get_window(waveform: Tensor,
|
||||||
|
padded_window_size: int,
|
||||||
|
window_size: int,
|
||||||
|
window_shift: int,
|
||||||
|
window_type: str,
|
||||||
|
blackman_coeff: float,
|
||||||
|
snip_edges: bool,
|
||||||
|
raw_energy: bool,
|
||||||
|
energy_floor: float,
|
||||||
|
dither: float,
|
||||||
|
remove_dc_offset: bool,
|
||||||
|
preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]:
|
||||||
|
dtype = waveform.dtype
|
||||||
|
epsilon = _get_epsilon(dtype)
|
||||||
|
|
||||||
|
# (m, window_size)
|
||||||
|
strided_input = _get_strided(waveform, window_size, window_shift,
|
||||||
|
snip_edges)
|
||||||
|
|
||||||
|
if dither != 0.0:
|
||||||
|
x = paddle.maximum(epsilon,
|
||||||
|
paddle.rand(strided_input.shape, dtype=dtype))
|
||||||
|
rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
|
||||||
|
strided_input = strided_input + rand_gauss * dither
|
||||||
|
|
||||||
|
if remove_dc_offset:
|
||||||
|
row_means = paddle.mean(strided_input, axis=1).unsqueeze(1) # (m, 1)
|
||||||
|
strided_input = strided_input - row_means
|
||||||
|
|
||||||
|
if raw_energy:
|
||||||
|
signal_log_energy = _get_log_energy(strided_input, epsilon,
|
||||||
|
energy_floor) # (m)
|
||||||
|
|
||||||
|
if preemphasis_coefficient != 0.0:
|
||||||
|
offset_strided_input = paddle.nn.functional.pad(
|
||||||
|
strided_input.unsqueeze(0), (1, 0),
|
||||||
|
data_format='NCL',
|
||||||
|
mode='replicate').squeeze(0) # (m, window_size + 1)
|
||||||
|
strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
|
||||||
|
-1]
|
||||||
|
|
||||||
|
window_function = _feature_window_function(
|
||||||
|
window_type, window_size, blackman_coeff,
|
||||||
|
dtype).unsqueeze(0) # (1, window_size)
|
||||||
|
strided_input = strided_input * window_function # (m, window_size)
|
||||||
|
|
||||||
|
# (m, padded_window_size)
|
||||||
|
if padded_window_size != window_size:
|
||||||
|
padding_right = padded_window_size - window_size
|
||||||
|
strided_input = paddle.nn.functional.pad(
|
||||||
|
strided_input.unsqueeze(0), (0, padding_right),
|
||||||
|
data_format='NCL',
|
||||||
|
mode='constant',
|
||||||
|
value=0).squeeze(0)
|
||||||
|
|
||||||
|
if not raw_energy:
|
||||||
|
signal_log_energy = _get_log_energy(strided_input, epsilon,
|
||||||
|
energy_floor) # size (m)
|
||||||
|
|
||||||
|
return strided_input, signal_log_energy
|
||||||
|
|
||||||
|
|
||||||
|
def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
|
||||||
|
if subtract_mean:
|
||||||
|
col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
|
||||||
|
tensor = tensor - col_means
|
||||||
|
return tensor
|
||||||
|
|
||||||
|
|
||||||
|
def spectrogram(waveform: Tensor,
|
||||||
|
blackman_coeff: float=0.42,
|
||||||
|
channel: int=-1,
|
||||||
|
dither: float=0.0,
|
||||||
|
energy_floor: float=1.0,
|
||||||
|
frame_length: float=25.0,
|
||||||
|
frame_shift: float=10.0,
|
||||||
|
preemphasis_coefficient: float=0.97,
|
||||||
|
raw_energy: bool=True,
|
||||||
|
remove_dc_offset: bool=True,
|
||||||
|
round_to_power_of_two: bool=True,
|
||||||
|
sr: int=16000,
|
||||||
|
snip_edges: bool=True,
|
||||||
|
subtract_mean: bool=False,
|
||||||
|
window_type: str=POVEY) -> Tensor:
|
||||||
|
"""Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
waveform (Tensor): A waveform tensor with shape [C, T].
|
||||||
|
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
|
||||||
|
channel (int, optional): Select the channel of waveform. Defaults to -1.
|
||||||
|
dither (float, optional): Dithering constant . Defaults to 0.0.
|
||||||
|
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
|
||||||
|
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
|
||||||
|
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
|
||||||
|
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
|
||||||
|
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
|
||||||
|
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
|
||||||
|
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||||
|
to FFT. Defaults to True.
|
||||||
|
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
|
||||||
|
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
|
||||||
|
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
|
||||||
|
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
|
||||||
|
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: A spectrogram tensor with shape (m, padded_window_size // 2 + 1) where m is the number of frames
|
||||||
|
depends on frame_length and frame_shift.
|
||||||
|
"""
|
||||||
|
dtype = waveform.dtype
|
||||||
|
epsilon = _get_epsilon(dtype)
|
||||||
|
|
||||||
|
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
|
||||||
|
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
|
||||||
|
preemphasis_coefficient)
|
||||||
|
|
||||||
|
strided_input, signal_log_energy = _get_window(
|
||||||
|
waveform, padded_window_size, window_size, window_shift, window_type,
|
||||||
|
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
|
||||||
|
remove_dc_offset, preemphasis_coefficient)
|
||||||
|
|
||||||
|
# (m, padded_window_size // 2 + 1, 2)
|
||||||
|
fft = paddle.fft.rfft(strided_input)
|
||||||
|
|
||||||
|
power_spectrum = paddle.maximum(
|
||||||
|
fft.abs().pow(2.), epsilon).log() # (m, padded_window_size // 2 + 1)
|
||||||
|
power_spectrum[:, 0] = signal_log_energy
|
||||||
|
|
||||||
|
power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
|
||||||
|
return power_spectrum
|
||||||
|
|
||||||
|
|
||||||
|
def _inverse_mel_scale_scalar(mel_freq: float) -> float:
|
||||||
|
return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
|
||||||
|
|
||||||
|
|
||||||
|
def _inverse_mel_scale(mel_freq: Tensor) -> Tensor:
|
||||||
|
return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
|
||||||
|
|
||||||
|
|
||||||
|
def _mel_scale_scalar(freq: float) -> float:
|
||||||
|
return 1127.0 * math.log(1.0 + freq / 700.0)
|
||||||
|
|
||||||
|
|
||||||
|
def _mel_scale(freq: Tensor) -> Tensor:
|
||||||
|
return 1127.0 * (1.0 + freq / 700.0).log()
|
||||||
|
|
||||||
|
|
||||||
|
def _vtln_warp_freq(vtln_low_cutoff: float,
|
||||||
|
vtln_high_cutoff: float,
|
||||||
|
low_freq: float,
|
||||||
|
high_freq: float,
|
||||||
|
vtln_warp_factor: float,
|
||||||
|
freq: Tensor) -> Tensor:
|
||||||
|
assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq'
|
||||||
|
assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]'
|
||||||
|
l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
|
||||||
|
h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
|
||||||
|
scale = 1.0 / vtln_warp_factor
|
||||||
|
Fl = scale * l
|
||||||
|
Fh = scale * h
|
||||||
|
assert l > low_freq and h < high_freq
|
||||||
|
scale_left = (Fl - low_freq) / (l - low_freq)
|
||||||
|
scale_right = (high_freq - Fh) / (high_freq - h)
|
||||||
|
res = paddle.empty_like(freq)
|
||||||
|
|
||||||
|
outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
|
||||||
|
| paddle.greater_than(freq, paddle.to_tensor(high_freq))
|
||||||
|
before_l = paddle.less_than(freq, paddle.to_tensor(l))
|
||||||
|
before_h = paddle.less_than(freq, paddle.to_tensor(h))
|
||||||
|
after_h = paddle.greater_equal(freq, paddle.to_tensor(h))
|
||||||
|
|
||||||
|
res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
|
||||||
|
res[before_h] = scale * freq[before_h]
|
||||||
|
res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
|
||||||
|
res[outside_low_high_freq] = freq[outside_low_high_freq]
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def _vtln_warp_mel_freq(vtln_low_cutoff: float,
|
||||||
|
vtln_high_cutoff: float,
|
||||||
|
low_freq,
|
||||||
|
high_freq: float,
|
||||||
|
vtln_warp_factor: float,
|
||||||
|
mel_freq: Tensor) -> Tensor:
|
||||||
|
return _mel_scale(
|
||||||
|
_vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq,
|
||||||
|
vtln_warp_factor, _inverse_mel_scale(mel_freq)))
|
||||||
|
|
||||||
|
|
||||||
|
def _get_mel_banks(num_bins: int,
|
||||||
|
window_length_padded: int,
|
||||||
|
sample_freq: float,
|
||||||
|
low_freq: float,
|
||||||
|
high_freq: float,
|
||||||
|
vtln_low: float,
|
||||||
|
vtln_high: float,
|
||||||
|
vtln_warp_factor: float) -> Tuple[Tensor, Tensor]:
|
||||||
|
assert num_bins > 3, 'Must have at least 3 mel bins'
|
||||||
|
assert window_length_padded % 2 == 0
|
||||||
|
num_fft_bins = window_length_padded / 2
|
||||||
|
nyquist = 0.5 * sample_freq
|
||||||
|
|
||||||
|
if high_freq <= 0.0:
|
||||||
|
high_freq += nyquist
|
||||||
|
|
||||||
|
assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
|
||||||
|
('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
|
||||||
|
|
||||||
|
fft_bin_width = sample_freq / window_length_padded
|
||||||
|
mel_low_freq = _mel_scale_scalar(low_freq)
|
||||||
|
mel_high_freq = _mel_scale_scalar(high_freq)
|
||||||
|
|
||||||
|
mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
|
||||||
|
|
||||||
|
if vtln_high < 0.0:
|
||||||
|
vtln_high += nyquist
|
||||||
|
|
||||||
|
assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and
|
||||||
|
(0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \
|
||||||
|
('Bad values in options: vtln-low {} and vtln-high {}, versus '
|
||||||
|
'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
|
||||||
|
|
||||||
|
bin = paddle.arange(num_bins).unsqueeze(1)
|
||||||
|
left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1)
|
||||||
|
center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1)
|
||||||
|
right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1)
|
||||||
|
|
||||||
|
if vtln_warp_factor != 1.0:
|
||||||
|
left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
|
||||||
|
vtln_warp_factor, left_mel)
|
||||||
|
center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
|
||||||
|
high_freq, vtln_warp_factor,
|
||||||
|
center_mel)
|
||||||
|
right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
|
||||||
|
high_freq, vtln_warp_factor, right_mel)
|
||||||
|
|
||||||
|
center_freqs = _inverse_mel_scale(center_mel) # (num_bins)
|
||||||
|
# (1, num_fft_bins)
|
||||||
|
mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0)
|
||||||
|
|
||||||
|
# (num_bins, num_fft_bins)
|
||||||
|
up_slope = (mel - left_mel) / (center_mel - left_mel)
|
||||||
|
down_slope = (right_mel - mel) / (right_mel - center_mel)
|
||||||
|
|
||||||
|
if vtln_warp_factor == 1.0:
|
||||||
|
bins = paddle.maximum(
|
||||||
|
paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
|
||||||
|
else:
|
||||||
|
bins = paddle.zeros_like(up_slope)
|
||||||
|
up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
|
||||||
|
mel, center_mel)
|
||||||
|
down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
|
||||||
|
mel, right_mel)
|
||||||
|
bins[up_idx] = up_slope[up_idx]
|
||||||
|
bins[down_idx] = down_slope[down_idx]
|
||||||
|
|
||||||
|
return bins, center_freqs
|
||||||
|
|
||||||
|
|
||||||
|
def fbank(waveform: Tensor,
|
||||||
|
blackman_coeff: float=0.42,
|
||||||
|
channel: int=-1,
|
||||||
|
dither: float=0.0,
|
||||||
|
energy_floor: float=1.0,
|
||||||
|
frame_length: float=25.0,
|
||||||
|
frame_shift: float=10.0,
|
||||||
|
high_freq: float=0.0,
|
||||||
|
htk_compat: bool=False,
|
||||||
|
low_freq: float=20.0,
|
||||||
|
n_mels: int=23,
|
||||||
|
preemphasis_coefficient: float=0.97,
|
||||||
|
raw_energy: bool=True,
|
||||||
|
remove_dc_offset: bool=True,
|
||||||
|
round_to_power_of_two: bool=True,
|
||||||
|
sr: int=16000,
|
||||||
|
snip_edges: bool=True,
|
||||||
|
subtract_mean: bool=False,
|
||||||
|
use_energy: bool=False,
|
||||||
|
use_log_fbank: bool=True,
|
||||||
|
use_power: bool=True,
|
||||||
|
vtln_high: float=-500.0,
|
||||||
|
vtln_low: float=100.0,
|
||||||
|
vtln_warp: float=1.0,
|
||||||
|
window_type: str=POVEY) -> Tensor:
|
||||||
|
"""Compute and return filter banks from a waveform. The output is identical to Kaldi's.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
waveform (Tensor): A waveform tensor with shape [C, T].
|
||||||
|
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
|
||||||
|
channel (int, optional): Select the channel of waveform. Defaults to -1.
|
||||||
|
dither (float, optional): Dithering constant . Defaults to 0.0.
|
||||||
|
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
|
||||||
|
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
|
||||||
|
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
|
||||||
|
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
|
||||||
|
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
|
||||||
|
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
|
||||||
|
n_mels (int, optional): Number of output mel bins. Defaults to 23.
|
||||||
|
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
|
||||||
|
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
|
||||||
|
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
|
||||||
|
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||||
|
to FFT. Defaults to True.
|
||||||
|
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
|
||||||
|
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
|
||||||
|
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
|
||||||
|
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
|
||||||
|
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
|
||||||
|
use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
|
||||||
|
use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
|
||||||
|
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
|
||||||
|
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
|
||||||
|
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
|
||||||
|
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: A filter banks tensor with shape (m, n_mels).
|
||||||
|
"""
|
||||||
|
dtype = waveform.dtype
|
||||||
|
|
||||||
|
waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
|
||||||
|
waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
|
||||||
|
preemphasis_coefficient)
|
||||||
|
|
||||||
|
strided_input, signal_log_energy = _get_window(
|
||||||
|
waveform, padded_window_size, window_size, window_shift, window_type,
|
||||||
|
blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
|
||||||
|
remove_dc_offset, preemphasis_coefficient)
|
||||||
|
|
||||||
|
# (m, padded_window_size // 2 + 1)
|
||||||
|
spectrum = paddle.fft.rfft(strided_input).abs()
|
||||||
|
if use_power:
|
||||||
|
spectrum = spectrum.pow(2.)
|
||||||
|
|
||||||
|
# (n_mels, padded_window_size // 2)
|
||||||
|
mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq,
|
||||||
|
high_freq, vtln_low, vtln_high, vtln_warp)
|
||||||
|
mel_energies = mel_energies.astype(dtype)
|
||||||
|
|
||||||
|
# (n_mels, padded_window_size // 2 + 1)
|
||||||
|
mel_energies = paddle.nn.functional.pad(
|
||||||
|
mel_energies.unsqueeze(0), (0, 1),
|
||||||
|
data_format='NCL',
|
||||||
|
mode='constant',
|
||||||
|
value=0).squeeze(0)
|
||||||
|
|
||||||
|
# (m, n_mels)
|
||||||
|
mel_energies = paddle.mm(spectrum, mel_energies.T)
|
||||||
|
if use_log_fbank:
|
||||||
|
mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
|
||||||
|
|
||||||
|
if use_energy:
|
||||||
|
signal_log_energy = signal_log_energy.unsqueeze(1)
|
||||||
|
if htk_compat:
|
||||||
|
mel_energies = paddle.concat(
|
||||||
|
(mel_energies, signal_log_energy), axis=1)
|
||||||
|
else:
|
||||||
|
mel_energies = paddle.concat(
|
||||||
|
(signal_log_energy, mel_energies), axis=1)
|
||||||
|
|
||||||
|
# (m, n_mels + 1)
|
||||||
|
mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
|
||||||
|
return mel_energies
|
||||||
|
|
||||||
|
|
||||||
|
def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor:
|
||||||
|
dct_matrix = create_dct(n_mels, n_mels, 'ortho')
|
||||||
|
dct_matrix[:, 0] = math.sqrt(1 / float(n_mels))
|
||||||
|
dct_matrix = dct_matrix[:, :n_mfcc] # (n_mels, n_mfcc)
|
||||||
|
return dct_matrix
|
||||||
|
|
||||||
|
|
||||||
|
def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor:
|
||||||
|
i = paddle.arange(n_mfcc)
|
||||||
|
return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
|
||||||
|
cepstral_lifter)
|
||||||
|
|
||||||
|
|
||||||
|
def mfcc(waveform: Tensor,
|
||||||
|
blackman_coeff: float=0.42,
|
||||||
|
cepstral_lifter: float=22.0,
|
||||||
|
channel: int=-1,
|
||||||
|
dither: float=0.0,
|
||||||
|
energy_floor: float=1.0,
|
||||||
|
frame_length: float=25.0,
|
||||||
|
frame_shift: float=10.0,
|
||||||
|
high_freq: float=0.0,
|
||||||
|
htk_compat: bool=False,
|
||||||
|
low_freq: float=20.0,
|
||||||
|
n_mfcc: int=13,
|
||||||
|
n_mels: int=23,
|
||||||
|
preemphasis_coefficient: float=0.97,
|
||||||
|
raw_energy: bool=True,
|
||||||
|
remove_dc_offset: bool=True,
|
||||||
|
round_to_power_of_two: bool=True,
|
||||||
|
sr: int=16000,
|
||||||
|
snip_edges: bool=True,
|
||||||
|
subtract_mean: bool=False,
|
||||||
|
use_energy: bool=False,
|
||||||
|
vtln_high: float=-500.0,
|
||||||
|
vtln_low: float=100.0,
|
||||||
|
vtln_warp: float=1.0,
|
||||||
|
window_type: str=POVEY) -> Tensor:
|
||||||
|
"""Compute and return mel frequency cepstral coefficients from a waveform. The output is
|
||||||
|
identical to Kaldi's.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
waveform (Tensor): A waveform tensor with shape [C, T].
|
||||||
|
blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
|
||||||
|
cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
|
||||||
|
channel (int, optional): Select the channel of waveform. Defaults to -1.
|
||||||
|
dither (float, optional): Dithering constant . Defaults to 0.0.
|
||||||
|
energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
|
||||||
|
frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
|
||||||
|
frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
|
||||||
|
high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
|
||||||
|
htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
|
||||||
|
low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
|
||||||
|
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
|
||||||
|
n_mels (int, optional): Number of output mel bins. Defaults to 23.
|
||||||
|
preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
|
||||||
|
raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
|
||||||
|
remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
|
||||||
|
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
|
||||||
|
to FFT. Defaults to True.
|
||||||
|
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
|
||||||
|
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
|
||||||
|
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
|
||||||
|
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
|
||||||
|
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
|
||||||
|
vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
|
||||||
|
vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
|
||||||
|
vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
|
||||||
|
window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: A mel frequency cepstral coefficients tensor with shape (m, n_mfcc).
|
||||||
|
"""
|
||||||
|
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
|
||||||
|
n_mfcc, n_mels)
|
||||||
|
|
||||||
|
dtype = waveform.dtype
|
||||||
|
|
||||||
|
# (m, n_mels + use_energy)
|
||||||
|
feature = fbank(
|
||||||
|
waveform=waveform,
|
||||||
|
blackman_coeff=blackman_coeff,
|
||||||
|
channel=channel,
|
||||||
|
dither=dither,
|
||||||
|
energy_floor=energy_floor,
|
||||||
|
frame_length=frame_length,
|
||||||
|
frame_shift=frame_shift,
|
||||||
|
high_freq=high_freq,
|
||||||
|
htk_compat=htk_compat,
|
||||||
|
low_freq=low_freq,
|
||||||
|
n_mels=n_mels,
|
||||||
|
preemphasis_coefficient=preemphasis_coefficient,
|
||||||
|
raw_energy=raw_energy,
|
||||||
|
remove_dc_offset=remove_dc_offset,
|
||||||
|
round_to_power_of_two=round_to_power_of_two,
|
||||||
|
sr=sr,
|
||||||
|
snip_edges=snip_edges,
|
||||||
|
subtract_mean=False,
|
||||||
|
use_energy=use_energy,
|
||||||
|
use_log_fbank=True,
|
||||||
|
use_power=True,
|
||||||
|
vtln_high=vtln_high,
|
||||||
|
vtln_low=vtln_low,
|
||||||
|
vtln_warp=vtln_warp,
|
||||||
|
window_type=window_type)
|
||||||
|
|
||||||
|
if use_energy:
|
||||||
|
# (m)
|
||||||
|
signal_log_energy = feature[:, n_mels if htk_compat else 0]
|
||||||
|
mel_offset = int(not htk_compat)
|
||||||
|
feature = feature[:, mel_offset:(n_mels + mel_offset)]
|
||||||
|
|
||||||
|
# (n_mels, n_mfcc)
|
||||||
|
dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype)
|
||||||
|
|
||||||
|
# (m, n_mfcc)
|
||||||
|
feature = feature.matmul(dct_matrix)
|
||||||
|
|
||||||
|
if cepstral_lifter != 0.0:
|
||||||
|
# (1, n_mfcc)
|
||||||
|
lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0)
|
||||||
|
feature *= lifter_coeffs.astype(dtype=dtype)
|
||||||
|
|
||||||
|
if use_energy:
|
||||||
|
feature[:, 0] = signal_log_energy
|
||||||
|
|
||||||
|
if htk_compat:
|
||||||
|
energy = feature[:, 0].unsqueeze(1) # (m, 1)
|
||||||
|
feature = feature[:, 1:] # (m, n_mfcc - 1)
|
||||||
|
if not use_energy:
|
||||||
|
energy *= math.sqrt(2)
|
||||||
|
|
||||||
|
feature = paddle.concat((feature, energy), axis=1)
|
||||||
|
|
||||||
|
feature = _subtract_column_mean(feature, subtract_mean)
|
||||||
|
return feature
|
@ -0,0 +1,344 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from functools import partial
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import paddle.nn as nn
|
||||||
|
|
||||||
|
from ..functional import compute_fbank_matrix
|
||||||
|
from ..functional import create_dct
|
||||||
|
from ..functional import power_to_db
|
||||||
|
from ..functional.window import get_window
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'Spectrogram',
|
||||||
|
'MelSpectrogram',
|
||||||
|
'LogMelSpectrogram',
|
||||||
|
'MFCC',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class Spectrogram(nn.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
n_fft: int=512,
|
||||||
|
hop_length: Optional[int]=None,
|
||||||
|
win_length: Optional[int]=None,
|
||||||
|
window: str='hann',
|
||||||
|
center: bool=True,
|
||||||
|
pad_mode: str='reflect',
|
||||||
|
dtype: str=paddle.float32):
|
||||||
|
"""Compute spectrogram of a given signal, typically an audio waveform.
|
||||||
|
The spectorgram is defined as the complex norm of the short-time
|
||||||
|
Fourier transformation.
|
||||||
|
Parameters:
|
||||||
|
n_fft (int): the number of frequency components of the discrete Fourier transform.
|
||||||
|
The default value is 2048,
|
||||||
|
hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
||||||
|
The default value is None.
|
||||||
|
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
||||||
|
The default value is None.
|
||||||
|
window (str): the name of the window function applied to the single before the Fourier transform.
|
||||||
|
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
||||||
|
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
||||||
|
The default value is 'hann'
|
||||||
|
center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
||||||
|
If False, frame t begins at x[t * hop_length]
|
||||||
|
The default value is True
|
||||||
|
pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
||||||
|
and 'constant'. The default value is 'reflect'.
|
||||||
|
dtype (str): the data type of input and window.
|
||||||
|
Notes:
|
||||||
|
The Spectrogram transform relies on STFT transform to compute the spectrogram.
|
||||||
|
By default, the weights are not learnable. To fine-tune the Fourier coefficients,
|
||||||
|
set stop_gradient=False before training.
|
||||||
|
For more information, see STFT().
|
||||||
|
"""
|
||||||
|
super(Spectrogram, self).__init__()
|
||||||
|
|
||||||
|
if win_length is None:
|
||||||
|
win_length = n_fft
|
||||||
|
|
||||||
|
self.fft_window = get_window(
|
||||||
|
window, win_length, fftbins=True, dtype=dtype)
|
||||||
|
self._stft = partial(
|
||||||
|
paddle.signal.stft,
|
||||||
|
n_fft=n_fft,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=win_length,
|
||||||
|
window=self.fft_window,
|
||||||
|
center=center,
|
||||||
|
pad_mode=pad_mode)
|
||||||
|
self.register_buffer('fft_window', self.fft_window)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
stft = self._stft(x)
|
||||||
|
spectrogram = paddle.square(paddle.abs(stft))
|
||||||
|
return spectrogram
|
||||||
|
|
||||||
|
|
||||||
|
class MelSpectrogram(nn.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
sr: int=22050,
|
||||||
|
n_fft: int=512,
|
||||||
|
hop_length: Optional[int]=None,
|
||||||
|
win_length: Optional[int]=None,
|
||||||
|
window: str='hann',
|
||||||
|
center: bool=True,
|
||||||
|
pad_mode: str='reflect',
|
||||||
|
n_mels: int=64,
|
||||||
|
f_min: float=50.0,
|
||||||
|
f_max: Optional[float]=None,
|
||||||
|
htk: bool=False,
|
||||||
|
norm: Union[str, float]='slaney',
|
||||||
|
dtype: str=paddle.float32):
|
||||||
|
"""Compute the melspectrogram of a given signal, typically an audio waveform.
|
||||||
|
The melspectrogram is also known as filterbank or fbank feature in audio community.
|
||||||
|
It is computed by multiplying spectrogram with Mel filter bank matrix.
|
||||||
|
Parameters:
|
||||||
|
sr(int): the audio sample rate.
|
||||||
|
The default value is 22050.
|
||||||
|
n_fft(int): the number of frequency components of the discrete Fourier transform.
|
||||||
|
The default value is 2048,
|
||||||
|
hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
||||||
|
The default value is None.
|
||||||
|
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
||||||
|
The default value is None.
|
||||||
|
window(str): the name of the window function applied to the single before the Fourier transform.
|
||||||
|
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
||||||
|
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
||||||
|
The default value is 'hann'
|
||||||
|
center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
||||||
|
If False, frame t begins at x[t * hop_length]
|
||||||
|
The default value is True
|
||||||
|
pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
||||||
|
and 'constant'.
|
||||||
|
The default value is 'reflect'.
|
||||||
|
n_mels(int): the mel bins.
|
||||||
|
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
||||||
|
f_max(float): the upper cut-off frequency, above which the filter response is zeros.
|
||||||
|
htk(bool): whether to use HTK formula in computing fbank matrix.
|
||||||
|
norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
|
||||||
|
You can specify norm=1.0/2.0 to use customized p-norm normalization.
|
||||||
|
dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
|
||||||
|
accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
|
||||||
|
"""
|
||||||
|
super(MelSpectrogram, self).__init__()
|
||||||
|
|
||||||
|
self._spectrogram = Spectrogram(
|
||||||
|
n_fft=n_fft,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=win_length,
|
||||||
|
window=window,
|
||||||
|
center=center,
|
||||||
|
pad_mode=pad_mode,
|
||||||
|
dtype=dtype)
|
||||||
|
self.n_mels = n_mels
|
||||||
|
self.f_min = f_min
|
||||||
|
self.f_max = f_max
|
||||||
|
self.htk = htk
|
||||||
|
self.norm = norm
|
||||||
|
if f_max is None:
|
||||||
|
f_max = sr // 2
|
||||||
|
self.fbank_matrix = compute_fbank_matrix(
|
||||||
|
sr=sr,
|
||||||
|
n_fft=n_fft,
|
||||||
|
n_mels=n_mels,
|
||||||
|
f_min=f_min,
|
||||||
|
f_max=f_max,
|
||||||
|
htk=htk,
|
||||||
|
norm=norm,
|
||||||
|
dtype=dtype) # float64 for better numerical results
|
||||||
|
self.register_buffer('fbank_matrix', self.fbank_matrix)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
spect_feature = self._spectrogram(x)
|
||||||
|
mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
|
||||||
|
return mel_feature
|
||||||
|
|
||||||
|
|
||||||
|
class LogMelSpectrogram(nn.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
sr: int=22050,
|
||||||
|
n_fft: int=512,
|
||||||
|
hop_length: Optional[int]=None,
|
||||||
|
win_length: Optional[int]=None,
|
||||||
|
window: str='hann',
|
||||||
|
center: bool=True,
|
||||||
|
pad_mode: str='reflect',
|
||||||
|
n_mels: int=64,
|
||||||
|
f_min: float=50.0,
|
||||||
|
f_max: Optional[float]=None,
|
||||||
|
htk: bool=False,
|
||||||
|
norm: Union[str, float]='slaney',
|
||||||
|
ref_value: float=1.0,
|
||||||
|
amin: float=1e-10,
|
||||||
|
top_db: Optional[float]=None,
|
||||||
|
dtype: str=paddle.float32):
|
||||||
|
"""Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
|
||||||
|
typically an audio waveform.
|
||||||
|
Parameters:
|
||||||
|
sr (int): the audio sample rate.
|
||||||
|
The default value is 22050.
|
||||||
|
n_fft (int): the number of frequency components of the discrete Fourier transform.
|
||||||
|
The default value is 2048,
|
||||||
|
hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
||||||
|
The default value is None.
|
||||||
|
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
||||||
|
The default value is None.
|
||||||
|
window (str): the name of the window function applied to the single before the Fourier transform.
|
||||||
|
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
||||||
|
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
||||||
|
The default value is 'hann'
|
||||||
|
center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
||||||
|
If False, frame t begins at x[t * hop_length]
|
||||||
|
The default value is True
|
||||||
|
pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
||||||
|
and 'constant'.
|
||||||
|
The default value is 'reflect'.
|
||||||
|
n_mels (int): the mel bins.
|
||||||
|
f_min (float): the lower cut-off frequency, below which the filter response is zero.
|
||||||
|
f_max (float): the upper cut-off frequency, above which the filter response is zeros.
|
||||||
|
htk (bool): whether to use HTK formula in computing fbank matrix.
|
||||||
|
norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
|
||||||
|
You can specify norm=1.0/2.0 to use customized p-norm normalization.
|
||||||
|
ref_value (float): the reference value. If smaller than 1.0, the db level
|
||||||
|
amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
|
||||||
|
Otherwise, the db level is pushed down.
|
||||||
|
magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
|
||||||
|
e.g., 1e-3.
|
||||||
|
top_db (float): the maximum db value of resulting spectrum, above which the
|
||||||
|
spectrum is clipped(to top_db).
|
||||||
|
dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
|
||||||
|
accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
|
||||||
|
"""
|
||||||
|
super(LogMelSpectrogram, self).__init__()
|
||||||
|
|
||||||
|
self._melspectrogram = MelSpectrogram(
|
||||||
|
sr=sr,
|
||||||
|
n_fft=n_fft,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=win_length,
|
||||||
|
window=window,
|
||||||
|
center=center,
|
||||||
|
pad_mode=pad_mode,
|
||||||
|
n_mels=n_mels,
|
||||||
|
f_min=f_min,
|
||||||
|
f_max=f_max,
|
||||||
|
htk=htk,
|
||||||
|
norm=norm,
|
||||||
|
dtype=dtype)
|
||||||
|
|
||||||
|
self.ref_value = ref_value
|
||||||
|
self.amin = amin
|
||||||
|
self.top_db = top_db
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# import ipdb; ipdb.set_trace()
|
||||||
|
mel_feature = self._melspectrogram(x)
|
||||||
|
log_mel_feature = power_to_db(
|
||||||
|
mel_feature,
|
||||||
|
ref_value=self.ref_value,
|
||||||
|
amin=self.amin,
|
||||||
|
top_db=self.top_db)
|
||||||
|
return log_mel_feature
|
||||||
|
|
||||||
|
|
||||||
|
class MFCC(nn.Layer):
|
||||||
|
def __init__(self,
|
||||||
|
sr: int=22050,
|
||||||
|
n_mfcc: int=40,
|
||||||
|
n_fft: int=512,
|
||||||
|
hop_length: Optional[int]=None,
|
||||||
|
win_length: Optional[int]=None,
|
||||||
|
window: str='hann',
|
||||||
|
center: bool=True,
|
||||||
|
pad_mode: str='reflect',
|
||||||
|
n_mels: int=64,
|
||||||
|
f_min: float=50.0,
|
||||||
|
f_max: Optional[float]=None,
|
||||||
|
htk: bool=False,
|
||||||
|
norm: Union[str, float]='slaney',
|
||||||
|
ref_value: float=1.0,
|
||||||
|
amin: float=1e-10,
|
||||||
|
top_db: Optional[float]=None,
|
||||||
|
dtype: str=paddle.float32):
|
||||||
|
"""Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
sr(int): the audio sample rate.
|
||||||
|
The default value is 22050.
|
||||||
|
n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40.
|
||||||
|
n_fft (int): the number of frequency components of the discrete Fourier transform.
|
||||||
|
The default value is 2048,
|
||||||
|
hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
|
||||||
|
The default value is None.
|
||||||
|
win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
|
||||||
|
The default value is None.
|
||||||
|
window (str): the name of the window function applied to the single before the Fourier transform.
|
||||||
|
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
|
||||||
|
'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
|
||||||
|
The default value is 'hann'
|
||||||
|
center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
|
||||||
|
If False, frame t begins at x[t * hop_length]
|
||||||
|
The default value is True
|
||||||
|
pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
|
||||||
|
and 'constant'.
|
||||||
|
The default value is 'reflect'.
|
||||||
|
n_mels (int): the mel bins.
|
||||||
|
f_min (float): the lower cut-off frequency, below which the filter response is zero.
|
||||||
|
f_max (float): the upper cut-off frequency, above which the filter response is zeros.
|
||||||
|
htk (bool): whether to use HTK formula in computing fbank matrix.
|
||||||
|
norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
|
||||||
|
You can specify norm=1.0/2.0 to use customized p-norm normalization.
|
||||||
|
ref_value (float): the reference value. If smaller than 1.0, the db level
|
||||||
|
amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
|
||||||
|
Otherwise, the db level is pushed down.
|
||||||
|
magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
|
||||||
|
e.g., 1e-3.
|
||||||
|
top_db (float): the maximum db value of resulting spectrum, above which the
|
||||||
|
spectrum is clipped(to top_db).
|
||||||
|
dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
|
||||||
|
accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
|
||||||
|
"""
|
||||||
|
super(MFCC, self).__init__()
|
||||||
|
assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
|
||||||
|
n_mfcc, n_mels)
|
||||||
|
self._log_melspectrogram = LogMelSpectrogram(
|
||||||
|
sr=sr,
|
||||||
|
n_fft=n_fft,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=win_length,
|
||||||
|
window=window,
|
||||||
|
center=center,
|
||||||
|
pad_mode=pad_mode,
|
||||||
|
n_mels=n_mels,
|
||||||
|
f_min=f_min,
|
||||||
|
f_max=f_max,
|
||||||
|
htk=htk,
|
||||||
|
norm=norm,
|
||||||
|
ref_value=ref_value,
|
||||||
|
amin=amin,
|
||||||
|
top_db=top_db,
|
||||||
|
dtype=dtype)
|
||||||
|
self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
|
||||||
|
self.register_buffer('dct_matrix', self.dct_matrix)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
log_mel_feature = self._log_melspectrogram(x)
|
||||||
|
mfcc = paddle.matmul(
|
||||||
|
log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
|
||||||
|
(0, 2, 1)) # (B, n_mels, L)
|
||||||
|
return mfcc
|
@ -0,0 +1,20 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .functional import compute_fbank_matrix
|
||||||
|
from .functional import create_dct
|
||||||
|
from .functional import fft_frequencies
|
||||||
|
from .functional import hz_to_mel
|
||||||
|
from .functional import mel_frequencies
|
||||||
|
from .functional import mel_to_hz
|
||||||
|
from .functional import power_to_db
|
@ -0,0 +1,265 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from librosa(https://github.com/librosa/librosa)
|
||||||
|
import math
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'hz_to_mel',
|
||||||
|
'mel_to_hz',
|
||||||
|
'mel_frequencies',
|
||||||
|
'fft_frequencies',
|
||||||
|
'compute_fbank_matrix',
|
||||||
|
'power_to_db',
|
||||||
|
'create_dct',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def hz_to_mel(freq: Union[paddle.Tensor, float],
|
||||||
|
htk: bool=False) -> Union[paddle.Tensor, float]:
|
||||||
|
"""Convert Hz to Mels.
|
||||||
|
Parameters:
|
||||||
|
freq: the input tensor of arbitrary shape, or a single floating point number.
|
||||||
|
htk: use HTK formula to do the conversion.
|
||||||
|
The default value is False.
|
||||||
|
Returns:
|
||||||
|
The frequencies represented in Mel-scale.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if htk:
|
||||||
|
if isinstance(freq, paddle.Tensor):
|
||||||
|
return 2595.0 * paddle.log10(1.0 + freq / 700.0)
|
||||||
|
else:
|
||||||
|
return 2595.0 * math.log10(1.0 + freq / 700.0)
|
||||||
|
|
||||||
|
# Fill in the linear part
|
||||||
|
f_min = 0.0
|
||||||
|
f_sp = 200.0 / 3
|
||||||
|
|
||||||
|
mels = (freq - f_min) / f_sp
|
||||||
|
|
||||||
|
# Fill in the log-scale part
|
||||||
|
|
||||||
|
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||||
|
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||||
|
logstep = math.log(6.4) / 27.0 # step size for log region
|
||||||
|
|
||||||
|
if isinstance(freq, paddle.Tensor):
|
||||||
|
target = min_log_mel + paddle.log(
|
||||||
|
freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10
|
||||||
|
mask = (freq > min_log_hz).astype(freq.dtype)
|
||||||
|
mels = target * mask + mels * (
|
||||||
|
1 - mask) # will replace by masked_fill OP in future
|
||||||
|
else:
|
||||||
|
if freq >= min_log_hz:
|
||||||
|
mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
|
||||||
|
|
||||||
|
return mels
|
||||||
|
|
||||||
|
|
||||||
|
def mel_to_hz(mel: Union[float, paddle.Tensor],
|
||||||
|
htk: bool=False) -> Union[float, paddle.Tensor]:
|
||||||
|
"""Convert mel bin numbers to frequencies.
|
||||||
|
Parameters:
|
||||||
|
mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
|
||||||
|
htk: use HTK formula to do the conversion.
|
||||||
|
Returns:
|
||||||
|
The frequencies represented in hz.
|
||||||
|
"""
|
||||||
|
if htk:
|
||||||
|
return 700.0 * (10.0**(mel / 2595.0) - 1.0)
|
||||||
|
|
||||||
|
f_min = 0.0
|
||||||
|
f_sp = 200.0 / 3
|
||||||
|
freqs = f_min + f_sp * mel
|
||||||
|
# And now the nonlinear scale
|
||||||
|
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||||
|
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||||
|
logstep = math.log(6.4) / 27.0 # step size for log region
|
||||||
|
if isinstance(mel, paddle.Tensor):
|
||||||
|
target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
|
||||||
|
mask = (mel > min_log_mel).astype(mel.dtype)
|
||||||
|
freqs = target * mask + freqs * (
|
||||||
|
1 - mask) # will replace by masked_fill OP in future
|
||||||
|
else:
|
||||||
|
if mel >= min_log_mel:
|
||||||
|
freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
|
||||||
|
|
||||||
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
|
def mel_frequencies(n_mels: int=64,
|
||||||
|
f_min: float=0.0,
|
||||||
|
f_max: float=11025.0,
|
||||||
|
htk: bool=False,
|
||||||
|
dtype: str=paddle.float32):
|
||||||
|
"""Compute mel frequencies.
|
||||||
|
Parameters:
|
||||||
|
n_mels(int): number of Mel bins.
|
||||||
|
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
||||||
|
f_max(float): the upper cut-off frequency, above which the filter response is zero.
|
||||||
|
htk(bool): whether to use htk formula.
|
||||||
|
dtype(str): the datatype of the return frequencies.
|
||||||
|
Returns:
|
||||||
|
The frequencies represented in Mel-scale
|
||||||
|
"""
|
||||||
|
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||||
|
min_mel = hz_to_mel(f_min, htk=htk)
|
||||||
|
max_mel = hz_to_mel(f_max, htk=htk)
|
||||||
|
mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
|
||||||
|
freqs = mel_to_hz(mels, htk=htk)
|
||||||
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
|
def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
|
||||||
|
"""Compute fourier frequencies.
|
||||||
|
Parameters:
|
||||||
|
sr(int): the audio sample rate.
|
||||||
|
n_fft(float): the number of fft bins.
|
||||||
|
dtype(str): the datatype of the return frequencies.
|
||||||
|
Returns:
|
||||||
|
The frequencies represented in hz.
|
||||||
|
"""
|
||||||
|
return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_fbank_matrix(sr: int,
|
||||||
|
n_fft: int,
|
||||||
|
n_mels: int=64,
|
||||||
|
f_min: float=0.0,
|
||||||
|
f_max: Optional[float]=None,
|
||||||
|
htk: bool=False,
|
||||||
|
norm: Union[str, float]='slaney',
|
||||||
|
dtype: str=paddle.float32):
|
||||||
|
"""Compute fbank matrix.
|
||||||
|
Parameters:
|
||||||
|
sr(int): the audio sample rate.
|
||||||
|
n_fft(int): the number of fft bins.
|
||||||
|
n_mels(int): the number of Mel bins.
|
||||||
|
f_min(float): the lower cut-off frequency, below which the filter response is zero.
|
||||||
|
f_max(float): the upper cut-off frequency, above which the filter response is zero.
|
||||||
|
htk: whether to use htk formula.
|
||||||
|
return_complex(bool): whether to return complex matrix. If True, the matrix will
|
||||||
|
be complex type. Otherwise, the real and image part will be stored in the last
|
||||||
|
axis of returned tensor.
|
||||||
|
dtype(str): the datatype of the returned fbank matrix.
|
||||||
|
Returns:
|
||||||
|
The fbank matrix of shape (n_mels, int(1+n_fft//2)).
|
||||||
|
Shape:
|
||||||
|
output: (n_mels, int(1+n_fft//2))
|
||||||
|
"""
|
||||||
|
|
||||||
|
if f_max is None:
|
||||||
|
f_max = float(sr) / 2
|
||||||
|
|
||||||
|
# Initialize the weights
|
||||||
|
weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
|
||||||
|
|
||||||
|
# Center freqs of each FFT bin
|
||||||
|
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
|
||||||
|
|
||||||
|
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||||
|
mel_f = mel_frequencies(
|
||||||
|
n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
|
||||||
|
|
||||||
|
fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f)
|
||||||
|
ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
|
||||||
|
#ramps = np.subtract.outer(mel_f, fftfreqs)
|
||||||
|
|
||||||
|
for i in range(n_mels):
|
||||||
|
# lower and upper slopes for all bins
|
||||||
|
lower = -ramps[i] / fdiff[i]
|
||||||
|
upper = ramps[i + 2] / fdiff[i + 1]
|
||||||
|
|
||||||
|
# .. then intersect them with each other and zero
|
||||||
|
weights[i] = paddle.maximum(
|
||||||
|
paddle.zeros_like(lower), paddle.minimum(lower, upper))
|
||||||
|
|
||||||
|
# Slaney-style mel is scaled to be approx constant energy per channel
|
||||||
|
if norm == 'slaney':
|
||||||
|
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
|
||||||
|
weights *= enorm.unsqueeze(1)
|
||||||
|
elif isinstance(norm, int) or isinstance(norm, float):
|
||||||
|
weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
|
||||||
|
|
||||||
|
return weights
|
||||||
|
|
||||||
|
|
||||||
|
def power_to_db(magnitude: paddle.Tensor,
|
||||||
|
ref_value: float=1.0,
|
||||||
|
amin: float=1e-10,
|
||||||
|
top_db: Optional[float]=None) -> paddle.Tensor:
|
||||||
|
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units.
|
||||||
|
The function computes the scaling ``10 * log10(x / ref)`` in a numerically
|
||||||
|
stable way.
|
||||||
|
Parameters:
|
||||||
|
magnitude(Tensor): the input magnitude tensor of any shape.
|
||||||
|
ref_value(float): the reference value. If smaller than 1.0, the db level
|
||||||
|
of the signal will be pulled up accordingly. Otherwise, the db level
|
||||||
|
is pushed down.
|
||||||
|
amin(float): the minimum value of input magnitude, below which the input
|
||||||
|
magnitude is clipped(to amin).
|
||||||
|
top_db(float): the maximum db value of resulting spectrum, above which the
|
||||||
|
spectrum is clipped(to top_db).
|
||||||
|
Returns:
|
||||||
|
The spectrogram in log-scale.
|
||||||
|
shape:
|
||||||
|
input: any shape
|
||||||
|
output: same as input
|
||||||
|
"""
|
||||||
|
if amin <= 0:
|
||||||
|
raise Exception("amin must be strictly positive")
|
||||||
|
|
||||||
|
if ref_value <= 0:
|
||||||
|
raise Exception("ref_value must be strictly positive")
|
||||||
|
|
||||||
|
ones = paddle.ones_like(magnitude)
|
||||||
|
log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
|
||||||
|
log_spec -= 10.0 * math.log10(max(ref_value, amin))
|
||||||
|
|
||||||
|
if top_db is not None:
|
||||||
|
if top_db < 0:
|
||||||
|
raise Exception("top_db must be non-negative")
|
||||||
|
log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
|
||||||
|
|
||||||
|
return log_spec
|
||||||
|
|
||||||
|
|
||||||
|
def create_dct(n_mfcc: int,
|
||||||
|
n_mels: int,
|
||||||
|
norm: Optional[str]='ortho',
|
||||||
|
dtype: Optional[str]=paddle.float32) -> paddle.Tensor:
|
||||||
|
"""Create a discrete cosine transform(DCT) matrix.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
n_mfcc (int): Number of mel frequency cepstral coefficients.
|
||||||
|
n_mels (int): Number of mel filterbanks.
|
||||||
|
norm (str, optional): Normalizaiton type. Defaults to 'ortho'.
|
||||||
|
Returns:
|
||||||
|
Tensor: The DCT matrix with shape (n_mels, n_mfcc).
|
||||||
|
"""
|
||||||
|
n = paddle.arange(n_mels, dtype=dtype)
|
||||||
|
k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
|
||||||
|
dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
|
||||||
|
k) # size (n_mfcc, n_mels)
|
||||||
|
if norm is None:
|
||||||
|
dct *= 2.0
|
||||||
|
else:
|
||||||
|
assert norm == "ortho"
|
||||||
|
dct[0] *= 1.0 / math.sqrt(2.0)
|
||||||
|
dct *= math.sqrt(2.0 / float(n_mels))
|
||||||
|
return dct.T
|
@ -0,0 +1,42 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import numpy as np
|
||||||
|
from dtaidistance import dtw_ndim
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'dtw_distance',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float:
|
||||||
|
"""dtw distance
|
||||||
|
|
||||||
|
Dynamic Time Warping.
|
||||||
|
This function keeps a compact matrix, not the full warping paths matrix.
|
||||||
|
Uses dynamic programming to compute:
|
||||||
|
|
||||||
|
wps[i, j] = (s1[i]-s2[j])**2 + min(
|
||||||
|
wps[i-1, j ] + penalty, // vertical / insertion / expansion
|
||||||
|
wps[i , j-1] + penalty, // horizontal / deletion / compression
|
||||||
|
wps[i-1, j-1]) // diagonal / match
|
||||||
|
dtw = sqrt(wps[-1, -1])
|
||||||
|
|
||||||
|
Args:
|
||||||
|
xs (np.ndarray): ref sequence, [T,D]
|
||||||
|
ys (np.ndarray): hyp sequence, [T,D]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: dtw distance
|
||||||
|
"""
|
||||||
|
return dtw_ndim.distance(xs, ys)
|
@ -0,0 +1,48 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import mcd.metrics_fast as mt
|
||||||
|
import numpy as np
|
||||||
|
from mcd import dtw
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'mcd_distance',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def mcd_distance(xs: np.ndarray, ys: np.ndarray, cost_fn=mt.logSpecDbDist):
|
||||||
|
"""Mel cepstral distortion (MCD), dtw distance.
|
||||||
|
|
||||||
|
Dynamic Time Warping.
|
||||||
|
Uses dynamic programming to compute:
|
||||||
|
wps[i, j] = cost_fn(xs[i], ys[j]) + min(
|
||||||
|
wps[i-1, j ], // vertical / insertion / expansion
|
||||||
|
wps[i , j-1], // horizontal / deletion / compression
|
||||||
|
wps[i-1, j-1]) // diagonal / match
|
||||||
|
dtw = sqrt(wps[-1, -1])
|
||||||
|
|
||||||
|
Cost Function:
|
||||||
|
logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0)
|
||||||
|
def logSpecDbDist(x, y):
|
||||||
|
diff = x - y
|
||||||
|
return logSpecDbConst * math.sqrt(np.inner(diff, diff))
|
||||||
|
|
||||||
|
Args:
|
||||||
|
xs (np.ndarray): ref sequence, [T,D]
|
||||||
|
ys (np.ndarray): hyp sequence, [T,D]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: dtw distance
|
||||||
|
"""
|
||||||
|
min_cost, path = dtw.dtw(xs, ys, cost_fn)
|
||||||
|
return min_cost
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,25 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .download import decompress
|
||||||
|
from .download import download_and_decompress
|
||||||
|
from .download import load_state_dict_from_url
|
||||||
|
from .env import DATA_HOME
|
||||||
|
from .env import MODEL_HOME
|
||||||
|
from .env import PPAUDIO_HOME
|
||||||
|
from .env import USER_HOME
|
||||||
|
from .error import ParameterError
|
||||||
|
from .log import Logger
|
||||||
|
from .log import logger
|
||||||
|
from .time import seconds_to_hms
|
||||||
|
from .time import Timer
|
Loading…
Reference in new issue