Feature alignment.

pull/1548/head
KP 3 years ago
parent 67dcff2f3f
commit f657ee64dc

@ -37,6 +37,7 @@ class Spectrogram(nn.Layer):
hop_length: Optional[int]=None, hop_length: Optional[int]=None,
win_length: Optional[int]=None, win_length: Optional[int]=None,
window: str='hann', window: str='hann',
power: float=2.0,
center: bool=True, center: bool=True,
pad_mode: str='reflect', pad_mode: str='reflect',
dtype: str=paddle.float32): dtype: str=paddle.float32):
@ -54,6 +55,7 @@ class Spectrogram(nn.Layer):
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
'exponential','triang','bohman','blackman','cosine','tukey','taylor'. 'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
The default value is 'hann' The default value is 'hann'
power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
If False, frame t begins at x[t * hop_length] If False, frame t begins at x[t * hop_length]
The default value is True The default value is True
@ -68,6 +70,9 @@ class Spectrogram(nn.Layer):
""" """
super(Spectrogram, self).__init__() super(Spectrogram, self).__init__()
assert power > 0, 'Power of spectrogram must be > 0.'
self.power = power
if win_length is None: if win_length is None:
win_length = n_fft win_length = n_fft
@ -85,7 +90,7 @@ class Spectrogram(nn.Layer):
def forward(self, x): def forward(self, x):
stft = self._stft(x) stft = self._stft(x)
spectrogram = paddle.square(paddle.abs(stft)) spectrogram = paddle.pow(paddle.abs(stft), self.power)
return spectrogram return spectrogram
@ -96,6 +101,7 @@ class MelSpectrogram(nn.Layer):
hop_length: Optional[int]=None, hop_length: Optional[int]=None,
win_length: Optional[int]=None, win_length: Optional[int]=None,
window: str='hann', window: str='hann',
power: float=2.0,
center: bool=True, center: bool=True,
pad_mode: str='reflect', pad_mode: str='reflect',
n_mels: int=64, n_mels: int=64,
@ -120,6 +126,7 @@ class MelSpectrogram(nn.Layer):
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
'exponential','triang','bohman','blackman','cosine','tukey','taylor'. 'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
The default value is 'hann' The default value is 'hann'
power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
If False, frame t begins at x[t * hop_length] If False, frame t begins at x[t * hop_length]
The default value is True The default value is True
@ -142,6 +149,7 @@ class MelSpectrogram(nn.Layer):
hop_length=hop_length, hop_length=hop_length,
win_length=win_length, win_length=win_length,
window=window, window=window,
power=power,
center=center, center=center,
pad_mode=pad_mode, pad_mode=pad_mode,
dtype=dtype) dtype=dtype)
@ -176,6 +184,7 @@ class LogMelSpectrogram(nn.Layer):
hop_length: Optional[int]=None, hop_length: Optional[int]=None,
win_length: Optional[int]=None, win_length: Optional[int]=None,
window: str='hann', window: str='hann',
power: float=2.0,
center: bool=True, center: bool=True,
pad_mode: str='reflect', pad_mode: str='reflect',
n_mels: int=64, n_mels: int=64,
@ -214,11 +223,8 @@ class LogMelSpectrogram(nn.Layer):
htk (bool): whether to use HTK formula in computing fbank matrix. htk (bool): whether to use HTK formula in computing fbank matrix.
norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
You can specify norm=1.0/2.0 to use customized p-norm normalization. You can specify norm=1.0/2.0 to use customized p-norm normalization.
ref_value (float): the reference value. If smaller than 1.0, the db level ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down.
amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly. amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin).
Otherwise, the db level is pushed down.
magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
e.g., 1e-3.
top_db (float): the maximum db value of resulting spectrum, above which the top_db (float): the maximum db value of resulting spectrum, above which the
spectrum is clipped(to top_db). spectrum is clipped(to top_db).
dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
@ -232,6 +238,7 @@ class LogMelSpectrogram(nn.Layer):
hop_length=hop_length, hop_length=hop_length,
win_length=win_length, win_length=win_length,
window=window, window=window,
power=power,
center=center, center=center,
pad_mode=pad_mode, pad_mode=pad_mode,
n_mels=n_mels, n_mels=n_mels,
@ -246,7 +253,6 @@ class LogMelSpectrogram(nn.Layer):
self.top_db = top_db self.top_db = top_db
def forward(self, x): def forward(self, x):
# import ipdb; ipdb.set_trace()
mel_feature = self._melspectrogram(x) mel_feature = self._melspectrogram(x)
log_mel_feature = power_to_db( log_mel_feature = power_to_db(
mel_feature, mel_feature,
@ -264,6 +270,7 @@ class MFCC(nn.Layer):
hop_length: Optional[int]=None, hop_length: Optional[int]=None,
win_length: Optional[int]=None, win_length: Optional[int]=None,
window: str='hann', window: str='hann',
power: float=2.0,
center: bool=True, center: bool=True,
pad_mode: str='reflect', pad_mode: str='reflect',
n_mels: int=64, n_mels: int=64,
@ -291,6 +298,7 @@ class MFCC(nn.Layer):
The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
'exponential','triang','bohman','blackman','cosine','tukey','taylor'. 'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
The default value is 'hann' The default value is 'hann'
power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
If False, frame t begins at x[t * hop_length] If False, frame t begins at x[t * hop_length]
The default value is True The default value is True
@ -303,11 +311,8 @@ class MFCC(nn.Layer):
htk (bool): whether to use HTK formula in computing fbank matrix. htk (bool): whether to use HTK formula in computing fbank matrix.
norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
You can specify norm=1.0/2.0 to use customized p-norm normalization. You can specify norm=1.0/2.0 to use customized p-norm normalization.
ref_value (float): the reference value. If smaller than 1.0, the db level ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down.
amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly. amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin).
Otherwise, the db level is pushed down.
magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
e.g., 1e-3.
top_db (float): the maximum db value of resulting spectrum, above which the top_db (float): the maximum db value of resulting spectrum, above which the
spectrum is clipped(to top_db). spectrum is clipped(to top_db).
dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
@ -322,6 +327,7 @@ class MFCC(nn.Layer):
hop_length=hop_length, hop_length=hop_length,
win_length=win_length, win_length=win_length,
window=window, window=window,
power=power,
center=center, center=center,
pad_mode=pad_mode, pad_mode=pad_mode,
n_mels=n_mels, n_mels=n_mels,

Loading…
Cancel
Save