Feature alignment.

4 years ago · f657ee64dc
parent 67dcff2f3f
commit f657ee64dc
1 changed files with 18 additions and 12 deletions
--- a/paddleaudio/paddleaudio/features/layers.py
+++ b/paddleaudio/paddleaudio/features/layers.py
@ -37,6 +37,7 @@ class Spectrogram(nn.Layer):
                 hop_length: Optional[int]=None,
                 win_length: Optional[int]=None,
                 window: str='hann',
                 power: float=2.0,
                 center: bool=True,
                 pad_mode: str='reflect',
                 dtype: str=paddle.float32):
@ -54,6 +55,7 @@ class Spectrogram(nn.Layer):
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
@ -68,6 +70,9 @@ class Spectrogram(nn.Layer):
        """
        super(Spectrogram, self).__init__()
        assert power > 0, 'Power of spectrogram must be > 0.'
        self.power = power
        if win_length is None:
            win_length = n_fft
@ -85,7 +90,7 @@ class Spectrogram(nn.Layer):
    def forward(self, x):
        stft = self._stft(x)
-        spectrogram = paddle.square(paddle.abs(stft))
+        spectrogram = paddle.pow(paddle.abs(stft), self.power)
        return spectrogram
@ -96,6 +101,7 @@ class MelSpectrogram(nn.Layer):
                 hop_length: Optional[int]=None,
                 win_length: Optional[int]=None,
                 window: str='hann',
                 power: float=2.0,
                 center: bool=True,
                 pad_mode: str='reflect',
                 n_mels: int=64,
@ -120,6 +126,7 @@ class MelSpectrogram(nn.Layer):
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
@ -142,6 +149,7 @@ class MelSpectrogram(nn.Layer):
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            power=power,
            center=center,
            pad_mode=pad_mode,
            dtype=dtype)
@ -176,6 +184,7 @@ class LogMelSpectrogram(nn.Layer):
                 hop_length: Optional[int]=None,
                 win_length: Optional[int]=None,
                 window: str='hann',
                 power: float=2.0,
                 center: bool=True,
                 pad_mode: str='reflect',
                 n_mels: int=64,
@ -214,11 +223,8 @@ class LogMelSpectrogram(nn.Layer):
            htk (bool): whether to use HTK formula in computing fbank matrix.
            norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
                You can specify norm=1.0/2.0 to use customized p-norm normalization.
-            ref_value (float): the reference value. If smaller than 1.0, the db level
+            ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down.
-            amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
+            amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin).
                Otherwise, the db level is pushed down.
                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
                e.g., 1e-3.
            top_db (float): the maximum db value of resulting spectrum, above which the
                spectrum is clipped(to top_db).
            dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
@ -232,6 +238,7 @@ class LogMelSpectrogram(nn.Layer):
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            power=power,
            center=center,
            pad_mode=pad_mode,
            n_mels=n_mels,
@ -246,7 +253,6 @@ class LogMelSpectrogram(nn.Layer):
        self.top_db = top_db
    def forward(self, x):
        # import ipdb; ipdb.set_trace()
        mel_feature = self._melspectrogram(x)
        log_mel_feature = power_to_db(
            mel_feature,
@ -264,6 +270,7 @@ class MFCC(nn.Layer):
                 hop_length: Optional[int]=None,
                 win_length: Optional[int]=None,
                 window: str='hann',
                 power: float=2.0,
                 center: bool=True,
                 pad_mode: str='reflect',
                 n_mels: int=64,
@ -291,6 +298,7 @@ class MFCC(nn.Layer):
                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                The default value is 'hann'
            power (float): Exponent for the magnitude spectrogram. The default value is 2.0.
            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                If False, frame t begins at x[t * hop_length]
                The default value is True
@ -303,11 +311,8 @@ class MFCC(nn.Layer):
            htk (bool): whether to use HTK formula in computing fbank matrix.
            norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
                You can specify norm=1.0/2.0 to use customized p-norm normalization.
-            ref_value (float): the reference value. If smaller than 1.0, the db level
+            ref_value (float): the reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down.
-            amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
+            amin (float): the minimum value of input magnitude, below which the input magnitude is clipped(to amin).
                Otherwise, the db level is pushed down.
                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
                e.g., 1e-3.
            top_db (float): the maximum db value of resulting spectrum, above which the
                spectrum is clipped(to top_db).
            dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
@ -322,6 +327,7 @@ class MFCC(nn.Layer):
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            power=power,
            center=center,
            pad_mode=pad_mode,
            n_mels=n_mels,