You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
282 lines
9.3 KiB
282 lines
9.3 KiB
3 years ago
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
import unittest
|
||
|
|
||
|
import librosa
|
||
|
import numpy as np
|
||
|
import paddle
|
||
|
|
||
2 years ago
|
import paddlespeech.audio
|
||
3 years ago
|
from .base import FeatTest
|
||
2 years ago
|
from paddlespeech.audio.functional.window import get_window
|
||
3 years ago
|
|
||
3 years ago
|
|
||
|
class TestLibrosa(FeatTest):
|
||
|
def initParmas(self):
|
||
|
self.n_fft = 512
|
||
|
self.hop_length = 128
|
||
|
self.n_mels = 40
|
||
3 years ago
|
self.n_mfcc = 20
|
||
3 years ago
|
self.fmin = 0.0
|
||
|
self.window_str = 'hann'
|
||
|
self.pad_mode = 'reflect'
|
||
3 years ago
|
self.top_db = 80.0
|
||
3 years ago
|
|
||
|
def test_stft(self):
|
||
|
if len(self.waveform.shape) == 2: # (C, T)
|
||
|
self.waveform = self.waveform.squeeze(
|
||
|
0) # 1D input for librosa.feature.melspectrogram
|
||
|
|
||
|
feature_librosa = librosa.core.stft(
|
||
|
y=self.waveform,
|
||
|
n_fft=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
win_length=None,
|
||
|
window=self.window_str,
|
||
|
center=True,
|
||
|
dtype=None,
|
||
|
pad_mode=self.pad_mode, )
|
||
|
x = paddle.to_tensor(self.waveform).unsqueeze(0)
|
||
|
window = get_window(self.window_str, self.n_fft, dtype=x.dtype)
|
||
|
feature_paddle = paddle.signal.stft(
|
||
|
x=x,
|
||
|
n_fft=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
win_length=None,
|
||
|
window=window,
|
||
|
center=True,
|
||
|
pad_mode=self.pad_mode,
|
||
|
normalized=False,
|
||
|
onesided=True, ).squeeze(0)
|
||
|
|
||
|
np.testing.assert_array_almost_equal(
|
||
|
feature_librosa, feature_paddle, decimal=5)
|
||
|
|
||
|
def test_istft(self):
|
||
|
if len(self.waveform.shape) == 2: # (C, T)
|
||
|
self.waveform = self.waveform.squeeze(
|
||
|
0) # 1D input for librosa.feature.melspectrogram
|
||
|
|
||
|
# Get stft result from librosa.
|
||
|
stft_matrix = librosa.core.stft(
|
||
|
y=self.waveform,
|
||
|
n_fft=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
win_length=None,
|
||
|
window=self.window_str,
|
||
|
center=True,
|
||
|
pad_mode=self.pad_mode, )
|
||
|
|
||
|
feature_librosa = librosa.core.istft(
|
||
|
stft_matrix=stft_matrix,
|
||
|
hop_length=self.hop_length,
|
||
|
win_length=None,
|
||
|
window=self.window_str,
|
||
|
center=True,
|
||
|
dtype=None,
|
||
|
length=None, )
|
||
|
|
||
|
x = paddle.to_tensor(stft_matrix).unsqueeze(0)
|
||
|
window = get_window(
|
||
|
self.window_str,
|
||
|
self.n_fft,
|
||
|
dtype=paddle.to_tensor(self.waveform).dtype)
|
||
|
feature_paddle = paddle.signal.istft(
|
||
|
x=x,
|
||
|
n_fft=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
win_length=None,
|
||
|
window=window,
|
||
|
center=True,
|
||
|
normalized=False,
|
||
|
onesided=True,
|
||
|
length=None,
|
||
|
return_complex=False, ).squeeze(0)
|
||
|
|
||
|
np.testing.assert_array_almost_equal(
|
||
|
feature_librosa, feature_paddle, decimal=5)
|
||
|
|
||
|
def test_mel(self):
|
||
|
feature_librosa = librosa.filters.mel(
|
||
|
sr=self.sr,
|
||
|
n_fft=self.n_fft,
|
||
|
n_mels=self.n_mels,
|
||
|
fmin=self.fmin,
|
||
|
fmax=None,
|
||
|
htk=False,
|
||
|
norm='slaney',
|
||
|
dtype=self.waveform.dtype, )
|
||
2 years ago
|
feature_compliance = paddlespeech.audio.compliance.librosa.compute_fbank_matrix(
|
||
3 years ago
|
sr=self.sr,
|
||
|
n_fft=self.n_fft,
|
||
|
n_mels=self.n_mels,
|
||
|
fmin=self.fmin,
|
||
|
fmax=None,
|
||
|
htk=False,
|
||
|
norm='slaney',
|
||
|
dtype=self.waveform.dtype, )
|
||
|
x = paddle.to_tensor(self.waveform)
|
||
2 years ago
|
feature_functional = paddlespeech.audio.functional.compute_fbank_matrix(
|
||
3 years ago
|
sr=self.sr,
|
||
|
n_fft=self.n_fft,
|
||
|
n_mels=self.n_mels,
|
||
|
f_min=self.fmin,
|
||
|
f_max=None,
|
||
|
htk=False,
|
||
|
norm='slaney',
|
||
|
dtype=x.dtype, )
|
||
|
|
||
|
np.testing.assert_array_almost_equal(feature_librosa,
|
||
|
feature_compliance)
|
||
|
np.testing.assert_array_almost_equal(feature_librosa,
|
||
|
feature_functional)
|
||
|
|
||
|
def test_melspect(self):
|
||
|
if len(self.waveform.shape) == 2: # (C, T)
|
||
|
self.waveform = self.waveform.squeeze(
|
||
|
0) # 1D input for librosa.feature.melspectrogram
|
||
|
|
||
|
# librosa:
|
||
|
feature_librosa = librosa.feature.melspectrogram(
|
||
|
y=self.waveform,
|
||
|
sr=self.sr,
|
||
|
n_fft=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
n_mels=self.n_mels,
|
||
|
fmin=self.fmin)
|
||
|
|
||
2 years ago
|
# paddlespeech.audio.compliance.librosa:
|
||
|
feature_compliance = paddlespeech.audio.compliance.librosa.melspectrogram(
|
||
3 years ago
|
x=self.waveform,
|
||
|
sr=self.sr,
|
||
|
window_size=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
n_mels=self.n_mels,
|
||
|
fmin=self.fmin,
|
||
|
to_db=False)
|
||
|
|
||
2 years ago
|
# paddlespeech.audio.features.layer
|
||
3 years ago
|
x = paddle.to_tensor(
|
||
|
self.waveform, dtype=paddle.float64).unsqueeze(0) # Add batch dim.
|
||
2 years ago
|
feature_extractor = paddlespeech.audio.features.MelSpectrogram(
|
||
3 years ago
|
sr=self.sr,
|
||
|
n_fft=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
n_mels=self.n_mels,
|
||
|
f_min=self.fmin,
|
||
|
dtype=x.dtype)
|
||
|
feature_layer = feature_extractor(x).squeeze(0).numpy()
|
||
|
|
||
|
np.testing.assert_array_almost_equal(
|
||
|
feature_librosa, feature_compliance, decimal=5)
|
||
|
np.testing.assert_array_almost_equal(
|
||
|
feature_librosa, feature_layer, decimal=5)
|
||
|
|
||
|
def test_log_melspect(self):
|
||
|
if len(self.waveform.shape) == 2: # (C, T)
|
||
|
self.waveform = self.waveform.squeeze(
|
||
|
0) # 1D input for librosa.feature.melspectrogram
|
||
|
|
||
|
# librosa:
|
||
|
feature_librosa = librosa.feature.melspectrogram(
|
||
|
y=self.waveform,
|
||
|
sr=self.sr,
|
||
|
n_fft=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
n_mels=self.n_mels,
|
||
|
fmin=self.fmin)
|
||
|
feature_librosa = librosa.power_to_db(feature_librosa, top_db=None)
|
||
|
|
||
2 years ago
|
# paddlespeech.audio.compliance.librosa:
|
||
|
feature_compliance = paddlespeech.audio.compliance.librosa.melspectrogram(
|
||
3 years ago
|
x=self.waveform,
|
||
|
sr=self.sr,
|
||
|
window_size=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
n_mels=self.n_mels,
|
||
|
fmin=self.fmin)
|
||
|
|
||
2 years ago
|
# paddlespeech.audio.features.layer
|
||
3 years ago
|
x = paddle.to_tensor(
|
||
|
self.waveform, dtype=paddle.float64).unsqueeze(0) # Add batch dim.
|
||
2 years ago
|
feature_extractor = paddlespeech.audio.features.LogMelSpectrogram(
|
||
3 years ago
|
sr=self.sr,
|
||
|
n_fft=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
n_mels=self.n_mels,
|
||
|
f_min=self.fmin,
|
||
|
dtype=x.dtype)
|
||
|
feature_layer = feature_extractor(x).squeeze(0).numpy()
|
||
|
|
||
|
np.testing.assert_array_almost_equal(
|
||
|
feature_librosa, feature_compliance, decimal=5)
|
||
|
np.testing.assert_array_almost_equal(
|
||
|
feature_librosa, feature_layer, decimal=4)
|
||
|
|
||
3 years ago
|
def test_mfcc(self):
|
||
|
if len(self.waveform.shape) == 2: # (C, T)
|
||
|
self.waveform = self.waveform.squeeze(
|
||
|
0) # 1D input for librosa.feature.melspectrogram
|
||
|
|
||
|
# librosa:
|
||
|
feature_librosa = librosa.feature.mfcc(
|
||
|
y=self.waveform,
|
||
|
sr=self.sr,
|
||
|
S=None,
|
||
|
n_mfcc=self.n_mfcc,
|
||
|
dct_type=2,
|
||
|
norm='ortho',
|
||
|
lifter=0,
|
||
|
n_fft=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
n_mels=self.n_mels,
|
||
|
fmin=self.fmin)
|
||
|
|
||
2 years ago
|
# paddlespeech.audio.compliance.librosa:
|
||
|
feature_compliance = paddlespeech.audio.compliance.librosa.mfcc(
|
||
3 years ago
|
x=self.waveform,
|
||
|
sr=self.sr,
|
||
|
n_mfcc=self.n_mfcc,
|
||
|
dct_type=2,
|
||
|
norm='ortho',
|
||
|
lifter=0,
|
||
|
window_size=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
n_mels=self.n_mels,
|
||
|
fmin=self.fmin,
|
||
|
top_db=self.top_db)
|
||
|
|
||
2 years ago
|
# paddlespeech.audio.features.layer
|
||
3 years ago
|
x = paddle.to_tensor(
|
||
|
self.waveform, dtype=paddle.float64).unsqueeze(0) # Add batch dim.
|
||
2 years ago
|
feature_extractor = paddlespeech.audio.features.MFCC(
|
||
3 years ago
|
sr=self.sr,
|
||
|
n_mfcc=self.n_mfcc,
|
||
|
n_fft=self.n_fft,
|
||
|
hop_length=self.hop_length,
|
||
|
n_mels=self.n_mels,
|
||
|
f_min=self.fmin,
|
||
|
top_db=self.top_db,
|
||
|
dtype=x.dtype)
|
||
|
feature_layer = feature_extractor(x).squeeze(0).numpy()
|
||
|
|
||
|
np.testing.assert_array_almost_equal(
|
||
|
feature_librosa, feature_compliance, decimal=4)
|
||
|
np.testing.assert_array_almost_equal(
|
||
|
feature_librosa, feature_layer, decimal=4)
|
||
|
|
||
3 years ago
|
|
||
|
if __name__ == '__main__':
|
||
|
unittest.main()
|