|
|
@ -13,8 +13,9 @@
|
|
|
|
# limitations under the License.
|
|
|
|
# limitations under the License.
|
|
|
|
"""Contains the audio featurizer class."""
|
|
|
|
"""Contains the audio featurizer class."""
|
|
|
|
import numpy as np
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
import paddle
|
|
|
|
|
|
|
|
import paddleaudio.compliance.kaldi as kaldi
|
|
|
|
from python_speech_features import delta
|
|
|
|
from python_speech_features import delta
|
|
|
|
from python_speech_features import logfbank
|
|
|
|
|
|
|
|
from python_speech_features import mfcc
|
|
|
|
from python_speech_features import mfcc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -345,19 +346,17 @@ class AudioFeaturizer():
|
|
|
|
raise ValueError("Stride size must not be greater than "
|
|
|
|
raise ValueError("Stride size must not be greater than "
|
|
|
|
"window size.")
|
|
|
|
"window size.")
|
|
|
|
# (T, D)
|
|
|
|
# (T, D)
|
|
|
|
fbank_feat = logfbank(
|
|
|
|
waveform = paddle.to_tensor(
|
|
|
|
signal=samples,
|
|
|
|
np.expand_dims(samples, 0), dtype=paddle.float32)
|
|
|
|
samplerate=sample_rate,
|
|
|
|
mat = kaldi.fbank(
|
|
|
|
winlen=0.001 * window_ms,
|
|
|
|
waveform,
|
|
|
|
winstep=0.001 * stride_ms,
|
|
|
|
n_mels=feat_dim,
|
|
|
|
nfilt=feat_dim,
|
|
|
|
frame_length=window_ms, # default : 25
|
|
|
|
nfft=512,
|
|
|
|
frame_shift=stride_ms, # default : 10
|
|
|
|
lowfreq=20,
|
|
|
|
|
|
|
|
highfreq=max_freq,
|
|
|
|
|
|
|
|
dither=dither,
|
|
|
|
dither=dither,
|
|
|
|
remove_dc_offset=True,
|
|
|
|
energy_floor=0.0,
|
|
|
|
preemph=0.97,
|
|
|
|
sr=sample_rate)
|
|
|
|
wintype='povey')
|
|
|
|
fbank_feat = np.squeeze(mat.numpy())
|
|
|
|
if delta_delta:
|
|
|
|
if delta_delta:
|
|
|
|
fbank_feat = self._concat_delta_delta(fbank_feat)
|
|
|
|
fbank_feat = self._concat_delta_delta(fbank_feat)
|
|
|
|
return fbank_feat
|
|
|
|
return fbank_feat
|
|
|
|