From 0ffe1f91143b0489fd38be90747afcbb5e61fedc Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Mon, 28 Mar 2022 03:35:55 +0000 Subject: [PATCH 1/3] replace kaidi_fbank with paddleaudio --- examples/aishell/asr1/conf/preprocess.yaml | 9 ++-- paddlespeech/s2t/transform/spectrogram.py | 45 ++++++++++++++++++++ paddlespeech/s2t/transform/transformation.py | 1 + 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml index f7f4c58d..a20ff2ab 100644 --- a/examples/aishell/asr1/conf/preprocess.yaml +++ b/examples/aishell/asr1/conf/preprocess.yaml @@ -3,8 +3,9 @@ process: - type: fbank_kaldi fs: 16000 n_mels: 80 - n_shift: 160 - win_length: 400 + n_frame_length: 25 + n_frame_shift: 10 + energy_floor: 0.0 dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json @@ -23,7 +24,3 @@ process: n_mask: 2 inplace: true replace_with_zero: false - - - - diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index 889cd349..f779b07d 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -14,8 +14,11 @@ # Modified from espnet(https://github.com/espnet/espnet) import librosa import numpy as np +import paddle from python_speech_features import logfbank +import paddleaudio.compliance.kaldi as kaldi + def stft(x, n_fft, @@ -309,6 +312,48 @@ class IStft(): class LogMelSpectrogramKaldi(): + def __init__(self, + fs=16000, + n_mels=80, + n_frame_length=25, + n_frame_shift=10, + energy_floor=0.0, + dither=0.1): + self.fs = fs + self.n_mels = n_mels + self.n_frame_length = n_frame_length + self.n_frame_shift = n_frame_shift + self.energy_floor = energy_floor + self.dither = dither + + def __repr__(self): + return ( + "{name}(fs={fs}, n_mels={n_mels}, " + "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, " + "dither={dither}))".format( + name=self.__class__.__name__, + fs=self.fs, + n_mels=self.n_mels, + n_frame_shift=self.n_frame_shift, + n_frame_length=self.n_frame_length, + dither=self.dither, )) + + def __call__(self, x, train): + dither = self.dither if train else 0.0 + waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32) + mat = kaldi.fbank( + waveform, + n_mels=self.n_mels, + frame_length=self.n_frame_length, + frame_shift=self.n_frame_shift, + dither=dither, + energy_floor=self.energy_floor, + sr=self.fs) + mat = np.squeeze(mat.numpy()) + return mat + + +class LogMelSpectrogramKaldi_decay(): def __init__( self, fs=16000, diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py index 381b0cdc..3b433cb0 100644 --- a/paddlespeech/s2t/transform/transformation.py +++ b/paddlespeech/s2t/transform/transformation.py @@ -31,6 +31,7 @@ import_alias = dict( freq_mask="paddlespeech.s2t.transform.spec_augment:FreqMask", spec_augment="paddlespeech.s2t.transform.spec_augment:SpecAugment", speed_perturbation="paddlespeech.s2t.transform.perturb:SpeedPerturbation", + speed_perturbation_sox="paddlespeech.s2t.transform.perturb:SpeedPerturbationSox", volume_perturbation="paddlespeech.s2t.transform.perturb:VolumePerturbation", noise_injection="paddlespeech.s2t.transform.perturb:NoiseInjection", bandpass_perturbation="paddlespeech.s2t.transform.perturb:BandpassPerturbation", From ed490b66cb052c1308117e5e9703d94d8e43239a Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 29 Mar 2022 03:20:07 +0000 Subject: [PATCH 2/3] update spectrogram, test=asr --- examples/aishell/asr1/conf/preprocess.yaml | 5 ++-- paddlespeech/s2t/transform/spectrogram.py | 34 ++++++++++++++++------ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml index a20ff2ab..d3992cb9 100644 --- a/examples/aishell/asr1/conf/preprocess.yaml +++ b/examples/aishell/asr1/conf/preprocess.yaml @@ -3,9 +3,8 @@ process: - type: fbank_kaldi fs: 16000 n_mels: 80 - n_frame_length: 25 - n_frame_shift: 10 - energy_floor: 0.0 + n_shift: 160 + win_length: 400 dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index f779b07d..75787d92 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -312,17 +312,33 @@ class IStft(): class LogMelSpectrogramKaldi(): - def __init__(self, - fs=16000, - n_mels=80, - n_frame_length=25, - n_frame_shift=10, - energy_floor=0.0, - dither=0.1): + def __init__( + self, + fs=16000, + n_mels=80, + n_shift=160, # unit:sample, 10ms + win_length=400, # unit:sample, 25ms + energy_floor=0.0, + dither=0.1): + """ + The Kaldi implementation of LogMelSpectrogram + Args: + fs (int): sample rate of the audio + n_mels (int): number of mel filter banks + n_shift (int): number of points in a frame shift + win_length (int): number of points in a frame windows + energy_floor (float): Floor on energy in Spectrogram computation (absolute) + dither (float): Dithering constant + + Returns: + LogMelSpectrogramKaldi + """ + self.fs = fs self.n_mels = n_mels - self.n_frame_length = n_frame_length - self.n_frame_shift = n_frame_shift + num_point_ms = fs / 1000 + self.n_frame_length = win_length / num_point_ms + self.n_frame_shift = n_shift / num_point_ms self.energy_floor = energy_floor self.dither = dither From f47146af494f510428e9d14702f4b735c88843aa Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Tue, 29 Mar 2022 12:09:54 +0000 Subject: [PATCH 3/3] add docstring, test=asr --- paddlespeech/s2t/transform/spectrogram.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index 75787d92..4a65548f 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -355,7 +355,20 @@ class LogMelSpectrogramKaldi(): dither=self.dither, )) def __call__(self, x, train): + """ + Args: + x (np.ndarray): shape (Ti,) + train (bool): True, train mode. + + Raises: + ValueError: not support (Ti, C) + + Returns: + np.ndarray: (T, D) + """ dither = self.dither if train else 0.0 + if x.ndim != 1: + raise ValueError("Not support x: [Time, Channel]") waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32) mat = kaldi.fbank( waveform,