From 0ffe1f91143b0489fd38be90747afcbb5e61fedc Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 28 Mar 2022 03:35:55 +0000
Subject: [PATCH 1/3] replace kaidi_fbank with paddleaudio

---
 examples/aishell/asr1/conf/preprocess.yaml   |  9 ++--
 paddlespeech/s2t/transform/spectrogram.py    | 45 ++++++++++++++++++++
 paddlespeech/s2t/transform/transformation.py |  1 +
 3 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml
index f7f4c58d..a20ff2ab 100644
--- a/examples/aishell/asr1/conf/preprocess.yaml
+++ b/examples/aishell/asr1/conf/preprocess.yaml
@@ -3,8 +3,9 @@ process:
   - type: fbank_kaldi
     fs: 16000
     n_mels: 80
-    n_shift: 160
-    win_length: 400
+    n_frame_length: 25
+    n_frame_shift: 10
+    energy_floor: 0.0
     dither: 0.1
   - type: cmvn_json
     cmvn_path: data/mean_std.json
@@ -23,7 +24,3 @@ process:
     n_mask: 2
     inplace: true
     replace_with_zero: false
-
-
-
-
diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index 889cd349..f779b07d 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -14,8 +14,11 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 import librosa
 import numpy as np
+import paddle
 from python_speech_features import logfbank
 
+import paddleaudio.compliance.kaldi as kaldi
+
 
 def stft(x,
          n_fft,
@@ -309,6 +312,48 @@ class IStft():
 
 
 class LogMelSpectrogramKaldi():
+    def __init__(self,
+                 fs=16000,
+                 n_mels=80,
+                 n_frame_length=25,
+                 n_frame_shift=10,
+                 energy_floor=0.0,
+                 dither=0.1):
+        self.fs = fs
+        self.n_mels = n_mels
+        self.n_frame_length = n_frame_length
+        self.n_frame_shift = n_frame_shift
+        self.energy_floor = energy_floor
+        self.dither = dither
+
+    def __repr__(self):
+        return (
+            "{name}(fs={fs}, n_mels={n_mels}, "
+            "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, "
+            "dither={dither}))".format(
+                name=self.__class__.__name__,
+                fs=self.fs,
+                n_mels=self.n_mels,
+                n_frame_shift=self.n_frame_shift,
+                n_frame_length=self.n_frame_length,
+                dither=self.dither, ))
+
+    def __call__(self, x, train):
+        dither = self.dither if train else 0.0
+        waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32)
+        mat = kaldi.fbank(
+            waveform,
+            n_mels=self.n_mels,
+            frame_length=self.n_frame_length,
+            frame_shift=self.n_frame_shift,
+            dither=dither,
+            energy_floor=self.energy_floor,
+            sr=self.fs)
+        mat = np.squeeze(mat.numpy())
+        return mat
+
+
+class LogMelSpectrogramKaldi_decay():
     def __init__(
             self,
             fs=16000,
diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py
index 381b0cdc..3b433cb0 100644
--- a/paddlespeech/s2t/transform/transformation.py
+++ b/paddlespeech/s2t/transform/transformation.py
@@ -31,6 +31,7 @@ import_alias = dict(
     freq_mask="paddlespeech.s2t.transform.spec_augment:FreqMask",
     spec_augment="paddlespeech.s2t.transform.spec_augment:SpecAugment",
     speed_perturbation="paddlespeech.s2t.transform.perturb:SpeedPerturbation",
+    speed_perturbation_sox="paddlespeech.s2t.transform.perturb:SpeedPerturbationSox",
     volume_perturbation="paddlespeech.s2t.transform.perturb:VolumePerturbation",
     noise_injection="paddlespeech.s2t.transform.perturb:NoiseInjection",
     bandpass_perturbation="paddlespeech.s2t.transform.perturb:BandpassPerturbation",

From ed490b66cb052c1308117e5e9703d94d8e43239a Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 29 Mar 2022 03:20:07 +0000
Subject: [PATCH 2/3] update spectrogram, test=asr

---
 examples/aishell/asr1/conf/preprocess.yaml |  5 ++--
 paddlespeech/s2t/transform/spectrogram.py  | 34 ++++++++++++++++------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml
index a20ff2ab..d3992cb9 100644
--- a/examples/aishell/asr1/conf/preprocess.yaml
+++ b/examples/aishell/asr1/conf/preprocess.yaml
@@ -3,9 +3,8 @@ process:
   - type: fbank_kaldi
     fs: 16000
     n_mels: 80
-    n_frame_length: 25
-    n_frame_shift: 10
-    energy_floor: 0.0
+    n_shift: 160
+    win_length: 400
     dither: 0.1
   - type: cmvn_json
     cmvn_path: data/mean_std.json
diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index f779b07d..75787d92 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -312,17 +312,33 @@ class IStft():
 
 
 class LogMelSpectrogramKaldi():
-    def __init__(self,
-                 fs=16000,
-                 n_mels=80,
-                 n_frame_length=25,
-                 n_frame_shift=10,
-                 energy_floor=0.0,
-                 dither=0.1):
+    def __init__(
+            self,
+            fs=16000,
+            n_mels=80,
+            n_shift=160,  # unit:sample, 10ms
+            win_length=400,  # unit:sample, 25ms
+            energy_floor=0.0,
+            dither=0.1):
+        """
+        The Kaldi implementation of LogMelSpectrogram 
+        Args:
+            fs (int): sample rate of the audio
+            n_mels (int): number of mel filter banks
+            n_shift (int): number of points in a frame shift
+            win_length (int): number of points in a frame windows
+            energy_floor (float): Floor on energy in Spectrogram computation (absolute)
+            dither (float): Dithering constant
+
+        Returns:
+            LogMelSpectrogramKaldi
+        """
+
         self.fs = fs
         self.n_mels = n_mels
-        self.n_frame_length = n_frame_length
-        self.n_frame_shift = n_frame_shift
+        num_point_ms = fs / 1000
+        self.n_frame_length = win_length / num_point_ms
+        self.n_frame_shift = n_shift / num_point_ms
         self.energy_floor = energy_floor
         self.dither = dither
 

From f47146af494f510428e9d14702f4b735c88843aa Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 29 Mar 2022 12:09:54 +0000
Subject: [PATCH 3/3] add docstring, test=asr

---
 paddlespeech/s2t/transform/spectrogram.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index 75787d92..4a65548f 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -355,7 +355,20 @@ class LogMelSpectrogramKaldi():
                 dither=self.dither, ))
 
     def __call__(self, x, train):
+        """
+        Args:
+            x (np.ndarray): shape (Ti,)
+            train (bool): True, train mode.
+
+        Raises:
+            ValueError: not support (Ti, C)
+
+        Returns:
+            np.ndarray: (T, D)
+        """
         dither = self.dither if train else 0.0
+        if x.ndim != 1:
+            raise ValueError("Not support x: [Time, Channel]")
         waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32)
         mat = kaldi.fbank(
             waveform,