From c437a7c5c193625ecbc6bfead3c5ee216e16c808 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 25 Feb 2022 07:12:56 +0000
Subject: [PATCH 01/17] refactor paddleaudio, test=doc

---
 .gitignore                                    |   1 +
 paddleaudio/CHANGELOG.md                      |   3 +
 paddleaudio/features/augment.py               | 170 ------------------
 .../{backends => paddleaudio}/__init__.py     |   1 -
 .../{ => paddleaudio/backends}/__init__.py    |   2 -
 .../paddleaudio/backends/soundfile_backend.py |   0
 .../paddleaudio/backends/sox_backend.py       |   0
 .../{ => paddleaudio}/datasets/__init__.py    |   0
 .../{ => paddleaudio}/datasets/dataset.py     |   0
 .../{ => paddleaudio}/datasets/esc50.py       |   0
 .../{ => paddleaudio}/datasets/gtzan.py       |   0
 .../{ => paddleaudio}/datasets/tess.py        |   0
 .../{ => paddleaudio}/datasets/urban_sound.py |   0
 .../{ => paddleaudio}/features/__init__.py    |   7 +-
 .../features/librosa.py}                      |   2 +-
 .../paddleaudio/functional/__init__.py        |   0
 .../functional/functional.py}                 | 155 +++++++++++++++-
 .../functional}/window.py                     |  44 +++--
 paddleaudio/paddleaudio/io/__init__.py        |   6 +
 .../{backends => paddleaudio/io}/audio.py     |   0
 paddleaudio/paddleaudio/kaldi/__init__.py     |   0
 .../paddleaudio/sox_effects/__init__.py       |   0
 .../{ => paddleaudio}/utils/__init__.py       |  22 ++-
 .../{ => paddleaudio}/utils/download.py       |   5 +
 paddleaudio/{ => paddleaudio}/utils/env.py    |   6 +
 paddleaudio/{ => paddleaudio}/utils/error.py  |   0
 paddleaudio/{ => paddleaudio}/utils/log.py    |   5 +-
 paddleaudio/{ => paddleaudio}/utils/time.py   |   4 +
 setup_audio.py => paddleaudio/setup.py        |   2 +-
 requirements.txt                              |  48 -----
 30 files changed, 234 insertions(+), 249 deletions(-)
 delete mode 100644 paddleaudio/features/augment.py
 rename paddleaudio/{backends => paddleaudio}/__init__.py (96%)
 rename paddleaudio/{ => paddleaudio/backends}/__init__.py (92%)
 create mode 100644 paddleaudio/paddleaudio/backends/soundfile_backend.py
 create mode 100644 paddleaudio/paddleaudio/backends/sox_backend.py
 rename paddleaudio/{ => paddleaudio}/datasets/__init__.py (100%)
 rename paddleaudio/{ => paddleaudio}/datasets/dataset.py (100%)
 rename paddleaudio/{ => paddleaudio}/datasets/esc50.py (100%)
 rename paddleaudio/{ => paddleaudio}/datasets/gtzan.py (100%)
 rename paddleaudio/{ => paddleaudio}/datasets/tess.py (100%)
 rename paddleaudio/{ => paddleaudio}/datasets/urban_sound.py (100%)
 rename paddleaudio/{ => paddleaudio}/features/__init__.py (84%)
 rename paddleaudio/{features/spectrum.py => paddleaudio/features/librosa.py} (99%)
 create mode 100644 paddleaudio/paddleaudio/functional/__init__.py
 rename paddleaudio/{features/core.py => paddleaudio/functional/functional.py} (79%)
 rename paddleaudio/{features => paddleaudio/functional}/window.py (98%)
 create mode 100644 paddleaudio/paddleaudio/io/__init__.py
 rename paddleaudio/{backends => paddleaudio/io}/audio.py (100%)
 create mode 100644 paddleaudio/paddleaudio/kaldi/__init__.py
 create mode 100644 paddleaudio/paddleaudio/sox_effects/__init__.py
 rename paddleaudio/{ => paddleaudio}/utils/__init__.py (61%)
 rename paddleaudio/{ => paddleaudio}/utils/download.py (94%)
 rename paddleaudio/{ => paddleaudio}/utils/env.py (95%)
 rename paddleaudio/{ => paddleaudio}/utils/error.py (100%)
 rename paddleaudio/{ => paddleaudio}/utils/log.py (98%)
 rename paddleaudio/{ => paddleaudio}/utils/time.py (97%)
 rename setup_audio.py => paddleaudio/setup.py (99%)
 delete mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
index cc8fff877..374276b4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,5 +30,6 @@ tools/OpenBLAS/
 tools/Miniconda3-latest-Linux-x86_64.sh
 tools/activate_python.sh
 tools/miniconda.sh
+tools/CRF++-0.58/
 
 *output/
diff --git a/paddleaudio/CHANGELOG.md b/paddleaudio/CHANGELOG.md
index 825c32f0d..e68895674 100644
--- a/paddleaudio/CHANGELOG.md
+++ b/paddleaudio/CHANGELOG.md
@@ -1 +1,4 @@
 # Changelog
+
+Date: 2022-2-25, Author: Hui Zhang.
+  - Refactor architecture.
\ No newline at end of file
diff --git a/paddleaudio/features/augment.py b/paddleaudio/features/augment.py
deleted file mode 100644
index 6f903bdba..000000000
--- a/paddleaudio/features/augment.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List
-
-import numpy as np
-from numpy import ndarray as array
-
-from ..backends import depth_convert
-from ..utils import ParameterError
-
-__all__ = [
-    'depth_augment',
-    'spect_augment',
-    'random_crop1d',
-    'random_crop2d',
-    'adaptive_spect_augment',
-]
-
-
-def randint(high: int) -> int:
-    """Generate one random integer in range [0 high)
-
-     This is a helper function for random data augmentaiton
-    """
-    return int(np.random.randint(0, high=high))
-
-
-def rand() -> float:
-    """Generate one floating-point number in range [0 1)
-
-    This is a helper function for random data augmentaiton
-    """
-    return float(np.random.rand(1))
-
-
-def depth_augment(y: array,
-                  choices: List=['int8', 'int16'],
-                  probs: List[float]=[0.5, 0.5]) -> array:
-    """ Audio depth augmentation
-
-    Do audio depth augmentation to simulate the distortion brought by quantization.
-    """
-    assert len(probs) == len(
-        choices
-    ), 'number of choices {} must be equal to size of probs {}'.format(
-        len(choices), len(probs))
-    depth = np.random.choice(choices, p=probs)
-    src_depth = y.dtype
-    y1 = depth_convert(y, depth)
-    y2 = depth_convert(y1, src_depth)
-
-    return y2
-
-
-def adaptive_spect_augment(spect: array, tempo_axis: int=0,
-                           level: float=0.1) -> array:
-    """Do adpative spectrogram augmentation
-
-    The level of the augmentation is gowern by the paramter level,
-    ranging from 0 to 1, with 0 represents no augmentation。
-
-    """
-    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
-    if tempo_axis == 0:
-        nt, nf = spect.shape
-    else:
-        nf, nt = spect.shape
-
-    time_mask_width = int(nt * level * 0.5)
-    freq_mask_width = int(nf * level * 0.5)
-
-    num_time_mask = int(10 * level)
-    num_freq_mask = int(10 * level)
-
-    if tempo_axis == 0:
-        for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
-            spect[start:start + time_mask_width, :] = 0
-        for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
-            spect[:, start:start + freq_mask_width] = 0
-    else:
-        for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
-            spect[:, start:start + time_mask_width] = 0
-        for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
-            spect[start:start + freq_mask_width, :] = 0
-
-    return spect
-
-
-def spect_augment(spect: array,
-                  tempo_axis: int=0,
-                  max_time_mask: int=3,
-                  max_freq_mask: int=3,
-                  max_time_mask_width: int=30,
-                  max_freq_mask_width: int=20) -> array:
-    """Do spectrogram augmentation in both time and freq axis
-
-    Reference:
-
-    """
-    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
-    if tempo_axis == 0:
-        nt, nf = spect.shape
-    else:
-        nf, nt = spect.shape
-
-    num_time_mask = randint(max_time_mask)
-    num_freq_mask = randint(max_freq_mask)
-
-    time_mask_width = randint(max_time_mask_width)
-    freq_mask_width = randint(max_freq_mask_width)
-
-    if tempo_axis == 0:
-        for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
-            spect[start:start + time_mask_width, :] = 0
-        for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
-            spect[:, start:start + freq_mask_width] = 0
-    else:
-        for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
-            spect[:, start:start + time_mask_width] = 0
-        for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
-            spect[start:start + freq_mask_width, :] = 0
-
-    return spect
-
-
-def random_crop1d(y: array, crop_len: int) -> array:
-    """ Do random cropping on 1d input signal
-
-    The input is a 1d signal, typically a sound waveform
-    """
-    if y.ndim != 1:
-        'only accept 1d tensor or numpy array'
-    n = len(y)
-    idx = randint(n - crop_len)
-    return y[idx:idx + crop_len]
-
-
-def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
-    """ Do random cropping for 2D array, typically a spectrogram.
-
-    The cropping is done in temporal direction on the time-freq input signal.
-    """
-    if tempo_axis >= s.ndim:
-        raise ParameterError('axis out of range')
-
-    n = s.shape[tempo_axis]
-    idx = randint(high=n - crop_len)
-    sli = [slice(None) for i in range(s.ndim)]
-    sli[tempo_axis] = slice(idx, idx + crop_len)
-    out = s[tuple(sli)]
-    return out
diff --git a/paddleaudio/backends/__init__.py b/paddleaudio/paddleaudio/__init__.py
similarity index 96%
rename from paddleaudio/backends/__init__.py
rename to paddleaudio/paddleaudio/__init__.py
index f2f77ffea..185a92b8d 100644
--- a/paddleaudio/backends/__init__.py
+++ b/paddleaudio/paddleaudio/__init__.py
@@ -11,4 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .audio import *
diff --git a/paddleaudio/__init__.py b/paddleaudio/paddleaudio/backends/__init__.py
similarity index 92%
rename from paddleaudio/__init__.py
rename to paddleaudio/paddleaudio/backends/__init__.py
index 2685cf57c..185a92b8d 100644
--- a/paddleaudio/__init__.py
+++ b/paddleaudio/paddleaudio/backends/__init__.py
@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .backends import *
-from .features import *
diff --git a/paddleaudio/paddleaudio/backends/soundfile_backend.py b/paddleaudio/paddleaudio/backends/soundfile_backend.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddleaudio/paddleaudio/backends/sox_backend.py b/paddleaudio/paddleaudio/backends/sox_backend.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddleaudio/datasets/__init__.py b/paddleaudio/paddleaudio/datasets/__init__.py
similarity index 100%
rename from paddleaudio/datasets/__init__.py
rename to paddleaudio/paddleaudio/datasets/__init__.py
diff --git a/paddleaudio/datasets/dataset.py b/paddleaudio/paddleaudio/datasets/dataset.py
similarity index 100%
rename from paddleaudio/datasets/dataset.py
rename to paddleaudio/paddleaudio/datasets/dataset.py
diff --git a/paddleaudio/datasets/esc50.py b/paddleaudio/paddleaudio/datasets/esc50.py
similarity index 100%
rename from paddleaudio/datasets/esc50.py
rename to paddleaudio/paddleaudio/datasets/esc50.py
diff --git a/paddleaudio/datasets/gtzan.py b/paddleaudio/paddleaudio/datasets/gtzan.py
similarity index 100%
rename from paddleaudio/datasets/gtzan.py
rename to paddleaudio/paddleaudio/datasets/gtzan.py
diff --git a/paddleaudio/datasets/tess.py b/paddleaudio/paddleaudio/datasets/tess.py
similarity index 100%
rename from paddleaudio/datasets/tess.py
rename to paddleaudio/paddleaudio/datasets/tess.py
diff --git a/paddleaudio/datasets/urban_sound.py b/paddleaudio/paddleaudio/datasets/urban_sound.py
similarity index 100%
rename from paddleaudio/datasets/urban_sound.py
rename to paddleaudio/paddleaudio/datasets/urban_sound.py
diff --git a/paddleaudio/features/__init__.py b/paddleaudio/paddleaudio/features/__init__.py
similarity index 84%
rename from paddleaudio/features/__init__.py
rename to paddleaudio/paddleaudio/features/__init__.py
index d8ac7c4b9..1688cc5c2 100644
--- a/paddleaudio/features/__init__.py
+++ b/paddleaudio/paddleaudio/features/__init__.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .augment import *
-from .core import *
-from .spectrum import *
+
+from .librosa import Spectrogram
+from .librosa import MelSpectrogram
+from .librosa import LogMelSpectrogram
\ No newline at end of file
diff --git a/paddleaudio/features/spectrum.py b/paddleaudio/paddleaudio/features/librosa.py
similarity index 99%
rename from paddleaudio/features/spectrum.py
rename to paddleaudio/paddleaudio/features/librosa.py
index 154b6484c..1cbd2d1a2 100644
--- a/paddleaudio/features/spectrum.py
+++ b/paddleaudio/paddleaudio/features/librosa.py
@@ -19,7 +19,7 @@ from typing import Union
 import paddle
 import paddle.nn as nn
 
-from .window import get_window
+from ..functional.window import get_window
 
 __all__ = [
     'Spectrogram',
diff --git a/paddleaudio/paddleaudio/functional/__init__.py b/paddleaudio/paddleaudio/functional/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddleaudio/features/core.py b/paddleaudio/paddleaudio/functional/functional.py
similarity index 79%
rename from paddleaudio/features/core.py
rename to paddleaudio/paddleaudio/functional/functional.py
index 01925ec62..ce49cdc43 100644
--- a/paddleaudio/features/core.py
+++ b/paddleaudio/paddleaudio/functional/functional.py
@@ -21,11 +21,14 @@ import numpy as np
 import scipy
 from numpy import ndarray as array
 from numpy.lib.stride_tricks import as_strided
-from scipy.signal import get_window
+from scipy import signal
 
 from ..utils import ParameterError
+from ..backends import depth_convert
+
 
 __all__ = [
+    # dsp
     'stft',
     'mfcc',
     'hz_to_mel',
@@ -38,6 +41,12 @@ __all__ = [
     'spectrogram',
     'mu_encode',
     'mu_decode',
+    # augmentation
+    'depth_augment',
+    'spect_augment',
+    'random_crop1d',
+    'random_crop2d',
+    'adaptive_spect_augment',
 ]
 
 
@@ -303,7 +312,7 @@ def stft(x: array,
     if hop_length is None:
         hop_length = int(win_length // 4)
 
-    fft_window = get_window(window, win_length, fftbins=True)
+    fft_window = signal.get_window(window, win_length, fftbins=True)
 
     # Pad the window out to n_fft size
     fft_window = pad_center(fft_window, n_fft)
@@ -576,3 +585,145 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
         y = y * 2 / mu - 1
     x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
     return x
+
+
+def randint(high: int) -> int:
+    """Generate one random integer in range [0 high)
+
+     This is a helper function for random data augmentaiton
+    """
+    return int(np.random.randint(0, high=high))
+
+
+def rand() -> float:
+    """Generate one floating-point number in range [0 1)
+
+    This is a helper function for random data augmentaiton
+    """
+    return float(np.random.rand(1))
+
+
+def depth_augment(y: array,
+                  choices: List=['int8', 'int16'],
+                  probs: List[float]=[0.5, 0.5]) -> array:
+    """ Audio depth augmentation
+
+    Do audio depth augmentation to simulate the distortion brought by quantization.
+    """
+    assert len(probs) == len(
+        choices
+    ), 'number of choices {} must be equal to size of probs {}'.format(
+        len(choices), len(probs))
+    depth = np.random.choice(choices, p=probs)
+    src_depth = y.dtype
+    y1 = depth_convert(y, depth)
+    y2 = depth_convert(y1, src_depth)
+
+    return y2
+
+
+def adaptive_spect_augment(spect: array, tempo_axis: int=0,
+                           level: float=0.1) -> array:
+    """Do adpative spectrogram augmentation
+
+    The level of the augmentation is gowern by the paramter level,
+    ranging from 0 to 1, with 0 represents no augmentation。
+
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+
+    time_mask_width = int(nt * level * 0.5)
+    freq_mask_width = int(nf * level * 0.5)
+
+    num_time_mask = int(10 * level)
+    num_freq_mask = int(10 * level)
+
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+
+    return spect
+
+
+def spect_augment(spect: array,
+                  tempo_axis: int=0,
+                  max_time_mask: int=3,
+                  max_freq_mask: int=3,
+                  max_time_mask_width: int=30,
+                  max_freq_mask_width: int=20) -> array:
+    """Do spectrogram augmentation in both time and freq axis
+
+    Reference:
+
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+
+    num_time_mask = randint(max_time_mask)
+    num_freq_mask = randint(max_freq_mask)
+
+    time_mask_width = randint(max_time_mask_width)
+    freq_mask_width = randint(max_freq_mask_width)
+
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+
+    return spect
+
+
+def random_crop1d(y: array, crop_len: int) -> array:
+    """ Do random cropping on 1d input signal
+
+    The input is a 1d signal, typically a sound waveform
+    """
+    if y.ndim != 1:
+        'only accept 1d tensor or numpy array'
+    n = len(y)
+    idx = randint(n - crop_len)
+    return y[idx:idx + crop_len]
+
+
+def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
+    """ Do random cropping for 2D array, typically a spectrogram.
+
+    The cropping is done in temporal direction on the time-freq input signal.
+    """
+    if tempo_axis >= s.ndim:
+        raise ParameterError('axis out of range')
+
+    n = s.shape[tempo_axis]
+    idx = randint(high=n - crop_len)
+    sli = [slice(None) for i in range(s.ndim)]
+    sli[tempo_axis] = slice(idx, idx + crop_len)
+    out = s[tuple(sli)]
+    return out
\ No newline at end of file
diff --git a/paddleaudio/features/window.py b/paddleaudio/paddleaudio/functional/window.py
similarity index 98%
rename from paddleaudio/features/window.py
rename to paddleaudio/paddleaudio/functional/window.py
index 629989fc9..e34862b4c 100644
--- a/paddleaudio/features/window.py
+++ b/paddleaudio/paddleaudio/functional/window.py
@@ -20,6 +20,19 @@ from paddle import Tensor
 
 __all__ = [
     'get_window',
+    
+    # windows
+    'taylor',
+    'hamming',
+    'hann',
+    'tukey',
+    'kaiser',
+    'gaussian',
+    'exponential',
+    'triang',
+    'bohman',
+    'blackman',
+    'cosine',
 ]
 
 
@@ -73,6 +86,21 @@ def general_gaussian(M: int, p, sig, sym: bool=True,
     return _truncate(w, needs_trunc)
 
 
+def general_cosine(M: int, a: float, sym: bool=True,
+                   dtype: str='float64') -> Tensor:
+    """Compute a generic weighted sum of cosine terms window.
+    This function is consistent with scipy.signal.windows.general_cosine().
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
+    w = paddle.zeros((M, ), dtype=dtype)
+    for k in range(len(a)):
+        w += a[k] * paddle.cos(k * fac)
+    return _truncate(w, needs_trunc)
+
+
 def general_hamming(M: int, alpha: float, sym: bool=True,
                     dtype: str='float64') -> Tensor:
     """Compute a generalized Hamming window.
@@ -143,21 +171,6 @@ def taylor(M: int,
     return _truncate(w, needs_trunc)
 
 
-def general_cosine(M: int, a: float, sym: bool=True,
-                   dtype: str='float64') -> Tensor:
-    """Compute a generic weighted sum of cosine terms window.
-    This function is consistent with scipy.signal.windows.general_cosine().
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
-    w = paddle.zeros((M, ), dtype=dtype)
-    for k in range(len(a)):
-        w += a[k] * paddle.cos(k * fac)
-    return _truncate(w, needs_trunc)
-
-
 def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a Hamming window.
     The Hamming window is a taper formed by using a raised cosine with
@@ -375,6 +388,7 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     return _truncate(w, needs_trunc)
 
 
+## factory function
 def get_window(window: Union[str, Tuple[str, float]],
                win_length: int,
                fftbins: bool=True,
diff --git a/paddleaudio/paddleaudio/io/__init__.py b/paddleaudio/paddleaudio/io/__init__.py
new file mode 100644
index 000000000..3a9a01e85
--- /dev/null
+++ b/paddleaudio/paddleaudio/io/__init__.py
@@ -0,0 +1,6 @@
+from .audio import save_wav
+from .audio import load
+from .audio import normalize
+from .audio import to_mono
+from .audio import resample
+from .audio import depth_convert
\ No newline at end of file
diff --git a/paddleaudio/backends/audio.py b/paddleaudio/paddleaudio/io/audio.py
similarity index 100%
rename from paddleaudio/backends/audio.py
rename to paddleaudio/paddleaudio/io/audio.py
diff --git a/paddleaudio/paddleaudio/kaldi/__init__.py b/paddleaudio/paddleaudio/kaldi/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddleaudio/paddleaudio/sox_effects/__init__.py b/paddleaudio/paddleaudio/sox_effects/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddleaudio/utils/__init__.py b/paddleaudio/paddleaudio/utils/__init__.py
similarity index 61%
rename from paddleaudio/utils/__init__.py
rename to paddleaudio/paddleaudio/utils/__init__.py
index 1c1b4a90e..5fe0980b5 100644
--- a/paddleaudio/utils/__init__.py
+++ b/paddleaudio/paddleaudio/utils/__init__.py
@@ -11,8 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .download import *
-from .env import *
-from .error import *
-from .log import *
-from .time import *
+
+from .env import USER_HOME
+from .env import PPAUDIO_HOME
+from .env import MODEL_HOME
+from .env import DATA_HOME
+
+from .download import decompress
+from .download import download_and_decompress
+from .download import load_state_dict_from_url
+
+from .error import ParameterError
+
+from .log import logger
+from .log import Logger
+
+from .time import Timer
+from .time import seconds_to_hms
diff --git a/paddleaudio/utils/download.py b/paddleaudio/paddleaudio/utils/download.py
similarity index 94%
rename from paddleaudio/utils/download.py
rename to paddleaudio/paddleaudio/utils/download.py
index 45a8e57ba..fd4785cd7 100644
--- a/paddleaudio/utils/download.py
+++ b/paddleaudio/paddleaudio/utils/download.py
@@ -22,6 +22,11 @@ from .log import logger
 
 download.logger = logger
 
+__all__ = [
+    'decompress',
+    'download_and_decompress',
+    'load_state_dict_from_url',
+]
 
 def decompress(file: str):
     """
diff --git a/paddleaudio/utils/env.py b/paddleaudio/paddleaudio/utils/env.py
similarity index 95%
rename from paddleaudio/utils/env.py
rename to paddleaudio/paddleaudio/utils/env.py
index 59c6b6219..e202c3803 100644
--- a/paddleaudio/utils/env.py
+++ b/paddleaudio/paddleaudio/utils/env.py
@@ -20,6 +20,12 @@ PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. D
 '''
 import os
 
+__all__ = [
+    'USER_HOME',
+    'PPAUDIO_HOME',
+'MODEL_HOME' ,
+'DATA_HOME' ,
+]
 
 def _get_user_home():
     return os.path.expanduser('~')
diff --git a/paddleaudio/utils/error.py b/paddleaudio/paddleaudio/utils/error.py
similarity index 100%
rename from paddleaudio/utils/error.py
rename to paddleaudio/paddleaudio/utils/error.py
diff --git a/paddleaudio/utils/log.py b/paddleaudio/paddleaudio/utils/log.py
similarity index 98%
rename from paddleaudio/utils/log.py
rename to paddleaudio/paddleaudio/utils/log.py
index 5e7db68a9..5656b286a 100644
--- a/paddleaudio/utils/log.py
+++ b/paddleaudio/paddleaudio/utils/log.py
@@ -19,7 +19,10 @@ import time
 
 import colorlog
 
-loggers = {}
+__all__ = [
+    'Logger',
+    'logger',
+]
 
 log_config = {
     'DEBUG': {
diff --git a/paddleaudio/utils/time.py b/paddleaudio/paddleaudio/utils/time.py
similarity index 97%
rename from paddleaudio/utils/time.py
rename to paddleaudio/paddleaudio/utils/time.py
index 6f0c7585b..23af62fc7 100644
--- a/paddleaudio/utils/time.py
+++ b/paddleaudio/paddleaudio/utils/time.py
@@ -14,6 +14,10 @@
 import math
 import time
 
+__all__ = [
+    'Timer',
+    'seconds_to_hms',
+]
 
 class Timer(object):
     '''Calculate runing speed and estimated time of arrival(ETA)'''
diff --git a/setup_audio.py b/paddleaudio/setup.py
similarity index 99%
rename from setup_audio.py
rename to paddleaudio/setup.py
index 212049987..98bf8a6f4 100644
--- a/setup_audio.py
+++ b/paddleaudio/setup.py
@@ -14,7 +14,7 @@
 import setuptools
 
 # set the version here
-VERSION = '0.1.0'
+VERSION = '0.2.0'
 
 
 def write_version_py(filename='paddleaudio/__init__.py'):
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 760821662..000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-ConfigArgParse
-coverage
-editdistance
-g2p_en
-g2pM
-gpustat
-h5py
-inflect
-jieba
-jsonlines
-kaldiio
-librosa
-loguru
-matplotlib
-nara_wpe
-nltk
-paddleaudio
-paddlenlp
-paddlespeech_ctcdecoders
-paddlespeech_feat
-pandas
-phkit
-Pillow
-praatio==5.0.0
-pre-commit
-pybind11
-pypi-kenlm
-pypinyin
-python-dateutil
-pyworld
-resampy==0.2.2
-sacrebleu
-scipy
-sentencepiece~=0.1.96
-snakeviz
-soundfile~=0.10
-sox
-soxbindings
-textgrid
-timer
-tqdm
-typeguard
-unidecode
-visualdl
-webrtcvad
-yacs~=0.1.8
-yq
-zhon

From 54f06041d48d32ea6bc81a461ec6ee645b993897 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 25 Feb 2022 07:29:26 +0000
Subject: [PATCH 02/17] add tests, metric dir and format, test=doc

---
 paddleaudio/CHANGELOG.md                      |  2 +-
 .../paddleaudio/backends/soundfile_backend.py | 13 +++++++++++++
 .../paddleaudio/backends/sox_backend.py       | 13 +++++++++++++
 paddleaudio/paddleaudio/features/__init__.py  |  5 ++---
 .../paddleaudio/functional/__init__.py        | 13 +++++++++++++
 .../paddleaudio/functional/functional.py      |  5 ++---
 paddleaudio/paddleaudio/functional/window.py  |  2 +-
 paddleaudio/paddleaudio/io/__init__.py        | 19 ++++++++++++++++---
 paddleaudio/paddleaudio/kaldi/__init__.py     | 13 +++++++++++++
 paddleaudio/paddleaudio/metric/__init__.py    | 13 +++++++++++++
 .../paddleaudio/sox_effects/__init__.py       | 13 +++++++++++++
 paddleaudio/paddleaudio/utils/__init__.py     | 17 ++++++-----------
 paddleaudio/paddleaudio/utils/download.py     |  1 +
 paddleaudio/paddleaudio/utils/env.py          |  5 +++--
 paddleaudio/paddleaudio/utils/time.py         |  1 +
 paddleaudio/tests/.gitkeep                    |  0
 16 files changed, 111 insertions(+), 24 deletions(-)
 create mode 100644 paddleaudio/paddleaudio/metric/__init__.py
 create mode 100644 paddleaudio/tests/.gitkeep

diff --git a/paddleaudio/CHANGELOG.md b/paddleaudio/CHANGELOG.md
index e68895674..52d44dd39 100644
--- a/paddleaudio/CHANGELOG.md
+++ b/paddleaudio/CHANGELOG.md
@@ -1,4 +1,4 @@
 # Changelog
 
 Date: 2022-2-25, Author: Hui Zhang.
-  - Refactor architecture.
\ No newline at end of file
+  - Refactor architecture.
diff --git a/paddleaudio/paddleaudio/backends/soundfile_backend.py b/paddleaudio/paddleaudio/backends/soundfile_backend.py
index e69de29bb..97043fd7b 100644
--- a/paddleaudio/paddleaudio/backends/soundfile_backend.py
+++ b/paddleaudio/paddleaudio/backends/soundfile_backend.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddleaudio/paddleaudio/backends/sox_backend.py b/paddleaudio/paddleaudio/backends/sox_backend.py
index e69de29bb..97043fd7b 100644
--- a/paddleaudio/paddleaudio/backends/sox_backend.py
+++ b/paddleaudio/paddleaudio/backends/sox_backend.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddleaudio/paddleaudio/features/__init__.py b/paddleaudio/paddleaudio/features/__init__.py
index 1688cc5c2..469b4c9ba 100644
--- a/paddleaudio/paddleaudio/features/__init__.py
+++ b/paddleaudio/paddleaudio/features/__init__.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .librosa import Spectrogram
+from .librosa import LogMelSpectrogram
 from .librosa import MelSpectrogram
-from .librosa import LogMelSpectrogram
\ No newline at end of file
+from .librosa import Spectrogram
diff --git a/paddleaudio/paddleaudio/functional/__init__.py b/paddleaudio/paddleaudio/functional/__init__.py
index e69de29bb..97043fd7b 100644
--- a/paddleaudio/paddleaudio/functional/__init__.py
+++ b/paddleaudio/paddleaudio/functional/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddleaudio/paddleaudio/functional/functional.py b/paddleaudio/paddleaudio/functional/functional.py
index ce49cdc43..167795c37 100644
--- a/paddleaudio/paddleaudio/functional/functional.py
+++ b/paddleaudio/paddleaudio/functional/functional.py
@@ -23,9 +23,8 @@ from numpy import ndarray as array
 from numpy.lib.stride_tricks import as_strided
 from scipy import signal
 
-from ..utils import ParameterError
 from ..backends import depth_convert
-
+from ..utils import ParameterError
 
 __all__ = [
     # dsp
@@ -726,4 +725,4 @@ def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
     sli = [slice(None) for i in range(s.ndim)]
     sli[tempo_axis] = slice(idx, idx + crop_len)
     out = s[tuple(sli)]
-    return out
\ No newline at end of file
+    return out
diff --git a/paddleaudio/paddleaudio/functional/window.py b/paddleaudio/paddleaudio/functional/window.py
index e34862b4c..f321b38ef 100644
--- a/paddleaudio/paddleaudio/functional/window.py
+++ b/paddleaudio/paddleaudio/functional/window.py
@@ -20,7 +20,7 @@ from paddle import Tensor
 
 __all__ = [
     'get_window',
-    
+
     # windows
     'taylor',
     'hamming',
diff --git a/paddleaudio/paddleaudio/io/__init__.py b/paddleaudio/paddleaudio/io/__init__.py
index 3a9a01e85..cc2538f7f 100644
--- a/paddleaudio/paddleaudio/io/__init__.py
+++ b/paddleaudio/paddleaudio/io/__init__.py
@@ -1,6 +1,19 @@
-from .audio import save_wav
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .audio import depth_convert
 from .audio import load
 from .audio import normalize
-from .audio import to_mono
 from .audio import resample
-from .audio import depth_convert
\ No newline at end of file
+from .audio import save_wav
+from .audio import to_mono
diff --git a/paddleaudio/paddleaudio/kaldi/__init__.py b/paddleaudio/paddleaudio/kaldi/__init__.py
index e69de29bb..97043fd7b 100644
--- a/paddleaudio/paddleaudio/kaldi/__init__.py
+++ b/paddleaudio/paddleaudio/kaldi/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddleaudio/paddleaudio/metric/__init__.py b/paddleaudio/paddleaudio/metric/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddleaudio/paddleaudio/metric/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddleaudio/paddleaudio/sox_effects/__init__.py b/paddleaudio/paddleaudio/sox_effects/__init__.py
index e69de29bb..97043fd7b 100644
--- a/paddleaudio/paddleaudio/sox_effects/__init__.py
+++ b/paddleaudio/paddleaudio/sox_effects/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddleaudio/paddleaudio/utils/__init__.py b/paddleaudio/paddleaudio/utils/__init__.py
index 5fe0980b5..afb9cedd8 100644
--- a/paddleaudio/paddleaudio/utils/__init__.py
+++ b/paddleaudio/paddleaudio/utils/__init__.py
@@ -11,20 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .env import USER_HOME
-from .env import PPAUDIO_HOME
-from .env import MODEL_HOME
-from .env import DATA_HOME
-
 from .download import decompress
 from .download import download_and_decompress
 from .download import load_state_dict_from_url
-
+from .env import DATA_HOME
+from .env import MODEL_HOME
+from .env import PPAUDIO_HOME
+from .env import USER_HOME
 from .error import ParameterError
-
-from .log import logger
 from .log import Logger
-
-from .time import Timer
+from .log import logger
 from .time import seconds_to_hms
+from .time import Timer
diff --git a/paddleaudio/paddleaudio/utils/download.py b/paddleaudio/paddleaudio/utils/download.py
index fd4785cd7..4658352f9 100644
--- a/paddleaudio/paddleaudio/utils/download.py
+++ b/paddleaudio/paddleaudio/utils/download.py
@@ -28,6 +28,7 @@ __all__ = [
     'load_state_dict_from_url',
 ]
 
+
 def decompress(file: str):
     """
     Extracts all files from a compressed file.
diff --git a/paddleaudio/paddleaudio/utils/env.py b/paddleaudio/paddleaudio/utils/env.py
index e202c3803..a2d14b89e 100644
--- a/paddleaudio/paddleaudio/utils/env.py
+++ b/paddleaudio/paddleaudio/utils/env.py
@@ -23,10 +23,11 @@ import os
 __all__ = [
     'USER_HOME',
     'PPAUDIO_HOME',
-'MODEL_HOME' ,
-'DATA_HOME' ,
+    'MODEL_HOME',
+    'DATA_HOME',
 ]
 
+
 def _get_user_home():
     return os.path.expanduser('~')
 
diff --git a/paddleaudio/paddleaudio/utils/time.py b/paddleaudio/paddleaudio/utils/time.py
index 23af62fc7..105208f91 100644
--- a/paddleaudio/paddleaudio/utils/time.py
+++ b/paddleaudio/paddleaudio/utils/time.py
@@ -19,6 +19,7 @@ __all__ = [
     'seconds_to_hms',
 ]
 
+
 class Timer(object):
     '''Calculate runing speed and estimated time of arrival(ETA)'''
 
diff --git a/paddleaudio/tests/.gitkeep b/paddleaudio/tests/.gitkeep
new file mode 100644
index 000000000..e69de29bb

From 852d0ab92b41b4a8b85a2d134ddffc8dfd8b608a Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 25 Feb 2022 09:48:23 +0000
Subject: [PATCH 03/17] dtw metric for tts, test=doc

---
 paddleaudio/CHANGELOG.md                   |  1 +
 paddleaudio/paddleaudio/metric/__init__.py |  2 +
 paddleaudio/paddleaudio/metric/dtw.py      | 42 +++++++++++++++++++
 paddleaudio/paddleaudio/metric/mcd.py      | 47 ++++++++++++++++++++++
 paddleaudio/setup.py                       |  2 +
 5 files changed, 94 insertions(+)
 create mode 100644 paddleaudio/paddleaudio/metric/dtw.py
 create mode 100644 paddleaudio/paddleaudio/metric/mcd.py

diff --git a/paddleaudio/CHANGELOG.md b/paddleaudio/CHANGELOG.md
index 52d44dd39..91b0fef08 100644
--- a/paddleaudio/CHANGELOG.md
+++ b/paddleaudio/CHANGELOG.md
@@ -2,3 +2,4 @@
 
 Date: 2022-2-25, Author: Hui Zhang.
   - Refactor architecture.
+  - dtw distance and mcd style dtw
diff --git a/paddleaudio/paddleaudio/metric/__init__.py b/paddleaudio/paddleaudio/metric/__init__.py
index 97043fd7b..a96530ff6 100644
--- a/paddleaudio/paddleaudio/metric/__init__.py
+++ b/paddleaudio/paddleaudio/metric/__init__.py
@@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .dtw import dtw_distance
+from .mcd import mcd_distance
diff --git a/paddleaudio/paddleaudio/metric/dtw.py b/paddleaudio/paddleaudio/metric/dtw.py
new file mode 100644
index 000000000..d27f56e28
--- /dev/null
+++ b/paddleaudio/paddleaudio/metric/dtw.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from dtaidistance import dtw_ndim
+
+__all__ = [
+    'dtw_distance',
+]
+
+
+def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float:
+    """dtw distance
+
+    Dynamic Time Warping.
+    This function keeps a compact matrix, not the full warping paths matrix.
+    Uses dynamic programming to compute:
+
+    wps[i, j] = (s1[i]-s2[j])**2 + min(
+                    wps[i-1, j  ] + penalty,  // vertical   / insertion / expansion
+                    wps[i  , j-1] + penalty,  // horizontal / deletion  / compression
+                    wps[i-1, j-1])            // diagonal   / match
+    dtw = sqrt(wps[-1, -1])
+
+    Args:
+        xs (np.ndarray): ref sequence, [T,D]
+        ys (np.ndarray): hyp sequence, [T,D]
+
+    Returns:
+        float: dtw distance
+    """
+    return dtw_ndim.distance(xs, ys)
diff --git a/paddleaudio/paddleaudio/metric/mcd.py b/paddleaudio/paddleaudio/metric/mcd.py
new file mode 100644
index 000000000..281e57653
--- /dev/null
+++ b/paddleaudio/paddleaudio/metric/mcd.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import mcd.metrics_fast as mt
+from mcd import dtw
+
+__all__ = [
+    'mcd_distance',
+]
+
+
+def mcd_distance(xs: np.ndarray, ys: np.ndarray, cost_fn=mt.logSpecDbDist):
+    """Mel cepstral distortion (MCD), dtw distance.
+
+    Dynamic Time Warping.
+    Uses dynamic programming to compute:
+    wps[i, j] = cost_fn(xs[i], ys[j]) + min(
+                    wps[i-1, j  ],  // vertical   / insertion / expansion
+                    wps[i  , j-1],  // horizontal / deletion  / compression
+                    wps[i-1, j-1])  // diagonal   / match
+    dtw = sqrt(wps[-1, -1])
+
+    Cost Function:
+    logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0)
+    def logSpecDbDist(x, y):
+        diff = x - y
+        return logSpecDbConst * math.sqrt(np.inner(diff, diff))
+
+    Args:
+        xs (np.ndarray): ref sequence, [T,D]
+        ys (np.ndarray): hyp sequence, [T,D]
+
+    Returns:
+        float: dtw distance
+    """
+    min_cost, path = dtw.dtw(xs, ys, cost_fn)
+    return min_cost
diff --git a/paddleaudio/setup.py b/paddleaudio/setup.py
index 98bf8a6f4..7623443a6 100644
--- a/paddleaudio/setup.py
+++ b/paddleaudio/setup.py
@@ -59,6 +59,8 @@ setuptools.setup(
         'resampy >= 0.2.2',
         'soundfile >= 0.9.0',
         'colorlog',
+        'dtaidistance >= 2.3.6',
+        'mcd >= 0.4',
     ], )
 
 remove_version_py()

From c52f0f805bc92800b61b9594d873778f79304a9a Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Wed, 2 Mar 2022 12:09:56 +0800
Subject: [PATCH 04/17] refactor

---
 paddleaudio/paddleaudio/__init__.py           |   2 +
 paddleaudio/paddleaudio/backends/__init__.py  |   6 +
 .../paddleaudio/backends/soundfile_backend.py | 252 ++++++
 .../{kaldi => compliance}/__init__.py         |   0
 paddleaudio/paddleaudio/compliance/kaldi.py   | 688 ++++++++++++++++
 paddleaudio/paddleaudio/compliance/librosa.py | 728 ++++++++++++++++
 .../features/{librosa.py => layers.py}        | 241 +-----
 .../paddleaudio/functional/__init__.py        |   7 +
 .../paddleaudio/functional/functional.py      | 776 ++++--------------
 paddleaudio/paddleaudio/io/__init__.py        |   8 +-
 paddleaudio/paddleaudio/io/audio.py           | 303 -------
 11 files changed, 1870 insertions(+), 1141 deletions(-)
 rename paddleaudio/paddleaudio/{kaldi => compliance}/__init__.py (100%)
 create mode 100644 paddleaudio/paddleaudio/compliance/kaldi.py
 create mode 100644 paddleaudio/paddleaudio/compliance/librosa.py
 rename paddleaudio/paddleaudio/features/{librosa.py => layers.py} (59%)
 delete mode 100644 paddleaudio/paddleaudio/io/audio.py

diff --git a/paddleaudio/paddleaudio/__init__.py b/paddleaudio/paddleaudio/__init__.py
index 185a92b8d..2dab610cf 100644
--- a/paddleaudio/paddleaudio/__init__.py
+++ b/paddleaudio/paddleaudio/__init__.py
@@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .backends import load
+from .backends import save
diff --git a/paddleaudio/paddleaudio/backends/__init__.py b/paddleaudio/paddleaudio/backends/__init__.py
index 185a92b8d..8eae07e82 100644
--- a/paddleaudio/paddleaudio/backends/__init__.py
+++ b/paddleaudio/paddleaudio/backends/__init__.py
@@ -11,3 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .soundfile_backend import depth_convert
+from .soundfile_backend import load
+from .soundfile_backend import normalize
+from .soundfile_backend import resample
+from .soundfile_backend import save
+from .soundfile_backend import to_mono
diff --git a/paddleaudio/paddleaudio/backends/soundfile_backend.py b/paddleaudio/paddleaudio/backends/soundfile_backend.py
index 97043fd7b..2b920284a 100644
--- a/paddleaudio/paddleaudio/backends/soundfile_backend.py
+++ b/paddleaudio/paddleaudio/backends/soundfile_backend.py
@@ -11,3 +11,255 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import resampy
+import soundfile as sf
+from numpy import ndarray as array
+from scipy.io import wavfile
+
+from ..utils import ParameterError
+
+__all__ = [
+    'resample',
+    'to_mono',
+    'depth_convert',
+    'normalize',
+    'save',
+    'load',
+]
+NORMALMIZE_TYPES = ['linear', 'gaussian']
+MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
+RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
+EPS = 1e-8
+
+
+def resample(y: array, src_sr: int, target_sr: int,
+             mode: str='kaiser_fast') -> array:
+    """ Audio resampling
+     This function is the same as using resampy.resample().
+     Notes:
+        The default mode is kaiser_fast.  For better audio quality, use mode = 'kaiser_fast'
+     """
+
+    if mode == 'kaiser_best':
+        warnings.warn(
+            f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
+        we recommend the mode kaiser_fast in large scale audio trainning')
+
+    if not isinstance(y, np.ndarray):
+        raise ParameterError(
+            'Only support numpy array, but received y in {type(y)}')
+
+    if mode not in RESAMPLE_MODES:
+        raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
+
+    return resampy.resample(y, src_sr, target_sr, filter=mode)
+
+
+def to_mono(y: array, merge_type: str='average') -> array:
+    """ convert sterior audio to mono
+    """
+    if merge_type not in MERGE_TYPES:
+        raise ParameterError(
+            f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
+        )
+    if y.ndim > 2:
+        raise ParameterError(
+            f'Unsupported audio array,  y.ndim > 2, the shape is {y.shape}')
+    if y.ndim == 1:  # nothing to merge
+        return y
+
+    if merge_type == 'ch0':
+        return y[0]
+    if merge_type == 'ch1':
+        return y[1]
+    if merge_type == 'random':
+        return y[np.random.randint(0, 2)]
+
+    # need to do averaging according to dtype
+
+    if y.dtype == 'float32':
+        y_out = (y[0] + y[1]) * 0.5
+    elif y.dtype == 'int16':
+        y_out = y.astype('int32')
+        y_out = (y_out[0] + y_out[1]) // 2
+        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
+                        np.iinfo(y.dtype).max).astype(y.dtype)
+
+    elif y.dtype == 'int8':
+        y_out = y.astype('int16')
+        y_out = (y_out[0] + y_out[1]) // 2
+        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
+                        np.iinfo(y.dtype).max).astype(y.dtype)
+    else:
+        raise ParameterError(f'Unsupported dtype: {y.dtype}')
+    return y_out
+
+
+def _safe_cast(y: array, dtype: Union[type, str]) -> array:
+    """ data type casting in a safe way, i.e., prevent overflow or underflow
+    This function is used internally.
+    """
+    return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
+
+
+def depth_convert(y: array, dtype: Union[type, str],
+                  dithering: bool=True) -> array:
+    """Convert audio array to target dtype safely
+    This function convert audio waveform to a target dtype, with addition steps of
+    preventing overflow/underflow and preserving audio range.
+    """
+
+    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
+    if y.dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype == y.dtype:
+        return y
+
+    if dtype == 'float64' and y.dtype == 'float32':
+        return _safe_cast(y, dtype)
+    if dtype == 'float32' and y.dtype == 'float64':
+        return _safe_cast(y, dtype)
+
+    if dtype == 'int16' or dtype == 'int8':
+        if y.dtype in ['float64', 'float32']:
+            factor = np.iinfo(dtype).max
+            y = np.clip(y * factor, np.iinfo(dtype).min,
+                        np.iinfo(dtype).max).astype(dtype)
+            y = y.astype(dtype)
+        else:
+            if dtype == 'int16' and y.dtype == 'int8':
+                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
+                y = y.astype('float32') * factor
+                y = y.astype('int16')
+
+            else:  # dtype == 'int8' and y.dtype=='int16':
+                y = y.astype('int32') * np.iinfo('int8').max / \
+                    np.iinfo('int16').max
+                y = y.astype('int8')
+
+    if dtype in ['float32', 'float64']:
+        org_dtype = y.dtype
+        y = y.astype(dtype) / np.iinfo(org_dtype).max
+    return y
+
+
+def sound_file_load(file: str,
+                    offset: Optional[float]=None,
+                    dtype: str='int16',
+                    duration: Optional[int]=None) -> Tuple[array, int]:
+    """Load audio using soundfile library
+    This function load audio file using libsndfile.
+    Reference:
+        http://www.mega-nerd.com/libsndfile/#Features
+    """
+    with sf.SoundFile(file) as sf_desc:
+        sr_native = sf_desc.samplerate
+        if offset:
+            sf_desc.seek(int(offset * sr_native))
+        if duration is not None:
+            frame_duration = int(duration * sr_native)
+        else:
+            frame_duration = -1
+        y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
+
+    return y, sf_desc.samplerate
+
+
+def normalize(y: array, norm_type: str='linear',
+              mul_factor: float=1.0) -> array:
+    """ normalize an input audio with additional multiplier.
+    """
+
+    if norm_type == 'linear':
+        amax = np.max(np.abs(y))
+        factor = 1.0 / (amax + EPS)
+        y = y * factor * mul_factor
+    elif norm_type == 'gaussian':
+        amean = np.mean(y)
+        astd = np.std(y)
+        astd = max(astd, EPS)
+        y = mul_factor * (y - amean) / astd
+    else:
+        raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
+
+    return y
+
+
+def save(y: array, sr: int, file: str) -> None:
+    """Save audio file to disk.
+    This function saves audio to disk using scipy.io.wavfile, with additional step
+    to convert input waveform to int16 unless it already is int16
+    Notes:
+        It only support raw wav format.
+    """
+    if not file.endswith('.wav'):
+        raise ParameterError(
+            f'only .wav file supported, but dst file name is: {file}')
+
+    if sr <= 0:
+        raise ParameterError(
+            f'Sample rate should be larger than 0, recieved sr = {sr}')
+
+    if y.dtype not in ['int16', 'int8']:
+        warnings.warn(
+            f'input data type is {y.dtype}, will convert data to int16 format before saving'
+        )
+        y_out = depth_convert(y, 'int16')
+    else:
+        y_out = y
+
+    wavfile.write(file, sr, y_out)
+
+
+def load(
+        file: str,
+        sr: Optional[int]=None,
+        mono: bool=True,
+        merge_type: str='average',  # ch0,ch1,random,average
+        normal: bool=True,
+        norm_type: str='linear',
+        norm_mul_factor: float=1.0,
+        offset: float=0.0,
+        duration: Optional[int]=None,
+        dtype: str='float32',
+        resample_mode: str='kaiser_fast') -> Tuple[array, int]:
+    """Load audio file from disk.
+    This function loads audio from disk using using audio beackend.
+    Parameters:
+    Notes:
+    """
+
+    y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration)
+
+    if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
+        raise ParameterError(f'audio file {file} looks empty')
+
+    if mono:
+        y = to_mono(y, merge_type)
+
+    if sr is not None and sr != r:
+        y = resample(y, r, sr, mode=resample_mode)
+        r = sr
+
+    if normal:
+        y = normalize(y, norm_type, norm_mul_factor)
+    elif dtype in ['int8', 'int16']:
+        # still need to do normalization, before depth convertion
+        y = normalize(y, 'linear', 1.0)
+
+    y = depth_convert(y, dtype)
+    return y, r
diff --git a/paddleaudio/paddleaudio/kaldi/__init__.py b/paddleaudio/paddleaudio/compliance/__init__.py
similarity index 100%
rename from paddleaudio/paddleaudio/kaldi/__init__.py
rename to paddleaudio/paddleaudio/compliance/__init__.py
diff --git a/paddleaudio/paddleaudio/compliance/kaldi.py b/paddleaudio/paddleaudio/compliance/kaldi.py
new file mode 100644
index 000000000..61ca4e3db
--- /dev/null
+++ b/paddleaudio/paddleaudio/compliance/kaldi.py
@@ -0,0 +1,688 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Tuple
+
+import paddle
+from paddle import Tensor
+
+from ..functional.window import get_window
+from .spectrum import create_dct
+
+__all__ = [
+    'spectrogram',
+    'fbank',
+    'mfcc',
+]
+
+# window types
+HANNING = 'hann'
+HAMMING = 'hamming'
+POVEY = 'povey'
+RECTANGULAR = 'rect'
+BLACKMAN = 'blackman'
+
+
+def _get_epsilon(dtype):
+    return paddle.to_tensor(1e-07, dtype=dtype)
+
+
+def _next_power_of_2(x: int) -> int:
+    return 1 if x == 0 else 2**(x - 1).bit_length()
+
+
+def _get_strided(waveform: Tensor,
+                 window_size: int,
+                 window_shift: int,
+                 snip_edges: bool) -> Tensor:
+    assert waveform.dim() == 1
+    num_samples = waveform.shape[0]
+
+    if snip_edges:
+        if num_samples < window_size:
+            return paddle.empty((0, 0), dtype=waveform.dtype)
+        else:
+            m = 1 + (num_samples - window_size) // window_shift
+    else:
+        reversed_waveform = paddle.flip(waveform, [0])
+        m = (num_samples + (window_shift // 2)) // window_shift
+        pad = window_size // 2 - window_shift // 2
+        pad_right = reversed_waveform
+        if pad > 0:
+            pad_left = reversed_waveform[-pad:]
+            waveform = paddle.concat((pad_left, waveform, pad_right), axis=0)
+        else:
+            waveform = paddle.concat((waveform[-pad:], pad_right), axis=0)
+
+    return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T
+
+
+def _feature_window_function(
+        window_type: str,
+        window_size: int,
+        blackman_coeff: float,
+        dtype: int, ) -> Tensor:
+    if window_type == HANNING:
+        return get_window('hann', window_size, fftbins=False, dtype=dtype)
+    elif window_type == HAMMING:
+        return get_window('hamming', window_size, fftbins=False, dtype=dtype)
+    elif window_type == POVEY:
+        return get_window(
+            'hann', window_size, fftbins=False, dtype=dtype).pow(0.85)
+    elif window_type == RECTANGULAR:
+        return paddle.ones([window_size], dtype=dtype)
+    elif window_type == BLACKMAN:
+        a = 2 * math.pi / (window_size - 1)
+        window_function = paddle.arange(window_size, dtype=dtype)
+        return (blackman_coeff - 0.5 * paddle.cos(a * window_function) +
+                (0.5 - blackman_coeff) * paddle.cos(2 * a * window_function)
+                ).astype(dtype)
+    else:
+        raise Exception('Invalid window type ' + window_type)
+
+
+def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
+                    energy_floor: float) -> Tensor:
+    log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log()
+    if energy_floor == 0.0:
+        return log_energy
+    return paddle.maximum(
+        log_energy,
+        paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype))
+
+
+def _get_waveform_and_window_properties(
+        waveform: Tensor,
+        channel: int,
+        sample_frequency: float,
+        frame_shift: float,
+        frame_length: float,
+        round_to_power_of_two: bool,
+        preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]:
+    channel = max(channel, 0)
+    assert channel < waveform.shape[0], (
+        'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
+    waveform = waveform[channel, :]  # size (n)
+    window_shift = int(
+        sample_frequency * frame_shift *
+        0.001)  # pass frame_shift and frame_length in milliseconds
+    window_size = int(sample_frequency * frame_length * 0.001)
+    padded_window_size = _next_power_of_2(
+        window_size) if round_to_power_of_two else window_size
+
+    assert 2 <= window_size <= len(waveform), (
+        'choose a window size {} that is [2, {}]'.format(window_size,
+                                                         len(waveform)))
+    assert 0 < window_shift, '`window_shift` must be greater than 0'
+    assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
+                                        ' use `round_to_power_of_two` or change `frame_length`'
+    assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
+    assert sample_frequency > 0, '`sample_frequency` must be greater than zero'
+    return waveform, window_shift, window_size, padded_window_size
+
+
+def _get_window(waveform: Tensor,
+                padded_window_size: int,
+                window_size: int,
+                window_shift: int,
+                window_type: str,
+                blackman_coeff: float,
+                snip_edges: bool,
+                raw_energy: bool,
+                energy_floor: float,
+                dither: float,
+                remove_dc_offset: bool,
+                preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]:
+    dtype = waveform.dtype
+    epsilon = _get_epsilon(dtype)
+
+    # size (m, window_size)
+    strided_input = _get_strided(waveform, window_size, window_shift,
+                                 snip_edges)
+
+    if dither != 0.0:
+        # Returns a random number strictly between 0 and 1
+        x = paddle.maximum(epsilon,
+                           paddle.rand(strided_input.shape, dtype=dtype))
+        rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
+        strided_input = strided_input + rand_gauss * dither
+
+    if remove_dc_offset:
+        # Subtract each row/frame by its mean
+        row_means = paddle.mean(
+            strided_input, axis=1).unsqueeze(1)  # size (m, 1)
+        strided_input = strided_input - row_means
+
+    if raw_energy:
+        # Compute the log energy of each row/frame before applying preemphasis and
+        # window function
+        signal_log_energy = _get_log_energy(strided_input, epsilon,
+                                            energy_floor)  # size (m)
+
+    if preemphasis_coefficient != 0.0:
+        # strided_input[i,j] -= preemphasis_coefficient * strided_input[i, max(0, j-1)] for all i,j
+        offset_strided_input = paddle.nn.functional.pad(
+            strided_input.unsqueeze(0), (1, 0),
+            data_format='NCL',
+            mode='replicate').squeeze(0)  # size (m, window_size + 1)
+        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
+                                                                                       -1]
+
+    # Apply window_function to each row/frame
+    window_function = _feature_window_function(
+        window_type, window_size, blackman_coeff,
+        dtype).unsqueeze(0)  # size (1, window_size)
+    strided_input = strided_input * window_function  # size (m, window_size)
+
+    # Pad columns with zero until we reach size (m, padded_window_size)
+    if padded_window_size != window_size:
+        padding_right = padded_window_size - window_size
+        strided_input = paddle.nn.functional.pad(
+            strided_input.unsqueeze(0), (0, padding_right),
+            data_format='NCL',
+            mode='constant',
+            value=0).squeeze(0)
+
+    # Compute energy after window function (not the raw one)
+    if not raw_energy:
+        signal_log_energy = _get_log_energy(strided_input, epsilon,
+                                            energy_floor)  # size (m)
+
+    return strided_input, signal_log_energy
+
+
+def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
+    # subtracts the column mean of the tensor size (m, n) if subtract_mean=True
+    # it returns size (m, n)
+    if subtract_mean:
+        col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
+        tensor = tensor - col_means
+    return tensor
+
+
+def spectrogram(waveform: Tensor,
+                blackman_coeff: float=0.42,
+                channel: int=-1,
+                dither: float=0.0,
+                energy_floor: float=1.0,
+                frame_length: float=25.0,
+                frame_shift: float=10.0,
+                min_duration: float=0.0,
+                preemphasis_coefficient: float=0.97,
+                raw_energy: bool=True,
+                remove_dc_offset: bool=True,
+                round_to_power_of_two: bool=True,
+                sample_frequency: float=16000.0,
+                snip_edges: bool=True,
+                subtract_mean: bool=False,
+                window_type: str=POVEY) -> Tensor:
+    """[summary]
+
+    Args:
+        waveform (Tensor): [description]
+        blackman_coeff (float, optional): [description]. Defaults to 0.42.
+        channel (int, optional): [description]. Defaults to -1.
+        dither (float, optional): [description]. Defaults to 0.0.
+        energy_floor (float, optional): [description]. Defaults to 1.0.
+        frame_length (float, optional): [description]. Defaults to 25.0.
+        frame_shift (float, optional): [description]. Defaults to 10.0.
+        min_duration (float, optional): [description]. Defaults to 0.0.
+        preemphasis_coefficient (float, optional): [description]. Defaults to 0.97.
+        raw_energy (bool, optional): [description]. Defaults to True.
+        remove_dc_offset (bool, optional): [description]. Defaults to True.
+        round_to_power_of_two (bool, optional): [description]. Defaults to True.
+        sample_frequency (float, optional): [description]. Defaults to 16000.0.
+        snip_edges (bool, optional): [description]. Defaults to True.
+        subtract_mean (bool, optional): [description]. Defaults to False.
+        window_type (str, optional): [description]. Defaults to POVEY.
+
+    Returns:
+        Tensor: [description]
+    """
+    dtype = waveform.dtype
+    epsilon = _get_epsilon(dtype)
+
+    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
+        waveform, channel, sample_frequency, frame_shift, frame_length,
+        round_to_power_of_two, preemphasis_coefficient)
+
+    if len(waveform) < min_duration * sample_frequency:
+        # signal is too short
+        return paddle.empty([0])
+
+    strided_input, signal_log_energy = _get_window(
+        waveform, padded_window_size, window_size, window_shift, window_type,
+        blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
+        remove_dc_offset, preemphasis_coefficient)
+
+    # size (m, padded_window_size // 2 + 1, 2)
+    fft = paddle.fft.rfft(strided_input)
+
+    # Convert the FFT into a power spectrum
+    power_spectrum = paddle.maximum(
+        fft.abs().pow(2.),
+        epsilon).log()  # size (m, padded_window_size // 2 + 1)
+    power_spectrum[:, 0] = signal_log_energy
+
+    power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
+    return power_spectrum
+
+
+def _inverse_mel_scale_scalar(mel_freq: float) -> float:
+    return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
+
+
+def _inverse_mel_scale(mel_freq: Tensor) -> Tensor:
+    return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
+
+
+def _mel_scale_scalar(freq: float) -> float:
+    return 1127.0 * math.log(1.0 + freq / 700.0)
+
+
+def _mel_scale(freq: Tensor) -> Tensor:
+    return 1127.0 * (1.0 + freq / 700.0).log()
+
+
+def _vtln_warp_freq(vtln_low_cutoff: float,
+                    vtln_high_cutoff: float,
+                    low_freq: float,
+                    high_freq: float,
+                    vtln_warp_factor: float,
+                    freq: Tensor) -> Tensor:
+    assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq'
+    assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]'
+    l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
+    h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
+    scale = 1.0 / vtln_warp_factor
+    Fl = scale * l  # F(l)
+    Fh = scale * h  # F(h)
+    assert l > low_freq and h < high_freq
+    # slope of left part of the 3-piece linear function
+    scale_left = (Fl - low_freq) / (l - low_freq)
+    # [slope of center part is just "scale"]
+
+    # slope of right part of the 3-piece linear function
+    scale_right = (high_freq - Fh) / (high_freq - h)
+
+    res = paddle.empty_like(freq)
+
+    outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
+        | paddle.greater_than(freq, paddle.to_tensor(high_freq))  # freq < low_freq || freq > high_freq
+    before_l = paddle.less_than(freq, paddle.to_tensor(l))  # freq < l
+    before_h = paddle.less_than(freq, paddle.to_tensor(h))  # freq < h
+    after_h = paddle.greater_equal(freq, paddle.to_tensor(h))  # freq >= h
+
+    # order of operations matter here (since there is overlapping frequency regions)
+    res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
+    res[before_h] = scale * freq[before_h]
+    res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
+    res[outside_low_high_freq] = freq[outside_low_high_freq]
+
+    return res
+
+
+def _vtln_warp_mel_freq(vtln_low_cutoff: float,
+                        vtln_high_cutoff: float,
+                        low_freq,
+                        high_freq: float,
+                        vtln_warp_factor: float,
+                        mel_freq: Tensor) -> Tensor:
+    return _mel_scale(
+        _vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq,
+                        vtln_warp_factor, _inverse_mel_scale(mel_freq)))
+
+
+def _get_mel_banks(num_bins: int,
+                   window_length_padded: int,
+                   sample_freq: float,
+                   low_freq: float,
+                   high_freq: float,
+                   vtln_low: float,
+                   vtln_high: float,
+                   vtln_warp_factor: float) -> Tuple[Tensor, Tensor]:
+    assert num_bins > 3, 'Must have at least 3 mel bins'
+    assert window_length_padded % 2 == 0
+    num_fft_bins = window_length_padded / 2
+    nyquist = 0.5 * sample_freq
+
+    if high_freq <= 0.0:
+        high_freq += nyquist
+
+    assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
+        ('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
+
+    # fft-bin width [think of it as Nyquist-freq / half-window-length]
+    fft_bin_width = sample_freq / window_length_padded
+    mel_low_freq = _mel_scale_scalar(low_freq)
+    mel_high_freq = _mel_scale_scalar(high_freq)
+
+    # divide by num_bins+1 in next line because of end-effects where the bins
+    # spread out to the sides.
+    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
+
+    if vtln_high < 0.0:
+        vtln_high += nyquist
+
+    assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and
+                                       (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \
+        ('Bad values in options: vtln-low {} and vtln-high {}, versus '
+         'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
+
+    bin = paddle.arange(num_bins).unsqueeze(1)
+    left_mel = mel_low_freq + bin * mel_freq_delta  # size(num_bins, 1)
+    center_mel = mel_low_freq + (bin + 1.0
+                                 ) * mel_freq_delta  # size(num_bins, 1)
+    right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta  # size(num_bins, 1)
+
+    if vtln_warp_factor != 1.0:
+        left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
+                                       vtln_warp_factor, left_mel)
+        center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
+                                         high_freq, vtln_warp_factor,
+                                         center_mel)
+        right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
+                                        high_freq, vtln_warp_factor, right_mel)
+
+    center_freqs = _inverse_mel_scale(center_mel)  # size (num_bins)
+    # size(1, num_fft_bins)
+    mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0)
+
+    # size (num_bins, num_fft_bins)
+    up_slope = (mel - left_mel) / (center_mel - left_mel)
+    down_slope = (right_mel - mel) / (right_mel - center_mel)
+
+    if vtln_warp_factor == 1.0:
+        # left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values
+        bins = paddle.maximum(
+            paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
+    else:
+        # warping can move the order of left_mel, center_mel, right_mel anywhere
+        bins = paddle.zeros_like(up_slope)
+        up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
+            mel, center_mel)  # left_mel < mel <= center_mel
+        down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
+            mel, right_mel)  # center_mel < mel < right_mel
+        bins[up_idx] = up_slope[up_idx]
+        bins[down_idx] = down_slope[down_idx]
+
+    return bins, center_freqs
+
+
+def fbank(waveform: Tensor,
+          blackman_coeff: float=0.42,
+          channel: int=-1,
+          dither: float=0.0,
+          energy_floor: float=1.0,
+          frame_length: float=25.0,
+          frame_shift: float=10.0,
+          high_freq: float=0.0,
+          htk_compat: bool=False,
+          low_freq: float=20.0,
+          min_duration: float=0.0,
+          num_mel_bins: int=23,
+          preemphasis_coefficient: float=0.97,
+          raw_energy: bool=True,
+          remove_dc_offset: bool=True,
+          round_to_power_of_two: bool=True,
+          sample_frequency: float=16000.0,
+          snip_edges: bool=True,
+          subtract_mean: bool=False,
+          use_energy: bool=False,
+          use_log_fbank: bool=True,
+          use_power: bool=True,
+          vtln_high: float=-500.0,
+          vtln_low: float=100.0,
+          vtln_warp: float=1.0,
+          window_type: str=POVEY) -> Tensor:
+    """[summary]
+
+    Args:
+        waveform (Tensor): [description]
+        blackman_coeff (float, optional): [description]. Defaults to 0.42.
+        channel (int, optional): [description]. Defaults to -1.
+        dither (float, optional): [description]. Defaults to 0.0.
+        energy_floor (float, optional): [description]. Defaults to 1.0.
+        frame_length (float, optional): [description]. Defaults to 25.0.
+        frame_shift (float, optional): [description]. Defaults to 10.0.
+        high_freq (float, optional): [description]. Defaults to 0.0.
+        htk_compat (bool, optional): [description]. Defaults to False.
+        low_freq (float, optional): [description]. Defaults to 20.0.
+        min_duration (float, optional): [description]. Defaults to 0.0.
+        num_mel_bins (int, optional): [description]. Defaults to 23.
+        preemphasis_coefficient (float, optional): [description]. Defaults to 0.97.
+        raw_energy (bool, optional): [description]. Defaults to True.
+        remove_dc_offset (bool, optional): [description]. Defaults to True.
+        round_to_power_of_two (bool, optional): [description]. Defaults to True.
+        sample_frequency (float, optional): [description]. Defaults to 16000.0.
+        snip_edges (bool, optional): [description]. Defaults to True.
+        subtract_mean (bool, optional): [description]. Defaults to False.
+        use_energy (bool, optional): [description]. Defaults to False.
+        use_log_fbank (bool, optional): [description]. Defaults to True.
+        use_power (bool, optional): [description]. Defaults to True.
+        vtln_high (float, optional): [description]. Defaults to -500.0.
+        vtln_low (float, optional): [description]. Defaults to 100.0.
+        vtln_warp (float, optional): [description]. Defaults to 1.0.
+        window_type (str, optional): [description]. Defaults to POVEY.
+
+    Returns:
+        Tensor: [description]
+    """
+    dtype = waveform.dtype
+
+    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
+        waveform, channel, sample_frequency, frame_shift, frame_length,
+        round_to_power_of_two, preemphasis_coefficient)
+
+    if len(waveform) < min_duration * sample_frequency:
+        # signal is too short
+        return paddle.empty([0], dtype=dtype)
+
+    # strided_input, size (m, padded_window_size) and signal_log_energy, size (m)
+    strided_input, signal_log_energy = _get_window(
+        waveform, padded_window_size, window_size, window_shift, window_type,
+        blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
+        remove_dc_offset, preemphasis_coefficient)
+
+    # size (m, padded_window_size // 2 + 1)
+    spectrum = paddle.fft.rfft(strided_input).abs()
+    if use_power:
+        spectrum = spectrum.pow(2.)
+
+    # size (num_mel_bins, padded_window_size // 2)
+    mel_energies, _ = _get_mel_banks(num_mel_bins, padded_window_size,
+                                     sample_frequency, low_freq, high_freq,
+                                     vtln_low, vtln_high, vtln_warp)
+    mel_energies = mel_energies.astype(dtype)
+
+    # pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1)
+    mel_energies = paddle.nn.functional.pad(
+        mel_energies.unsqueeze(0), (0, 1),
+        data_format='NCL',
+        mode='constant',
+        value=0).squeeze(0)
+
+    # sum with mel fiterbanks over the power spectrum, size (m, num_mel_bins)
+    mel_energies = paddle.mm(spectrum, mel_energies.T)
+    if use_log_fbank:
+        # avoid log of zero (which should be prevented anyway by dithering)
+        mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
+
+    # if use_energy then add it as the last column for htk_compat == true else first column
+    if use_energy:
+        signal_log_energy = signal_log_energy.unsqueeze(1)  # size (m, 1)
+        # returns size (m, num_mel_bins + 1)
+        if htk_compat:
+            mel_energies = paddle.concat(
+                (mel_energies, signal_log_energy), axis=1)
+        else:
+            mel_energies = paddle.concat(
+                (signal_log_energy, mel_energies), axis=1)
+
+    mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
+    return mel_energies
+
+
+def _get_dct_matrix(num_ceps: int, num_mel_bins: int) -> Tensor:
+    # returns a dct matrix of size (num_mel_bins, num_ceps)
+    # size (num_mel_bins, num_mel_bins)
+    dct_matrix = create_dct(num_mel_bins, num_mel_bins, 'ortho')
+    # kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins)
+    # this would be the first column in the dct_matrix for torchaudio as it expects a
+    # right multiply (which would be the first column of the kaldi's dct_matrix as kaldi
+    # expects a left multiply e.g. dct_matrix * vector).
+    dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins))
+    dct_matrix = dct_matrix[:, :num_ceps]
+    return dct_matrix
+
+
+def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> Tensor:
+    # returns size (num_ceps)
+    # Compute liftering coefficients (scaling on cepstral coeffs)
+    # coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected.
+    i = paddle.arange(num_ceps)
+    return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
+                                                    cepstral_lifter)
+
+
+def mfcc(waveform: Tensor,
+         blackman_coeff: float=0.42,
+         cepstral_lifter: float=22.0,
+         channel: int=-1,
+         dither: float=0.0,
+         energy_floor: float=1.0,
+         frame_length: float=25.0,
+         frame_shift: float=10.0,
+         high_freq: float=0.0,
+         htk_compat: bool=False,
+         low_freq: float=20.0,
+         num_ceps: int=13,
+         min_duration: float=0.0,
+         num_mel_bins: int=23,
+         preemphasis_coefficient: float=0.97,
+         raw_energy: bool=True,
+         remove_dc_offset: bool=True,
+         round_to_power_of_two: bool=True,
+         sample_frequency: float=16000.0,
+         snip_edges: bool=True,
+         subtract_mean: bool=False,
+         use_energy: bool=False,
+         vtln_high: float=-500.0,
+         vtln_low: float=100.0,
+         vtln_warp: float=1.0,
+         window_type: str=POVEY) -> Tensor:
+    """[summary]
+
+    Args:
+        waveform (Tensor): [description]
+        blackman_coeff (float, optional): [description]. Defaults to 0.42.
+        cepstral_lifter (float, optional): [description]. Defaults to 22.0.
+        channel (int, optional): [description]. Defaults to -1.
+        dither (float, optional): [description]. Defaults to 0.0.
+        energy_floor (float, optional): [description]. Defaults to 1.0.
+        frame_length (float, optional): [description]. Defaults to 25.0.
+        frame_shift (float, optional): [description]. Defaults to 10.0.
+        high_freq (float, optional): [description]. Defaults to 0.0.
+        htk_compat (bool, optional): [description]. Defaults to False.
+        low_freq (float, optional): [description]. Defaults to 20.0.
+        num_ceps (int, optional): [description]. Defaults to 13.
+        min_duration (float, optional): [description]. Defaults to 0.0.
+        num_mel_bins (int, optional): [description]. Defaults to 23.
+        preemphasis_coefficient (float, optional): [description]. Defaults to 0.97.
+        raw_energy (bool, optional): [description]. Defaults to True.
+        remove_dc_offset (bool, optional): [description]. Defaults to True.
+        round_to_power_of_two (bool, optional): [description]. Defaults to True.
+        sample_frequency (float, optional): [description]. Defaults to 16000.0.
+        snip_edges (bool, optional): [description]. Defaults to True.
+        subtract_mean (bool, optional): [description]. Defaults to False.
+        use_energy (bool, optional): [description]. Defaults to False.
+        vtln_high (float, optional): [description]. Defaults to -500.0.
+        vtln_low (float, optional): [description]. Defaults to 100.0.
+        vtln_warp (float, optional): [description]. Defaults to 1.0.
+        window_type (str, optional): [description]. Defaults to POVEY.
+
+    Returns:
+        Tensor: [description]
+    """
+    assert num_ceps <= num_mel_bins, 'num_ceps cannot be larger than num_mel_bins: %d vs %d' % (
+        num_ceps, num_mel_bins)
+
+    dtype = waveform.dtype
+
+    # The mel_energies should not be squared (use_power=True), not have mean subtracted
+    # (subtract_mean=False), and use log (use_log_fbank=True).
+    # size (m, num_mel_bins + use_energy)
+    feature = fbank(
+        waveform=waveform,
+        blackman_coeff=blackman_coeff,
+        channel=channel,
+        dither=dither,
+        energy_floor=energy_floor,
+        frame_length=frame_length,
+        frame_shift=frame_shift,
+        high_freq=high_freq,
+        htk_compat=htk_compat,
+        low_freq=low_freq,
+        min_duration=min_duration,
+        num_mel_bins=num_mel_bins,
+        preemphasis_coefficient=preemphasis_coefficient,
+        raw_energy=raw_energy,
+        remove_dc_offset=remove_dc_offset,
+        round_to_power_of_two=round_to_power_of_two,
+        sample_frequency=sample_frequency,
+        snip_edges=snip_edges,
+        subtract_mean=False,
+        use_energy=use_energy,
+        use_log_fbank=True,
+        use_power=True,
+        vtln_high=vtln_high,
+        vtln_low=vtln_low,
+        vtln_warp=vtln_warp,
+        window_type=window_type)
+
+    if use_energy:
+        # size (m)
+        signal_log_energy = feature[:, num_mel_bins if htk_compat else 0]
+        # offset is 0 if htk_compat==True else 1
+        mel_offset = int(not htk_compat)
+        feature = feature[:, mel_offset:(num_mel_bins + mel_offset)]
+
+    # size (num_mel_bins, num_ceps)
+    dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins).astype(dtype=dtype)
+
+    # size (m, num_ceps)
+    feature = feature.matmul(dct_matrix)
+
+    if cepstral_lifter != 0.0:
+        # size (1, num_ceps)
+        lifter_coeffs = _get_lifter_coeffs(num_ceps,
+                                           cepstral_lifter).unsqueeze(0)
+        feature *= lifter_coeffs.astype(dtype=dtype)
+
+    # if use_energy then replace the last column for htk_compat == true else first column
+    if use_energy:
+        feature[:, 0] = signal_log_energy
+
+    if htk_compat:
+        energy = feature[:, 0].unsqueeze(1)  # size (m, 1)
+        feature = feature[:, 1:]  # size (m, num_ceps - 1)
+        if not use_energy:
+            # scale on C0 (actually removing a scale we previously added that's
+            # part of one common definition of the cosine transform.)
+            energy *= math.sqrt(2)
+
+        feature = paddle.concat((feature, energy), axis=1)
+
+    feature = _subtract_column_mean(feature, subtract_mean)
+    return feature
diff --git a/paddleaudio/paddleaudio/compliance/librosa.py b/paddleaudio/paddleaudio/compliance/librosa.py
new file mode 100644
index 000000000..167795c37
--- /dev/null
+++ b/paddleaudio/paddleaudio/compliance/librosa.py
@@ -0,0 +1,728 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from librosa(https://github.com/librosa/librosa)
+import warnings
+from typing import List
+from typing import Optional
+from typing import Union
+
+import numpy as np
+import scipy
+from numpy import ndarray as array
+from numpy.lib.stride_tricks import as_strided
+from scipy import signal
+
+from ..backends import depth_convert
+from ..utils import ParameterError
+
+__all__ = [
+    # dsp
+    'stft',
+    'mfcc',
+    'hz_to_mel',
+    'mel_to_hz',
+    'split_frames',
+    'mel_frequencies',
+    'power_to_db',
+    'compute_fbank_matrix',
+    'melspectrogram',
+    'spectrogram',
+    'mu_encode',
+    'mu_decode',
+    # augmentation
+    'depth_augment',
+    'spect_augment',
+    'random_crop1d',
+    'random_crop2d',
+    'adaptive_spect_augment',
+]
+
+
+def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array:
+    """Pad an array to a target length along a target axis.
+
+    This differs from `np.pad` by centering the data prior to padding,
+    analogous to `str.center`
+    """
+
+    kwargs.setdefault("mode", "constant")
+    n = data.shape[axis]
+    lpad = int((size - n) // 2)
+    lengths = [(0, 0)] * data.ndim
+    lengths[axis] = (lpad, int(size - n - lpad))
+
+    if lpad < 0:
+        raise ParameterError(("Target size ({size:d}) must be "
+                              "at least input size ({n:d})"))
+
+    return np.pad(data, lengths, **kwargs)
+
+
+def split_frames(x: array, frame_length: int, hop_length: int,
+                 axis: int=-1) -> array:
+    """Slice a data array into (overlapping) frames.
+
+    This function is aligned with librosa.frame
+    """
+
+    if not isinstance(x, np.ndarray):
+        raise ParameterError(
+            f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
+
+    if x.shape[axis] < frame_length:
+        raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
+                             f" for frame_length={frame_length:d}")
+
+    if hop_length < 1:
+        raise ParameterError(f"Invalid hop_length: {hop_length:d}")
+
+    if axis == -1 and not x.flags["F_CONTIGUOUS"]:
+        warnings.warn(f"librosa.util.frame called with axis={axis} "
+                      "on a non-contiguous input. This will result in a copy.")
+        x = np.asfortranarray(x)
+    elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
+        warnings.warn(f"librosa.util.frame called with axis={axis} "
+                      "on a non-contiguous input. This will result in a copy.")
+        x = np.ascontiguousarray(x)
+
+    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
+    strides = np.asarray(x.strides)
+
+    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
+
+    if axis == -1:
+        shape = list(x.shape)[:-1] + [frame_length, n_frames]
+        strides = list(strides) + [hop_length * new_stride]
+
+    elif axis == 0:
+        shape = [n_frames, frame_length] + list(x.shape)[1:]
+        strides = [hop_length * new_stride] + list(strides)
+
+    else:
+        raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
+
+    return as_strided(x, shape=shape, strides=strides)
+
+
+def _check_audio(y, mono=True) -> bool:
+    """Determine whether a variable contains valid audio data.
+
+    The audio y must be a np.ndarray, ether 1-channel or two channel
+    """
+    if not isinstance(y, np.ndarray):
+        raise ParameterError("Audio data must be of type numpy.ndarray")
+    if y.ndim > 2:
+        raise ParameterError(
+            f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
+
+    if mono and y.ndim == 2:
+        raise ParameterError(
+            f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
+
+    if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
+        raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
+
+    if not np.issubdtype(y.dtype, np.floating):
+        raise ParameterError("Audio data must be floating-point")
+
+    if not np.isfinite(y).all():
+        raise ParameterError("Audio buffer is not finite everywhere")
+
+    return True
+
+
+def hz_to_mel(frequencies: Union[float, List[float], array],
+              htk: bool=False) -> array:
+    """Convert Hz to Mels
+
+    This function is aligned with librosa.
+    """
+    freq = np.asanyarray(frequencies)
+
+    if htk:
+        return 2595.0 * np.log10(1.0 + freq / 700.0)
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = np.log(6.4) / 27.0  # step size for log region
+
+    if freq.ndim:
+        # If we have array data, vectorize
+        log_t = freq >= min_log_hz
+        mels[log_t] = min_log_mel + \
+            np.log(freq[log_t] / min_log_hz) / logstep
+    elif freq >= min_log_hz:
+        # If we have scalar data, heck directly
+        mels = min_log_mel + np.log(freq / min_log_hz) / logstep
+
+    return mels
+
+
+def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array:
+    """Convert mel bin numbers to frequencies.
+
+    This function is aligned with librosa.
+    """
+    mel_array = np.asanyarray(mels)
+
+    if htk:
+        return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
+
+    # Fill in the linear scale
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mel_array
+
+    # And now the nonlinear scale
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = np.log(6.4) / 27.0  # step size for log region
+
+    if mel_array.ndim:
+        # If we have vector data, vectorize
+        log_t = mel_array >= min_log_mel
+        freqs[log_t] = min_log_hz * \
+            np.exp(logstep * (mel_array[log_t] - min_log_mel))
+    elif mel_array >= min_log_mel:
+        # If we have scalar data, check directly
+        freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
+
+    return freqs
+
+
+def mel_frequencies(n_mels: int=128,
+                    fmin: float=0.0,
+                    fmax: float=11025.0,
+                    htk: bool=False) -> array:
+    """Compute mel frequencies
+
+    This function is aligned with librosa.
+    """
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    min_mel = hz_to_mel(fmin, htk=htk)
+    max_mel = hz_to_mel(fmax, htk=htk)
+
+    mels = np.linspace(min_mel, max_mel, n_mels)
+
+    return mel_to_hz(mels, htk=htk)
+
+
+def fft_frequencies(sr: int, n_fft: int) -> array:
+    """Compute fourier frequencies.
+
+    This function is aligned with librosa.
+    """
+    return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
+
+
+def compute_fbank_matrix(sr: int,
+                         n_fft: int,
+                         n_mels: int=128,
+                         fmin: float=0.0,
+                         fmax: Optional[float]=None,
+                         htk: bool=False,
+                         norm: str="slaney",
+                         dtype: type=np.float32):
+    """Compute fbank matrix.
+
+    This funciton is aligned with librosa.
+    """
+    if norm != "slaney":
+        raise ParameterError('norm must set to slaney')
+
+    if fmax is None:
+        fmax = float(sr) / 2
+
+    # Initialize the weights
+    n_mels = int(n_mels)
+    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
+
+    fdiff = np.diff(mel_f)
+    ramps = np.subtract.outer(mel_f, fftfreqs)
+
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+
+        # .. then intersect them with each other and zero
+        weights[i] = np.maximum(0, np.minimum(lower, upper))
+
+    if norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm[:, np.newaxis]
+
+    # Only check weights if f_mel[0] is positive
+    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
+        # This means we have an empty channel somewhere
+        warnings.warn("Empty filters detected in mel frequency basis. "
+                      "Some channels will produce empty responses. "
+                      "Try increasing your sampling rate (and fmax) or "
+                      "reducing n_mels.")
+
+    return weights
+
+
+def stft(x: array,
+         n_fft: int=2048,
+         hop_length: Optional[int]=None,
+         win_length: Optional[int]=None,
+         window: str="hann",
+         center: bool=True,
+         dtype: type=np.complex64,
+         pad_mode: str="reflect") -> array:
+    """Short-time Fourier transform (STFT).
+
+    This function is aligned with librosa.
+    """
+    _check_audio(x)
+
+    # By default, use the entire frame
+    if win_length is None:
+        win_length = n_fft
+
+    # Set the default hop, if it's not already specified
+    if hop_length is None:
+        hop_length = int(win_length // 4)
+
+    fft_window = signal.get_window(window, win_length, fftbins=True)
+
+    # Pad the window out to n_fft size
+    fft_window = pad_center(fft_window, n_fft)
+
+    # Reshape so that the window can be broadcast
+    fft_window = fft_window.reshape((-1, 1))
+
+    # Pad the time series so that frames are centered
+    if center:
+        if n_fft > x.shape[-1]:
+            warnings.warn(
+                f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
+            )
+        x = np.pad(x, int(n_fft // 2), mode=pad_mode)
+
+    elif n_fft > x.shape[-1]:
+        raise ParameterError(
+            f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
+        )
+
+    # Window the time series.
+    x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length)
+    # Pre-allocate the STFT matrix
+    stft_matrix = np.empty(
+        (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
+    fft = np.fft  # use numpy fft as default
+    # Constrain STFT block sizes to 256 KB
+    MAX_MEM_BLOCK = 2**8 * 2**10
+    # how many columns can we fit within MAX_MEM_BLOCK?
+    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
+    n_columns = max(n_columns, 1)
+
+    for bl_s in range(0, stft_matrix.shape[1], n_columns):
+        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
+        stft_matrix[:, bl_s:bl_t] = fft.rfft(
+            fft_window * x_frames[:, bl_s:bl_t], axis=0)
+
+    return stft_matrix
+
+
+def power_to_db(spect: array,
+                ref: float=1.0,
+                amin: float=1e-10,
+                top_db: Optional[float]=80.0) -> array:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units
+
+    This computes the scaling ``10 * log10(spect / ref)`` in a numerically
+    stable way.
+
+    This function is aligned with librosa.
+    """
+    spect = np.asarray(spect)
+
+    if amin <= 0:
+        raise ParameterError("amin must be strictly positive")
+
+    if np.issubdtype(spect.dtype, np.complexfloating):
+        warnings.warn(
+            "power_to_db was called on complex input so phase "
+            "information will be discarded. To suppress this warning, "
+            "call power_to_db(np.abs(D)**2) instead.")
+        magnitude = np.abs(spect)
+    else:
+        magnitude = spect
+
+    if callable(ref):
+        # User supplied a function to calculate reference power
+        ref_value = ref(magnitude)
+    else:
+        ref_value = np.abs(ref)
+
+    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
+    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
+
+    if top_db is not None:
+        if top_db < 0:
+            raise ParameterError("top_db must be non-negative")
+        log_spec = np.maximum(log_spec, log_spec.max() - top_db)
+
+    return log_spec
+
+
+def mfcc(x,
+         sr: int=16000,
+         spect: Optional[array]=None,
+         n_mfcc: int=20,
+         dct_type: int=2,
+         norm: str="ortho",
+         lifter: int=0,
+         **kwargs) -> array:
+    """Mel-frequency cepstral coefficients (MFCCs)
+
+    This function is NOT strictly aligned with librosa. The following example shows how to get the
+    same result with librosa:
+
+    # mfcc:
+     kwargs = {
+        'window_size':512,
+        'hop_length':320,
+        'mel_bins':64,
+        'fmin':50,
+         'to_db':False}
+    a = mfcc(x,
+        spect=None,
+        n_mfcc=20,
+        dct_type=2,
+        norm='ortho',
+        lifter=0,
+        **kwargs)
+
+    # librosa mfcc:
+    spect = librosa.feature.melspectrogram(y=x,sr=16000,n_fft=512,
+                                              win_length=512,
+                                              hop_length=320,
+                                              n_mels=64, fmin=50)
+    b = librosa.feature.mfcc(y=x,
+        sr=16000,
+        S=spect,
+        n_mfcc=20,
+        dct_type=2,
+        norm='ortho',
+        lifter=0)
+
+    assert np.mean( (a-b)**2) < 1e-8
+
+    """
+    if spect is None:
+        spect = melspectrogram(x, sr=sr, **kwargs)
+
+    M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
+
+    if lifter > 0:
+        factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
+                        lifter)
+        return M * factor[:, np.newaxis]
+    elif lifter == 0:
+        return M
+    else:
+        raise ParameterError(
+            f"MFCC lifter={lifter} must be a non-negative number")
+
+
+def melspectrogram(x: array,
+                   sr: int=16000,
+                   window_size: int=512,
+                   hop_length: int=320,
+                   n_mels: int=64,
+                   fmin: int=50,
+                   fmax: Optional[float]=None,
+                   window: str='hann',
+                   center: bool=True,
+                   pad_mode: str='reflect',
+                   power: float=2.0,
+                   to_db: bool=True,
+                   ref: float=1.0,
+                   amin: float=1e-10,
+                   top_db: Optional[float]=None) -> array:
+    """Compute mel-spectrogram.
+
+    Parameters:
+        x: numpy.ndarray
+        The input wavform is a numpy array [shape=(n,)]
+
+        window_size: int, typically 512, 1024, 2048, etc.
+        The window size for framing, also used as n_fft for stft
+
+
+    Returns:
+        The mel-spectrogram in power scale or db scale(default)
+
+
+    Notes:
+    1. sr is default to 16000, which is commonly used in speech/speaker processing.
+    2. when fmax is None, it is set to sr//2.
+    3. this function will convert mel spectgrum to db scale by default. This is different
+    that of librosa.
+
+    """
+    _check_audio(x, mono=True)
+    if len(x) <= 0:
+        raise ParameterError('The input waveform is empty')
+
+    if fmax is None:
+        fmax = sr // 2
+    if fmin < 0 or fmin >= fmax:
+        raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
+
+    s = stft(
+        x,
+        n_fft=window_size,
+        hop_length=hop_length,
+        win_length=window_size,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    spect_power = np.abs(s)**power
+    fb_matrix = compute_fbank_matrix(
+        sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
+    mel_spect = np.matmul(fb_matrix, spect_power)
+    if to_db:
+        return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
+    else:
+        return mel_spect
+
+
+def spectrogram(x: array,
+                sr: int=16000,
+                window_size: int=512,
+                hop_length: int=320,
+                window: str='hann',
+                center: bool=True,
+                pad_mode: str='reflect',
+                power: float=2.0) -> array:
+    """Compute spectrogram from an input waveform.
+
+    This function is a wrapper for librosa.feature.stft, with addition step to
+    compute the magnitude of the complex spectrogram.
+    """
+
+    s = stft(
+        x,
+        n_fft=window_size,
+        hop_length=hop_length,
+        win_length=window_size,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    return np.abs(s)**power
+
+
+def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array:
+    """Mu-law encoding.
+
+    Compute the mu-law decoding given an input code.
+    When quantized is True, the result will be converted to
+    integer in range [0,mu-1]. Otherwise, the resulting signal
+    is in range [-1,1]
+
+
+    Reference:
+        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+
+    """
+    mu = 255
+    y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
+    if quantized:
+        y = np.floor((y + 1) / 2 * mu + 0.5)  # convert to [0 , mu-1]
+    return y
+
+
+def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
+    """Mu-law decoding.
+
+    Compute the mu-law decoding given an input code.
+
+    it assumes that the input y is in
+    range [0,mu-1] when quantize is True and [-1,1] otherwise
+
+    Reference:
+        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+
+    """
+    if mu < 1:
+        raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
+
+    mu = mu - 1
+    if quantized:  # undo the quantization
+        y = y * 2 / mu - 1
+    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
+    return x
+
+
+def randint(high: int) -> int:
+    """Generate one random integer in range [0 high)
+
+     This is a helper function for random data augmentaiton
+    """
+    return int(np.random.randint(0, high=high))
+
+
+def rand() -> float:
+    """Generate one floating-point number in range [0 1)
+
+    This is a helper function for random data augmentaiton
+    """
+    return float(np.random.rand(1))
+
+
+def depth_augment(y: array,
+                  choices: List=['int8', 'int16'],
+                  probs: List[float]=[0.5, 0.5]) -> array:
+    """ Audio depth augmentation
+
+    Do audio depth augmentation to simulate the distortion brought by quantization.
+    """
+    assert len(probs) == len(
+        choices
+    ), 'number of choices {} must be equal to size of probs {}'.format(
+        len(choices), len(probs))
+    depth = np.random.choice(choices, p=probs)
+    src_depth = y.dtype
+    y1 = depth_convert(y, depth)
+    y2 = depth_convert(y1, src_depth)
+
+    return y2
+
+
+def adaptive_spect_augment(spect: array, tempo_axis: int=0,
+                           level: float=0.1) -> array:
+    """Do adpative spectrogram augmentation
+
+    The level of the augmentation is gowern by the paramter level,
+    ranging from 0 to 1, with 0 represents no augmentation。
+
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+
+    time_mask_width = int(nt * level * 0.5)
+    freq_mask_width = int(nf * level * 0.5)
+
+    num_time_mask = int(10 * level)
+    num_freq_mask = int(10 * level)
+
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+
+    return spect
+
+
+def spect_augment(spect: array,
+                  tempo_axis: int=0,
+                  max_time_mask: int=3,
+                  max_freq_mask: int=3,
+                  max_time_mask_width: int=30,
+                  max_freq_mask_width: int=20) -> array:
+    """Do spectrogram augmentation in both time and freq axis
+
+    Reference:
+
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+
+    num_time_mask = randint(max_time_mask)
+    num_freq_mask = randint(max_freq_mask)
+
+    time_mask_width = randint(max_time_mask_width)
+    freq_mask_width = randint(max_freq_mask_width)
+
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+
+    return spect
+
+
+def random_crop1d(y: array, crop_len: int) -> array:
+    """ Do random cropping on 1d input signal
+
+    The input is a 1d signal, typically a sound waveform
+    """
+    if y.ndim != 1:
+        'only accept 1d tensor or numpy array'
+    n = len(y)
+    idx = randint(n - crop_len)
+    return y[idx:idx + crop_len]
+
+
+def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
+    """ Do random cropping for 2D array, typically a spectrogram.
+
+    The cropping is done in temporal direction on the time-freq input signal.
+    """
+    if tempo_axis >= s.ndim:
+        raise ParameterError('axis out of range')
+
+    n = s.shape[tempo_axis]
+    idx = randint(high=n - crop_len)
+    sli = [slice(None) for i in range(s.ndim)]
+    sli[tempo_axis] = slice(idx, idx + crop_len)
+    out = s[tuple(sli)]
+    return out
diff --git a/paddleaudio/paddleaudio/features/librosa.py b/paddleaudio/paddleaudio/features/layers.py
similarity index 59%
rename from paddleaudio/paddleaudio/features/librosa.py
rename to paddleaudio/paddleaudio/features/layers.py
index 1cbd2d1a2..69f814d66 100644
--- a/paddleaudio/paddleaudio/features/librosa.py
+++ b/paddleaudio/paddleaudio/features/layers.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import math
 from functools import partial
 from typing import Optional
 from typing import Union
@@ -19,225 +18,19 @@ from typing import Union
 import paddle
 import paddle.nn as nn
 
+from ..functional import compute_fbank_matrix
+from ..functional import create_dct
+from ..functional import power_to_db
 from ..functional.window import get_window
 
 __all__ = [
     'Spectrogram',
     'MelSpectrogram',
     'LogMelSpectrogram',
+    'MFCC',
 ]
 
 
-def hz_to_mel(freq: Union[paddle.Tensor, float],
-              htk: bool=False) -> Union[paddle.Tensor, float]:
-    """Convert Hz to Mels.
-    Parameters:
-        freq: the input tensor of arbitrary shape, or a single floating point number.
-        htk: use HTK formula to do the conversion.
-            The default value is False.
-    Returns:
-        The frequencies represented in Mel-scale.
-    """
-
-    if htk:
-        if isinstance(freq, paddle.Tensor):
-            return 2595.0 * paddle.log10(1.0 + freq / 700.0)
-        else:
-            return 2595.0 * math.log10(1.0 + freq / 700.0)
-
-    # Fill in the linear part
-    f_min = 0.0
-    f_sp = 200.0 / 3
-
-    mels = (freq - f_min) / f_sp
-
-    # Fill in the log-scale part
-
-    min_log_hz = 1000.0  # beginning of log region (Hz)
-    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-    logstep = math.log(6.4) / 27.0  # step size for log region
-
-    if isinstance(freq, paddle.Tensor):
-        target = min_log_mel + paddle.log(
-            freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
-        mask = (freq > min_log_hz).astype(freq.dtype)
-        mels = target * mask + mels * (
-            1 - mask)  # will replace by masked_fill OP in future
-    else:
-        if freq >= min_log_hz:
-            mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
-
-    return mels
-
-
-def mel_to_hz(mel: Union[float, paddle.Tensor],
-              htk: bool=False) -> Union[float, paddle.Tensor]:
-    """Convert mel bin numbers to frequencies.
-    Parameters:
-        mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
-        htk: use HTK formula to do the conversion.
-    Returns:
-        The frequencies represented in hz.
-    """
-    if htk:
-        return 700.0 * (10.0**(mel / 2595.0) - 1.0)
-
-    f_min = 0.0
-    f_sp = 200.0 / 3
-    freqs = f_min + f_sp * mel
-    # And now the nonlinear scale
-    min_log_hz = 1000.0  # beginning of log region (Hz)
-    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-    logstep = math.log(6.4) / 27.0  # step size for log region
-    if isinstance(mel, paddle.Tensor):
-        target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
-        mask = (mel > min_log_mel).astype(mel.dtype)
-        freqs = target * mask + freqs * (
-            1 - mask)  # will replace by masked_fill OP in future
-    else:
-        if mel >= min_log_mel:
-            freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
-
-    return freqs
-
-
-def mel_frequencies(n_mels: int=64,
-                    f_min: float=0.0,
-                    f_max: float=11025.0,
-                    htk: bool=False,
-                    dtype: str=paddle.float32):
-    """Compute mel frequencies.
-    Parameters:
-        n_mels(int): number of Mel bins.
-        f_min(float): the lower cut-off frequency, below which the filter response is zero.
-        f_max(float): the upper cut-off frequency, above which the filter response is zero.
-        htk(bool): whether to use htk formula.
-        dtype(str): the datatype of the return frequencies.
-    Returns:
-        The frequencies represented in Mel-scale
-    """
-    # 'Center freqs' of mel bands - uniformly spaced between limits
-    min_mel = hz_to_mel(f_min, htk=htk)
-    max_mel = hz_to_mel(f_max, htk=htk)
-    mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
-    freqs = mel_to_hz(mels, htk=htk)
-    return freqs
-
-
-def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
-    """Compute fourier frequencies.
-    Parameters:
-        sr(int): the audio sample rate.
-        n_fft(float): the number of fft bins.
-        dtype(str): the datatype of the return frequencies.
-    Returns:
-        The frequencies represented in hz.
-    """
-    return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
-
-
-def compute_fbank_matrix(sr: int,
-                         n_fft: int,
-                         n_mels: int=64,
-                         f_min: float=0.0,
-                         f_max: Optional[float]=None,
-                         htk: bool=False,
-                         norm: Union[str, float]='slaney',
-                         dtype: str=paddle.float32):
-    """Compute fbank matrix.
-    Parameters:
-        sr(int): the audio sample rate.
-        n_fft(int): the number of fft bins.
-        n_mels(int): the number of Mel bins.
-        f_min(float): the lower cut-off frequency, below which the filter response is zero.
-        f_max(float): the upper cut-off frequency, above which the filter response is zero.
-        htk: whether to use htk formula.
-        return_complex(bool): whether to return complex matrix. If True, the matrix will
-            be complex type. Otherwise, the real and image part will be stored in the last
-            axis of returned tensor.
-        dtype(str): the datatype of the returned fbank matrix.
-    Returns:
-        The fbank matrix of shape (n_mels, int(1+n_fft//2)).
-    Shape:
-        output: (n_mels, int(1+n_fft//2))
-    """
-
-    if f_max is None:
-        f_max = float(sr) / 2
-
-    # Initialize the weights
-    weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
-
-    # Center freqs of each FFT bin
-    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
-
-    # 'Center freqs' of mel bands - uniformly spaced between limits
-    mel_f = mel_frequencies(
-        n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
-
-    fdiff = mel_f[1:] - mel_f[:-1]  #np.diff(mel_f)
-    ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
-    #ramps = np.subtract.outer(mel_f, fftfreqs)
-
-    for i in range(n_mels):
-        # lower and upper slopes for all bins
-        lower = -ramps[i] / fdiff[i]
-        upper = ramps[i + 2] / fdiff[i + 1]
-
-        # .. then intersect them with each other and zero
-        weights[i] = paddle.maximum(
-            paddle.zeros_like(lower), paddle.minimum(lower, upper))
-
-    # Slaney-style mel is scaled to be approx constant energy per channel
-    if norm == 'slaney':
-        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
-        weights *= enorm.unsqueeze(1)
-    elif isinstance(norm, int) or isinstance(norm, float):
-        weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
-
-    return weights
-
-
-def power_to_db(magnitude: paddle.Tensor,
-                ref_value: float=1.0,
-                amin: float=1e-10,
-                top_db: Optional[float]=None) -> paddle.Tensor:
-    """Convert a power spectrogram (amplitude squared) to decibel (dB) units.
-    The function computes the scaling ``10 * log10(x / ref)`` in a numerically
-    stable way.
-    Parameters:
-        magnitude(Tensor): the input magnitude tensor of any shape.
-        ref_value(float): the reference value. If smaller than 1.0, the db level
-            of the signal will be pulled up accordingly. Otherwise, the db level
-            is pushed down.
-        amin(float): the minimum value of input magnitude, below which the input
-            magnitude is clipped(to amin).
-        top_db(float): the maximum db value of resulting spectrum, above which the
-            spectrum is clipped(to top_db).
-    Returns:
-        The spectrogram in log-scale.
-    shape:
-        input: any shape
-        output: same as input
-    """
-    if amin <= 0:
-        raise Exception("amin must be strictly positive")
-
-    if ref_value <= 0:
-        raise Exception("ref_value must be strictly positive")
-
-    ones = paddle.ones_like(magnitude)
-    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
-    log_spec -= 10.0 * math.log10(max(ref_value, amin))
-
-    if top_db is not None:
-        if top_db < 0:
-            raise Exception("top_db must be non-negative")
-        log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
-
-    return log_spec
-
-
 class Spectrogram(nn.Layer):
     def __init__(self,
                  n_fft: int=512,
@@ -459,3 +252,29 @@ class LogMelSpectrogram(nn.Layer):
             amin=self.amin,
             top_db=self.top_db)
         return log_mel_feature
+
+
+class MFCC(nn.Layer):
+    def __init__(self,
+                 sr: int=22050,
+                 n_mfcc: int=40,
+                 norm: str='ortho',
+                 **kwargs):
+        """[summary]
+        Parameters:
+            sr (int, optional): [description]. Defaults to 22050.
+            n_mfcc (int, optional): [description]. Defaults to 40.
+            norm (str, optional): [description]. Defaults to 'ortho'.
+        """
+        super(MFCC, self).__init__()
+        self._log_melspectrogram = LogMelSpectrogram(sr=sr, **kwargs)
+        self.dct_matrix = create_dct(
+            n_mfcc=n_mfcc, n_mels=self._log_melspectrogram.n_mels, norm=norm)
+        self.register_buffer('dct_matrix', self.dct_matrix)
+
+    def forward(self, x):
+        log_mel_feature = self._log_melspectrogram(x)
+        mfcc = paddle.matmul(
+            log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
+                (0, 2, 1))  # (B, n_mels, L)
+        return mfcc
diff --git a/paddleaudio/paddleaudio/functional/__init__.py b/paddleaudio/paddleaudio/functional/__init__.py
index 97043fd7b..c85232df1 100644
--- a/paddleaudio/paddleaudio/functional/__init__.py
+++ b/paddleaudio/paddleaudio/functional/__init__.py
@@ -11,3 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .functional import compute_fbank_matrix
+from .functional import create_dct
+from .functional import fft_frequencies
+from .functional import hz_to_mel
+from .functional import mel_frequencies
+from .functional import mel_to_hz
+from .functional import power_to_db
diff --git a/paddleaudio/paddleaudio/functional/functional.py b/paddleaudio/paddleaudio/functional/functional.py
index 167795c37..c07f14fd8 100644
--- a/paddleaudio/paddleaudio/functional/functional.py
+++ b/paddleaudio/paddleaudio/functional/functional.py
@@ -12,146 +12,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from librosa(https://github.com/librosa/librosa)
-import warnings
-from typing import List
+import math
 from typing import Optional
 from typing import Union
 
-import numpy as np
-import scipy
-from numpy import ndarray as array
-from numpy.lib.stride_tricks import as_strided
-from scipy import signal
-
-from ..backends import depth_convert
-from ..utils import ParameterError
+import paddle
 
 __all__ = [
-    # dsp
-    'stft',
-    'mfcc',
     'hz_to_mel',
     'mel_to_hz',
-    'split_frames',
     'mel_frequencies',
-    'power_to_db',
+    'fft_frequencies',
     'compute_fbank_matrix',
-    'melspectrogram',
-    'spectrogram',
-    'mu_encode',
-    'mu_decode',
-    # augmentation
-    'depth_augment',
-    'spect_augment',
-    'random_crop1d',
-    'random_crop2d',
-    'adaptive_spect_augment',
+    'power_to_db',
+    'create_dct',
 ]
 
 
-def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array:
-    """Pad an array to a target length along a target axis.
-
-    This differs from `np.pad` by centering the data prior to padding,
-    analogous to `str.center`
-    """
-
-    kwargs.setdefault("mode", "constant")
-    n = data.shape[axis]
-    lpad = int((size - n) // 2)
-    lengths = [(0, 0)] * data.ndim
-    lengths[axis] = (lpad, int(size - n - lpad))
-
-    if lpad < 0:
-        raise ParameterError(("Target size ({size:d}) must be "
-                              "at least input size ({n:d})"))
-
-    return np.pad(data, lengths, **kwargs)
-
-
-def split_frames(x: array, frame_length: int, hop_length: int,
-                 axis: int=-1) -> array:
-    """Slice a data array into (overlapping) frames.
-
-    This function is aligned with librosa.frame
-    """
-
-    if not isinstance(x, np.ndarray):
-        raise ParameterError(
-            f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
-
-    if x.shape[axis] < frame_length:
-        raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
-                             f" for frame_length={frame_length:d}")
-
-    if hop_length < 1:
-        raise ParameterError(f"Invalid hop_length: {hop_length:d}")
-
-    if axis == -1 and not x.flags["F_CONTIGUOUS"]:
-        warnings.warn(f"librosa.util.frame called with axis={axis} "
-                      "on a non-contiguous input. This will result in a copy.")
-        x = np.asfortranarray(x)
-    elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
-        warnings.warn(f"librosa.util.frame called with axis={axis} "
-                      "on a non-contiguous input. This will result in a copy.")
-        x = np.ascontiguousarray(x)
-
-    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
-    strides = np.asarray(x.strides)
-
-    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
-
-    if axis == -1:
-        shape = list(x.shape)[:-1] + [frame_length, n_frames]
-        strides = list(strides) + [hop_length * new_stride]
-
-    elif axis == 0:
-        shape = [n_frames, frame_length] + list(x.shape)[1:]
-        strides = [hop_length * new_stride] + list(strides)
-
-    else:
-        raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
-
-    return as_strided(x, shape=shape, strides=strides)
-
-
-def _check_audio(y, mono=True) -> bool:
-    """Determine whether a variable contains valid audio data.
-
-    The audio y must be a np.ndarray, ether 1-channel or two channel
-    """
-    if not isinstance(y, np.ndarray):
-        raise ParameterError("Audio data must be of type numpy.ndarray")
-    if y.ndim > 2:
-        raise ParameterError(
-            f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
-
-    if mono and y.ndim == 2:
-        raise ParameterError(
-            f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
-
-    if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
-        raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
-
-    if not np.issubdtype(y.dtype, np.floating):
-        raise ParameterError("Audio data must be floating-point")
-
-    if not np.isfinite(y).all():
-        raise ParameterError("Audio buffer is not finite everywhere")
-
-    return True
-
-
-def hz_to_mel(frequencies: Union[float, List[float], array],
-              htk: bool=False) -> array:
-    """Convert Hz to Mels
-
-    This function is aligned with librosa.
+def hz_to_mel(freq: Union[paddle.Tensor, float],
+              htk: bool=False) -> Union[paddle.Tensor, float]:
+    """Convert Hz to Mels.
+    Parameters:
+        freq: the input tensor of arbitrary shape, or a single floating point number.
+        htk: use HTK formula to do the conversion.
+            The default value is False.
+    Returns:
+        The frequencies represented in Mel-scale.
     """
-    freq = np.asanyarray(frequencies)
 
     if htk:
-        return 2595.0 * np.log10(1.0 + freq / 700.0)
+        if isinstance(freq, paddle.Tensor):
+            return 2595.0 * paddle.log10(1.0 + freq / 700.0)
+        else:
+            return 2595.0 * math.log10(1.0 + freq / 700.0)
 
     # Fill in the linear part
     f_min = 0.0
@@ -163,107 +56,129 @@ def hz_to_mel(frequencies: Union[float, List[float], array],
 
     min_log_hz = 1000.0  # beginning of log region (Hz)
     min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-    logstep = np.log(6.4) / 27.0  # step size for log region
-
-    if freq.ndim:
-        # If we have array data, vectorize
-        log_t = freq >= min_log_hz
-        mels[log_t] = min_log_mel + \
-            np.log(freq[log_t] / min_log_hz) / logstep
-    elif freq >= min_log_hz:
-        # If we have scalar data, heck directly
-        mels = min_log_mel + np.log(freq / min_log_hz) / logstep
+    logstep = math.log(6.4) / 27.0  # step size for log region
+
+    if isinstance(freq, paddle.Tensor):
+        target = min_log_mel + paddle.log(
+            freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
+        mask = (freq > min_log_hz).astype(freq.dtype)
+        mels = target * mask + mels * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if freq >= min_log_hz:
+            mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
 
     return mels
 
 
-def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array:
+def mel_to_hz(mel: Union[float, paddle.Tensor],
+              htk: bool=False) -> Union[float, paddle.Tensor]:
     """Convert mel bin numbers to frequencies.
-
-    This function is aligned with librosa.
+    Parameters:
+        mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
+        htk: use HTK formula to do the conversion.
+    Returns:
+        The frequencies represented in hz.
     """
-    mel_array = np.asanyarray(mels)
-
     if htk:
-        return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
+        return 700.0 * (10.0**(mel / 2595.0) - 1.0)
 
-    # Fill in the linear scale
     f_min = 0.0
     f_sp = 200.0 / 3
-    freqs = f_min + f_sp * mel_array
-
+    freqs = f_min + f_sp * mel
     # And now the nonlinear scale
     min_log_hz = 1000.0  # beginning of log region (Hz)
     min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-    logstep = np.log(6.4) / 27.0  # step size for log region
-
-    if mel_array.ndim:
-        # If we have vector data, vectorize
-        log_t = mel_array >= min_log_mel
-        freqs[log_t] = min_log_hz * \
-            np.exp(logstep * (mel_array[log_t] - min_log_mel))
-    elif mel_array >= min_log_mel:
-        # If we have scalar data, check directly
-        freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
+    logstep = math.log(6.4) / 27.0  # step size for log region
+    if isinstance(mel, paddle.Tensor):
+        target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
+        mask = (mel > min_log_mel).astype(mel.dtype)
+        freqs = target * mask + freqs * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if mel >= min_log_mel:
+            freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
 
     return freqs
 
 
-def mel_frequencies(n_mels: int=128,
-                    fmin: float=0.0,
-                    fmax: float=11025.0,
-                    htk: bool=False) -> array:
-    """Compute mel frequencies
-
-    This function is aligned with librosa.
+def mel_frequencies(n_mels: int=64,
+                    f_min: float=0.0,
+                    f_max: float=11025.0,
+                    htk: bool=False,
+                    dtype: str=paddle.float32):
+    """Compute mel frequencies.
+    Parameters:
+        n_mels(int): number of Mel bins.
+        f_min(float): the lower cut-off frequency, below which the filter response is zero.
+        f_max(float): the upper cut-off frequency, above which the filter response is zero.
+        htk(bool): whether to use htk formula.
+        dtype(str): the datatype of the return frequencies.
+    Returns:
+        The frequencies represented in Mel-scale
     """
     # 'Center freqs' of mel bands - uniformly spaced between limits
-    min_mel = hz_to_mel(fmin, htk=htk)
-    max_mel = hz_to_mel(fmax, htk=htk)
-
-    mels = np.linspace(min_mel, max_mel, n_mels)
-
-    return mel_to_hz(mels, htk=htk)
+    min_mel = hz_to_mel(f_min, htk=htk)
+    max_mel = hz_to_mel(f_max, htk=htk)
+    mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
+    freqs = mel_to_hz(mels, htk=htk)
+    return freqs
 
 
-def fft_frequencies(sr: int, n_fft: int) -> array:
+def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
     """Compute fourier frequencies.
-
-    This function is aligned with librosa.
+    Parameters:
+        sr(int): the audio sample rate.
+        n_fft(float): the number of fft bins.
+        dtype(str): the datatype of the return frequencies.
+    Returns:
+        The frequencies represented in hz.
     """
-    return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
+    return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
 
 
 def compute_fbank_matrix(sr: int,
                          n_fft: int,
-                         n_mels: int=128,
-                         fmin: float=0.0,
-                         fmax: Optional[float]=None,
+                         n_mels: int=64,
+                         f_min: float=0.0,
+                         f_max: Optional[float]=None,
                          htk: bool=False,
-                         norm: str="slaney",
-                         dtype: type=np.float32):
+                         norm: Union[str, float]='slaney',
+                         dtype: str=paddle.float32):
     """Compute fbank matrix.
-
-    This funciton is aligned with librosa.
+    Parameters:
+        sr(int): the audio sample rate.
+        n_fft(int): the number of fft bins.
+        n_mels(int): the number of Mel bins.
+        f_min(float): the lower cut-off frequency, below which the filter response is zero.
+        f_max(float): the upper cut-off frequency, above which the filter response is zero.
+        htk: whether to use htk formula.
+        return_complex(bool): whether to return complex matrix. If True, the matrix will
+            be complex type. Otherwise, the real and image part will be stored in the last
+            axis of returned tensor.
+        dtype(str): the datatype of the returned fbank matrix.
+    Returns:
+        The fbank matrix of shape (n_mels, int(1+n_fft//2)).
+    Shape:
+        output: (n_mels, int(1+n_fft//2))
     """
-    if norm != "slaney":
-        raise ParameterError('norm must set to slaney')
 
-    if fmax is None:
-        fmax = float(sr) / 2
+    if f_max is None:
+        f_max = float(sr) / 2
 
     # Initialize the weights
-    n_mels = int(n_mels)
-    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+    weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
 
     # Center freqs of each FFT bin
-    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
 
     # 'Center freqs' of mel bands - uniformly spaced between limits
-    mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
+    mel_f = mel_frequencies(
+        n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
 
-    fdiff = np.diff(mel_f)
-    ramps = np.subtract.outer(mel_f, fftfreqs)
+    fdiff = mel_f[1:] - mel_f[:-1]  #np.diff(mel_f)
+    ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
+    #ramps = np.subtract.outer(mel_f, fftfreqs)
 
     for i in range(n_mels):
         # lower and upper slopes for all bins
@@ -271,458 +186,79 @@ def compute_fbank_matrix(sr: int,
         upper = ramps[i + 2] / fdiff[i + 1]
 
         # .. then intersect them with each other and zero
-        weights[i] = np.maximum(0, np.minimum(lower, upper))
+        weights[i] = paddle.maximum(
+            paddle.zeros_like(lower), paddle.minimum(lower, upper))
 
-    if norm == "slaney":
-        # Slaney-style mel is scaled to be approx constant energy per channel
+    # Slaney-style mel is scaled to be approx constant energy per channel
+    if norm == 'slaney':
         enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
-        weights *= enorm[:, np.newaxis]
-
-    # Only check weights if f_mel[0] is positive
-    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
-        # This means we have an empty channel somewhere
-        warnings.warn("Empty filters detected in mel frequency basis. "
-                      "Some channels will produce empty responses. "
-                      "Try increasing your sampling rate (and fmax) or "
-                      "reducing n_mels.")
+        weights *= enorm.unsqueeze(1)
+    elif isinstance(norm, int) or isinstance(norm, float):
+        weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
 
     return weights
 
 
-def stft(x: array,
-         n_fft: int=2048,
-         hop_length: Optional[int]=None,
-         win_length: Optional[int]=None,
-         window: str="hann",
-         center: bool=True,
-         dtype: type=np.complex64,
-         pad_mode: str="reflect") -> array:
-    """Short-time Fourier transform (STFT).
-
-    This function is aligned with librosa.
-    """
-    _check_audio(x)
-
-    # By default, use the entire frame
-    if win_length is None:
-        win_length = n_fft
-
-    # Set the default hop, if it's not already specified
-    if hop_length is None:
-        hop_length = int(win_length // 4)
-
-    fft_window = signal.get_window(window, win_length, fftbins=True)
-
-    # Pad the window out to n_fft size
-    fft_window = pad_center(fft_window, n_fft)
-
-    # Reshape so that the window can be broadcast
-    fft_window = fft_window.reshape((-1, 1))
-
-    # Pad the time series so that frames are centered
-    if center:
-        if n_fft > x.shape[-1]:
-            warnings.warn(
-                f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
-            )
-        x = np.pad(x, int(n_fft // 2), mode=pad_mode)
-
-    elif n_fft > x.shape[-1]:
-        raise ParameterError(
-            f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
-        )
-
-    # Window the time series.
-    x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length)
-    # Pre-allocate the STFT matrix
-    stft_matrix = np.empty(
-        (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
-    fft = np.fft  # use numpy fft as default
-    # Constrain STFT block sizes to 256 KB
-    MAX_MEM_BLOCK = 2**8 * 2**10
-    # how many columns can we fit within MAX_MEM_BLOCK?
-    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
-    n_columns = max(n_columns, 1)
-
-    for bl_s in range(0, stft_matrix.shape[1], n_columns):
-        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
-        stft_matrix[:, bl_s:bl_t] = fft.rfft(
-            fft_window * x_frames[:, bl_s:bl_t], axis=0)
-
-    return stft_matrix
-
-
-def power_to_db(spect: array,
-                ref: float=1.0,
+def power_to_db(magnitude: paddle.Tensor,
+                ref_value: float=1.0,
                 amin: float=1e-10,
-                top_db: Optional[float]=80.0) -> array:
-    """Convert a power spectrogram (amplitude squared) to decibel (dB) units
-
-    This computes the scaling ``10 * log10(spect / ref)`` in a numerically
+                top_db: Optional[float]=None) -> paddle.Tensor:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units.
+    The function computes the scaling ``10 * log10(x / ref)`` in a numerically
     stable way.
-
-    This function is aligned with librosa.
+    Parameters:
+        magnitude(Tensor): the input magnitude tensor of any shape.
+        ref_value(float): the reference value. If smaller than 1.0, the db level
+            of the signal will be pulled up accordingly. Otherwise, the db level
+            is pushed down.
+        amin(float): the minimum value of input magnitude, below which the input
+            magnitude is clipped(to amin).
+        top_db(float): the maximum db value of resulting spectrum, above which the
+            spectrum is clipped(to top_db).
+    Returns:
+        The spectrogram in log-scale.
+    shape:
+        input: any shape
+        output: same as input
     """
-    spect = np.asarray(spect)
-
     if amin <= 0:
-        raise ParameterError("amin must be strictly positive")
-
-    if np.issubdtype(spect.dtype, np.complexfloating):
-        warnings.warn(
-            "power_to_db was called on complex input so phase "
-            "information will be discarded. To suppress this warning, "
-            "call power_to_db(np.abs(D)**2) instead.")
-        magnitude = np.abs(spect)
-    else:
-        magnitude = spect
+        raise Exception("amin must be strictly positive")
 
-    if callable(ref):
-        # User supplied a function to calculate reference power
-        ref_value = ref(magnitude)
-    else:
-        ref_value = np.abs(ref)
+    if ref_value <= 0:
+        raise Exception("ref_value must be strictly positive")
 
-    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
-    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
+    ones = paddle.ones_like(magnitude)
+    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
+    log_spec -= 10.0 * math.log10(max(ref_value, amin))
 
     if top_db is not None:
         if top_db < 0:
-            raise ParameterError("top_db must be non-negative")
-        log_spec = np.maximum(log_spec, log_spec.max() - top_db)
+            raise Exception("top_db must be non-negative")
+        log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
 
     return log_spec
 
 
-def mfcc(x,
-         sr: int=16000,
-         spect: Optional[array]=None,
-         n_mfcc: int=20,
-         dct_type: int=2,
-         norm: str="ortho",
-         lifter: int=0,
-         **kwargs) -> array:
-    """Mel-frequency cepstral coefficients (MFCCs)
-
-    This function is NOT strictly aligned with librosa. The following example shows how to get the
-    same result with librosa:
-
-    # mfcc:
-     kwargs = {
-        'window_size':512,
-        'hop_length':320,
-        'mel_bins':64,
-        'fmin':50,
-         'to_db':False}
-    a = mfcc(x,
-        spect=None,
-        n_mfcc=20,
-        dct_type=2,
-        norm='ortho',
-        lifter=0,
-        **kwargs)
-
-    # librosa mfcc:
-    spect = librosa.feature.melspectrogram(y=x,sr=16000,n_fft=512,
-                                              win_length=512,
-                                              hop_length=320,
-                                              n_mels=64, fmin=50)
-    b = librosa.feature.mfcc(y=x,
-        sr=16000,
-        S=spect,
-        n_mfcc=20,
-        dct_type=2,
-        norm='ortho',
-        lifter=0)
-
-    assert np.mean( (a-b)**2) < 1e-8
-
-    """
-    if spect is None:
-        spect = melspectrogram(x, sr=sr, **kwargs)
-
-    M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
-
-    if lifter > 0:
-        factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
-                        lifter)
-        return M * factor[:, np.newaxis]
-    elif lifter == 0:
-        return M
-    else:
-        raise ParameterError(
-            f"MFCC lifter={lifter} must be a non-negative number")
-
-
-def melspectrogram(x: array,
-                   sr: int=16000,
-                   window_size: int=512,
-                   hop_length: int=320,
-                   n_mels: int=64,
-                   fmin: int=50,
-                   fmax: Optional[float]=None,
-                   window: str='hann',
-                   center: bool=True,
-                   pad_mode: str='reflect',
-                   power: float=2.0,
-                   to_db: bool=True,
-                   ref: float=1.0,
-                   amin: float=1e-10,
-                   top_db: Optional[float]=None) -> array:
-    """Compute mel-spectrogram.
-
+def create_dct(n_mfcc: int,
+               n_mels: int,
+               norm: Optional[str]='ortho',
+               dtype: Optional[str]=paddle.float32):
+    """[summary]
     Parameters:
-        x: numpy.ndarray
-        The input wavform is a numpy array [shape=(n,)]
-
-        window_size: int, typically 512, 1024, 2048, etc.
-        The window size for framing, also used as n_fft for stft
-
-
+        n_mfcc (int): [description]
+        n_mels (int): [description]
+        norm (str, optional): [description]. Defaults to 'ortho'.
     Returns:
-        The mel-spectrogram in power scale or db scale(default)
-
-
-    Notes:
-    1. sr is default to 16000, which is commonly used in speech/speaker processing.
-    2. when fmax is None, it is set to sr//2.
-    3. this function will convert mel spectgrum to db scale by default. This is different
-    that of librosa.
-
-    """
-    _check_audio(x, mono=True)
-    if len(x) <= 0:
-        raise ParameterError('The input waveform is empty')
-
-    if fmax is None:
-        fmax = sr // 2
-    if fmin < 0 or fmin >= fmax:
-        raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
-
-    s = stft(
-        x,
-        n_fft=window_size,
-        hop_length=hop_length,
-        win_length=window_size,
-        window=window,
-        center=center,
-        pad_mode=pad_mode)
-
-    spect_power = np.abs(s)**power
-    fb_matrix = compute_fbank_matrix(
-        sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
-    mel_spect = np.matmul(fb_matrix, spect_power)
-    if to_db:
-        return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
-    else:
-        return mel_spect
-
-
-def spectrogram(x: array,
-                sr: int=16000,
-                window_size: int=512,
-                hop_length: int=320,
-                window: str='hann',
-                center: bool=True,
-                pad_mode: str='reflect',
-                power: float=2.0) -> array:
-    """Compute spectrogram from an input waveform.
-
-    This function is a wrapper for librosa.feature.stft, with addition step to
-    compute the magnitude of the complex spectrogram.
-    """
-
-    s = stft(
-        x,
-        n_fft=window_size,
-        hop_length=hop_length,
-        win_length=window_size,
-        window=window,
-        center=center,
-        pad_mode=pad_mode)
-
-    return np.abs(s)**power
-
-
-def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array:
-    """Mu-law encoding.
-
-    Compute the mu-law decoding given an input code.
-    When quantized is True, the result will be converted to
-    integer in range [0,mu-1]. Otherwise, the resulting signal
-    is in range [-1,1]
-
-
-    Reference:
-        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
-
-    """
-    mu = 255
-    y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
-    if quantized:
-        y = np.floor((y + 1) / 2 * mu + 0.5)  # convert to [0 , mu-1]
-    return y
-
-
-def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
-    """Mu-law decoding.
-
-    Compute the mu-law decoding given an input code.
-
-    it assumes that the input y is in
-    range [0,mu-1] when quantize is True and [-1,1] otherwise
-
-    Reference:
-        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
-
-    """
-    if mu < 1:
-        raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
-
-    mu = mu - 1
-    if quantized:  # undo the quantization
-        y = y * 2 / mu - 1
-    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
-    return x
-
-
-def randint(high: int) -> int:
-    """Generate one random integer in range [0 high)
-
-     This is a helper function for random data augmentaiton
-    """
-    return int(np.random.randint(0, high=high))
-
-
-def rand() -> float:
-    """Generate one floating-point number in range [0 1)
-
-    This is a helper function for random data augmentaiton
-    """
-    return float(np.random.rand(1))
-
-
-def depth_augment(y: array,
-                  choices: List=['int8', 'int16'],
-                  probs: List[float]=[0.5, 0.5]) -> array:
-    """ Audio depth augmentation
-
-    Do audio depth augmentation to simulate the distortion brought by quantization.
-    """
-    assert len(probs) == len(
-        choices
-    ), 'number of choices {} must be equal to size of probs {}'.format(
-        len(choices), len(probs))
-    depth = np.random.choice(choices, p=probs)
-    src_depth = y.dtype
-    y1 = depth_convert(y, depth)
-    y2 = depth_convert(y1, src_depth)
-
-    return y2
-
-
-def adaptive_spect_augment(spect: array, tempo_axis: int=0,
-                           level: float=0.1) -> array:
-    """Do adpative spectrogram augmentation
-
-    The level of the augmentation is gowern by the paramter level,
-    ranging from 0 to 1, with 0 represents no augmentation。
-
-    """
-    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
-    if tempo_axis == 0:
-        nt, nf = spect.shape
-    else:
-        nf, nt = spect.shape
-
-    time_mask_width = int(nt * level * 0.5)
-    freq_mask_width = int(nf * level * 0.5)
-
-    num_time_mask = int(10 * level)
-    num_freq_mask = int(10 * level)
-
-    if tempo_axis == 0:
-        for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
-            spect[start:start + time_mask_width, :] = 0
-        for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
-            spect[:, start:start + freq_mask_width] = 0
-    else:
-        for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
-            spect[:, start:start + time_mask_width] = 0
-        for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
-            spect[start:start + freq_mask_width, :] = 0
-
-    return spect
-
-
-def spect_augment(spect: array,
-                  tempo_axis: int=0,
-                  max_time_mask: int=3,
-                  max_freq_mask: int=3,
-                  max_time_mask_width: int=30,
-                  max_freq_mask_width: int=20) -> array:
-    """Do spectrogram augmentation in both time and freq axis
-
-    Reference:
-
+        [type]: [description]
     """
-    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
-    if tempo_axis == 0:
-        nt, nf = spect.shape
-    else:
-        nf, nt = spect.shape
-
-    num_time_mask = randint(max_time_mask)
-    num_freq_mask = randint(max_freq_mask)
-
-    time_mask_width = randint(max_time_mask_width)
-    freq_mask_width = randint(max_freq_mask_width)
-
-    if tempo_axis == 0:
-        for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
-            spect[start:start + time_mask_width, :] = 0
-        for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
-            spect[:, start:start + freq_mask_width] = 0
+    n = paddle.arange(n_mels, dtype=dtype)
+    k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
+    dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
+                     k)  # size (n_mfcc, n_mels)
+    if norm is None:
+        dct *= 2.0
     else:
-        for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
-            spect[:, start:start + time_mask_width] = 0
-        for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
-            spect[start:start + freq_mask_width, :] = 0
-
-    return spect
-
-
-def random_crop1d(y: array, crop_len: int) -> array:
-    """ Do random cropping on 1d input signal
-
-    The input is a 1d signal, typically a sound waveform
-    """
-    if y.ndim != 1:
-        'only accept 1d tensor or numpy array'
-    n = len(y)
-    idx = randint(n - crop_len)
-    return y[idx:idx + crop_len]
-
-
-def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
-    """ Do random cropping for 2D array, typically a spectrogram.
-
-    The cropping is done in temporal direction on the time-freq input signal.
-    """
-    if tempo_axis >= s.ndim:
-        raise ParameterError('axis out of range')
-
-    n = s.shape[tempo_axis]
-    idx = randint(high=n - crop_len)
-    sli = [slice(None) for i in range(s.ndim)]
-    sli[tempo_axis] = slice(idx, idx + crop_len)
-    out = s[tuple(sli)]
-    return out
+        assert norm == "ortho"
+        dct[0] *= 1.0 / math.sqrt(2.0)
+        dct *= math.sqrt(2.0 / float(n_mels))
+    return dct.T
diff --git a/paddleaudio/paddleaudio/io/__init__.py b/paddleaudio/paddleaudio/io/__init__.py
index cc2538f7f..185a92b8d 100644
--- a/paddleaudio/paddleaudio/io/__init__.py
+++ b/paddleaudio/paddleaudio/io/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,9 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .audio import depth_convert
-from .audio import load
-from .audio import normalize
-from .audio import resample
-from .audio import save_wav
-from .audio import to_mono
diff --git a/paddleaudio/paddleaudio/io/audio.py b/paddleaudio/paddleaudio/io/audio.py
deleted file mode 100644
index 4127570ec..000000000
--- a/paddleaudio/paddleaudio/io/audio.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import warnings
-from typing import Optional
-from typing import Tuple
-from typing import Union
-
-import numpy as np
-import resampy
-import soundfile as sf
-from numpy import ndarray as array
-from scipy.io import wavfile
-
-from ..utils import ParameterError
-
-__all__ = [
-    'resample',
-    'to_mono',
-    'depth_convert',
-    'normalize',
-    'save_wav',
-    'load',
-]
-NORMALMIZE_TYPES = ['linear', 'gaussian']
-MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
-RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
-EPS = 1e-8
-
-
-def resample(y: array, src_sr: int, target_sr: int,
-             mode: str='kaiser_fast') -> array:
-    """ Audio resampling
-
-     This function is the same as using resampy.resample().
-
-     Notes:
-        The default mode is kaiser_fast.  For better audio quality, use mode = 'kaiser_fast'
-
-     """
-
-    if mode == 'kaiser_best':
-        warnings.warn(
-            f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
-        we recommend the mode kaiser_fast in large scale audio trainning')
-
-    if not isinstance(y, np.ndarray):
-        raise ParameterError(
-            'Only support numpy array, but received y in {type(y)}')
-
-    if mode not in RESAMPLE_MODES:
-        raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
-
-    return resampy.resample(y, src_sr, target_sr, filter=mode)
-
-
-def to_mono(y: array, merge_type: str='average') -> array:
-    """ convert sterior audio to mono
-    """
-    if merge_type not in MERGE_TYPES:
-        raise ParameterError(
-            f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
-        )
-    if y.ndim > 2:
-        raise ParameterError(
-            f'Unsupported audio array,  y.ndim > 2, the shape is {y.shape}')
-    if y.ndim == 1:  # nothing to merge
-        return y
-
-    if merge_type == 'ch0':
-        return y[0]
-    if merge_type == 'ch1':
-        return y[1]
-    if merge_type == 'random':
-        return y[np.random.randint(0, 2)]
-
-    # need to do averaging according to dtype
-
-    if y.dtype == 'float32':
-        y_out = (y[0] + y[1]) * 0.5
-    elif y.dtype == 'int16':
-        y_out = y.astype('int32')
-        y_out = (y_out[0] + y_out[1]) // 2
-        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
-                        np.iinfo(y.dtype).max).astype(y.dtype)
-
-    elif y.dtype == 'int8':
-        y_out = y.astype('int16')
-        y_out = (y_out[0] + y_out[1]) // 2
-        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
-                        np.iinfo(y.dtype).max).astype(y.dtype)
-    else:
-        raise ParameterError(f'Unsupported dtype: {y.dtype}')
-    return y_out
-
-
-def _safe_cast(y: array, dtype: Union[type, str]) -> array:
-    """ data type casting in a safe way, i.e., prevent overflow or underflow
-
-    This function is used internally.
-    """
-    return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
-
-
-def depth_convert(y: array, dtype: Union[type, str],
-                  dithering: bool=True) -> array:
-    """Convert audio array to target dtype safely
-
-    This function convert audio waveform to a target dtype, with addition steps of
-    preventing overflow/underflow and preserving audio range.
-
-    """
-
-    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
-    if y.dtype not in SUPPORT_DTYPE:
-        raise ParameterError(
-            'Unsupported audio dtype, '
-            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
-
-    if dtype not in SUPPORT_DTYPE:
-        raise ParameterError(
-            'Unsupported audio dtype, '
-            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
-
-    if dtype == y.dtype:
-        return y
-
-    if dtype == 'float64' and y.dtype == 'float32':
-        return _safe_cast(y, dtype)
-    if dtype == 'float32' and y.dtype == 'float64':
-        return _safe_cast(y, dtype)
-
-    if dtype == 'int16' or dtype == 'int8':
-        if y.dtype in ['float64', 'float32']:
-            factor = np.iinfo(dtype).max
-            y = np.clip(y * factor, np.iinfo(dtype).min,
-                        np.iinfo(dtype).max).astype(dtype)
-            y = y.astype(dtype)
-        else:
-            if dtype == 'int16' and y.dtype == 'int8':
-                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
-                y = y.astype('float32') * factor
-                y = y.astype('int16')
-
-            else:  # dtype == 'int8' and y.dtype=='int16':
-                y = y.astype('int32') * np.iinfo('int8').max / \
-                    np.iinfo('int16').max
-                y = y.astype('int8')
-
-    if dtype in ['float32', 'float64']:
-        org_dtype = y.dtype
-        y = y.astype(dtype) / np.iinfo(org_dtype).max
-    return y
-
-
-def sound_file_load(file: str,
-                    offset: Optional[float]=None,
-                    dtype: str='int16',
-                    duration: Optional[int]=None) -> Tuple[array, int]:
-    """Load audio using soundfile library
-
-    This function load audio file using libsndfile.
-
-    Reference:
-        http://www.mega-nerd.com/libsndfile/#Features
-
-    """
-    with sf.SoundFile(file) as sf_desc:
-        sr_native = sf_desc.samplerate
-        if offset:
-            sf_desc.seek(int(offset * sr_native))
-        if duration is not None:
-            frame_duration = int(duration * sr_native)
-        else:
-            frame_duration = -1
-        y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
-
-    return y, sf_desc.samplerate
-
-
-def audio_file_load():
-    """Load audio using audiofile library
-
-    This function load audio file using audiofile.
-
-    Reference:
-        https://audiofile.68k.org/
-
-    """
-    raise NotImplementedError()
-
-
-def sox_file_load():
-    """Load audio using sox library
-
-    This function load audio file using sox.
-
-    Reference:
-        http://sox.sourceforge.net/
-    """
-    raise NotImplementedError()
-
-
-def normalize(y: array, norm_type: str='linear',
-              mul_factor: float=1.0) -> array:
-    """ normalize an input audio with additional multiplier.
-
-    """
-
-    if norm_type == 'linear':
-        amax = np.max(np.abs(y))
-        factor = 1.0 / (amax + EPS)
-        y = y * factor * mul_factor
-    elif norm_type == 'gaussian':
-        amean = np.mean(y)
-        astd = np.std(y)
-        astd = max(astd, EPS)
-        y = mul_factor * (y - amean) / astd
-    else:
-        raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
-
-    return y
-
-
-def save_wav(y: array, sr: int, file: str) -> None:
-    """Save audio file to disk.
-    This function saves audio to disk using scipy.io.wavfile, with additional step
-    to convert input waveform to int16 unless it already is int16
-
-    Notes:
-        It only support raw wav format.
-
-    """
-    if not file.endswith('.wav'):
-        raise ParameterError(
-            f'only .wav file supported, but dst file name is: {file}')
-
-    if sr <= 0:
-        raise ParameterError(
-            f'Sample rate should be larger than 0, recieved sr = {sr}')
-
-    if y.dtype not in ['int16', 'int8']:
-        warnings.warn(
-            f'input data type is {y.dtype}, will convert data to int16 format before saving'
-        )
-        y_out = depth_convert(y, 'int16')
-    else:
-        y_out = y
-
-    wavfile.write(file, sr, y_out)
-
-
-def load(
-        file: str,
-        sr: Optional[int]=None,
-        mono: bool=True,
-        merge_type: str='average',  # ch0,ch1,random,average
-        normal: bool=True,
-        norm_type: str='linear',
-        norm_mul_factor: float=1.0,
-        offset: float=0.0,
-        duration: Optional[int]=None,
-        dtype: str='float32',
-        resample_mode: str='kaiser_fast') -> Tuple[array, int]:
-    """Load audio file from disk.
-    This function loads audio from disk using using audio beackend.
-
-    Parameters:
-
-    Notes:
-
-    """
-
-    y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration)
-
-    if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
-        raise ParameterError(f'audio file {file} looks empty')
-
-    if mono:
-        y = to_mono(y, merge_type)
-
-    if sr is not None and sr != r:
-        y = resample(y, r, sr, mode=resample_mode)
-        r = sr
-
-    if normal:
-        y = normalize(y, norm_type, norm_mul_factor)
-    elif dtype in ['int8', 'int16']:
-        # still need to do normalization, before depth convertion
-        y = normalize(y, 'linear', 1.0)
-
-    y = depth_convert(y, dtype)
-    return y, r

From f4c720544013d0eb28b7f8cfb858b355a1a5e6ef Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Wed, 2 Mar 2022 12:42:20 +0800
Subject: [PATCH 05/17] refactor

---
 paddleaudio/paddleaudio/compliance/kaldi.py  | 2 +-
 paddleaudio/paddleaudio/datasets/__init__.py | 7 -------
 paddleaudio/paddleaudio/datasets/dataset.py  | 4 ++--
 paddleaudio/paddleaudio/features/__init__.py | 7 ++++---
 paddleaudio/paddleaudio/metric/mcd.py        | 1 +
 5 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/paddleaudio/paddleaudio/compliance/kaldi.py b/paddleaudio/paddleaudio/compliance/kaldi.py
index 61ca4e3db..e4192e817 100644
--- a/paddleaudio/paddleaudio/compliance/kaldi.py
+++ b/paddleaudio/paddleaudio/compliance/kaldi.py
@@ -17,8 +17,8 @@ from typing import Tuple
 import paddle
 from paddle import Tensor
 
+from ..functional import create_dct
 from ..functional.window import get_window
-from .spectrum import create_dct
 
 __all__ = [
     'spectrogram',
diff --git a/paddleaudio/paddleaudio/datasets/__init__.py b/paddleaudio/paddleaudio/datasets/__init__.py
index 8d2fdab46..5c5f03694 100644
--- a/paddleaudio/paddleaudio/datasets/__init__.py
+++ b/paddleaudio/paddleaudio/datasets/__init__.py
@@ -15,10 +15,3 @@ from .esc50 import ESC50
 from .gtzan import GTZAN
 from .tess import TESS
 from .urban_sound import UrbanSound8K
-
-__all__ = [
-    'ESC50',
-    'UrbanSound8K',
-    'GTZAN',
-    'TESS',
-]
diff --git a/paddleaudio/paddleaudio/datasets/dataset.py b/paddleaudio/paddleaudio/datasets/dataset.py
index 7a57fd6cc..06e2df6d0 100644
--- a/paddleaudio/paddleaudio/datasets/dataset.py
+++ b/paddleaudio/paddleaudio/datasets/dataset.py
@@ -17,8 +17,8 @@ import numpy as np
 import paddle
 
 from ..backends import load as load_audio
-from ..features import melspectrogram
-from ..features import mfcc
+from ..compliance.librosa import melspectrogram
+from ..compliance.librosa import mfcc
 
 feat_funcs = {
     'raw': None,
diff --git a/paddleaudio/paddleaudio/features/__init__.py b/paddleaudio/paddleaudio/features/__init__.py
index 469b4c9ba..00781397f 100644
--- a/paddleaudio/paddleaudio/features/__init__.py
+++ b/paddleaudio/paddleaudio/features/__init__.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .librosa import LogMelSpectrogram
-from .librosa import MelSpectrogram
-from .librosa import Spectrogram
+from .layers import LogMelSpectrogram
+from .layers import MelSpectrogram
+from .layers import MFCC
+from .layers import Spectrogram
diff --git a/paddleaudio/paddleaudio/metric/mcd.py b/paddleaudio/paddleaudio/metric/mcd.py
index 281e57653..465cd5a45 100644
--- a/paddleaudio/paddleaudio/metric/mcd.py
+++ b/paddleaudio/paddleaudio/metric/mcd.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import mcd.metrics_fast as mt
+import numpy as np
 from mcd import dtw
 
 __all__ = [

From 4d2f2191a817d3d3db2d4562d5844387c659c819 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Wed, 2 Mar 2022 08:44:07 +0000
Subject: [PATCH 06/17] fix gbk encode bug

---
 paddlespeech/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py
index 185a92b8d..b781c4a8e 100644
--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
@@ -11,3 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import _locale
+
+_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])

From 504c2c9d50ca360aab23c78162a5b0e2ce5b53fe Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Wed, 2 Mar 2022 16:35:11 +0800
Subject: [PATCH 07/17] refactor

---
 paddleaudio/paddleaudio/__init__.py        | 7 +++++++
 paddleaudio/paddleaudio/features/layers.py | 6 ++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/paddleaudio/paddleaudio/__init__.py b/paddleaudio/paddleaudio/__init__.py
index 2dab610cf..6184c1dd4 100644
--- a/paddleaudio/paddleaudio/__init__.py
+++ b/paddleaudio/paddleaudio/__init__.py
@@ -11,5 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from . import compliance
+from . import datasets
+from . import features
+from . import functional
+from . import io
+from . import metric
+from . import sox_effects
 from .backends import load
 from .backends import save
diff --git a/paddleaudio/paddleaudio/features/layers.py b/paddleaudio/paddleaudio/features/layers.py
index 69f814d66..69f462542 100644
--- a/paddleaudio/paddleaudio/features/layers.py
+++ b/paddleaudio/paddleaudio/features/layers.py
@@ -71,15 +71,17 @@ class Spectrogram(nn.Layer):
         if win_length is None:
             win_length = n_fft
 
-        fft_window = get_window(window, win_length, fftbins=True, dtype=dtype)
+        self.fft_window = get_window(
+            window, win_length, fftbins=True, dtype=dtype)
         self._stft = partial(
             paddle.signal.stft,
             n_fft=n_fft,
             hop_length=hop_length,
             win_length=win_length,
-            window=fft_window,
+            window=self.fft_window,
             center=center,
             pad_mode=pad_mode)
+        self.register_buffer('fft_window', self.fft_window)
 
     def forward(self, x):
         stft = self._stft(x)

From 959408bafe70fab8f096a5393daabf81405a27e6 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Thu, 3 Mar 2022 17:22:21 +0800
Subject: [PATCH 08/17] Refactor and add doc string.

---
 paddleaudio/paddleaudio/compliance/kaldi.py | 341 +++++++++-----------
 paddleaudio/paddleaudio/features/layers.py  |  14 +-
 2 files changed, 155 insertions(+), 200 deletions(-)

diff --git a/paddleaudio/paddleaudio/compliance/kaldi.py b/paddleaudio/paddleaudio/compliance/kaldi.py
index e4192e817..35d7072ca 100644
--- a/paddleaudio/paddleaudio/compliance/kaldi.py
+++ b/paddleaudio/paddleaudio/compliance/kaldi.py
@@ -105,7 +105,7 @@ def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
 def _get_waveform_and_window_properties(
         waveform: Tensor,
         channel: int,
-        sample_frequency: float,
+        sr: int,
         frame_shift: float,
         frame_length: float,
         round_to_power_of_two: bool,
@@ -115,9 +115,9 @@ def _get_waveform_and_window_properties(
         'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
     waveform = waveform[channel, :]  # size (n)
     window_shift = int(
-        sample_frequency * frame_shift *
+        sr * frame_shift *
         0.001)  # pass frame_shift and frame_length in milliseconds
-    window_size = int(sample_frequency * frame_length * 0.001)
+    window_size = int(sr * frame_length * 0.001)
     padded_window_size = _next_power_of_2(
         window_size) if round_to_power_of_two else window_size
 
@@ -128,7 +128,7 @@ def _get_waveform_and_window_properties(
     assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
                                         ' use `round_to_power_of_two` or change `frame_length`'
     assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
-    assert sample_frequency > 0, '`sample_frequency` must be greater than zero'
+    assert sr > 0, '`sr` must be greater than zero'
     return waveform, window_shift, window_size, padded_window_size
 
 
@@ -147,45 +147,38 @@ def _get_window(waveform: Tensor,
     dtype = waveform.dtype
     epsilon = _get_epsilon(dtype)
 
-    # size (m, window_size)
+    # (m, window_size)
     strided_input = _get_strided(waveform, window_size, window_shift,
                                  snip_edges)
 
     if dither != 0.0:
-        # Returns a random number strictly between 0 and 1
         x = paddle.maximum(epsilon,
                            paddle.rand(strided_input.shape, dtype=dtype))
         rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
         strided_input = strided_input + rand_gauss * dither
 
     if remove_dc_offset:
-        # Subtract each row/frame by its mean
-        row_means = paddle.mean(
-            strided_input, axis=1).unsqueeze(1)  # size (m, 1)
+        row_means = paddle.mean(strided_input, axis=1).unsqueeze(1)  # (m, 1)
         strided_input = strided_input - row_means
 
     if raw_energy:
-        # Compute the log energy of each row/frame before applying preemphasis and
-        # window function
         signal_log_energy = _get_log_energy(strided_input, epsilon,
-                                            energy_floor)  # size (m)
+                                            energy_floor)  # (m)
 
     if preemphasis_coefficient != 0.0:
-        # strided_input[i,j] -= preemphasis_coefficient * strided_input[i, max(0, j-1)] for all i,j
         offset_strided_input = paddle.nn.functional.pad(
             strided_input.unsqueeze(0), (1, 0),
             data_format='NCL',
-            mode='replicate').squeeze(0)  # size (m, window_size + 1)
+            mode='replicate').squeeze(0)  # (m, window_size + 1)
         strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
                                                                                        -1]
 
-    # Apply window_function to each row/frame
     window_function = _feature_window_function(
         window_type, window_size, blackman_coeff,
-        dtype).unsqueeze(0)  # size (1, window_size)
-    strided_input = strided_input * window_function  # size (m, window_size)
+        dtype).unsqueeze(0)  # (1, window_size)
+    strided_input = strided_input * window_function  # (m, window_size)
 
-    # Pad columns with zero until we reach size (m, padded_window_size)
+    # (m, padded_window_size)
     if padded_window_size != window_size:
         padding_right = padded_window_size - window_size
         strided_input = paddle.nn.functional.pad(
@@ -194,7 +187,6 @@ def _get_window(waveform: Tensor,
             mode='constant',
             value=0).squeeze(0)
 
-    # Compute energy after window function (not the raw one)
     if not raw_energy:
         signal_log_energy = _get_log_energy(strided_input, epsilon,
                                             energy_floor)  # size (m)
@@ -203,8 +195,6 @@ def _get_window(waveform: Tensor,
 
 
 def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
-    # subtracts the column mean of the tensor size (m, n) if subtract_mean=True
-    # it returns size (m, n)
     if subtract_mean:
         col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
         tensor = tensor - col_means
@@ -218,61 +208,56 @@ def spectrogram(waveform: Tensor,
                 energy_floor: float=1.0,
                 frame_length: float=25.0,
                 frame_shift: float=10.0,
-                min_duration: float=0.0,
                 preemphasis_coefficient: float=0.97,
                 raw_energy: bool=True,
                 remove_dc_offset: bool=True,
                 round_to_power_of_two: bool=True,
-                sample_frequency: float=16000.0,
+                sr: int=16000,
                 snip_edges: bool=True,
                 subtract_mean: bool=False,
                 window_type: str=POVEY) -> Tensor:
-    """[summary]
+    """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
 
     Args:
-        waveform (Tensor): [description]
-        blackman_coeff (float, optional): [description]. Defaults to 0.42.
-        channel (int, optional): [description]. Defaults to -1.
-        dither (float, optional): [description]. Defaults to 0.0.
-        energy_floor (float, optional): [description]. Defaults to 1.0.
-        frame_length (float, optional): [description]. Defaults to 25.0.
-        frame_shift (float, optional): [description]. Defaults to 10.0.
-        min_duration (float, optional): [description]. Defaults to 0.0.
-        preemphasis_coefficient (float, optional): [description]. Defaults to 0.97.
-        raw_energy (bool, optional): [description]. Defaults to True.
-        remove_dc_offset (bool, optional): [description]. Defaults to True.
-        round_to_power_of_two (bool, optional): [description]. Defaults to True.
-        sample_frequency (float, optional): [description]. Defaults to 16000.0.
-        snip_edges (bool, optional): [description]. Defaults to True.
-        subtract_mean (bool, optional): [description]. Defaults to False.
-        window_type (str, optional): [description]. Defaults to POVEY.
+        waveform (Tensor): A waveform tensor with shape [C, T].
+        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
+        channel (int, optional): Select the channel of waveform. Defaults to -1.
+        dither (float, optional): Dithering constant . Defaults to 0.0.
+        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
+        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
+        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
+        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
+        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
+        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. Defaults to True.
+        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
+        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
+            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
+        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
+        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
 
     Returns:
-        Tensor: [description]
+        Tensor: A spectrogram tensor with shape (m, padded_window_size // 2 + 1) where m is the number of frames
+            depends on frame_length and frame_shift.
     """
     dtype = waveform.dtype
     epsilon = _get_epsilon(dtype)
 
     waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
-        waveform, channel, sample_frequency, frame_shift, frame_length,
-        round_to_power_of_two, preemphasis_coefficient)
-
-    if len(waveform) < min_duration * sample_frequency:
-        # signal is too short
-        return paddle.empty([0])
+        waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
+        preemphasis_coefficient)
 
     strided_input, signal_log_energy = _get_window(
         waveform, padded_window_size, window_size, window_shift, window_type,
         blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
         remove_dc_offset, preemphasis_coefficient)
 
-    # size (m, padded_window_size // 2 + 1, 2)
+    # (m, padded_window_size // 2 + 1, 2)
     fft = paddle.fft.rfft(strided_input)
 
-    # Convert the FFT into a power spectrum
     power_spectrum = paddle.maximum(
-        fft.abs().pow(2.),
-        epsilon).log()  # size (m, padded_window_size // 2 + 1)
+        fft.abs().pow(2.), epsilon).log()  # (m, padded_window_size // 2 + 1)
     power_spectrum[:, 0] = signal_log_energy
 
     power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
@@ -306,25 +291,19 @@ def _vtln_warp_freq(vtln_low_cutoff: float,
     l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
     h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
     scale = 1.0 / vtln_warp_factor
-    Fl = scale * l  # F(l)
-    Fh = scale * h  # F(h)
+    Fl = scale * l
+    Fh = scale * h
     assert l > low_freq and h < high_freq
-    # slope of left part of the 3-piece linear function
     scale_left = (Fl - low_freq) / (l - low_freq)
-    # [slope of center part is just "scale"]
-
-    # slope of right part of the 3-piece linear function
     scale_right = (high_freq - Fh) / (high_freq - h)
-
     res = paddle.empty_like(freq)
 
     outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
-        | paddle.greater_than(freq, paddle.to_tensor(high_freq))  # freq < low_freq || freq > high_freq
-    before_l = paddle.less_than(freq, paddle.to_tensor(l))  # freq < l
-    before_h = paddle.less_than(freq, paddle.to_tensor(h))  # freq < h
-    after_h = paddle.greater_equal(freq, paddle.to_tensor(h))  # freq >= h
+        | paddle.greater_than(freq, paddle.to_tensor(high_freq))
+    before_l = paddle.less_than(freq, paddle.to_tensor(l))
+    before_h = paddle.less_than(freq, paddle.to_tensor(h))
+    after_h = paddle.greater_equal(freq, paddle.to_tensor(h))
 
-    # order of operations matter here (since there is overlapping frequency regions)
     res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
     res[before_h] = scale * freq[before_h]
     res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
@@ -363,13 +342,10 @@ def _get_mel_banks(num_bins: int,
     assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
         ('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
 
-    # fft-bin width [think of it as Nyquist-freq / half-window-length]
     fft_bin_width = sample_freq / window_length_padded
     mel_low_freq = _mel_scale_scalar(low_freq)
     mel_high_freq = _mel_scale_scalar(high_freq)
 
-    # divide by num_bins+1 in next line because of end-effects where the bins
-    # spread out to the sides.
     mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
 
     if vtln_high < 0.0:
@@ -381,10 +357,9 @@ def _get_mel_banks(num_bins: int,
          'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
 
     bin = paddle.arange(num_bins).unsqueeze(1)
-    left_mel = mel_low_freq + bin * mel_freq_delta  # size(num_bins, 1)
-    center_mel = mel_low_freq + (bin + 1.0
-                                 ) * mel_freq_delta  # size(num_bins, 1)
-    right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta  # size(num_bins, 1)
+    left_mel = mel_low_freq + bin * mel_freq_delta  # (num_bins, 1)
+    center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta  # (num_bins, 1)
+    right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta  # (num_bins, 1)
 
     if vtln_warp_factor != 1.0:
         left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
@@ -395,25 +370,23 @@ def _get_mel_banks(num_bins: int,
         right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
                                         high_freq, vtln_warp_factor, right_mel)
 
-    center_freqs = _inverse_mel_scale(center_mel)  # size (num_bins)
-    # size(1, num_fft_bins)
+    center_freqs = _inverse_mel_scale(center_mel)  # (num_bins)
+    # (1, num_fft_bins)
     mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0)
 
-    # size (num_bins, num_fft_bins)
+    # (num_bins, num_fft_bins)
     up_slope = (mel - left_mel) / (center_mel - left_mel)
     down_slope = (right_mel - mel) / (right_mel - center_mel)
 
     if vtln_warp_factor == 1.0:
-        # left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values
         bins = paddle.maximum(
             paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
     else:
-        # warping can move the order of left_mel, center_mel, right_mel anywhere
         bins = paddle.zeros_like(up_slope)
         up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
-            mel, center_mel)  # left_mel < mel <= center_mel
+            mel, center_mel)
         down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
-            mel, right_mel)  # center_mel < mel < right_mel
+            mel, right_mel)
         bins[up_idx] = up_slope[up_idx]
         bins[down_idx] = down_slope[down_idx]
 
@@ -430,13 +403,12 @@ def fbank(waveform: Tensor,
           high_freq: float=0.0,
           htk_compat: bool=False,
           low_freq: float=20.0,
-          min_duration: float=0.0,
-          num_mel_bins: int=23,
+          n_mels: int=23,
           preemphasis_coefficient: float=0.97,
           raw_energy: bool=True,
           remove_dc_offset: bool=True,
           round_to_power_of_two: bool=True,
-          sample_frequency: float=16000.0,
+          sr: int=16000,
           snip_edges: bool=True,
           subtract_mean: bool=False,
           use_energy: bool=False,
@@ -446,83 +418,75 @@ def fbank(waveform: Tensor,
           vtln_low: float=100.0,
           vtln_warp: float=1.0,
           window_type: str=POVEY) -> Tensor:
-    """[summary]
+    """Compute and return filter banks from a waveform. The output is identical to Kaldi's.
 
     Args:
-        waveform (Tensor): [description]
-        blackman_coeff (float, optional): [description]. Defaults to 0.42.
-        channel (int, optional): [description]. Defaults to -1.
-        dither (float, optional): [description]. Defaults to 0.0.
-        energy_floor (float, optional): [description]. Defaults to 1.0.
-        frame_length (float, optional): [description]. Defaults to 25.0.
-        frame_shift (float, optional): [description]. Defaults to 10.0.
-        high_freq (float, optional): [description]. Defaults to 0.0.
-        htk_compat (bool, optional): [description]. Defaults to False.
-        low_freq (float, optional): [description]. Defaults to 20.0.
-        min_duration (float, optional): [description]. Defaults to 0.0.
-        num_mel_bins (int, optional): [description]. Defaults to 23.
-        preemphasis_coefficient (float, optional): [description]. Defaults to 0.97.
-        raw_energy (bool, optional): [description]. Defaults to True.
-        remove_dc_offset (bool, optional): [description]. Defaults to True.
-        round_to_power_of_two (bool, optional): [description]. Defaults to True.
-        sample_frequency (float, optional): [description]. Defaults to 16000.0.
-        snip_edges (bool, optional): [description]. Defaults to True.
-        subtract_mean (bool, optional): [description]. Defaults to False.
-        use_energy (bool, optional): [description]. Defaults to False.
-        use_log_fbank (bool, optional): [description]. Defaults to True.
-        use_power (bool, optional): [description]. Defaults to True.
-        vtln_high (float, optional): [description]. Defaults to -500.0.
-        vtln_low (float, optional): [description]. Defaults to 100.0.
-        vtln_warp (float, optional): [description]. Defaults to 1.0.
-        window_type (str, optional): [description]. Defaults to POVEY.
+        waveform (Tensor): A waveform tensor with shape [C, T].
+        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
+        channel (int, optional): Select the channel of waveform. Defaults to -1.
+        dither (float, optional): Dithering constant . Defaults to 0.0.
+        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
+        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
+        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
+        high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
+        htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
+        low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
+        n_mels (int, optional): Number of output mel bins. Defaults to 23.
+        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
+        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
+        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. Defaults to True.
+        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
+        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
+            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
+        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
+        use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
+        use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
+        use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
+        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
+        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
+        vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
+        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
 
     Returns:
-        Tensor: [description]
+        Tensor: A filter banks tensor with shape (m, n_mels).
     """
     dtype = waveform.dtype
 
     waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
-        waveform, channel, sample_frequency, frame_shift, frame_length,
-        round_to_power_of_two, preemphasis_coefficient)
-
-    if len(waveform) < min_duration * sample_frequency:
-        # signal is too short
-        return paddle.empty([0], dtype=dtype)
+        waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
+        preemphasis_coefficient)
 
-    # strided_input, size (m, padded_window_size) and signal_log_energy, size (m)
     strided_input, signal_log_energy = _get_window(
         waveform, padded_window_size, window_size, window_shift, window_type,
         blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
         remove_dc_offset, preemphasis_coefficient)
 
-    # size (m, padded_window_size // 2 + 1)
+    # (m, padded_window_size // 2 + 1)
     spectrum = paddle.fft.rfft(strided_input).abs()
     if use_power:
         spectrum = spectrum.pow(2.)
 
-    # size (num_mel_bins, padded_window_size // 2)
-    mel_energies, _ = _get_mel_banks(num_mel_bins, padded_window_size,
-                                     sample_frequency, low_freq, high_freq,
-                                     vtln_low, vtln_high, vtln_warp)
+    # (n_mels, padded_window_size // 2)
+    mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq,
+                                     high_freq, vtln_low, vtln_high, vtln_warp)
     mel_energies = mel_energies.astype(dtype)
 
-    # pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1)
+    # (n_mels, padded_window_size // 2 + 1)
     mel_energies = paddle.nn.functional.pad(
         mel_energies.unsqueeze(0), (0, 1),
         data_format='NCL',
         mode='constant',
         value=0).squeeze(0)
 
-    # sum with mel fiterbanks over the power spectrum, size (m, num_mel_bins)
+    # (m, n_mels)
     mel_energies = paddle.mm(spectrum, mel_energies.T)
     if use_log_fbank:
-        # avoid log of zero (which should be prevented anyway by dithering)
         mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
 
-    # if use_energy then add it as the last column for htk_compat == true else first column
     if use_energy:
-        signal_log_energy = signal_log_energy.unsqueeze(1)  # size (m, 1)
-        # returns size (m, num_mel_bins + 1)
+        signal_log_energy = signal_log_energy.unsqueeze(1)
         if htk_compat:
             mel_energies = paddle.concat(
                 (mel_energies, signal_log_energy), axis=1)
@@ -530,28 +494,20 @@ def fbank(waveform: Tensor,
             mel_energies = paddle.concat(
                 (signal_log_energy, mel_energies), axis=1)
 
+    # (m, n_mels + 1)
     mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
     return mel_energies
 
 
-def _get_dct_matrix(num_ceps: int, num_mel_bins: int) -> Tensor:
-    # returns a dct matrix of size (num_mel_bins, num_ceps)
-    # size (num_mel_bins, num_mel_bins)
-    dct_matrix = create_dct(num_mel_bins, num_mel_bins, 'ortho')
-    # kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins)
-    # this would be the first column in the dct_matrix for torchaudio as it expects a
-    # right multiply (which would be the first column of the kaldi's dct_matrix as kaldi
-    # expects a left multiply e.g. dct_matrix * vector).
-    dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins))
-    dct_matrix = dct_matrix[:, :num_ceps]
+def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor:
+    dct_matrix = create_dct(n_mels, n_mels, 'ortho')
+    dct_matrix[:, 0] = math.sqrt(1 / float(n_mels))
+    dct_matrix = dct_matrix[:, :n_mfcc]  # (n_mels, n_mfcc)
     return dct_matrix
 
 
-def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> Tensor:
-    # returns size (num_ceps)
-    # Compute liftering coefficients (scaling on cepstral coeffs)
-    # coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected.
-    i = paddle.arange(num_ceps)
+def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor:
+    i = paddle.arange(n_mfcc)
     return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
                                                     cepstral_lifter)
 
@@ -567,14 +523,13 @@ def mfcc(waveform: Tensor,
          high_freq: float=0.0,
          htk_compat: bool=False,
          low_freq: float=20.0,
-         num_ceps: int=13,
-         min_duration: float=0.0,
-         num_mel_bins: int=23,
+         n_mfcc: int=13,
+         n_mels: int=23,
          preemphasis_coefficient: float=0.97,
          raw_energy: bool=True,
          remove_dc_offset: bool=True,
          round_to_power_of_two: bool=True,
-         sample_frequency: float=16000.0,
+         sr: int=16000,
          snip_edges: bool=True,
          subtract_mean: bool=False,
          use_energy: bool=False,
@@ -582,47 +537,47 @@ def mfcc(waveform: Tensor,
          vtln_low: float=100.0,
          vtln_warp: float=1.0,
          window_type: str=POVEY) -> Tensor:
-    """[summary]
+    """Compute and return mel frequency cepstral coefficients from a waveform. The output is
+            identical to Kaldi's.
 
     Args:
-        waveform (Tensor): [description]
-        blackman_coeff (float, optional): [description]. Defaults to 0.42.
-        cepstral_lifter (float, optional): [description]. Defaults to 22.0.
-        channel (int, optional): [description]. Defaults to -1.
-        dither (float, optional): [description]. Defaults to 0.0.
-        energy_floor (float, optional): [description]. Defaults to 1.0.
-        frame_length (float, optional): [description]. Defaults to 25.0.
-        frame_shift (float, optional): [description]. Defaults to 10.0.
-        high_freq (float, optional): [description]. Defaults to 0.0.
-        htk_compat (bool, optional): [description]. Defaults to False.
-        low_freq (float, optional): [description]. Defaults to 20.0.
-        num_ceps (int, optional): [description]. Defaults to 13.
-        min_duration (float, optional): [description]. Defaults to 0.0.
-        num_mel_bins (int, optional): [description]. Defaults to 23.
-        preemphasis_coefficient (float, optional): [description]. Defaults to 0.97.
-        raw_energy (bool, optional): [description]. Defaults to True.
-        remove_dc_offset (bool, optional): [description]. Defaults to True.
-        round_to_power_of_two (bool, optional): [description]. Defaults to True.
-        sample_frequency (float, optional): [description]. Defaults to 16000.0.
-        snip_edges (bool, optional): [description]. Defaults to True.
-        subtract_mean (bool, optional): [description]. Defaults to False.
-        use_energy (bool, optional): [description]. Defaults to False.
-        vtln_high (float, optional): [description]. Defaults to -500.0.
-        vtln_low (float, optional): [description]. Defaults to 100.0.
-        vtln_warp (float, optional): [description]. Defaults to 1.0.
-        window_type (str, optional): [description]. Defaults to POVEY.
+        waveform (Tensor): A waveform tensor with shape [C, T].
+        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
+        cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
+        channel (int, optional): Select the channel of waveform. Defaults to -1.
+        dither (float, optional): Dithering constant . Defaults to 0.0.
+        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
+        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
+        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
+        high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
+        htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
+        low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
+        n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
+        n_mels (int, optional): Number of output mel bins. Defaults to 23.
+        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
+        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
+        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. Defaults to True.
+        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
+        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
+            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
+        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
+        use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
+        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
+        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
+        vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
+        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
 
     Returns:
-        Tensor: [description]
+        Tensor: A mel frequency cepstral coefficients tensor with shape (m, n_mfcc).
     """
-    assert num_ceps <= num_mel_bins, 'num_ceps cannot be larger than num_mel_bins: %d vs %d' % (
-        num_ceps, num_mel_bins)
+    assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
+        n_mfcc, n_mels)
 
     dtype = waveform.dtype
 
-    # The mel_energies should not be squared (use_power=True), not have mean subtracted
-    # (subtract_mean=False), and use log (use_log_fbank=True).
-    # size (m, num_mel_bins + use_energy)
+    # (m, n_mels + use_energy)
     feature = fbank(
         waveform=waveform,
         blackman_coeff=blackman_coeff,
@@ -634,13 +589,12 @@ def mfcc(waveform: Tensor,
         high_freq=high_freq,
         htk_compat=htk_compat,
         low_freq=low_freq,
-        min_duration=min_duration,
-        num_mel_bins=num_mel_bins,
+        n_mels=n_mels,
         preemphasis_coefficient=preemphasis_coefficient,
         raw_energy=raw_energy,
         remove_dc_offset=remove_dc_offset,
         round_to_power_of_two=round_to_power_of_two,
-        sample_frequency=sample_frequency,
+        sr=sr,
         snip_edges=snip_edges,
         subtract_mean=False,
         use_energy=use_energy,
@@ -652,34 +606,29 @@ def mfcc(waveform: Tensor,
         window_type=window_type)
 
     if use_energy:
-        # size (m)
-        signal_log_energy = feature[:, num_mel_bins if htk_compat else 0]
-        # offset is 0 if htk_compat==True else 1
+        # (m)
+        signal_log_energy = feature[:, n_mels if htk_compat else 0]
         mel_offset = int(not htk_compat)
-        feature = feature[:, mel_offset:(num_mel_bins + mel_offset)]
+        feature = feature[:, mel_offset:(n_mels + mel_offset)]
 
-    # size (num_mel_bins, num_ceps)
-    dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins).astype(dtype=dtype)
+    # (n_mels, n_mfcc)
+    dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype)
 
-    # size (m, num_ceps)
+    # (m, n_mfcc)
     feature = feature.matmul(dct_matrix)
 
     if cepstral_lifter != 0.0:
-        # size (1, num_ceps)
-        lifter_coeffs = _get_lifter_coeffs(num_ceps,
-                                           cepstral_lifter).unsqueeze(0)
+        # (1, n_mfcc)
+        lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0)
         feature *= lifter_coeffs.astype(dtype=dtype)
 
-    # if use_energy then replace the last column for htk_compat == true else first column
     if use_energy:
         feature[:, 0] = signal_log_energy
 
     if htk_compat:
-        energy = feature[:, 0].unsqueeze(1)  # size (m, 1)
-        feature = feature[:, 1:]  # size (m, num_ceps - 1)
+        energy = feature[:, 0].unsqueeze(1)  # (m, 1)
+        feature = feature[:, 1:]  # (m, n_mfcc - 1)
         if not use_energy:
-            # scale on C0 (actually removing a scale we previously added that's
-            # part of one common definition of the cosine transform.)
             energy *= math.sqrt(2)
 
         feature = paddle.concat((feature, energy), axis=1)
diff --git a/paddleaudio/paddleaudio/features/layers.py b/paddleaudio/paddleaudio/features/layers.py
index 69f462542..16fa00817 100644
--- a/paddleaudio/paddleaudio/features/layers.py
+++ b/paddleaudio/paddleaudio/features/layers.py
@@ -261,12 +261,18 @@ class MFCC(nn.Layer):
                  sr: int=22050,
                  n_mfcc: int=40,
                  norm: str='ortho',
+                 dtype: str=paddle.float32,
                  **kwargs):
-        """[summary]
+        """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
+
         Parameters:
-            sr (int, optional): [description]. Defaults to 22050.
-            n_mfcc (int, optional): [description]. Defaults to 40.
-            norm (str, optional): [description]. Defaults to 'ortho'.
+            sr(int): the audio sample rate.
+                The default value is 22050.
+            n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40.
+            norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
+                You can specify norm=1.0/2.0 to use customized p-norm normalization.
+            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
+                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
         """
         super(MFCC, self).__init__()
         self._log_melspectrogram = LogMelSpectrogram(sr=sr, **kwargs)

From e50c1b3b1d61695369478e81ab9f5280416d7ba2 Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Thu, 3 Mar 2022 20:39:26 +0800
Subject: [PATCH 09/17] add server test, test=doc

---
 demos/speech_server/README.md             |   5 +-
 demos/speech_server/README_cn.md          |   2 +
 demos/speech_server/conf/application.yaml |   2 +-
 paddlespeech/server/conf/application.yaml |   2 +-
 tests/unit/server/change_yaml.py          | 114 ++++++++++++++
 tests/unit/server/conf/application.yaml   |  27 ++++
 tests/unit/server/conf/asr/asr.yaml       |   8 +
 tests/unit/server/conf/asr/asr_pd.yaml    |  26 +++
 tests/unit/server/conf/tts/tts.yaml       |  32 ++++
 tests/unit/server/conf/tts/tts_pd.yaml    |  42 +++++
 tests/unit/server/test_server_client.sh   | 184 ++++++++++++++++++++++
 11 files changed, 440 insertions(+), 4 deletions(-)
 create mode 100644 tests/unit/server/change_yaml.py
 create mode 100644 tests/unit/server/conf/application.yaml
 create mode 100644 tests/unit/server/conf/asr/asr.yaml
 create mode 100644 tests/unit/server/conf/asr/asr_pd.yaml
 create mode 100644 tests/unit/server/conf/tts/tts.yaml
 create mode 100644 tests/unit/server/conf/tts/tts_pd.yaml
 create mode 100644 tests/unit/server/test_server_client.sh

diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md
index ac5cc4b00..515abaf66 100644
--- a/demos/speech_server/README.md
+++ b/demos/speech_server/README.md
@@ -10,12 +10,13 @@ This demo is an implementation of starting the voice service and accessing the s
 ### 1. Installation
 see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
 
+It is recommended to use **paddlepaddle 2.2.1** or above.
 You can choose one way from easy, meduim and hard to install paddlespeech.
 
 ### 2. Prepare config File
 The configuration file contains the service-related configuration files and the model configuration related to the voice tasks contained in the service. They are all under the `conf` folder. 
 
-**Note: The configuration of `engine_backend` in `application.yaml` represents all speech tasks included in the started service. **
+**Note: The configuration of `engine_backend` in `application.yaml` represents all speech tasks included in the started service.**
 If the service you want to start contains only a certain speech task, then you need to comment out the speech tasks that do not need to be included. For example, if you only want to use the speech recognition (ASR) service, then you can comment out the speech synthesis (TTS) service, as in the following example:
 ```bash
 engine_backend:
@@ -23,7 +24,7 @@ engine_backend:
     #tts: 'conf/tts/tts.yaml'
 ```
 
-**Note: The configuration file of `engine_backend` in `application.yaml` needs to match the configuration type of `engine_type`. **
+**Note: The configuration file of `engine_backend` in `application.yaml` needs to match the configuration type of `engine_type`.**
 When the configuration file of `engine_backend` is `XXX.yaml`, the configuration type of `engine_type` needs to be set to `python`; when the configuration file of `engine_backend` is `XXX_pd.yaml`, the configuration of `engine_type` needs to be set type is `inference`;
 
 The input of  ASR client demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md
index f202a30cd..05884fbd2 100644
--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@@ -10,8 +10,10 @@
 ### 1. 安装
 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
 
+推荐使用 **paddlepaddle 2.2.1** 或以上版本。
 你可以从 easy，medium，hard 三中方式中选择一种方式安装 PaddleSpeech。
 
+
 ### 2. 准备配置文件
 配置文件包含服务相关的配置文件和服务中包含的语音任务相关的模型配置。 它们都在 `conf` 文件夹下。
 **注意：`application.yaml` 中 `engine_backend` 的配置表示启动的服务中包含的所有语音任务。**
diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml
index 6dcae74a9..aba33a514 100644
--- a/demos/speech_server/conf/application.yaml
+++ b/demos/speech_server/conf/application.yaml
@@ -3,7 +3,7 @@
 ##################################################################
 #                     SERVER SETTING                             #
 ##################################################################
-host: '127.0.0.1'
+host: 127.0.0.1
 port: 8090
 
 ##################################################################
diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml
index 6dcae74a9..aba33a514 100644
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@@ -3,7 +3,7 @@
 ##################################################################
 #                     SERVER SETTING                             #
 ##################################################################
-host: '127.0.0.1'
+host: 127.0.0.1
 port: 8090
 
 ##################################################################
diff --git a/tests/unit/server/change_yaml.py b/tests/unit/server/change_yaml.py
new file mode 100644
index 000000000..5a5d9ae01
--- /dev/null
+++ b/tests/unit/server/change_yaml.py
@@ -0,0 +1,114 @@
+#!/usr/bin/python
+import argparse
+import os
+
+import yaml
+
+
+def change_speech_yaml(yaml_name: str, device: str):
+    """Change the settings of the device under the voice task configuration file
+
+    Args:
+        yaml_name (str): asr or asr_pd or tts or tts_pd
+        cpu (bool): True means set device to "cpu"
+        model_type (dict): change model type
+    """
+    if "asr" in yaml_name:
+        dirpath = "./conf/asr/"
+    elif 'tts' in yaml_name:
+        dirpath = "./conf/tts/"
+    yamlfile = dirpath + yaml_name + ".yaml"
+    tmp_yamlfile = dirpath + yaml_name + "_tmp.yaml"
+    os.system("cp %s %s" % (yamlfile, tmp_yamlfile))
+
+    with open(tmp_yamlfile) as f, open(yamlfile, "w+", encoding="utf-8") as fw:
+        y = yaml.safe_load(f)
+        if device == 'cpu':
+            print("Set device: cpu")
+            if yaml_name == 'asr':
+                y['device'] = 'cpu'
+            elif yaml_name == 'asr_pd':
+                y['am_predictor_conf']['device'] = 'cpu'
+            elif yaml_name == 'tts':
+                y['device'] = 'cpu'
+            elif yaml_name == 'tts_pd':
+                y['am_predictor_conf']['device'] = 'cpu'
+                y['voc_predictor_conf']['device'] = 'cpu'
+        elif device == 'gpu':
+            print("Set device: gpu")
+            if yaml_name == 'asr':
+                y['device'] = 'gpu:0'
+            elif yaml_name == 'asr_pd':
+                y['am_predictor_conf']['device'] = 'gpu:0'
+            elif yaml_name == 'tts':
+                y['device'] = 'gpu:0'
+            elif yaml_name == 'tts_pd':
+                y['am_predictor_conf']['device'] = 'gpu:0'
+                y['voc_predictor_conf']['device'] = 'gpu:0'
+        else:
+            print("Please set correct device: cpu or gpu.")
+
+        print("The content of '%s': " % (yamlfile))
+        print(yaml.dump(y, default_flow_style=False, sort_keys=False))
+        yaml.dump(y, fw, allow_unicode=True)
+    os.system("rm %s" % (tmp_yamlfile))
+    print("Change %s successfully." % (yamlfile))
+
+
+def change_app_yaml(task: str, engine_type: str):
+    """Change the engine type and corresponding configuration file of the speech task in application.yaml
+
+    Args:
+        task (str):  asr or tts
+    """
+    yamlfile = "./conf/application.yaml"
+    tmp_yamlfile = "./conf/application_tmp.yaml"
+    os.system("cp %s %s" % (yamlfile, tmp_yamlfile))
+    with open(tmp_yamlfile) as f, open(yamlfile, "w+", encoding="utf-8") as fw:
+        y = yaml.safe_load(f)
+        y['engine_type'][task] = engine_type
+        path_list = ["./conf/", task, "/", task]
+        if engine_type == 'python':
+            path_list.append(".yaml")
+
+        elif engine_type == 'inference':
+            path_list.append("_pd.yaml")
+        y['engine_backend'][task] = ''.join(path_list)
+        print("The content of './conf/application.yaml': ")
+        print(yaml.dump(y, default_flow_style=False, sort_keys=False))
+        yaml.dump(y, fw, allow_unicode=True)
+    os.system("rm %s" % (tmp_yamlfile))
+    print("Change %s successfully." % (yamlfile))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--change_task',
+        type=str,
+        default=None,
+        help='Change task',
+        choices=[
+            'app-asr-python',
+            'app-asr-inference',
+            'app-tts-python',
+            'app-tts-inference',
+            'speech-asr-cpu',
+            'speech-asr-gpu',
+            'speech-asr_pd-cpu',
+            'speech-asr_pd-gpu',
+            'speech-tts-cpu',
+            'speech-tts-gpu',
+            'speech-tts_pd-cpu',
+            'speech-tts_pd-gpu',
+        ],
+        required=True)
+    args = parser.parse_args()
+
+    types = args.change_task.split("-")
+    if types[0] == "app":
+        change_app_yaml(types[1], types[2])
+    elif types[0] == "speech":
+        change_speech_yaml(types[1], types[2])
+    else:
+        print("Error change task, please check change_task.")
diff --git a/tests/unit/server/conf/application.yaml b/tests/unit/server/conf/application.yaml
new file mode 100644
index 000000000..aba33a514
--- /dev/null
+++ b/tests/unit/server/conf/application.yaml
@@ -0,0 +1,27 @@
+# This is the parameter configuration file for PaddleSpeech Serving.
+
+##################################################################
+#                     SERVER SETTING                             #
+##################################################################
+host: 127.0.0.1
+port: 8090
+
+##################################################################
+#                     CONFIG FILE                                #
+##################################################################
+# add engine backend type (Options: asr, tts) and config file here.
+# Adding a speech task to engine_backend means starting the service.
+engine_backend:
+    asr: 'conf/asr/asr.yaml'
+    tts: 'conf/tts/tts.yaml'
+
+# The engine_type of speech task needs to keep the same type as the config file of speech task.
+# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml'
+# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml'
+#
+# add engine type (Options: python, inference) 
+engine_type:
+    asr: 'python'
+    tts: 'python'
+
+
diff --git a/tests/unit/server/conf/asr/asr.yaml b/tests/unit/server/conf/asr/asr.yaml
new file mode 100644
index 000000000..a6743b775
--- /dev/null
+++ b/tests/unit/server/conf/asr/asr.yaml
@@ -0,0 +1,8 @@
+model: 'conformer_wenetspeech'
+lang: 'zh'
+sample_rate: 16000
+cfg_path: # [optional]
+ckpt_path: # [optional]
+decode_method: 'attention_rescoring'
+force_yes: True
+device:  # set 'gpu:id' or 'cpu'
diff --git a/tests/unit/server/conf/asr/asr_pd.yaml b/tests/unit/server/conf/asr/asr_pd.yaml
new file mode 100644
index 000000000..4c415ac79
--- /dev/null
+++ b/tests/unit/server/conf/asr/asr_pd.yaml
@@ -0,0 +1,26 @@
+# This is the parameter configuration file for ASR server.
+# These are the static models that support paddle inference.
+
+##################################################################
+#                  ACOUSTIC MODEL SETTING                        #
+# am choices=['deepspeech2offline_aishell'] TODO
+##################################################################
+model_type: 'deepspeech2offline_aishell'
+am_model: # the pdmodel file of am static model [optional]
+am_params:  # the pdiparams file of am static model [optional]
+lang: 'zh'
+sample_rate: 16000
+cfg_path: 
+decode_method: 
+force_yes: True
+
+am_predictor_conf:
+  device:  # set 'gpu:id' or 'cpu'
+  switch_ir_optim: True
+  glog_info: False  # True -> print glog
+  summary: True  # False -> do not show predictor config
+
+
+##################################################################
+#                            OTHERS                              #
+##################################################################
diff --git a/tests/unit/server/conf/tts/tts.yaml b/tests/unit/server/conf/tts/tts.yaml
new file mode 100644
index 000000000..19207f0b0
--- /dev/null
+++ b/tests/unit/server/conf/tts/tts.yaml
@@ -0,0 +1,32 @@
+# This is the parameter configuration file for TTS server.
+
+##################################################################
+#                  ACOUSTIC MODEL SETTING                        #
+# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
+#             'fastspeech2_ljspeech', 'fastspeech2_aishell3',
+#             'fastspeech2_vctk']
+##################################################################
+am: 'fastspeech2_csmsc'   
+am_config: 
+am_ckpt: 
+am_stat: 
+phones_dict: 
+tones_dict: 
+speaker_dict: 
+spk_id: 0
+
+##################################################################
+#                     VOCODER SETTING                            #
+# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
+#              'pwgan_vctk', 'mb_melgan_csmsc']
+##################################################################
+voc: 'pwgan_csmsc'
+voc_config: 
+voc_ckpt: 
+voc_stat: 
+
+##################################################################
+#                            OTHERS                              #
+##################################################################
+lang: 'zh'
+device:  # set 'gpu:id' or 'cpu'
diff --git a/tests/unit/server/conf/tts/tts_pd.yaml b/tests/unit/server/conf/tts/tts_pd.yaml
new file mode 100644
index 000000000..e27b9665b
--- /dev/null
+++ b/tests/unit/server/conf/tts/tts_pd.yaml
@@ -0,0 +1,42 @@
+# This is the parameter configuration file for TTS server.
+# These are the static models that support paddle inference.
+
+##################################################################
+#                  ACOUSTIC MODEL SETTING                        #
+# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
+##################################################################
+am: 'fastspeech2_csmsc'   
+am_model: # the pdmodel file of your am static model (XX.pdmodel)
+am_params: # the pdiparams file of your am static model (XX.pdipparams)
+am_sample_rate: 24000
+phones_dict: 
+tones_dict: 
+speaker_dict: 
+spk_id: 0
+
+am_predictor_conf:
+  device:  # set 'gpu:id' or 'cpu'
+  switch_ir_optim: True
+  glog_info: False # True -> print glog
+  summary: True  # False -> do not show predictor config
+
+
+##################################################################
+#                     VOCODER SETTING                            #
+# voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
+##################################################################
+voc: 'pwgan_csmsc'
+voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
+voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
+voc_sample_rate: 24000
+
+voc_predictor_conf:
+  device:  # set 'gpu:id' or 'cpu'  
+  switch_ir_optim: True  
+  glog_info: False # True -> print glog
+  summary: True  # False -> do not show predictor config
+
+##################################################################
+#                            OTHERS                              #
+##################################################################
+lang: 'zh'
diff --git a/tests/unit/server/test_server_client.sh b/tests/unit/server/test_server_client.sh
new file mode 100644
index 000000000..8f6a13687
--- /dev/null
+++ b/tests/unit/server/test_server_client.sh
@@ -0,0 +1,184 @@
+#!/bin/bash
+
+StartService(){
+    # Start service 
+    paddlespeech_server start --config_file $config_file 1>>log/server.log 2>>log/server.log.wf &
+    echo $! > pid
+
+    start_num=$(cat log/server.log.wf | grep "INFO:     Uvicorn running on http://" -c)
+    flag="normal"
+    while [[ $start_num -lt $target_start_num && $flag == "normal" ]]
+    do
+        start_num=$(cat log/server.log.wf | grep "INFO:     Uvicorn running on http://" -c)
+        # start service failed
+        if [ $(cat log/server.log.wf | grep -i "error" -c) -gt $error_time ];then
+            echo "Service started failed."  | tee -a ./log/test_result.log
+            error_time=$(cat log/server.log.wf | grep -i "error" -c)
+            flag="unnormal"
+        fi
+    done
+}
+
+ClientTest(){
+    # Client test
+    # test asr client
+    paddlespeech_client asr --server_ip $server_ip --port $port --input ./zh.wav 
+    ((test_times+=1))
+    paddlespeech_client asr --server_ip $server_ip --port $port --input ./zh.wav 
+    ((test_times+=1))
+
+    # test tts client
+    paddlespeech_client tts --server_ip $server_ip --port $port --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav 
+    ((test_times+=1))
+    paddlespeech_client tts --server_ip $server_ip --port $port --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav 
+    ((test_times+=1))  
+}
+
+GetTestResult() {
+    # Determine if the test was successful
+    response_success_time=$(cat log/server.log | grep "200 OK" -c)
+    if (( $response_success_time == $test_times )) ; then
+        echo "Testing successfully. The service configuration is: asr engine type: $1; tts engine type: $1; device: $2."  | tee -a ./log/test_result.log
+    else
+        echo "Testing failed. The service configuration is: asr engine type: $1; tts engine type: $1; device: $2." | tee -a ./log/test_result.log
+    fi
+    test_times=$response_success_time
+}
+
+
+mkdir -p log
+rm -rf log/server.log.wf 
+rm -rf log/server.log
+rm -rf log/test_result.log
+
+config_file=./conf/application.yaml
+server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
+port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
+
+echo "Sevice ip: $server_ip" | tee ./log/test_result.log
+echo "Sevice port: $port" | tee -a ./log/test_result.log
+
+# whether a process is listening on $port
+pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'`
+if [ "$pid" != "" ]; then
+    echo "The port: $port is occupied, please change another port"
+    exit
+fi
+
+# download test audios for ASR client
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+
+
+target_start_num=0  # the number of start service
+test_times=0  # The number of client test
+error_time=0  # The number of error occurrences in the startup failure server.log.wf file
+
+# start server: asr engine type: python; tts engine type: python; device: gpu
+echo "Start the service: asr engine type: python; tts engine type: python; device: gpu"  | tee -a ./log/test_result.log
+((target_start_num+=1))
+StartService
+
+if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
+    echo "Service started successfully."  | tee -a ./log/test_result.log
+    ClientTest
+    echo "This round of testing is over."  | tee -a ./log/test_result.log
+
+    GetTestResult python gpu
+else
+    echo "Service failed to start, no client test."
+    target_start_num=$start_num  
+
+fi
+
+kill -9 `cat pid`
+rm -rf pid
+sleep 2s
+echo "**************************************************************************************" | tee -a ./log/test_result.log
+
+
+
+# start server: asr engine type: python; tts engine type: python; device: cpu
+python change_yaml.py --change_task speech-asr-cpu    # change asr.yaml device: cpu
+python change_yaml.py --change_task speech-tts-cpu    # change tts.yaml device: cpu
+
+echo "Start the service: asr engine type: python; tts engine type: python; device: cpu"  | tee -a ./log/test_result.log
+((target_start_num+=1))
+StartService
+
+if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
+    echo "Service started successfully."  | tee -a ./log/test_result.log
+    ClientTest
+    echo "This round of testing is over."  | tee -a ./log/test_result.log
+
+    GetTestResult python cpu
+else
+    echo "Service failed to start, no client test."
+    target_start_num=$start_num  
+
+fi
+
+kill -9 `cat pid`
+rm -rf pid
+sleep 2s
+echo "**************************************************************************************" | tee -a ./log/test_result.log
+
+
+# start server: asr engine type: inference; tts engine type: inference; device: gpu
+python change_yaml.py --change_task app-asr-inference    # change application.yaml, asr engine_type: inference; asr engine_backend: asr_pd.yaml
+python change_yaml.py --change_task app-tts-inference    # change application.yaml, tts engine_type: inference; tts engine_backend: tts_pd.yaml
+
+echo "Start the service: asr engine type: inference; tts engine type: inference; device: gpu"  | tee -a ./log/test_result.log
+((target_start_num+=1))
+StartService
+
+if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
+    echo "Service started successfully."  | tee -a ./log/test_result.log
+    ClientTest
+    echo "This round of testing is over."  | tee -a ./log/test_result.log
+
+    GetTestResult inference gpu
+else
+    echo "Service failed to start, no client test."
+    target_start_num=$start_num  
+
+fi
+
+kill -9 `cat pid`
+rm -rf pid
+sleep 2s
+echo "**************************************************************************************" | tee -a ./log/test_result.log
+
+
+# start server: asr engine type: inference; tts engine type: inference; device: cpu
+python change_yaml.py --change_task speech-asr_pd-cpu    # change asr_pd.yaml device: cpu
+python change_yaml.py --change_task speech-tts_pd-cpu    # change tts_pd.yaml device: cpu
+
+echo "start the service: asr engine type: inference; tts engine type: inference; device: cpu"  | tee -a ./log/test_result.log
+((target_start_num+=1))
+StartService
+
+if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
+    echo "Service started successfully."  | tee -a ./log/test_result.log
+    ClientTest
+    echo "This round of testing is over."  | tee -a ./log/test_result.log
+
+    GetTestResult inference cpu
+else
+    echo "Service failed to start, no client test."
+    target_start_num=$start_num  
+    
+fi
+
+kill -9 `cat pid`
+rm -rf pid
+sleep 2s
+echo "**************************************************************************************" | tee -a ./log/test_result.log
+
+echo "All tests completed."  | tee -a ./log/test_result.log
+
+# sohw all the test results
+echo "***************** Here are all the test results ********************"
+cat ./log/test_result.log
+
+# Restoring conf is the same as demos/speech_server
+cp ../../../demos/speech_server/conf/ ./ -rf
\ No newline at end of file

From 933f879a2835446ebbd47f9c3b78c86b790931a8 Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Thu, 3 Mar 2022 20:43:13 +0800
Subject: [PATCH 10/17] add usage, test=doc

---
 tests/unit/server/test_server_client.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/server/test_server_client.sh b/tests/unit/server/test_server_client.sh
index 8f6a13687..795a23e01 100644
--- a/tests/unit/server/test_server_client.sh
+++ b/tests/unit/server/test_server_client.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# bash test_server_client.sh
 
 StartService(){
     # Start service 

From 4eb780ad2b1ed91c4e170699b742cc10b894ec24 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Fri, 4 Mar 2022 11:28:14 +0800
Subject: [PATCH 11/17] Add reference and doc string.

---
 paddleaudio/paddleaudio/compliance/kaldi.py   |   1 +
 paddleaudio/paddleaudio/features/layers.py    | 116 +++++++++++++-----
 .../paddleaudio/functional/functional.py      |  13 +-
 3 files changed, 94 insertions(+), 36 deletions(-)

diff --git a/paddleaudio/paddleaudio/compliance/kaldi.py b/paddleaudio/paddleaudio/compliance/kaldi.py
index 35d7072ca..8cb9b6660 100644
--- a/paddleaudio/paddleaudio/compliance/kaldi.py
+++ b/paddleaudio/paddleaudio/compliance/kaldi.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified from torchaudio(https://github.com/pytorch/audio)
 import math
 from typing import Tuple
 
diff --git a/paddleaudio/paddleaudio/features/layers.py b/paddleaudio/paddleaudio/features/layers.py
index 16fa00817..4a2c1673a 100644
--- a/paddleaudio/paddleaudio/features/layers.py
+++ b/paddleaudio/paddleaudio/features/layers.py
@@ -44,22 +44,22 @@ class Spectrogram(nn.Layer):
         The spectorgram is defined as the complex norm of the short-time
         Fourier transformation.
         Parameters:
-            n_fft(int): the number of frequency components of the discrete Fourier transform.
+            n_fft (int): the number of frequency components of the discrete Fourier transform.
                 The default value is 2048,
-            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                 The default value is None.
             win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                 The default value is None.
-            window(str): the name of the window function applied to the single before the Fourier transform.
+            window (str): the name of the window function applied to the single before the Fourier transform.
                 The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                 'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                 The default value is 'hann'
-            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                 If False, frame t begins at x[t * hop_length]
                 The default value is True
-            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                 and 'constant'. The default value is 'reflect'.
-            dtype(str): the data type of input and window.
+            dtype (str): the data type of input and window.
         Notes:
             The Spectrogram transform relies on STFT transform to compute the spectrogram.
             By default, the weights are not learnable. To fine-tune the Fourier coefficients,
@@ -190,39 +190,39 @@ class LogMelSpectrogram(nn.Layer):
         """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
         typically an audio waveform.
         Parameters:
-            sr(int): the audio sample rate.
+            sr (int): the audio sample rate.
                 The default value is 22050.
-            n_fft(int): the number of frequency components of the discrete Fourier transform.
+            n_fft (int): the number of frequency components of the discrete Fourier transform.
                 The default value is 2048,
-            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
                 The default value is None.
             win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
                 The default value is None.
-            window(str): the name of the window function applied to the single before the Fourier transform.
+            window (str): the name of the window function applied to the single before the Fourier transform.
                 The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
                 'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
                 The default value is 'hann'
-            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
                 If False, frame t begins at x[t * hop_length]
                 The default value is True
-            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
                 and 'constant'.
                 The default value is 'reflect'.
-            n_mels(int): the mel bins.
-            f_min(float): the lower cut-off frequency, below which the filter response is zero.
-            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
-            ref_value(float): the reference value. If smaller than 1.0, the db level
-            htk(bool): whether to use HTK formula in computing fbank matrix.
-            norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
+            n_mels (int): the mel bins.
+            f_min (float): the lower cut-off frequency, below which the filter response is zero.
+            f_max (float): the upper cut-off frequency, above which the filter response is zeros.
+            htk (bool): whether to use HTK formula in computing fbank matrix.
+            norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
                 You can specify norm=1.0/2.0 to use customized p-norm normalization.
-            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
-                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
-            amin(float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
+            ref_value (float): the reference value. If smaller than 1.0, the db level
+            amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
                 Otherwise, the db level is pushed down.
                 magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
                 e.g., 1e-3.
-            top_db(float): the maximum db value of resulting spectrum, above which the
+            top_db (float): the maximum db value of resulting spectrum, above which the
                 spectrum is clipped(to top_db).
+            dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
+                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
         """
         super(LogMelSpectrogram, self).__init__()
 
@@ -260,24 +260,80 @@ class MFCC(nn.Layer):
     def __init__(self,
                  sr: int=22050,
                  n_mfcc: int=40,
-                 norm: str='ortho',
-                 dtype: str=paddle.float32,
-                 **kwargs):
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 n_mels: int=64,
+                 f_min: float=50.0,
+                 f_max: Optional[float]=None,
+                 htk: bool=False,
+                 norm: Union[str, float]='slaney',
+                 ref_value: float=1.0,
+                 amin: float=1e-10,
+                 top_db: Optional[float]=None,
+                 dtype: str=paddle.float32):
         """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
 
         Parameters:
             sr(int): the audio sample rate.
                 The default value is 22050.
             n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40.
-            norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
+            n_fft (int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window (str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'.
+                The default value is 'reflect'.
+            n_mels (int): the mel bins.
+            f_min (float): the lower cut-off frequency, below which the filter response is zero.
+            f_max (float): the upper cut-off frequency, above which the filter response is zeros.
+            htk (bool): whether to use HTK formula in computing fbank matrix.
+            norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
                 You can specify norm=1.0/2.0 to use customized p-norm normalization.
-            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
+            ref_value (float): the reference value. If smaller than 1.0, the db level
+            amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
+                Otherwise, the db level is pushed down.
+                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
+                e.g., 1e-3.
+            top_db (float): the maximum db value of resulting spectrum, above which the
+                spectrum is clipped(to top_db).
+            dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
                 accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
         """
         super(MFCC, self).__init__()
-        self._log_melspectrogram = LogMelSpectrogram(sr=sr, **kwargs)
-        self.dct_matrix = create_dct(
-            n_mfcc=n_mfcc, n_mels=self._log_melspectrogram.n_mels, norm=norm)
+        assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
+            n_mfcc, n_mels)
+        self._log_melspectrogram = LogMelSpectrogram(
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=center,
+            pad_mode=pad_mode,
+            n_mels=n_mels,
+            f_min=f_min,
+            f_max=f_max,
+            htk=htk,
+            norm=norm,
+            ref_value=ref_value,
+            amin=amin,
+            top_db=top_db,
+            dtype=dtype)
+        self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
         self.register_buffer('dct_matrix', self.dct_matrix)
 
     def forward(self, x):
diff --git a/paddleaudio/paddleaudio/functional/functional.py b/paddleaudio/paddleaudio/functional/functional.py
index c07f14fd8..c5ab30453 100644
--- a/paddleaudio/paddleaudio/functional/functional.py
+++ b/paddleaudio/paddleaudio/functional/functional.py
@@ -242,14 +242,15 @@ def power_to_db(magnitude: paddle.Tensor,
 def create_dct(n_mfcc: int,
                n_mels: int,
                norm: Optional[str]='ortho',
-               dtype: Optional[str]=paddle.float32):
-    """[summary]
+               dtype: Optional[str]=paddle.float32) -> paddle.Tensor:
+    """Create a discrete cosine transform(DCT) matrix.
+
     Parameters:
-        n_mfcc (int): [description]
-        n_mels (int): [description]
-        norm (str, optional): [description]. Defaults to 'ortho'.
+        n_mfcc (int): Number of mel frequency cepstral coefficients. 
+        n_mels (int): Number of mel filterbanks.
+        norm (str, optional): Normalizaiton type. Defaults to 'ortho'.
     Returns:
-        [type]: [description]
+        Tensor: The DCT matrix with shape (n_mels, n_mfcc).
     """
     n = paddle.arange(n_mels, dtype=dtype)
     k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)

From 0bb9c3eaf2500c60d13c56c8069877ef1c504b3e Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 4 Mar 2022 14:37:58 +0800
Subject: [PATCH 12/17] Update README_cn.md

---
 demos/speech_server/README_cn.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md
index 05884fbd2..14f25e81f 100644
--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@@ -86,7 +86,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   ```
 
 ### 4. ASR客户端使用方法
-**注意：**初次使用客户端时响应时间会略长
+**注意：** 初次使用客户端时响应时间会略长
 - 命令行 (推荐使用)
    ```
    paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav

From a3b789512a73568829a3171a1f31bb5aa42a2b65 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 4 Mar 2022 14:38:39 +0800
Subject: [PATCH 13/17] Update README_cn.md

---
 demos/speech_server/README_cn.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md
index 14f25e81f..e4e50c0be 100644
--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@@ -135,7 +135,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   ```
  
 ### 5. TTS客户端使用方法
-**注意：**初次使用客户端时响应时间会略长
+**注意：** 初次使用客户端时响应时间会略长
    ```bash
    paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
    ```

From 10ab7aabfed63ef431f66f301630cd61e828194c Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 4 Mar 2022 14:41:39 +0800
Subject: [PATCH 14/17] Update README_cn.md

---
 demos/speech_server/README_cn.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md
index e4e50c0be..da05b686e 100644
--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@@ -136,9 +136,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
  
 ### 5. TTS客户端使用方法
 **注意：** 初次使用客户端时响应时间会略长
-   ```bash
-   paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
-   ```
+- 命令行 (推荐使用)
+
+    ```bash
+    paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
+    ```
     使用帮助:
   
     ```bash

From 3b304544f6187b91368c66e5a5b16840f69d175c Mon Sep 17 00:00:00 2001
From: lym0302 <lym0302@foxmail.com>
Date: Mon, 7 Mar 2022 18:19:17 +0800
Subject: [PATCH 15/17] modify yaml, test=doc

---
 demos/speech_server/README.md                 |  17 +--
 demos/speech_server/README_cn.md              |  17 +--
 demos/speech_server/conf/application.yaml     | 120 +++++++++++++++---
 demos/speech_server/conf/asr/asr.yaml         |   8 --
 demos/speech_server/conf/asr/asr_pd.yaml      |  26 ----
 demos/speech_server/conf/tts/tts.yaml         |  32 -----
 demos/speech_server/conf/tts/tts_pd.yaml      |  42 ------
 demos/speech_server/server.sh                 |   2 +-
 paddlespeech/server/bin/main.py               |   2 +-
 .../server/bin/paddlespeech_server.py         |   2 +-
 paddlespeech/server/conf/application.yaml     | 120 +++++++++++++++---
 paddlespeech/server/conf/asr/asr.yaml         |   8 --
 paddlespeech/server/conf/asr/asr_pd.yaml      |  26 ----
 paddlespeech/server/conf/tts/tts.yaml         |  32 -----
 paddlespeech/server/conf/tts/tts_pd.yaml      |  42 ------
 .../engine/asr/paddleinference/asr_engine.py  |   5 +-
 .../server/engine/asr/python/asr_engine.py    |   6 +-
 paddlespeech/server/engine/engine_pool.py     |  10 +-
 .../engine/tts/paddleinference/tts_engine.py  |   5 +-
 .../server/engine/tts/python/tts_engine.py    |   5 +-
 tests/unit/server/change_yaml.py              | 109 ++++++++--------
 tests/unit/server/conf/application.yaml       | 120 +++++++++++++++---
 tests/unit/server/conf/asr/asr.yaml           |   8 --
 tests/unit/server/conf/asr/asr_pd.yaml        |  26 ----
 tests/unit/server/conf/tts/tts.yaml           |  32 -----
 tests/unit/server/conf/tts/tts_pd.yaml        |  42 ------
 tests/unit/server/test_server_client.sh       |  13 +-
 27 files changed, 385 insertions(+), 492 deletions(-)
 delete mode 100644 demos/speech_server/conf/asr/asr.yaml
 delete mode 100644 demos/speech_server/conf/asr/asr_pd.yaml
 delete mode 100644 demos/speech_server/conf/tts/tts.yaml
 delete mode 100644 demos/speech_server/conf/tts/tts_pd.yaml
 delete mode 100644 paddlespeech/server/conf/asr/asr.yaml
 delete mode 100644 paddlespeech/server/conf/asr/asr_pd.yaml
 delete mode 100644 paddlespeech/server/conf/tts/tts.yaml
 delete mode 100644 paddlespeech/server/conf/tts/tts_pd.yaml
 delete mode 100644 tests/unit/server/conf/asr/asr.yaml
 delete mode 100644 tests/unit/server/conf/asr/asr_pd.yaml
 delete mode 100644 tests/unit/server/conf/tts/tts.yaml
 delete mode 100644 tests/unit/server/conf/tts/tts_pd.yaml

diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md
index 515abaf66..a2f6f2213 100644
--- a/demos/speech_server/README.md
+++ b/demos/speech_server/README.md
@@ -11,21 +11,14 @@ This demo is an implementation of starting the voice service and accessing the s
 see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
 
 It is recommended to use **paddlepaddle 2.2.1** or above.
-You can choose one way from easy, meduim and hard to install paddlespeech.
+You can choose one way from meduim and hard to install paddlespeech.
 
 ### 2. Prepare config File
-The configuration file contains the service-related configuration files and the model configuration related to the voice tasks contained in the service. They are all under the `conf` folder. 
+The configuration file can be found in `conf/application.yaml` .
+Among them, `engine_list` indicates the speech engine that will be included in the service to be started, in the format of <speech task>_<engine type>.
+At present, the speech tasks integrated by the service include: asr (speech recognition) and tts (speech synthesis).
+Currently the engine type supports two forms: python and inference (Paddle Inference)
 
-**Note: The configuration of `engine_backend` in `application.yaml` represents all speech tasks included in the started service.**
-If the service you want to start contains only a certain speech task, then you need to comment out the speech tasks that do not need to be included. For example, if you only want to use the speech recognition (ASR) service, then you can comment out the speech synthesis (TTS) service, as in the following example:
-```bash
-engine_backend:
-    asr: 'conf/asr/asr.yaml'
-    #tts: 'conf/tts/tts.yaml'
-```
-
-**Note: The configuration file of `engine_backend` in `application.yaml` needs to match the configuration type of `engine_type`.**
-When the configuration file of `engine_backend` is `XXX.yaml`, the configuration type of `engine_type` needs to be set to `python`; when the configuration file of `engine_backend` is `XXX_pd.yaml`, the configuration of `engine_type` needs to be set type is `inference`;
 
 The input of  ASR client demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
 
diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md
index da05b686e..762248a11 100644
--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@@ -11,20 +11,15 @@
 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
 
 推荐使用 **paddlepaddle 2.2.1** 或以上版本。
-你可以从 easy，medium，hard 三中方式中选择一种方式安装 PaddleSpeech。
+你可以从 medium，hard 三中方式中选择一种方式安装 PaddleSpeech。
 
 
 ### 2. 准备配置文件
-配置文件包含服务相关的配置文件和服务中包含的语音任务相关的模型配置。 它们都在 `conf` 文件夹下。
-**注意：`application.yaml` 中 `engine_backend` 的配置表示启动的服务中包含的所有语音任务。**
-如果你想启动的服务中只包含某项语音任务，那么你需要注释掉不需要包含的语音任务。例如你只想使用语音识别（ASR）服务，那么你可以将语音合成（TTS）服务注释掉，如下示例：
-```bash
-engine_backend:
-    asr: 'conf/asr/asr.yaml'
-    #tts: 'conf/tts/tts.yaml'
-```
-**注意：`application.yaml` 中 `engine_backend` 的配置文件需要和 `engine_type` 的配置类型匹配。**
-当`engine_backend` 的配置文件为`XXX.yaml`时，需要设置`engine_type`的配置类型为`python`;当`engine_backend` 的配置文件为`XXX_pd.yaml`时，需要设置`engine_type`的配置类型为`inference`;
+配置文件可参见 `conf/application.yaml` 。
+其中，`engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
+目前服务集成的语音任务有： asr(语音识别)、tts(语音合成)。
+目前引擎类型支持两种形式：python 及 inference (Paddle Inference)
+
 
 这个 ASR client 的输入应该是一个 WAV 文件（`.wav`），并且采样率必须与模型的采样率相同。
 
diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml
index aba33a514..6048450b7 100644
--- a/demos/speech_server/conf/application.yaml
+++ b/demos/speech_server/conf/application.yaml
@@ -1,27 +1,107 @@
 # This is the parameter configuration file for PaddleSpeech Serving.
 
-##################################################################
-#                     SERVER SETTING                             #
-##################################################################
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
 host: 127.0.0.1
 port: 8090
 
-##################################################################
-#                     CONFIG FILE                                #
-##################################################################
-# add engine backend type (Options: asr, tts) and config file here.
-# Adding a speech task to engine_backend means starting the service.
-engine_backend:
-    asr: 'conf/asr/asr.yaml'
-    tts: 'conf/tts/tts.yaml'
-
-# The engine_type of speech task needs to keep the same type as the config file of speech task.
-# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml'
-# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml'
-#
-# add engine type (Options: python, inference) 
-engine_type:
-    asr: 'python'
-    tts: 'python'
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
 
+engine_list: ['asr_python', 'tts_python']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+################### speech task: asr; engine_type: python #######################
+asr_python:
+    model: 'conformer_wenetspeech'
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    decode_method: 'attention_rescoring'
+    force_yes: True
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: asr; engine_type: inference #######################
+asr_inference:
+    # model_type choices=['deepspeech2offline_aishell']
+    model_type: 'deepspeech2offline_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################### speech task: tts; engine_type: python #######################
+tts_python: 
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', 
+    #                              'fastspeech2_ljspeech', 'fastspeech2_aishell3',
+    #                              'fastspeech2_vctk']        
+    am: 'fastspeech2_csmsc'   
+    am_config: 
+    am_ckpt: 
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
+    #                        'pwgan_vctk', 'mb_melgan_csmsc']
+    voc: 'pwgan_csmsc'
+    voc_config: 
+    voc_ckpt: 
+    voc_stat: 
+
+    # others
+    lang: 'zh'
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: tts; engine_type: inference #######################
+tts_inference:
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
+    am: 'fastspeech2_csmsc'   
+    am_model: # the pdmodel file of your am static model (XX.pdmodel)
+    am_params: # the pdiparams file of your am static model (XX.pdipparams)
+    am_sample_rate: 24000
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
+    voc: 'pwgan_csmsc'
+    voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
+    voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
+    voc_sample_rate: 24000
+
+    voc_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'  
+        switch_ir_optim: True  
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # others
+    lang: 'zh'
 
diff --git a/demos/speech_server/conf/asr/asr.yaml b/demos/speech_server/conf/asr/asr.yaml
deleted file mode 100644
index a6743b775..000000000
--- a/demos/speech_server/conf/asr/asr.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-model: 'conformer_wenetspeech'
-lang: 'zh'
-sample_rate: 16000
-cfg_path: # [optional]
-ckpt_path: # [optional]
-decode_method: 'attention_rescoring'
-force_yes: True
-device:  # set 'gpu:id' or 'cpu'
diff --git a/demos/speech_server/conf/asr/asr_pd.yaml b/demos/speech_server/conf/asr/asr_pd.yaml
deleted file mode 100644
index 4c415ac79..000000000
--- a/demos/speech_server/conf/asr/asr_pd.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# This is the parameter configuration file for ASR server.
-# These are the static models that support paddle inference.
-
-##################################################################
-#                  ACOUSTIC MODEL SETTING                        #
-# am choices=['deepspeech2offline_aishell'] TODO
-##################################################################
-model_type: 'deepspeech2offline_aishell'
-am_model: # the pdmodel file of am static model [optional]
-am_params:  # the pdiparams file of am static model [optional]
-lang: 'zh'
-sample_rate: 16000
-cfg_path: 
-decode_method: 
-force_yes: True
-
-am_predictor_conf:
-  device:  # set 'gpu:id' or 'cpu'
-  switch_ir_optim: True
-  glog_info: False  # True -> print glog
-  summary: True  # False -> do not show predictor config
-
-
-##################################################################
-#                            OTHERS                              #
-##################################################################
diff --git a/demos/speech_server/conf/tts/tts.yaml b/demos/speech_server/conf/tts/tts.yaml
deleted file mode 100644
index 19207f0b0..000000000
--- a/demos/speech_server/conf/tts/tts.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-# This is the parameter configuration file for TTS server.
-
-##################################################################
-#                  ACOUSTIC MODEL SETTING                        #
-# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
-#             'fastspeech2_ljspeech', 'fastspeech2_aishell3',
-#             'fastspeech2_vctk']
-##################################################################
-am: 'fastspeech2_csmsc'   
-am_config: 
-am_ckpt: 
-am_stat: 
-phones_dict: 
-tones_dict: 
-speaker_dict: 
-spk_id: 0
-
-##################################################################
-#                     VOCODER SETTING                            #
-# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
-#              'pwgan_vctk', 'mb_melgan_csmsc']
-##################################################################
-voc: 'pwgan_csmsc'
-voc_config: 
-voc_ckpt: 
-voc_stat: 
-
-##################################################################
-#                            OTHERS                              #
-##################################################################
-lang: 'zh'
-device:  # set 'gpu:id' or 'cpu'
diff --git a/demos/speech_server/conf/tts/tts_pd.yaml b/demos/speech_server/conf/tts/tts_pd.yaml
deleted file mode 100644
index e27b9665b..000000000
--- a/demos/speech_server/conf/tts/tts_pd.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# This is the parameter configuration file for TTS server.
-# These are the static models that support paddle inference.
-
-##################################################################
-#                  ACOUSTIC MODEL SETTING                        #
-# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
-##################################################################
-am: 'fastspeech2_csmsc'   
-am_model: # the pdmodel file of your am static model (XX.pdmodel)
-am_params: # the pdiparams file of your am static model (XX.pdipparams)
-am_sample_rate: 24000
-phones_dict: 
-tones_dict: 
-speaker_dict: 
-spk_id: 0
-
-am_predictor_conf:
-  device:  # set 'gpu:id' or 'cpu'
-  switch_ir_optim: True
-  glog_info: False # True -> print glog
-  summary: True  # False -> do not show predictor config
-
-
-##################################################################
-#                     VOCODER SETTING                            #
-# voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
-##################################################################
-voc: 'pwgan_csmsc'
-voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
-voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
-voc_sample_rate: 24000
-
-voc_predictor_conf:
-  device:  # set 'gpu:id' or 'cpu'  
-  switch_ir_optim: True  
-  glog_info: False # True -> print glog
-  summary: True  # False -> do not show predictor config
-
-##################################################################
-#                            OTHERS                              #
-##################################################################
-lang: 'zh'
diff --git a/demos/speech_server/server.sh b/demos/speech_server/server.sh
index d9367ec06..e5961286b 100644
--- a/demos/speech_server/server.sh
+++ b/demos/speech_server/server.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-paddlespeech_server start --config_file ./conf/application.yaml
\ No newline at end of file
+paddlespeech_server start --config_file ./conf/application.yaml
diff --git a/paddlespeech/server/bin/main.py b/paddlespeech/server/bin/main.py
index 360d295ef..de5282993 100644
--- a/paddlespeech/server/bin/main.py
+++ b/paddlespeech/server/bin/main.py
@@ -34,7 +34,7 @@ def init(config):
         bool: 
     """
     # init api
-    api_list = list(config.engine_backend)
+    api_list = list(engine.split("_")[0] for engine in config.engine_list)
     api_router = setup_router(api_list)
     app.include_router(api_router)
 
diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py
index 21fc5c65e..3d71f091b 100644
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@@ -62,7 +62,7 @@ class ServerExecutor(BaseExecutor):
             bool: 
         """
         # init api
-        api_list = list(config.engine_backend)
+        api_list = list(engine.split("_")[0] for engine in config.engine_list)
         api_router = setup_router(api_list)
         app.include_router(api_router)
 
diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml
index aba33a514..6048450b7 100644
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@@ -1,27 +1,107 @@
 # This is the parameter configuration file for PaddleSpeech Serving.
 
-##################################################################
-#                     SERVER SETTING                             #
-##################################################################
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
 host: 127.0.0.1
 port: 8090
 
-##################################################################
-#                     CONFIG FILE                                #
-##################################################################
-# add engine backend type (Options: asr, tts) and config file here.
-# Adding a speech task to engine_backend means starting the service.
-engine_backend:
-    asr: 'conf/asr/asr.yaml'
-    tts: 'conf/tts/tts.yaml'
-
-# The engine_type of speech task needs to keep the same type as the config file of speech task.
-# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml'
-# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml'
-#
-# add engine type (Options: python, inference) 
-engine_type:
-    asr: 'python'
-    tts: 'python'
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
 
+engine_list: ['asr_python', 'tts_python']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+################### speech task: asr; engine_type: python #######################
+asr_python:
+    model: 'conformer_wenetspeech'
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    decode_method: 'attention_rescoring'
+    force_yes: True
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: asr; engine_type: inference #######################
+asr_inference:
+    # model_type choices=['deepspeech2offline_aishell']
+    model_type: 'deepspeech2offline_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################### speech task: tts; engine_type: python #######################
+tts_python: 
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', 
+    #                              'fastspeech2_ljspeech', 'fastspeech2_aishell3',
+    #                              'fastspeech2_vctk']        
+    am: 'fastspeech2_csmsc'   
+    am_config: 
+    am_ckpt: 
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
+    #                        'pwgan_vctk', 'mb_melgan_csmsc']
+    voc: 'pwgan_csmsc'
+    voc_config: 
+    voc_ckpt: 
+    voc_stat: 
+
+    # others
+    lang: 'zh'
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: tts; engine_type: inference #######################
+tts_inference:
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
+    am: 'fastspeech2_csmsc'   
+    am_model: # the pdmodel file of your am static model (XX.pdmodel)
+    am_params: # the pdiparams file of your am static model (XX.pdipparams)
+    am_sample_rate: 24000
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
+    voc: 'pwgan_csmsc'
+    voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
+    voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
+    voc_sample_rate: 24000
+
+    voc_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'  
+        switch_ir_optim: True  
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # others
+    lang: 'zh'
 
diff --git a/paddlespeech/server/conf/asr/asr.yaml b/paddlespeech/server/conf/asr/asr.yaml
deleted file mode 100644
index a6743b775..000000000
--- a/paddlespeech/server/conf/asr/asr.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-model: 'conformer_wenetspeech'
-lang: 'zh'
-sample_rate: 16000
-cfg_path: # [optional]
-ckpt_path: # [optional]
-decode_method: 'attention_rescoring'
-force_yes: True
-device:  # set 'gpu:id' or 'cpu'
diff --git a/paddlespeech/server/conf/asr/asr_pd.yaml b/paddlespeech/server/conf/asr/asr_pd.yaml
deleted file mode 100644
index 4c415ac79..000000000
--- a/paddlespeech/server/conf/asr/asr_pd.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# This is the parameter configuration file for ASR server.
-# These are the static models that support paddle inference.
-
-##################################################################
-#                  ACOUSTIC MODEL SETTING                        #
-# am choices=['deepspeech2offline_aishell'] TODO
-##################################################################
-model_type: 'deepspeech2offline_aishell'
-am_model: # the pdmodel file of am static model [optional]
-am_params:  # the pdiparams file of am static model [optional]
-lang: 'zh'
-sample_rate: 16000
-cfg_path: 
-decode_method: 
-force_yes: True
-
-am_predictor_conf:
-  device:  # set 'gpu:id' or 'cpu'
-  switch_ir_optim: True
-  glog_info: False  # True -> print glog
-  summary: True  # False -> do not show predictor config
-
-
-##################################################################
-#                            OTHERS                              #
-##################################################################
diff --git a/paddlespeech/server/conf/tts/tts.yaml b/paddlespeech/server/conf/tts/tts.yaml
deleted file mode 100644
index 19207f0b0..000000000
--- a/paddlespeech/server/conf/tts/tts.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-# This is the parameter configuration file for TTS server.
-
-##################################################################
-#                  ACOUSTIC MODEL SETTING                        #
-# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
-#             'fastspeech2_ljspeech', 'fastspeech2_aishell3',
-#             'fastspeech2_vctk']
-##################################################################
-am: 'fastspeech2_csmsc'   
-am_config: 
-am_ckpt: 
-am_stat: 
-phones_dict: 
-tones_dict: 
-speaker_dict: 
-spk_id: 0
-
-##################################################################
-#                     VOCODER SETTING                            #
-# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
-#              'pwgan_vctk', 'mb_melgan_csmsc']
-##################################################################
-voc: 'pwgan_csmsc'
-voc_config: 
-voc_ckpt: 
-voc_stat: 
-
-##################################################################
-#                            OTHERS                              #
-##################################################################
-lang: 'zh'
-device:  # set 'gpu:id' or 'cpu'
diff --git a/paddlespeech/server/conf/tts/tts_pd.yaml b/paddlespeech/server/conf/tts/tts_pd.yaml
deleted file mode 100644
index e27b9665b..000000000
--- a/paddlespeech/server/conf/tts/tts_pd.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# This is the parameter configuration file for TTS server.
-# These are the static models that support paddle inference.
-
-##################################################################
-#                  ACOUSTIC MODEL SETTING                        #
-# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
-##################################################################
-am: 'fastspeech2_csmsc'   
-am_model: # the pdmodel file of your am static model (XX.pdmodel)
-am_params: # the pdiparams file of your am static model (XX.pdipparams)
-am_sample_rate: 24000
-phones_dict: 
-tones_dict: 
-speaker_dict: 
-spk_id: 0
-
-am_predictor_conf:
-  device:  # set 'gpu:id' or 'cpu'
-  switch_ir_optim: True
-  glog_info: False # True -> print glog
-  summary: True  # False -> do not show predictor config
-
-
-##################################################################
-#                     VOCODER SETTING                            #
-# voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
-##################################################################
-voc: 'pwgan_csmsc'
-voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
-voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
-voc_sample_rate: 24000
-
-voc_predictor_conf:
-  device:  # set 'gpu:id' or 'cpu'  
-  switch_ir_optim: True  
-  glog_info: False # True -> print glog
-  summary: True  # False -> do not show predictor config
-
-##################################################################
-#                            OTHERS                              #
-##################################################################
-lang: 'zh'
diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
index cb973e924..1925bf1d6 100644
--- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
@@ -26,7 +26,6 @@ from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.modules.ctc import CTCDecoder
 from paddlespeech.s2t.utils.utility import UpdateConfig
 from paddlespeech.server.engine.base_engine import BaseEngine
-from paddlespeech.server.utils.config import get_config
 from paddlespeech.server.utils.paddle_predictor import init_predictor
 from paddlespeech.server.utils.paddle_predictor import run_model
 
@@ -184,7 +183,7 @@ class ASREngine(BaseEngine):
     def __init__(self):
         super(ASREngine, self).__init__()
 
-    def init(self, config_file: str) -> bool:
+    def init(self, config: dict) -> bool:
         """init engine resource
 
         Args:
@@ -196,7 +195,7 @@ class ASREngine(BaseEngine):
         self.input = None
         self.output = None
         self.executor = ASRServerExecutor()
-        self.config = get_config(config_file)
+        self.config = config
 
         self.executor._init_from_path(
             model_type=self.config.model_type,
diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py
index 1e2c5cc27..e76c49a79 100644
--- a/paddlespeech/server/engine/asr/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/python/asr_engine.py
@@ -19,7 +19,6 @@ import paddle
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.server.engine.base_engine import BaseEngine
-from paddlespeech.server.utils.config import get_config
 
 __all__ = ['ASREngine']
 
@@ -40,7 +39,7 @@ class ASREngine(BaseEngine):
     def __init__(self):
         super(ASREngine, self).__init__()
 
-    def init(self, config_file: str) -> bool:
+    def init(self, config: dict) -> bool:
         """init engine resource
 
         Args:
@@ -52,8 +51,7 @@ class ASREngine(BaseEngine):
         self.input = None
         self.output = None
         self.executor = ASRServerExecutor()
-
-        self.config = get_config(config_file)
+        self.config = config
         try:
             if self.config.device:
                 self.device = self.config.device
diff --git a/paddlespeech/server/engine/engine_pool.py b/paddlespeech/server/engine/engine_pool.py
index f6a4d2aab..9de73567e 100644
--- a/paddlespeech/server/engine/engine_pool.py
+++ b/paddlespeech/server/engine/engine_pool.py
@@ -28,11 +28,13 @@ def init_engine_pool(config) -> bool:
     """ Init engine pool
     """
     global ENGINE_POOL
-    for engine in config.engine_backend:
+
+    for engine_and_type in config.engine_list:
+        engine = engine_and_type.split("_")[0]
+        engine_type = engine_and_type.split("_")[1]
         ENGINE_POOL[engine] = EngineFactory.get_engine(
-            engine_name=engine, engine_type=config.engine_type[engine])
-        if not ENGINE_POOL[engine].init(
-                config_file=config.engine_backend[engine]):
+            engine_name=engine, engine_type=engine_type)
+        if not ENGINE_POOL[engine].init(config=config[engine_and_type]):
             return False
 
     return True
diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
index 5955c1a21..1bbbe0ea3 100644
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -29,7 +29,6 @@ from paddlespeech.cli.utils import download_and_decompress
 from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import change_speed
-from paddlespeech.server.utils.config import get_config
 from paddlespeech.server.utils.errors import ErrorCode
 from paddlespeech.server.utils.exception import ServerBaseException
 from paddlespeech.server.utils.paddle_predictor import init_predictor
@@ -357,11 +356,11 @@ class TTSEngine(BaseEngine):
         """
         super(TTSEngine, self).__init__()
 
-    def init(self, config_file: str) -> bool:
+    def init(self, config: dict) -> bool:
         self.executor = TTSServerExecutor()
 
         try:
-            self.config = get_config(config_file)
+            self.config = config
             self.executor._init_from_path(
                 am=self.config.am,
                 am_model=self.config.am_model,
diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py
index 7dd576699..8d6c7fd17 100644
--- a/paddlespeech/server/engine/tts/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/python/tts_engine.py
@@ -25,7 +25,6 @@ from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import change_speed
-from paddlespeech.server.utils.config import get_config
 from paddlespeech.server.utils.errors import ErrorCode
 from paddlespeech.server.utils.exception import ServerBaseException
 
@@ -50,11 +49,11 @@ class TTSEngine(BaseEngine):
         """
         super(TTSEngine, self).__init__()
 
-    def init(self, config_file: str) -> bool:
+    def init(self, config: dict) -> bool:
         self.executor = TTSServerExecutor()
 
         try:
-            self.config = get_config(config_file)
+            self.config = config
             if self.config.device:
                 self.device = self.config.device
             else:
diff --git a/tests/unit/server/change_yaml.py b/tests/unit/server/change_yaml.py
index 5a5d9ae01..1f063d8f5 100644
--- a/tests/unit/server/change_yaml.py
+++ b/tests/unit/server/change_yaml.py
@@ -5,7 +5,7 @@ import os
 import yaml
 
 
-def change_speech_yaml(yaml_name: str, device: str):
+def change_device(yamlfile: str, engine: str, device: str):
     """Change the settings of the device under the voice task configuration file
 
     Args:
@@ -13,68 +13,54 @@ def change_speech_yaml(yaml_name: str, device: str):
         cpu (bool): True means set device to "cpu"
         model_type (dict): change model type
     """
-    if "asr" in yaml_name:
-        dirpath = "./conf/asr/"
-    elif 'tts' in yaml_name:
-        dirpath = "./conf/tts/"
-    yamlfile = dirpath + yaml_name + ".yaml"
-    tmp_yamlfile = dirpath + yaml_name + "_tmp.yaml"
+    tmp_yamlfile = yamlfile.split(".yaml")[0] + "_tmp.yaml"
     os.system("cp %s %s" % (yamlfile, tmp_yamlfile))
 
+    if device == 'cpu':
+        set_device = 'cpu'
+    elif device == 'gpu':
+        set_device = 'gpu:0'
+    else:
+        print("Please set correct device: cpu or gpu.")
+
     with open(tmp_yamlfile) as f, open(yamlfile, "w+", encoding="utf-8") as fw:
         y = yaml.safe_load(f)
-        if device == 'cpu':
-            print("Set device: cpu")
-            if yaml_name == 'asr':
-                y['device'] = 'cpu'
-            elif yaml_name == 'asr_pd':
-                y['am_predictor_conf']['device'] = 'cpu'
-            elif yaml_name == 'tts':
-                y['device'] = 'cpu'
-            elif yaml_name == 'tts_pd':
-                y['am_predictor_conf']['device'] = 'cpu'
-                y['voc_predictor_conf']['device'] = 'cpu'
-        elif device == 'gpu':
-            print("Set device: gpu")
-            if yaml_name == 'asr':
-                y['device'] = 'gpu:0'
-            elif yaml_name == 'asr_pd':
-                y['am_predictor_conf']['device'] = 'gpu:0'
-            elif yaml_name == 'tts':
-                y['device'] = 'gpu:0'
-            elif yaml_name == 'tts_pd':
-                y['am_predictor_conf']['device'] = 'gpu:0'
-                y['voc_predictor_conf']['device'] = 'gpu:0'
+        if engine == 'asr_python' or engine == 'tts_python':
+            y[engine]['device'] = set_device
+        elif engine == 'asr_inference':
+            y[engine]['am_predictor_conf']['device'] = set_device
+        elif engine == 'tts_inference':
+            y[engine]['am_predictor_conf']['device'] = set_device
+            y[engine]['voc_predictor_conf']['device'] = set_device
         else:
-            print("Please set correct device: cpu or gpu.")
+            print(
+                "Please set correct engine: asr_python, tts_python, asr_inference, tts_inference."
+            )
 
-        print("The content of '%s': " % (yamlfile))
         print(yaml.dump(y, default_flow_style=False, sort_keys=False))
         yaml.dump(y, fw, allow_unicode=True)
     os.system("rm %s" % (tmp_yamlfile))
     print("Change %s successfully." % (yamlfile))
 
 
-def change_app_yaml(task: str, engine_type: str):
+def change_engine_type(yamlfile: str, engine_type):
     """Change the engine type and corresponding configuration file of the speech task in application.yaml
 
     Args:
         task (str):  asr or tts
     """
-    yamlfile = "./conf/application.yaml"
-    tmp_yamlfile = "./conf/application_tmp.yaml"
+    tmp_yamlfile = yamlfile.split(".yaml")[0] + "_tmp.yaml"
     os.system("cp %s %s" % (yamlfile, tmp_yamlfile))
+    speech_task = engine_type.split("_")[0]
+
     with open(tmp_yamlfile) as f, open(yamlfile, "w+", encoding="utf-8") as fw:
         y = yaml.safe_load(f)
-        y['engine_type'][task] = engine_type
-        path_list = ["./conf/", task, "/", task]
-        if engine_type == 'python':
-            path_list.append(".yaml")
-
-        elif engine_type == 'inference':
-            path_list.append("_pd.yaml")
-        y['engine_backend'][task] = ''.join(path_list)
-        print("The content of './conf/application.yaml': ")
+        engine_list = y['engine_list']
+        for engine in engine_list:
+            if speech_task in engine:
+                engine_list.remove(engine)
+                engine_list.append(engine_type)
+        y['engine_list'] = engine_list
         print(yaml.dump(y, default_flow_style=False, sort_keys=False))
         yaml.dump(y, fw, allow_unicode=True)
     os.system("rm %s" % (tmp_yamlfile))
@@ -83,32 +69,37 @@ def change_app_yaml(task: str, engine_type: str):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--config_file',
+        type=str,
+        default='./conf/application.yaml',
+        help='server yaml file.')
     parser.add_argument(
         '--change_task',
         type=str,
         default=None,
         help='Change task',
         choices=[
-            'app-asr-python',
-            'app-asr-inference',
-            'app-tts-python',
-            'app-tts-inference',
-            'speech-asr-cpu',
-            'speech-asr-gpu',
-            'speech-asr_pd-cpu',
-            'speech-asr_pd-gpu',
-            'speech-tts-cpu',
-            'speech-tts-gpu',
-            'speech-tts_pd-cpu',
-            'speech-tts_pd-gpu',
+            'enginetype-asr_python',
+            'enginetype-asr_inference',
+            'enginetype-tts_python',
+            'enginetype-tts_inference',
+            'device-asr_python-cpu',
+            'device-asr_python-gpu',
+            'device-asr_inference-cpu',
+            'device-asr_inference-gpu',
+            'device-tts_python-cpu',
+            'device-tts_python-gpu',
+            'device-tts_inference-cpu',
+            'device-tts_inference-gpu',
         ],
         required=True)
     args = parser.parse_args()
 
     types = args.change_task.split("-")
-    if types[0] == "app":
-        change_app_yaml(types[1], types[2])
-    elif types[0] == "speech":
-        change_speech_yaml(types[1], types[2])
+    if types[0] == "enginetype":
+        change_engine_type(args.config_file, types[1])
+    elif types[0] == "device":
+        change_device(args.config_file, types[1], types[2])
     else:
         print("Error change task, please check change_task.")
diff --git a/tests/unit/server/conf/application.yaml b/tests/unit/server/conf/application.yaml
index aba33a514..6048450b7 100644
--- a/tests/unit/server/conf/application.yaml
+++ b/tests/unit/server/conf/application.yaml
@@ -1,27 +1,107 @@
 # This is the parameter configuration file for PaddleSpeech Serving.
 
-##################################################################
-#                     SERVER SETTING                             #
-##################################################################
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
 host: 127.0.0.1
 port: 8090
 
-##################################################################
-#                     CONFIG FILE                                #
-##################################################################
-# add engine backend type (Options: asr, tts) and config file here.
-# Adding a speech task to engine_backend means starting the service.
-engine_backend:
-    asr: 'conf/asr/asr.yaml'
-    tts: 'conf/tts/tts.yaml'
-
-# The engine_type of speech task needs to keep the same type as the config file of speech task.
-# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml'
-# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml'
-#
-# add engine type (Options: python, inference) 
-engine_type:
-    asr: 'python'
-    tts: 'python'
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
 
+engine_list: ['asr_python', 'tts_python']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+################### speech task: asr; engine_type: python #######################
+asr_python:
+    model: 'conformer_wenetspeech'
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    decode_method: 'attention_rescoring'
+    force_yes: True
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: asr; engine_type: inference #######################
+asr_inference:
+    # model_type choices=['deepspeech2offline_aishell']
+    model_type: 'deepspeech2offline_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################### speech task: tts; engine_type: python #######################
+tts_python: 
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', 
+    #                              'fastspeech2_ljspeech', 'fastspeech2_aishell3',
+    #                              'fastspeech2_vctk']        
+    am: 'fastspeech2_csmsc'   
+    am_config: 
+    am_ckpt: 
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
+    #                        'pwgan_vctk', 'mb_melgan_csmsc']
+    voc: 'pwgan_csmsc'
+    voc_config: 
+    voc_ckpt: 
+    voc_stat: 
+
+    # others
+    lang: 'zh'
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: tts; engine_type: inference #######################
+tts_inference:
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
+    am: 'fastspeech2_csmsc'   
+    am_model: # the pdmodel file of your am static model (XX.pdmodel)
+    am_params: # the pdiparams file of your am static model (XX.pdipparams)
+    am_sample_rate: 24000
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
+    voc: 'pwgan_csmsc'
+    voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
+    voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
+    voc_sample_rate: 24000
+
+    voc_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'  
+        switch_ir_optim: True  
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # others
+    lang: 'zh'
 
diff --git a/tests/unit/server/conf/asr/asr.yaml b/tests/unit/server/conf/asr/asr.yaml
deleted file mode 100644
index a6743b775..000000000
--- a/tests/unit/server/conf/asr/asr.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-model: 'conformer_wenetspeech'
-lang: 'zh'
-sample_rate: 16000
-cfg_path: # [optional]
-ckpt_path: # [optional]
-decode_method: 'attention_rescoring'
-force_yes: True
-device:  # set 'gpu:id' or 'cpu'
diff --git a/tests/unit/server/conf/asr/asr_pd.yaml b/tests/unit/server/conf/asr/asr_pd.yaml
deleted file mode 100644
index 4c415ac79..000000000
--- a/tests/unit/server/conf/asr/asr_pd.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# This is the parameter configuration file for ASR server.
-# These are the static models that support paddle inference.
-
-##################################################################
-#                  ACOUSTIC MODEL SETTING                        #
-# am choices=['deepspeech2offline_aishell'] TODO
-##################################################################
-model_type: 'deepspeech2offline_aishell'
-am_model: # the pdmodel file of am static model [optional]
-am_params:  # the pdiparams file of am static model [optional]
-lang: 'zh'
-sample_rate: 16000
-cfg_path: 
-decode_method: 
-force_yes: True
-
-am_predictor_conf:
-  device:  # set 'gpu:id' or 'cpu'
-  switch_ir_optim: True
-  glog_info: False  # True -> print glog
-  summary: True  # False -> do not show predictor config
-
-
-##################################################################
-#                            OTHERS                              #
-##################################################################
diff --git a/tests/unit/server/conf/tts/tts.yaml b/tests/unit/server/conf/tts/tts.yaml
deleted file mode 100644
index 19207f0b0..000000000
--- a/tests/unit/server/conf/tts/tts.yaml
+++ /dev/null
@@ -1,32 +0,0 @@
-# This is the parameter configuration file for TTS server.
-
-##################################################################
-#                  ACOUSTIC MODEL SETTING                        #
-# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
-#             'fastspeech2_ljspeech', 'fastspeech2_aishell3',
-#             'fastspeech2_vctk']
-##################################################################
-am: 'fastspeech2_csmsc'   
-am_config: 
-am_ckpt: 
-am_stat: 
-phones_dict: 
-tones_dict: 
-speaker_dict: 
-spk_id: 0
-
-##################################################################
-#                     VOCODER SETTING                            #
-# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
-#              'pwgan_vctk', 'mb_melgan_csmsc']
-##################################################################
-voc: 'pwgan_csmsc'
-voc_config: 
-voc_ckpt: 
-voc_stat: 
-
-##################################################################
-#                            OTHERS                              #
-##################################################################
-lang: 'zh'
-device:  # set 'gpu:id' or 'cpu'
diff --git a/tests/unit/server/conf/tts/tts_pd.yaml b/tests/unit/server/conf/tts/tts_pd.yaml
deleted file mode 100644
index e27b9665b..000000000
--- a/tests/unit/server/conf/tts/tts_pd.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-# This is the parameter configuration file for TTS server.
-# These are the static models that support paddle inference.
-
-##################################################################
-#                  ACOUSTIC MODEL SETTING                        #
-# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
-##################################################################
-am: 'fastspeech2_csmsc'   
-am_model: # the pdmodel file of your am static model (XX.pdmodel)
-am_params: # the pdiparams file of your am static model (XX.pdipparams)
-am_sample_rate: 24000
-phones_dict: 
-tones_dict: 
-speaker_dict: 
-spk_id: 0
-
-am_predictor_conf:
-  device:  # set 'gpu:id' or 'cpu'
-  switch_ir_optim: True
-  glog_info: False # True -> print glog
-  summary: True  # False -> do not show predictor config
-
-
-##################################################################
-#                     VOCODER SETTING                            #
-# voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
-##################################################################
-voc: 'pwgan_csmsc'
-voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
-voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
-voc_sample_rate: 24000
-
-voc_predictor_conf:
-  device:  # set 'gpu:id' or 'cpu'  
-  switch_ir_optim: True  
-  glog_info: False # True -> print glog
-  summary: True  # False -> do not show predictor config
-
-##################################################################
-#                            OTHERS                              #
-##################################################################
-lang: 'zh'
diff --git a/tests/unit/server/test_server_client.sh b/tests/unit/server/test_server_client.sh
index 795a23e01..b48e7111b 100644
--- a/tests/unit/server/test_server_client.sh
+++ b/tests/unit/server/test_server_client.sh
@@ -99,8 +99,8 @@ echo "**************************************************************************
 
 
 # start server: asr engine type: python; tts engine type: python; device: cpu
-python change_yaml.py --change_task speech-asr-cpu    # change asr.yaml device: cpu
-python change_yaml.py --change_task speech-tts-cpu    # change tts.yaml device: cpu
+python change_yaml.py --change_task device-asr_python-cpu    # change asr.yaml device: cpu
+python change_yaml.py --change_task device-tts_python-cpu    # change tts.yaml device: cpu
 
 echo "Start the service: asr engine type: python; tts engine type: python; device: cpu"  | tee -a ./log/test_result.log
 ((target_start_num+=1))
@@ -125,8 +125,8 @@ echo "**************************************************************************
 
 
 # start server: asr engine type: inference; tts engine type: inference; device: gpu
-python change_yaml.py --change_task app-asr-inference    # change application.yaml, asr engine_type: inference; asr engine_backend: asr_pd.yaml
-python change_yaml.py --change_task app-tts-inference    # change application.yaml, tts engine_type: inference; tts engine_backend: tts_pd.yaml
+python change_yaml.py --change_task enginetype-asr_inference    # change application.yaml, asr engine_type: inference; asr engine_backend: asr_pd.yaml
+python change_yaml.py --change_task enginetype-tts_inference    # change application.yaml, tts engine_type: inference; tts engine_backend: tts_pd.yaml
 
 echo "Start the service: asr engine type: inference; tts engine type: inference; device: gpu"  | tee -a ./log/test_result.log
 ((target_start_num+=1))
@@ -151,8 +151,8 @@ echo "**************************************************************************
 
 
 # start server: asr engine type: inference; tts engine type: inference; device: cpu
-python change_yaml.py --change_task speech-asr_pd-cpu    # change asr_pd.yaml device: cpu
-python change_yaml.py --change_task speech-tts_pd-cpu    # change tts_pd.yaml device: cpu
+python change_yaml.py --change_task device-asr_inference-cpu    # change asr_pd.yaml device: cpu
+python change_yaml.py --change_task device-tts_inference-cpu    # change tts_pd.yaml device: cpu
 
 echo "start the service: asr engine type: inference; tts engine type: inference; device: cpu"  | tee -a ./log/test_result.log
 ((target_start_num+=1))
@@ -182,4 +182,5 @@ echo "***************** Here are all the test results ********************"
 cat ./log/test_result.log
 
 # Restoring conf is the same as demos/speech_server
+rm -rf ./conf
 cp ../../../demos/speech_server/conf/ ./ -rf
\ No newline at end of file

From 1410a840546a9294b1196213a15f638d23c9a9c2 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 8 Mar 2022 03:12:26 +0000
Subject: [PATCH 16/17] add aishell3 hifigan, test=tts

---
 examples/aishell3/voc5/README.md           | 142 +++++++++++++++++
 examples/aishell3/voc5/conf/default.yaml   | 168 +++++++++++++++++++++
 examples/aishell3/voc5/local/preprocess.sh |  55 +++++++
 examples/aishell3/voc5/local/synthesize.sh |  14 ++
 examples/aishell3/voc5/local/train.sh      |  13 ++
 examples/aishell3/voc5/path.sh             |  13 ++
 examples/aishell3/voc5/run.sh              |  32 ++++
 7 files changed, 437 insertions(+)
 create mode 100644 examples/aishell3/voc5/README.md
 create mode 100644 examples/aishell3/voc5/conf/default.yaml
 create mode 100755 examples/aishell3/voc5/local/preprocess.sh
 create mode 100755 examples/aishell3/voc5/local/synthesize.sh
 create mode 100755 examples/aishell3/voc5/local/train.sh
 create mode 100755 examples/aishell3/voc5/path.sh
 create mode 100755 examples/aishell3/voc5/run.sh

diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md
new file mode 100644
index 000000000..7cd0b3963
--- /dev/null
+++ b/examples/aishell3/voc5/README.md
@@ -0,0 +1,142 @@
+# HiFiGAN with AISHELL-3
+This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.05646) model with [AISHELL-3](http://www.aishelltech.com/aishell_3).
+
+AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus that could be used to train multi-speaker Text-to-Speech (TTS) systems.
+## Dataset
+### Download and Extract
+Download AISHELL-3.
+```bash
+wget https://www.openslr.org/resources/93/data_aishell3.tgz
+```
+Extract AISHELL-3.
+```bash
+mkdir data_aishell3
+tar zxvf data_aishell3.tgz -C data_aishell3
+```
+### Get MFA Result and Extract
+We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/data_aishell3`.
+Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+
+The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
+                [--run-benchmark RUN_BENCHMARK]
+                [--profiler_options PROFILER_OPTIONS]
+
+Train a ParallelWaveGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+
+benchmark:
+  arguments related to benchmark.
+
+  --batch-size BATCH_SIZE
+                        batch size.
+  --max-iter MAX_ITER   train max steps.
+  --run-benchmark RUN_BENCHMARK
+                        runing benchmark or not, if True, use the --batch-size
+                        and --max-iter.
+  --profiler_options PROFILER_OPTIONS
+                        The option of profiler, which should be in format
+                        "key1=value1;key2=value2;key3=value3".
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesizing
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
+                     [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
+                     [--output-dir OUTPUT_DIR] [--ngpu NGPU]
+
+Synthesize with GANVocoder.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --generator-type GENERATOR_TYPE
+                        type of GANVocoder, should in {pwgan, mb_melgan,
+                        style_melgan, } now
+  --config CONFIG       GANVocoder config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+1. `--config` config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Models
+
+## Acknowledgement
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
diff --git a/examples/aishell3/voc5/conf/default.yaml b/examples/aishell3/voc5/conf/default.yaml
new file mode 100644
index 000000000..728a90369
--- /dev/null
+++ b/examples/aishell3/voc5/conf/default.yaml
@@ -0,0 +1,168 @@
+# This is the configuration file for AISHELL-3 dataset.
+# This configuration is based on HiFiGAN V1, which is
+# an official configuration. But I found that the optimizer
+# setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales
+# is also modified from the original 256 shift setting.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 2048              # FFT size (samples).
+n_shift: 300             # Hop size (samples). 12.5ms
+win_length: 1200         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [5, 5, 4, 3]         # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: True                 # Whether to apply weight normalization.
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1D"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: True             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: True                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
+feat_match_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 2              # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 2.0e-4               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1                 # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4               # Discriminator's learning rate.
+    gamma: 0.5                          # Discriminator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000    
+discriminator_grad_norm: -1             # Discriminator's gradient norm.            
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 5000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh
new file mode 100755
index 000000000..44cc3dbe4
--- /dev/null
+++ b/examples/aishell3/voc5/local/preprocess.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./aishell3_alignment_tone \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/../preprocess.py \
+        --rootdir=~/datasets/data_aishell3/ \
+        --dataset=aishell3 \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --cut-sil=True \
+        --num-cpu=20
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --stats=dump/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --stats=dump/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --stats=dump/train/feats_stats.npy
+fi
diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh
new file mode 100755
index 000000000..647896175
--- /dev/null
+++ b/examples/aishell3/voc5/local/synthesize.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize.py \
+    --config=${config_path} \
+    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+    --test-metadata=dump/test/norm/metadata.jsonl \
+    --output-dir=${train_output_path}/test \
+    --generator-type=hifigan
diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh
new file mode 100755
index 000000000..9695631ef
--- /dev/null
+++ b/examples/aishell3/voc5/local/train.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+FLAGS_cudnn_exhaustive_search=true \
+FLAGS_conv_workspace_size_limit=4000 \
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1
diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh
new file mode 100755
index 000000000..7451b3218
--- /dev/null
+++ b/examples/aishell3/voc5/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=hifigan
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/aishell3/voc5/run.sh b/examples/aishell3/voc5/run.sh
new file mode 100755
index 000000000..4f426ea02
--- /dev/null
+++ b/examples/aishell3/voc5/run.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_5000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi

From 81d964f0a0732a4d3128233c32b6f7d2864e134b Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 8 Mar 2022 03:02:36 +0000
Subject: [PATCH 17/17] add vctk hifigan, test=tts

---
 examples/vctk/voc5/README.md                  | 139 +++++++++++++++
 examples/vctk/voc5/conf/default.yaml          | 168 ++++++++++++++++++
 examples/vctk/voc5/local/preprocess.sh        |  55 ++++++
 examples/vctk/voc5/local/synthesize.sh        |  14 ++
 examples/vctk/voc5/local/train.sh             |  13 ++
 examples/vctk/voc5/path.sh                    |  13 ++
 examples/vctk/voc5/run.sh                     |  32 ++++
 .../t2s/exps/gan_vocoder/synthesize.py        |   2 +-
 8 files changed, 435 insertions(+), 1 deletion(-)
 create mode 100644 examples/vctk/voc5/README.md
 create mode 100644 examples/vctk/voc5/conf/default.yaml
 create mode 100755 examples/vctk/voc5/local/preprocess.sh
 create mode 100755 examples/vctk/voc5/local/synthesize.sh
 create mode 100755 examples/vctk/voc5/local/train.sh
 create mode 100755 examples/vctk/voc5/path.sh
 create mode 100755 examples/vctk/voc5/run.sh

diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md
new file mode 100644
index 000000000..06623ec5f
--- /dev/null
+++ b/examples/vctk/voc5/README.md
@@ -0,0 +1,139 @@
+# HiFiGAN with VCTK
+This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.05646) model with [VCTK](https://datashare.ed.ac.uk/handle/10283/3443).
+
+## Dataset
+### Download and Extract
+Download VCTK-0.92  from the [official website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/VCTK-Corpus-0.92`.
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio.
+You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/reorganize_vctk.py)):
+1. `p315`, because of no text for it.
+2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`.
+Assume the path to the MFA result of VCTK is `./vctk_alignment`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+
+The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
+                [--run-benchmark RUN_BENCHMARK]
+                [--profiler_options PROFILER_OPTIONS]
+
+Train a ParallelWaveGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+
+benchmark:
+  arguments related to benchmark.
+
+  --batch-size BATCH_SIZE
+                        batch size.
+  --max-iter MAX_ITER   train max steps.
+  --run-benchmark RUN_BENCHMARK
+                        runing benchmark or not, if True, use the --batch-size
+                        and --max-iter.
+  --profiler_options PROFILER_OPTIONS
+                        The option of profiler, which should be in format
+                        "key1=value1;key2=value2;key3=value3".
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesizing
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
+                     [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
+                     [--output-dir OUTPUT_DIR] [--ngpu NGPU]
+
+Synthesize with GANVocoder.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --generator-type GENERATOR_TYPE
+                        type of GANVocoder, should in {pwgan, mb_melgan,
+                        style_melgan, } now
+  --config CONFIG       GANVocoder config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+
+1. `--config` config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Model
+
+
+## Acknowledgement
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
diff --git a/examples/vctk/voc5/conf/default.yaml b/examples/vctk/voc5/conf/default.yaml
new file mode 100644
index 000000000..6361e01b2
--- /dev/null
+++ b/examples/vctk/voc5/conf/default.yaml
@@ -0,0 +1,168 @@
+# This is the configuration file for VCTK dataset.
+# This configuration is based on HiFiGAN V1, which is
+# an official configuration. But I found that the optimizer
+# setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales
+# is also modified from the original 256 shift setting.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 2048              # FFT size (samples).
+n_shift: 300             # Hop size (samples). 12.5ms
+win_length: 1200         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [5, 5, 4, 3]         # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: True                 # Whether to apply weight normalization.
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1D"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: True             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: True                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
+feat_match_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 2              # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 2.0e-4               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1                 # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4               # Discriminator's learning rate.
+    gamma: 0.5                          # Discriminator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000    
+discriminator_grad_norm: -1             # Discriminator's gradient norm.            
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 5000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh
new file mode 100755
index 000000000..88a478cd5
--- /dev/null
+++ b/examples/vctk/voc5/local/preprocess.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./vctk_alignment \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/../preprocess.py \
+        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
+        --dataset=vctk \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --cut-sil=True \
+        --num-cpu=20
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --stats=dump/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --stats=dump/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --stats=dump/train/feats_stats.npy
+fi
diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh
new file mode 100755
index 000000000..647896175
--- /dev/null
+++ b/examples/vctk/voc5/local/synthesize.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize.py \
+    --config=${config_path} \
+    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+    --test-metadata=dump/test/norm/metadata.jsonl \
+    --output-dir=${train_output_path}/test \
+    --generator-type=hifigan
diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh
new file mode 100755
index 000000000..9695631ef
--- /dev/null
+++ b/examples/vctk/voc5/local/train.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+FLAGS_cudnn_exhaustive_search=true \
+FLAGS_conv_workspace_size_limit=4000 \
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1
diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh
new file mode 100755
index 000000000..7451b3218
--- /dev/null
+++ b/examples/vctk/voc5/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=hifigan
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/vctk/voc5/run.sh b/examples/vctk/voc5/run.sh
new file mode 100755
index 000000000..4f426ea02
--- /dev/null
+++ b/examples/vctk/voc5/run.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_5000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/paddlespeech/t2s/exps/gan_vocoder/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py
index c60b9add2..9d9a8c49b 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/synthesize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py
@@ -34,7 +34,7 @@ def main():
         "--generator-type",
         type=str,
         default="pwgan",
-        help="type of GANVocoder, should in {pwgan, mb_melgan, style_melgan, } now"
+        help="type of GANVocoder, should in {pwgan, mb_melgan, style_melgan, hifigan, } now"
     )
     parser.add_argument("--config", type=str, help="GANVocoder config file.")
     parser.add_argument("--checkpoint", type=str, help="snapshot to load.")