From e66d1b7d9655bb44a588f0e1c1269f07bc4c6dd9 Mon Sep 17 00:00:00 2001
From: Yang Zhou <goat.zhou@qq.com>
Date: Sun, 18 Sep 2022 12:43:55 +0800
Subject: [PATCH] update audio api in app

---
 audio/docs/Makefile                           |  19 +
 audio/docs/README.md                          |  24 +
 audio/docs/images/paddle.png                  | Bin 0 -> 5043 bytes
 audio/docs/make.bat                           |  35 +
 audio/docs/source/_static/custom.css          |   5 +
 audio/docs/source/_templates/module.rst_t     |   9 +
 audio/docs/source/_templates/package.rst_t    |  57 ++
 audio/docs/source/_templates/toc.rst_t        |   8 +
 audio/docs/source/conf.py                     | 181 ++++
 audio/docs/source/index.rst                   |  22 +
 .../source/paddleaudio.backends.common.rst    |   7 +
 .../paddleaudio.backends.no_backend.rst       |   4 +-
 .../source/source/paddleaudio.backends.rst    |  19 +
 ...paddleaudio.backends.soundfile_backend.rst |   7 +
 .../paddleaudio.backends.sox_io_backend.rst   |   4 +-
 .../source/paddleaudio.backends.utils.rst     |   7 +
 .../source/paddleaudio.compliance.kaldi.rst   |   7 +
 .../source/paddleaudio.compliance.librosa.rst |   7 +
 .../source/source/paddleaudio.compliance.rst  |  16 +
 .../source/paddleaudio.datasets.dataset.rst   |   7 +
 .../source/paddleaudio.datasets.esc50.rst     |   7 +
 .../source/paddleaudio.datasets.gtzan.rst     |   7 +
 .../source/paddleaudio.datasets.hey_snips.rst |   7 +
 .../paddleaudio.datasets.rirs_noises.rst      |   4 +-
 .../source/source/paddleaudio.datasets.rst    |  22 +
 .../source/paddleaudio.datasets.tess.rst      |   7 +
 .../paddleaudio.datasets.urban_sound.rst      |   4 +-
 .../source/paddleaudio.datasets.voxceleb.rst  |   4 +-
 .../source/paddleaudio.features.layers.rst    |   7 +
 .../source/source/paddleaudio.features.rst    |  15 +
 .../paddleaudio.functional.functional.rst     |   7 +
 .../source/source/paddleaudio.functional.rst  |  16 +
 .../source/paddleaudio.functional.window.rst  |   7 +
 audio/docs/source/source/paddleaudio.io.rst   |   7 +
 .../source/source/paddleaudio.metric.eer.rst  |   7 +
 .../docs/source/source/paddleaudio.metric.rst |  15 +
 audio/docs/source/source/paddleaudio.rst      |  22 +
 .../source/source/paddleaudio.sox_effects.rst |   7 +
 .../api/paddlespeech.audio.backends.rst       |  16 -
 ...peech.audio.backends.soundfile_backend.rst |   7 -
 ...addlespeech.audio.backends.sox_backend.rst |   7 -
 .../api/paddlespeech.audio.compliance.rst     |  16 -
 ...addlespeech.audio.datasets.rirs_noises.rst |   7 -
 .../api/paddlespeech.audio.datasets.rst       |  22 -
 ...addlespeech.audio.datasets.urban_sound.rst |   7 -
 ...ddlespeech.audio.functional.functional.rst |   7 -
 .../api/paddlespeech.audio.functional.rst     |  16 -
 .../api/paddlespeech.audio.kaldi.kaldi.rst    |   7 +
 docs/source/api/paddlespeech.audio.kaldi.rst  |  15 +
 docs/source/api/paddlespeech.audio.metric.rst |  15 -
 docs/source/api/paddlespeech.audio.rst        |   7 +-
 .../api/paddlespeech.audio.sox_effects.rst    |   8 +
 ...lespeech.audio.sox_effects.sox_effects.rst |   7 +
 ...ddlespeech.audio.streamdata.autodecode.rst |   7 +
 ...> paddlespeech.audio.streamdata.cache.rst} |   4 +-
 ... paddlespeech.audio.streamdata.compat.rst} |   4 +-
 ...espeech.audio.streamdata.extradatasets.rst |   7 +
 .../paddlespeech.audio.streamdata.filters.rst |   7 +
 ...> paddlespeech.audio.streamdata.gopen.rst} |   4 +-
 ...paddlespeech.audio.streamdata.handlers.rst |   7 +
 .../api/paddlespeech.audio.streamdata.mix.rst |   7 +
 ...lespeech.audio.streamdata.paddle_utils.rst |   7 +
 ...paddlespeech.audio.streamdata.pipeline.rst |   7 +
 .../api/paddlespeech.audio.streamdata.rst     |  28 +
 ...ddlespeech.audio.streamdata.shardlists.rst |   7 +
 ...lespeech.audio.streamdata.tariterators.rst |   7 +
 .../paddlespeech.audio.streamdata.utils.rst   |   7 +
 ... paddlespeech.audio.streamdata.writer.rst} |   4 +-
 docs/source/api/paddlespeech.audio.text.rst   |  16 +
 ...addlespeech.audio.text.text_featurizer.rst |   7 +
 .../api/paddlespeech.audio.text.utility.rst   |   7 +
 ...addlespeech.audio.transform.add_deltas.rst |   7 +
 ...peech.audio.transform.channel_selector.rst |   7 +
 .../api/paddlespeech.audio.transform.cmvn.rst |   7 +
 ...addlespeech.audio.transform.functional.rst |   7 +
 .../paddlespeech.audio.transform.perturb.rst  |   7 +
 .../api/paddlespeech.audio.transform.rst      |  24 +
 ...dlespeech.audio.transform.spec_augment.rst |   7 +
 ...ddlespeech.audio.transform.spectrogram.rst |   7 +
 ...ch.audio.transform.transform_interface.rst |   7 +
 ...espeech.audio.transform.transformation.rst |   7 +
 .../api/paddlespeech.audio.transform.wpe.rst  |   7 +
 ...paddlespeech.audio.utils.check_kwargs.rst} |   4 +-
 ...addlespeech.audio.utils.dynamic_import.rst |   7 +
 docs/source/api/paddlespeech.audio.utils.rst  |   4 +
 .../paddlespeech.audio.utils.sox_utils.rst    |   7 +
 .../paddlespeech.audio.utils.tensor_utils.rst |   7 +
 .../api/paddlespeech.cls.exps.panns.rst       |   4 +
 .../paddlespeech.kws.exps.mdtc.collate.rst    |   7 +
 ...paddlespeech.kws.exps.mdtc.compute_det.rst |   7 +
 ...dlespeech.kws.exps.mdtc.plot_det_curve.rst |   7 +
 .../source/api/paddlespeech.kws.exps.mdtc.rst |  19 +
 .../api/paddlespeech.kws.exps.mdtc.score.rst  |   7 +
 .../api/paddlespeech.kws.exps.mdtc.train.rst  |   7 +
 docs/source/api/paddlespeech.kws.exps.rst     |  15 +
 docs/source/api/paddlespeech.kws.rst          |   1 +
 .../api/paddlespeech.resource.model_alias.rst |   7 +
 ...addlespeech.resource.pretrained_models.rst |   7 +
 .../api/paddlespeech.resource.resource.rst    |   7 +
 docs/source/api/paddlespeech.resource.rst     |  17 +
 docs/source/api/paddlespeech.rst              |  10 +
 docs/source/api/paddlespeech.s2t.rst          |   1 -
 docs/source/api/paddlespeech.server.utils.rst |   1 -
 docs/source/api/paddlespeech.t2s.datasets.rst |   1 +
 .../api/paddlespeech.t2s.datasets.sampler.rst |   7 +
 .../paddlespeech.t2s.exps.ernie_sat.align.rst |   7 +
 ...dlespeech.t2s.exps.ernie_sat.normalize.rst |   7 +
 ...lespeech.t2s.exps.ernie_sat.preprocess.rst |   7 +
 .../api/paddlespeech.t2s.exps.ernie_sat.rst   |  21 +
 ...lespeech.t2s.exps.ernie_sat.synthesize.rst |   7 +
 ...eech.t2s.exps.ernie_sat.synthesize_e2e.rst |   7 +
 .../paddlespeech.t2s.exps.ernie_sat.train.rst |   7 +
 .../paddlespeech.t2s.exps.ernie_sat.utils.rst |   7 +
 .../api/paddlespeech.t2s.exps.fastspeech2.rst |   1 +
 ...espeech.t2s.exps.fastspeech2.vc2_infer.rst |   7 +
 docs/source/api/paddlespeech.t2s.exps.rst     |   3 +
 .../paddlespeech.t2s.exps.stream_play_tts.rst |   7 +
 .../paddlespeech.t2s.exps.vits.normalize.rst  |   7 +
 .../paddlespeech.t2s.exps.vits.preprocess.rst |   7 +
 .../source/api/paddlespeech.t2s.exps.vits.rst |  20 +
 .../paddlespeech.t2s.exps.vits.synthesize.rst |   7 +
 ...dlespeech.t2s.exps.vits.synthesize_e2e.rst |   7 +
 .../api/paddlespeech.t2s.exps.vits.train.rst  |   7 +
 ...ddlespeech.t2s.exps.vits.voice_cloning.rst |   7 +
 ...paddlespeech.t2s.frontend.g2pw.dataset.rst |   7 +
 ...addlespeech.t2s.frontend.g2pw.onnx_api.rst |   7 +
 .../api/paddlespeech.t2s.frontend.g2pw.rst    |  17 +
 .../paddlespeech.t2s.frontend.g2pw.utils.rst  |   7 +
 ...paddlespeech.t2s.frontend.mix_frontend.rst |   7 +
 docs/source/api/paddlespeech.t2s.frontend.rst |   2 +
 ...espeech.t2s.models.ernie_sat.ernie_sat.rst |   7 +
 ...t2s.models.ernie_sat.ernie_sat_updater.rst |   7 +
 .../api/paddlespeech.t2s.models.ernie_sat.rst |   3 +-
 ...h.t2s.models.vits.monotonic_align.core.rst |   7 +
 ...speech.t2s.models.vits.monotonic_align.rst |  16 +
 ....t2s.models.vits.monotonic_align.setup.rst |   7 +
 .../api/paddlespeech.utils.dynamic_import.rst |   7 +
 docs/source/api/paddlespeech.utils.env.rst    |   7 +
 docs/source/api/paddlespeech.utils.rst        |  16 +
 docs/source/api/paddlespeech.version.rst      |   7 +
 docs/source/cls/custom_dataset.md             |   4 +-
 examples/esc50/cls0/conf/panns.yaml           |   2 +-
 examples/hey_snips/kws0/conf/mdtc.yaml        |   2 +-
 examples/voxceleb/sv0/local/data_prepare.py   |   2 +-
 .../make_rirs_noise_csv_dataset_from_json.py  |   2 +-
 .../local/make_vox_csv_dataset_from_json.py   |   2 +-
 paddlespeech/audio/backends/no_backend.py     |  32 -
 .../audio/backends/soundfile_backend.py       | 662 ---------------
 paddlespeech/audio/compliance/__init__.py     |  15 -
 paddlespeech/audio/compliance/kaldi.py        | 638 --------------
 paddlespeech/audio/compliance/librosa.py      | 788 ------------------
 paddlespeech/audio/datasets/__init__.py       |  20 -
 paddlespeech/audio/datasets/dataset.py        | 100 ---
 paddlespeech/audio/datasets/esc50.py          | 152 ----
 paddlespeech/audio/datasets/gtzan.py          | 115 ---
 paddlespeech/audio/datasets/hey_snips.py      |  74 --
 paddlespeech/audio/datasets/rirs_noises.py    | 200 -----
 paddlespeech/audio/datasets/tess.py           | 126 ---
 paddlespeech/audio/datasets/urban_sound.py    | 104 ---
 paddlespeech/audio/datasets/voxceleb.py       | 355 --------
 paddlespeech/audio/features/__init__.py       |  17 -
 paddlespeech/audio/features/layers.py         | 328 --------
 paddlespeech/audio/functional/__init__.py     |  20 -
 paddlespeech/audio/functional/functional.py   | 266 ------
 paddlespeech/audio/functional/window.py       | 337 --------
 paddlespeech/audio/io/__init__.py             |  13 -
 paddlespeech/audio/metric/__init__.py         |  15 -
 paddlespeech/audio/metric/eer.py              | 100 ---
 paddlespeech/audio/streamdata/autodecode.py   |   2 +-
 paddlespeech/audio/streamdata/tariterators.py |   4 +-
 paddlespeech/cli/cls/infer.py                 |   4 +-
 paddlespeech/cli/kws/infer.py                 |   4 +-
 paddlespeech/cli/vector/infer.py              |   4 +-
 paddlespeech/cls/exps/panns/deploy/predict.py |   6 +-
 paddlespeech/cls/exps/panns/export_model.py   |   2 +-
 paddlespeech/cls/exps/panns/predict.py        |   6 +-
 paddlespeech/cls/exps/panns/train.py          | 141 ++--
 paddlespeech/cls/models/panns/panns.py        |   2 +-
 paddlespeech/kws/exps/mdtc/train.py           |   4 +-
 .../frontend/featurizer/audio_featurizer.py   |   2 +-
 paddlespeech/s2t/models/u2/u2.py              |   6 +-
 paddlespeech/s2t/models/u2_st/u2_st.py        |   4 +-
 .../engine/vector/python/vector_engine.py     |   4 +-
 paddlespeech/server/util.py                   |   4 +-
 .../vector/exps/ecapa_tdnn/extract_emb.py     |   4 +-
 paddlespeech/vector/exps/ecapa_tdnn/test.py   |   2 +-
 paddlespeech/vector/exps/ecapa_tdnn/train.py  |   2 +-
 paddlespeech/vector/io/dataset.py             |   4 +-
 paddlespeech/vector/io/dataset_from_json.py   |   6 +-
 tests/benchmark/audio/README.md               |  38 -
 tests/benchmark/audio/log_melspectrogram.py   | 125 ---
 tests/benchmark/audio/melspectrogram.py       | 109 ---
 tests/benchmark/audio/mfcc.py                 | 123 ---
 .../unit/audio/backends/soundfile/__init__.py |  13 -
 tests/unit/audio/backends/soundfile/common.py |  57 --
 .../audio/backends/soundfile/info_test.py     | 199 -----
 .../audio/backends/soundfile/load_test.py     | 369 --------
 .../audio/backends/soundfile/save_test.py     | 322 -------
 .../unit/audio/backends/soundfile/test_io.py  |  73 --
 tests/unit/audio/features/base.py             |   2 +-
 tests/unit/audio/features/test_istft.py       |   2 +-
 tests/unit/audio/features/test_kaldi.py       |  81 --
 tests/unit/audio/features/test_librosa.py     | 281 -------
 .../audio/features/test_log_melspectrogram.py |   4 +-
 tests/unit/audio/features/test_spectrogram.py |   4 +-
 tests/unit/audio/features/test_stft.py        |   2 +-
 206 files changed, 1440 insertions(+), 6533 deletions(-)
 create mode 100644 audio/docs/Makefile
 create mode 100644 audio/docs/README.md
 create mode 100644 audio/docs/images/paddle.png
 create mode 100644 audio/docs/make.bat
 create mode 100644 audio/docs/source/_static/custom.css
 create mode 100644 audio/docs/source/_templates/module.rst_t
 create mode 100644 audio/docs/source/_templates/package.rst_t
 create mode 100644 audio/docs/source/_templates/toc.rst_t
 create mode 100644 audio/docs/source/conf.py
 create mode 100644 audio/docs/source/index.rst
 create mode 100644 audio/docs/source/source/paddleaudio.backends.common.rst
 rename docs/source/api/paddlespeech.audio.datasets.tess.rst => audio/docs/source/source/paddleaudio.backends.no_backend.rst (51%)
 create mode 100644 audio/docs/source/source/paddleaudio.backends.rst
 create mode 100644 audio/docs/source/source/paddleaudio.backends.soundfile_backend.rst
 rename docs/source/api/paddlespeech.audio.compliance.librosa.rst => audio/docs/source/source/paddleaudio.backends.sox_io_backend.rst (50%)
 create mode 100644 audio/docs/source/source/paddleaudio.backends.utils.rst
 create mode 100644 audio/docs/source/source/paddleaudio.compliance.kaldi.rst
 create mode 100644 audio/docs/source/source/paddleaudio.compliance.librosa.rst
 create mode 100644 audio/docs/source/source/paddleaudio.compliance.rst
 create mode 100644 audio/docs/source/source/paddleaudio.datasets.dataset.rst
 create mode 100644 audio/docs/source/source/paddleaudio.datasets.esc50.rst
 create mode 100644 audio/docs/source/source/paddleaudio.datasets.gtzan.rst
 create mode 100644 audio/docs/source/source/paddleaudio.datasets.hey_snips.rst
 rename docs/source/api/paddlespeech.audio.datasets.gtzan.rst => audio/docs/source/source/paddleaudio.datasets.rirs_noises.rst (51%)
 create mode 100644 audio/docs/source/source/paddleaudio.datasets.rst
 create mode 100644 audio/docs/source/source/paddleaudio.datasets.tess.rst
 rename docs/source/api/paddlespeech.audio.datasets.esc50.rst => audio/docs/source/source/paddleaudio.datasets.urban_sound.rst (51%)
 rename docs/source/api/paddlespeech.audio.metric.eer.rst => audio/docs/source/source/paddleaudio.datasets.voxceleb.rst (52%)
 create mode 100644 audio/docs/source/source/paddleaudio.features.layers.rst
 create mode 100644 audio/docs/source/source/paddleaudio.features.rst
 create mode 100644 audio/docs/source/source/paddleaudio.functional.functional.rst
 create mode 100644 audio/docs/source/source/paddleaudio.functional.rst
 create mode 100644 audio/docs/source/source/paddleaudio.functional.window.rst
 create mode 100644 audio/docs/source/source/paddleaudio.io.rst
 create mode 100644 audio/docs/source/source/paddleaudio.metric.eer.rst
 create mode 100644 audio/docs/source/source/paddleaudio.metric.rst
 create mode 100644 audio/docs/source/source/paddleaudio.rst
 create mode 100644 audio/docs/source/source/paddleaudio.sox_effects.rst
 delete mode 100644 docs/source/api/paddlespeech.audio.backends.rst
 delete mode 100644 docs/source/api/paddlespeech.audio.backends.soundfile_backend.rst
 delete mode 100644 docs/source/api/paddlespeech.audio.backends.sox_backend.rst
 delete mode 100644 docs/source/api/paddlespeech.audio.compliance.rst
 delete mode 100644 docs/source/api/paddlespeech.audio.datasets.rirs_noises.rst
 delete mode 100644 docs/source/api/paddlespeech.audio.datasets.rst
 delete mode 100644 docs/source/api/paddlespeech.audio.datasets.urban_sound.rst
 delete mode 100644 docs/source/api/paddlespeech.audio.functional.functional.rst
 delete mode 100644 docs/source/api/paddlespeech.audio.functional.rst
 create mode 100644 docs/source/api/paddlespeech.audio.kaldi.kaldi.rst
 create mode 100644 docs/source/api/paddlespeech.audio.kaldi.rst
 delete mode 100644 docs/source/api/paddlespeech.audio.metric.rst
 create mode 100644 docs/source/api/paddlespeech.audio.sox_effects.sox_effects.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.autodecode.rst
 rename docs/source/api/{paddlespeech.audio.datasets.dataset.rst => paddlespeech.audio.streamdata.cache.rst} (50%)
 rename docs/source/api/{paddlespeech.audio.datasets.voxceleb.rst => paddlespeech.audio.streamdata.compat.rst} (50%)
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.filters.rst
 rename docs/source/api/{paddlespeech.audio.compliance.kaldi.rst => paddlespeech.audio.streamdata.gopen.rst} (50%)
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.handlers.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.mix.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.pipeline.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.shardlists.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.tariterators.rst
 create mode 100644 docs/source/api/paddlespeech.audio.streamdata.utils.rst
 rename docs/source/api/{paddlespeech.audio.functional.window.rst => paddlespeech.audio.streamdata.writer.rst} (50%)
 create mode 100644 docs/source/api/paddlespeech.audio.text.rst
 create mode 100644 docs/source/api/paddlespeech.audio.text.text_featurizer.rst
 create mode 100644 docs/source/api/paddlespeech.audio.text.utility.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.add_deltas.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.channel_selector.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.cmvn.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.functional.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.perturb.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.spec_augment.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.spectrogram.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.transform_interface.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.transformation.rst
 create mode 100644 docs/source/api/paddlespeech.audio.transform.wpe.rst
 rename docs/source/api/{paddlespeech.audio.datasets.hey_snips.rst => paddlespeech.audio.utils.check_kwargs.rst} (50%)
 create mode 100644 docs/source/api/paddlespeech.audio.utils.dynamic_import.rst
 create mode 100644 docs/source/api/paddlespeech.audio.utils.sox_utils.rst
 create mode 100644 docs/source/api/paddlespeech.audio.utils.tensor_utils.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.score.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.mdtc.train.rst
 create mode 100644 docs/source/api/paddlespeech.kws.exps.rst
 create mode 100644 docs/source/api/paddlespeech.resource.model_alias.rst
 create mode 100644 docs/source/api/paddlespeech.resource.pretrained_models.rst
 create mode 100644 docs/source/api/paddlespeech.resource.resource.rst
 create mode 100644 docs/source/api/paddlespeech.resource.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.datasets.sampler.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.train.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
 create mode 100644 docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
 create mode 100644 docs/source/api/paddlespeech.utils.dynamic_import.rst
 create mode 100644 docs/source/api/paddlespeech.utils.env.rst
 create mode 100644 docs/source/api/paddlespeech.utils.rst
 create mode 100644 docs/source/api/paddlespeech.version.rst
 delete mode 100644 paddlespeech/audio/backends/no_backend.py
 delete mode 100644 paddlespeech/audio/backends/soundfile_backend.py
 delete mode 100644 paddlespeech/audio/compliance/__init__.py
 delete mode 100644 paddlespeech/audio/compliance/kaldi.py
 delete mode 100644 paddlespeech/audio/compliance/librosa.py
 delete mode 100644 paddlespeech/audio/datasets/__init__.py
 delete mode 100644 paddlespeech/audio/datasets/dataset.py
 delete mode 100644 paddlespeech/audio/datasets/esc50.py
 delete mode 100644 paddlespeech/audio/datasets/gtzan.py
 delete mode 100644 paddlespeech/audio/datasets/hey_snips.py
 delete mode 100644 paddlespeech/audio/datasets/rirs_noises.py
 delete mode 100644 paddlespeech/audio/datasets/tess.py
 delete mode 100644 paddlespeech/audio/datasets/urban_sound.py
 delete mode 100644 paddlespeech/audio/datasets/voxceleb.py
 delete mode 100644 paddlespeech/audio/features/__init__.py
 delete mode 100644 paddlespeech/audio/features/layers.py
 delete mode 100644 paddlespeech/audio/functional/__init__.py
 delete mode 100644 paddlespeech/audio/functional/functional.py
 delete mode 100644 paddlespeech/audio/functional/window.py
 delete mode 100644 paddlespeech/audio/io/__init__.py
 delete mode 100644 paddlespeech/audio/metric/__init__.py
 delete mode 100644 paddlespeech/audio/metric/eer.py
 delete mode 100644 tests/benchmark/audio/README.md
 delete mode 100644 tests/benchmark/audio/log_melspectrogram.py
 delete mode 100644 tests/benchmark/audio/melspectrogram.py
 delete mode 100644 tests/benchmark/audio/mfcc.py
 delete mode 100644 tests/unit/audio/backends/soundfile/__init__.py
 delete mode 100644 tests/unit/audio/backends/soundfile/common.py
 delete mode 100644 tests/unit/audio/backends/soundfile/info_test.py
 delete mode 100644 tests/unit/audio/backends/soundfile/load_test.py
 delete mode 100644 tests/unit/audio/backends/soundfile/save_test.py
 delete mode 100644 tests/unit/audio/backends/soundfile/test_io.py
 delete mode 100644 tests/unit/audio/features/test_kaldi.py
 delete mode 100644 tests/unit/audio/features/test_librosa.py

diff --git a/audio/docs/Makefile b/audio/docs/Makefile
new file mode 100644
index 000000000..69fe55ecf
--- /dev/null
+++ b/audio/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/audio/docs/README.md b/audio/docs/README.md
new file mode 100644
index 000000000..20626f52b
--- /dev/null
+++ b/audio/docs/README.md
@@ -0,0 +1,24 @@
+# Build docs for PaddleAudio
+
+Execute the following steps in **current directory**.
+
+## 1. Install
+
+`pip install Sphinx sphinx_rtd_theme`
+
+
+## 2. Generate API docs
+
+Generate API docs from doc string.
+
+`sphinx-apidoc -fMeT -o source ../paddleaudio ../paddleaudio/utils --templatedir source/_templates`
+
+
+## 3. Build
+
+`sphinx-build source _html`
+
+
+## 4. Preview
+
+Open `_html/index.html` for page preview.
diff --git a/audio/docs/images/paddle.png b/audio/docs/images/paddle.png
new file mode 100644
index 0000000000000000000000000000000000000000..bc1135abfab7aa48f29392da4bca614f688314af
GIT binary patch
literal 5043
zcmV;k6HM%hP)<h;3K|Lk000e1NJLTq007ni002S=1^@s6D-<Yu00001b5ch_0Itp)
z=>Px|ZAnByRCodHoe8iVMY+cbn;1d107fAM5|&a3LJ*P7Ld`R*M!^N#5=2ErP!zR<
zA_!7S^daytPsEDif&wLr7zGqC2&iDt(hzpCh!7y4kbpu=uD}1`&Y@4AUS`hpOwPT|
zSM|^Ie0%py&-8Tnxf!v>Fr<Njft{(=USJ$}63hqJj2JQEd5(Qa<UE4J-UN05uLO(0
z9proq$A+2~SbZ(P&AATv23Q4bnh!%aZQl`jZ3%9&6COOjlH<mGmvocRe@_c=1D@`=
zIWzV`3LErOi;&Nn;N~nr&+(Fe%DKtqzo-ScA=d{hJU6Dv{!%|R3yB?SCKo8bwx9BD
zatXJ<nql#fl8z$VwR2>=Zw{S0Rd&xAY^R)hjanbHz;NnVFVESY<8$iOrM7j>K$_5K
zXo$4HaOzl3<!tkNa_ZHkwjgJqhjQvQYJJcG!>MCEPJ8B=PM^~U?QW)Un|Tr_|2ZIQ
zL`@4+*8+U_e<ASkFVTHqWOW%7_e?$`!1W+R7Zvy1STrq=s0Fw)*8*pOm5#e~F1#HR
zg-Ys%ytV+paO5o34?tfn5hShrFm<NY{$WSlh}%)AJ%PFN;zo8ctnBB?lJ{QZHvyPC
zFaFriIXAgAEzq<;(*m`%z_5H{)z*4Vpq^WRx3kt9?fY*fql#6wQ*_s{dFZXMdA*+F
z|7F<P!|efLJld;IyJEfiY3%yA1-NnELVBOAC`H|TALk}~I<J57(W1C!L+$(TeEfXE
z^n>nJoG)_deg@tDHFZ@+|05huw)o#i_Y{lQIz@k73U>l~eN&sZ4{+J!;pwTdT8IiM
z_F949&r`7hpB($0k!@nh|Hb)Yuq0$_+2lSYrfvj1J?$GJ^P#7lcgi38m!h~wraX95
z9km000k+RrvQ($KX(pv|X8BQlPq28gQ|Y_f@CUXbZSq&zZ8rRaE!k=CFG2p0#ovtc
zF&0nXtUj87K<#7GZ}D%(v~|pLUB~(4lXqRxv`tec;myE5L5TDb?<9;~0sa**eiqqm
z@F9?QM}{+&$;8*GAViNMblos5H^_7|gh-o&&jtIKam$wRcT6Z~N!f)(@*~!0@Vq1D
zAAEk$-3_>Yv2Zo~j<s0MPX;#4vVEhVWscbOwEe`+m&&gltG083$9ADVjLzD=gphp+
z<Rw-1MDP-yzdU^G*8}P}A+m=cZPysd{;`m;CA*Px`|Z~DSEPWSsydc#_8ki5;=A@!
zLw<<!ZQ!EwOgj{!9Sfatz_+~Nicy5OdO8*vjs&{dL-aYbvCJ|?vVS6E?2+9L-yR~E
zu7hQ4KkD(Dv7ez}4^7)XbJ%Bo%%j%hP64GZ7WP$jtjpUK%uyf@UAxDDO?u!2AAHF5
z>jf?FyXbtF`c~36mbYUe(_2dGg8v3p&<dQku9f$#kfWV?Rhf$6rUB#S$Sw%Try=v+
zzyUjbFl&)`664@M1@yW^gN=1!m!Oj|3%~kS>Y_pWPjw063=MP602A<W1pa$nhttRT
zCGl`w8rr3z?+jnZKC8<j%+SQbD2K{S>h_$2-)13St~P*iE3jt?KMrp%<fn4%7#ebJ
z(|c`3!t<r%*_s#dHCVz}FS?@Qs`R};3%5S{I>|Y2;+*zAzZriCg*OmmVaA9Zi8}?I
zD*}4D;kU=<gDxHH4~2h~OICn)0sfCEpkG>*ymfy(23(!tCh~b>-XQqe2wh!!O)i-9
z3@2OT>j{(TE8|D^M%Qehp|8o5*4xYAeFy(X4c2Fs!Ox0!QJ9yV<T$R6&Rc=qiEks$
z>vPzrRiR>sfe`6cy0tGVvMC|s0y4e1LNk&bDWDtsh3};u2G$AoH-Z$Tw{>kq!l55w
z9#|w1e0|T;0ADNON4og=qQ{qyJ!{ZTLqU`Dji_OyW@`%(HkxR%Db(Q$`7fkQ7Mj8g
z(ZwN$7>)vaSa?0>`+>v3*T91y4}CHeE0f`9&m7KzlML9u2H1IP=rRd>1nBQt#AZoR
zX?>fz-X2-HNsq+4NkJX-7kIfxJLrDJ;X^FV?2hgwY3z;%uGvM{U1WDGbtp|<>}tm5
zd@R_&<hjb|90FXVkx9Ko7W=p|v3R?s$O@nP0@oA84EJ|VqE9MxeqtIq%5xIstdH|E
zfG&E7W~7K0^Fac75cuf*F?-1|UKgC{a{~XIJbrvS*z4h>P97t(V`)}$Thk}!F9Fkm
z9&YMz-nj>4Yl3;eCf!dOBGyX;)l)e{*Qd;gKzh6Rp14^a9}LArNz<~fo${Hzj)#PN
z6qVsm!>`cWOEYNpH$>4eQGH@u3S6`pragJk_J>T%3cCW$W<CIXBtOx&L))>ZfRDAn
zQQ-H$N3D)!yFgh(!y|#6=MUl8-%l-b_{Rbl)$+jWr~_GS4`NT^$sI-sPKdb`&|<NV
zt_y|1Xr|9_kbf7)+(V*JEK`Qh(<5;YGWAoJiM<W*O-Z+g(8wnOJ-P23gl<l)^Q{Ci
z(hzR{3i93?zpmk-uVZbBzTPx0x(w!eW{#X62s961lXzR1y0w%EdV!sZb*>AP{03F!
z@Ef{G|KW=ZUyZJ{sSq7qMFSG>Wk8GMKGGC?Wkk^b^8|hperveH@oJAwjI9T3uAyX+
z3gUA-xX%;Z*|Fpw{eyw;p?@o~?FRKU?DX9aUoDbr5?zwn9E658hzF)73FdHu?}-W!
zpz#hfHTbCk8;O>H;G(9>Z(bh0zpo%3nw{5AQ9a^q0+xH?o-TjLw8f|wT87bkisls4
zmrtNh?;~%=8mv#~TI=4`weW4By8~I}L-VdSAZX}~fq39=RVIe-4HP6m6YZqY)pvrv
zWkTfLzGVCB<($AL2ZG<+i_bhZfw&q{x7MIIUJ=O6Cz^v`$2>XY)xn=0+8VOODA=(S
z4+k@1P8@Q*!3u5Z%3HUf9zjQfrH+J5mUsJ7(0KtN!8GLak*i~s+P+jEHLohkPW2@W
zzmkrHU(Jp^;)(kWC5vnV@C}`8)2Crv4)*p|v<CtHHUsJizBenP4vKcH!MdG<Ri1A|
zIe!j#r`Ol+p<YEyrlDMB`%;luPF*UmZ-Nb-!WWpUOd0x`4RY@MF!a3Jmkgb<#a9L#
zbmSb+D=K;)2Cr}!t%R2?e(Fyi+~Ckz2JZ==ZC#ImTfkg)BFMHhaa}YMsR@t6fbtv(
zk`TVNHn=<35<CW6Z2-GX%PDD#+0WXXx(d2M!TlgaXIGsM!4D54pqVfIdbCZPPYLKt
zB%6+PkhXEc0x^E0U>^Er!cZ+&#L?9Ad{4X|r;b;|A2}TjLZgl>Z4KYA%}$JC>y5>^
z>UaXs=gEcOI<N>7p?gq%O_BVQ>Jt30NVb}y^lo2DkqrJd6jej`=ngv$ilw$M+5UPt
zCx=+u1=rttjCqi+k>Dp;xbGz|*%1G{KyI06Hia#K>)X`qSebs&qo(n-*xfM&@2gXl
zL&1}LK?nJl@+w^v(J3qbIu5oEIQg|HqtP_XiG2?g-rHuzh?U7f!;#SL{sH83Vt#s%
zf5v0;E3Q-P0Yq-cq4O4p-v6U>t%F|&OT*7oi9b`p$T4-y=WzI4FoPHmrgmP)AC=n&
z#Hg~TAc4n#Ya$6~NY}B;p7UC^FZmji_(y=pL5RxklJ5#nu-yXz+Z_p=$sWB0@LTI!
zUCt9Q@a=kIX;SP9pkwz2=c(RUx8duPV2fU=(0*g#U<3}{2DH7uEKQ3zxFLHAN{`qY
zvrdTE#Kg*96D8UZI|alp^yTF60D3JVhw&6-7ozst3Vls-915cQ4MPr*X)SCFI1uFd
z$w3ucXMArf{g0BrBCi%v*28KpCld!**<maN%^D^jqGkb%<E@l00vD5yCJ|E5(>T#j
zT2!|=KtGB38^`iQgS`vXMU%6P38*!r*iYV5=k*j0?YGs;NRW>0OXyCE=vN`WIBU05
zN0J;1^!9eqyX`nA?dMV)<oP<N?WWK<zIAV&2=wp^5<2=m{iTIZ$OxV$FLnVtf^EQP
z&@p)xbl!`Q0KWru&|l2_2pC&4sv@V38-@U*z_XD+M`wAXrwcSMqMurPFAh6~Jy||w
zj`%#slc@IoIe)VT|3@Aht#2J)CZ8&8$!#N`j%TB_@SQw1f%XP<(bTL01k?>0nzgGU
zr>=c=1eqQQv;=v3`x4~?Bl@YuTD09MYqwNKeCkG9=1KNgUmWx|28%s*Vr%xy*(rTK
z>)?~q0ZPt6N2HFg?-IQtFX!;~)Xp9L9)`CK<5&HBT5<JFccscT&?;h_K<x9)SmdDT
z)5Re4)H(&bGc!AEZlJt>c3$yOWB<v-UywQu_4qtKzwGe)I6VDa{+|w6i-qle6x0sB
z4)hyj@5RSoTLvZmA=bH=Pb%ReGu1cv=GlpHTuNDWE=%*M@4H$~NO~N|yNv_YQuH}9
zb3BC>K}!X)jOTOyW;1qHh94V>YCqT}hW<p$cU7mlCf_n+XE~%e;#WUH^zEQOB~J|O
zKL9lR^O4%_ET9{@XTtvHz5sp3fAKSi#LllNISo15Gmez^tRe2*#5f+bXFtuW3tziK
zVnfmMEF0hV1+J3Nb9lK5bo^}G%ppY?|5H8LE{ETKki=r4tM-kNCX2oev<TfEQpbE}
za%_fN$I=bAGq^jeM2|-e2eymYUdU_mABIkfxQ4qmv?uKmplQ<k!Bxb2@d*OML7FZP
zEg;M1IhNh4I4|2?rb|s3KLnKb@4-*W`zel9)2BTFbe*2`$hz3*YK`d`1e$L5b()tW
z`_LfNasNnJ!|z2v^<Dw4#g}-)f?D5LWotr19gdI70)GCDY+S!J9C_bNUZI^9myz=e
zW9C5RhMI_aA3)AOcjT?>v@LZm{9T04*J1k;u$9Aa9@bRiUgT25_fO#;;*ure=|N^b
z^m}#tXX>DbbsD1Lv8nnWriql!3pwW*itdpvV*hxe?$-kseXmTGI2;AO30AnAM|j@F
zW$a!GHc5c@dvZ9kOuZ|yCDDC7$(rolDhXb~DaozY?Mo$6FJ(%-cQr4JQPw0xk|}-s
z>;UvTtRm#?Sfwg#7?aFoPeRG3+npcvMlN}ZvDNmD?MrzaR;@$hjwi;HNSyZerHI~8
z6Z@{$FBrzmOHmp8+=HKY(9Sz@)MtXQ3#gQcH<|!?>#l1B(l^|}7O(3R`zv7D6E|9>
zIgUEr*=0}_gGBUX3Qe~n9%yf0sw$77e#vR35(`kr_NAi!`&FO%jsd=5+8e9Po4DJ7
zW5L-#v!3(7vmm3=Zw(yjrz0+!rMe*qzOj&h(Pa#8E?B3^_UZ6VFs-wMC^jqC;jfMp
zlGS&Bc|l$IA6-W-b;qEhGhCI0r&+q%iMZN%qwgo7)I=Kg`Z1)7^v7VvsB#=*9ZP>7
z<uR8Nc&FFWKm^(xDCrPYK5i~rS(2R|N*LJ^;F~rs8m_POsn<eJ-8&ZU`zq-i$xx?5
zfbU`1t{XbRj-^oO90^>cjmp(Oa0zmTE4sm}^sg`FXzwl+a$67ho=92hCwCbR0|u3w
zkX6ueJZPv^k-B^s_;%;tj!bK;EutLlSW-k!LscJ5f5BS?{}OQ|SnYp(sc6;6IktUC
zi<kDtYtb-$sZYJ$1AKH^UnU($ii+q#ajCCD@b!byv8D^o^L8vv?cWc4bRB}w`sW(G
z&I%qy!kd~&Tjgb(UtsZ8>nRPfCJCGvNurLwz7$EJGV!Ydo~P43W0Btq7K2rwZqnlF
z5?@gA(>K^SpoM5H+P@9>zRR@9`yKdO$gwbC_ayPOApKjgp~Jx>c$3h%k)sy1ZwGf{
z*L%C+Y%TBr2+`RD>MJ7IDWQasX*jV>UnQL|8R|AIQvcO0KFL>I5<3(4{>HkUw>JL1
z)#VF*t~XX@i@r8o$Np3TDlO7p*jbsap5jI6?HFtuFw(Xddw*R(y*82e!VPAipXx8D
zEt~oS{i*(t$s%e)@<utl>8Bt6!*#qihi+5_KXXyq59GNo)<;io(-!s8v0^u<{`!)J
z6MaKGN!~y*dqooZYD7My#at6jsoyfBD-llqYQ#HH&!lz4-(C3K1$-wQzEfJLV{wD-
zi+ODSTtDtu@a;CpT0?(4MC0V)PWm=^aF|u{4(g-tsY8MOifwYj?=o-{j^7M^ohLw{
zj)lY6+v<7}(37!mh=)O*?MuV@ZIxXNuFb=gcLvjep2S0ywEO;BK*PR?;EFmbLL6<+
z+n-n({hSN#^TqYes&wgdgYE!R7b9eI0D-URGG2`g@}5cfj|EcDZ>wHS-U9)hVJV_*
zHlt#r!N#E18RArJqo1`u>T*V&+UKWP=!d{U^d4~O#d!GH2c0V&dLP8+YzIHe!si&2
zwgp>)mH7El)W}r8L9>0yY>=khv_RPwpi^xDn(a$v8?v!)TEJ`pI#siM$!w6O+_XU1
z7NApU`_dC2MD2}^W#gr?C)r~{2_qY}+m}+cVdJO57T^XvE^>oDjeNrjQz-13eDv3s
z?v3PqUSY?LLDK>$T7Vm~*}jybAsZh}3z#iH$9iYvsjpw7YPzwnh@5VT<b6qBCEO%4
z>{=kUcA*{7q51te%-b*Sr&gguX`O9pAn#SkP6G`!El^nt{0HuA9HmP~`H}zt002ov
JPDHLkV1n`1<Zu80

literal 0
HcmV?d00001

diff --git a/audio/docs/make.bat b/audio/docs/make.bat
new file mode 100644
index 000000000..543c6b13b
--- /dev/null
+++ b/audio/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/audio/docs/source/_static/custom.css b/audio/docs/source/_static/custom.css
new file mode 100644
index 000000000..bb65c51a9
--- /dev/null
+++ b/audio/docs/source/_static/custom.css
@@ -0,0 +1,5 @@
+.wy-nav-content {
+    max-width: 80%;
+}
+.table table{ background:#b9b9b9} 
+.table table td{ background:#FFF; } 
diff --git a/audio/docs/source/_templates/module.rst_t b/audio/docs/source/_templates/module.rst_t
new file mode 100644
index 000000000..d9a50e6b9
--- /dev/null
+++ b/audio/docs/source/_templates/module.rst_t
@@ -0,0 +1,9 @@
+{%- if show_headings %}
+{{- basename | e | heading }}
+
+{% endif -%}
+.. automodule:: {{ qualname }}
+{%- for option in automodule_options %}
+   :{{ option }}:
+{%- endfor %}
+
diff --git a/audio/docs/source/_templates/package.rst_t b/audio/docs/source/_templates/package.rst_t
new file mode 100644
index 000000000..7239c11b7
--- /dev/null
+++ b/audio/docs/source/_templates/package.rst_t
@@ -0,0 +1,57 @@
+{%- macro automodule(modname, options) -%}
+.. automodule:: {{ modname }}
+{%- for option in options %}
+   :{{ option }}:
+{%- endfor %}
+{%- endmacro %}
+
+{%- macro toctree(docnames) -%}
+.. toctree::
+   :maxdepth: {{ maxdepth }}
+{% for docname in docnames %}
+   {{ docname }}
+{%- endfor %}
+{%- endmacro %}
+
+{%- if is_namespace %}
+{{- [pkgname, "namespace"] | join(" ") | e | heading }}
+{% else %}
+{{- pkgname | e | heading }}
+{% endif %}
+
+{%- if is_namespace %}
+.. py:module:: {{ pkgname }}
+{% endif %}
+
+{%- if modulefirst and not is_namespace %}
+{{ automodule(pkgname, automodule_options) }}
+{% endif %}
+
+{%- if subpackages %}
+Subpackages
+-----------
+
+{{ toctree(subpackages) }}
+{% endif %}
+
+{%- if submodules %}
+Submodules
+----------
+{% if separatemodules %}
+{{ toctree(submodules) }}
+{% else %}
+{%- for submodule in submodules %}
+{% if show_headings %}
+{{- submodule | e | heading(2) }}
+{% endif %}
+{{ automodule(submodule, automodule_options) }}
+{% endfor %}
+{%- endif %}
+{%- endif %}
+
+{%- if not modulefirst and not is_namespace %}
+Module contents
+---------------
+
+{{ automodule(pkgname, automodule_options) }}
+{% endif %}
diff --git a/audio/docs/source/_templates/toc.rst_t b/audio/docs/source/_templates/toc.rst_t
new file mode 100644
index 000000000..f0877eeb2
--- /dev/null
+++ b/audio/docs/source/_templates/toc.rst_t
@@ -0,0 +1,8 @@
+{{ header | heading }}
+
+.. toctree::
+   :maxdepth: {{ maxdepth }}
+{% for docname in docnames %}
+   {{ docname }}
+{%- endfor %}
+
diff --git a/audio/docs/source/conf.py b/audio/docs/source/conf.py
new file mode 100644
index 000000000..09c4f312f
--- /dev/null
+++ b/audio/docs/source/conf.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+# -- Path setup --------------------------------------------------------------
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../..'))
+
+# -- Project information -----------------------------------------------------
+
+project = 'PaddleAudio'
+copyright = '2022, PaddlePaddle'
+author = 'PaddlePaddle'
+
+# The short X.Y version
+version = ''
+# The full version, including alpha/beta/rc tags
+release = '0.2.0'
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.napoleon',
+]
+
+napoleon_google_docstring = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+
+import sphinx_rtd_theme
+html_theme = 'sphinx_rtd_theme'
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+smartquotes = False
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_logo = '../images/paddle.png'
+html_css_files = [
+    'custom.css',
+]
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PaddleAudiodoc'
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'PaddleAudio.tex', 'PaddleAudio Documentation', 'PaddlePaddle',
+     'manual'),
+]
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, 'paddleaudio', 'PaddleAudio Documentation', [author],
+              1)]
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'PaddleAudio', 'PaddleAudio Documentation', author,
+     'PaddleAudio', 'One line description of project.', 'Miscellaneous'),
+]
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# -- Extension configuration -------------------------------------------------
+
+# -- Options for intersphinx extension ---------------------------------------
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {'https://docs.python.org/': None}
diff --git a/audio/docs/source/index.rst b/audio/docs/source/index.rst
new file mode 100644
index 000000000..26963308e
--- /dev/null
+++ b/audio/docs/source/index.rst
@@ -0,0 +1,22 @@
+.. PaddleAudio documentation master file, created by
+   sphinx-quickstart on Tue Mar 22 15:57:16 2022.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to PaddleAudio's documentation!
+=======================================
+
+.. toctree::
+   :maxdepth: 1
+
+   Index <self>
+
+
+API References
+--------------
+
+.. toctree::
+   :maxdepth: 2
+   :titlesonly:
+
+   paddleaudio
\ No newline at end of file
diff --git a/audio/docs/source/source/paddleaudio.backends.common.rst b/audio/docs/source/source/paddleaudio.backends.common.rst
new file mode 100644
index 000000000..c936645e6
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.backends.common.rst
@@ -0,0 +1,7 @@
+paddleaudio.backends.common module
+==================================
+
+.. automodule:: paddleaudio.backends.common
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.datasets.tess.rst b/audio/docs/source/source/paddleaudio.backends.no_backend.rst
similarity index 51%
rename from docs/source/api/paddlespeech.audio.datasets.tess.rst
rename to audio/docs/source/source/paddleaudio.backends.no_backend.rst
index d845e6d6a..bf01dab2e 100644
--- a/docs/source/api/paddlespeech.audio.datasets.tess.rst
+++ b/audio/docs/source/source/paddleaudio.backends.no_backend.rst
@@ -1,7 +1,7 @@
-paddlespeech.audio.datasets.tess module
+paddleaudio.backends.no\_backend module
 =======================================
 
-.. automodule:: paddlespeech.audio.datasets.tess
+.. automodule:: paddleaudio.backends.no_backend
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.backends.rst b/audio/docs/source/source/paddleaudio.backends.rst
new file mode 100644
index 000000000..79907dd2e
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.backends.rst
@@ -0,0 +1,19 @@
+paddleaudio.backends package
+============================
+
+.. automodule:: paddleaudio.backends
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddleaudio.backends.common
+   paddleaudio.backends.no_backend
+   paddleaudio.backends.soundfile_backend
+   paddleaudio.backends.sox_io_backend
+   paddleaudio.backends.utils
diff --git a/audio/docs/source/source/paddleaudio.backends.soundfile_backend.rst b/audio/docs/source/source/paddleaudio.backends.soundfile_backend.rst
new file mode 100644
index 000000000..6146373cb
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.backends.soundfile_backend.rst
@@ -0,0 +1,7 @@
+paddleaudio.backends.soundfile\_backend module
+==============================================
+
+.. automodule:: paddleaudio.backends.soundfile_backend
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.compliance.librosa.rst b/audio/docs/source/source/paddleaudio.backends.sox_io_backend.rst
similarity index 50%
rename from docs/source/api/paddlespeech.audio.compliance.librosa.rst
rename to audio/docs/source/source/paddleaudio.backends.sox_io_backend.rst
index 85271bee4..04972706d 100644
--- a/docs/source/api/paddlespeech.audio.compliance.librosa.rst
+++ b/audio/docs/source/source/paddleaudio.backends.sox_io_backend.rst
@@ -1,7 +1,7 @@
-paddlespeech.audio.compliance.librosa module
+paddleaudio.backends.sox\_io\_backend module
 ============================================
 
-.. automodule:: paddlespeech.audio.compliance.librosa
+.. automodule:: paddleaudio.backends.sox_io_backend
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.backends.utils.rst b/audio/docs/source/source/paddleaudio.backends.utils.rst
new file mode 100644
index 000000000..c4cd5e1ed
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.backends.utils.rst
@@ -0,0 +1,7 @@
+paddleaudio.backends.utils module
+=================================
+
+.. automodule:: paddleaudio.backends.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.compliance.kaldi.rst b/audio/docs/source/source/paddleaudio.compliance.kaldi.rst
new file mode 100644
index 000000000..81bb7d648
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.compliance.kaldi.rst
@@ -0,0 +1,7 @@
+paddleaudio.compliance.kaldi module
+===================================
+
+.. automodule:: paddleaudio.compliance.kaldi
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.compliance.librosa.rst b/audio/docs/source/source/paddleaudio.compliance.librosa.rst
new file mode 100644
index 000000000..553e4d3a5
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.compliance.librosa.rst
@@ -0,0 +1,7 @@
+paddleaudio.compliance.librosa module
+=====================================
+
+.. automodule:: paddleaudio.compliance.librosa
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.compliance.rst b/audio/docs/source/source/paddleaudio.compliance.rst
new file mode 100644
index 000000000..137599bb3
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.compliance.rst
@@ -0,0 +1,16 @@
+paddleaudio.compliance package
+==============================
+
+.. automodule:: paddleaudio.compliance
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddleaudio.compliance.kaldi
+   paddleaudio.compliance.librosa
diff --git a/audio/docs/source/source/paddleaudio.datasets.dataset.rst b/audio/docs/source/source/paddleaudio.datasets.dataset.rst
new file mode 100644
index 000000000..ebf4ea18a
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.datasets.dataset.rst
@@ -0,0 +1,7 @@
+paddleaudio.datasets.dataset module
+===================================
+
+.. automodule:: paddleaudio.datasets.dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.datasets.esc50.rst b/audio/docs/source/source/paddleaudio.datasets.esc50.rst
new file mode 100644
index 000000000..2730fb919
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.datasets.esc50.rst
@@ -0,0 +1,7 @@
+paddleaudio.datasets.esc50 module
+=================================
+
+.. automodule:: paddleaudio.datasets.esc50
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.datasets.gtzan.rst b/audio/docs/source/source/paddleaudio.datasets.gtzan.rst
new file mode 100644
index 000000000..da3600cb9
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.datasets.gtzan.rst
@@ -0,0 +1,7 @@
+paddleaudio.datasets.gtzan module
+=================================
+
+.. automodule:: paddleaudio.datasets.gtzan
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.datasets.hey_snips.rst b/audio/docs/source/source/paddleaudio.datasets.hey_snips.rst
new file mode 100644
index 000000000..29da9fa88
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.datasets.hey_snips.rst
@@ -0,0 +1,7 @@
+paddleaudio.datasets.hey\_snips module
+======================================
+
+.. automodule:: paddleaudio.datasets.hey_snips
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.datasets.gtzan.rst b/audio/docs/source/source/paddleaudio.datasets.rirs_noises.rst
similarity index 51%
rename from docs/source/api/paddlespeech.audio.datasets.gtzan.rst
rename to audio/docs/source/source/paddleaudio.datasets.rirs_noises.rst
index 47252e8d7..26f52346a 100644
--- a/docs/source/api/paddlespeech.audio.datasets.gtzan.rst
+++ b/audio/docs/source/source/paddleaudio.datasets.rirs_noises.rst
@@ -1,7 +1,7 @@
-paddlespeech.audio.datasets.gtzan module
+paddleaudio.datasets.rirs\_noises module
 ========================================
 
-.. automodule:: paddlespeech.audio.datasets.gtzan
+.. automodule:: paddleaudio.datasets.rirs_noises
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.datasets.rst b/audio/docs/source/source/paddleaudio.datasets.rst
new file mode 100644
index 000000000..7a0b6f3f7
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.datasets.rst
@@ -0,0 +1,22 @@
+paddleaudio.datasets package
+============================
+
+.. automodule:: paddleaudio.datasets
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddleaudio.datasets.dataset
+   paddleaudio.datasets.esc50
+   paddleaudio.datasets.gtzan
+   paddleaudio.datasets.hey_snips
+   paddleaudio.datasets.rirs_noises
+   paddleaudio.datasets.tess
+   paddleaudio.datasets.urban_sound
+   paddleaudio.datasets.voxceleb
diff --git a/audio/docs/source/source/paddleaudio.datasets.tess.rst b/audio/docs/source/source/paddleaudio.datasets.tess.rst
new file mode 100644
index 000000000..7a4ad62a3
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.datasets.tess.rst
@@ -0,0 +1,7 @@
+paddleaudio.datasets.tess module
+================================
+
+.. automodule:: paddleaudio.datasets.tess
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.datasets.esc50.rst b/audio/docs/source/source/paddleaudio.datasets.urban_sound.rst
similarity index 51%
rename from docs/source/api/paddlespeech.audio.datasets.esc50.rst
rename to audio/docs/source/source/paddleaudio.datasets.urban_sound.rst
index 80e4a4187..ee4ad47ec 100644
--- a/docs/source/api/paddlespeech.audio.datasets.esc50.rst
+++ b/audio/docs/source/source/paddleaudio.datasets.urban_sound.rst
@@ -1,7 +1,7 @@
-paddlespeech.audio.datasets.esc50 module
+paddleaudio.datasets.urban\_sound module
 ========================================
 
-.. automodule:: paddlespeech.audio.datasets.esc50
+.. automodule:: paddleaudio.datasets.urban_sound
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.metric.eer.rst b/audio/docs/source/source/paddleaudio.datasets.voxceleb.rst
similarity index 52%
rename from docs/source/api/paddlespeech.audio.metric.eer.rst
rename to audio/docs/source/source/paddleaudio.datasets.voxceleb.rst
index bbe881221..b8f903666 100644
--- a/docs/source/api/paddlespeech.audio.metric.eer.rst
+++ b/audio/docs/source/source/paddleaudio.datasets.voxceleb.rst
@@ -1,7 +1,7 @@
-paddlespeech.audio.metric.eer module
+paddleaudio.datasets.voxceleb module
 ====================================
 
-.. automodule:: paddlespeech.audio.metric.eer
+.. automodule:: paddleaudio.datasets.voxceleb
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.features.layers.rst b/audio/docs/source/source/paddleaudio.features.layers.rst
new file mode 100644
index 000000000..90833e0a8
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.features.layers.rst
@@ -0,0 +1,7 @@
+paddleaudio.features.layers module
+==================================
+
+.. automodule:: paddleaudio.features.layers
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.features.rst b/audio/docs/source/source/paddleaudio.features.rst
new file mode 100644
index 000000000..86ecb5c9c
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.features.rst
@@ -0,0 +1,15 @@
+paddleaudio.features package
+============================
+
+.. automodule:: paddleaudio.features
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddleaudio.features.layers
diff --git a/audio/docs/source/source/paddleaudio.functional.functional.rst b/audio/docs/source/source/paddleaudio.functional.functional.rst
new file mode 100644
index 000000000..d1f72052d
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.functional.functional.rst
@@ -0,0 +1,7 @@
+paddleaudio.functional.functional module
+========================================
+
+.. automodule:: paddleaudio.functional.functional
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.functional.rst b/audio/docs/source/source/paddleaudio.functional.rst
new file mode 100644
index 000000000..be76de798
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.functional.rst
@@ -0,0 +1,16 @@
+paddleaudio.functional package
+==============================
+
+.. automodule:: paddleaudio.functional
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddleaudio.functional.functional
+   paddleaudio.functional.window
diff --git a/audio/docs/source/source/paddleaudio.functional.window.rst b/audio/docs/source/source/paddleaudio.functional.window.rst
new file mode 100644
index 000000000..46d89f3fb
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.functional.window.rst
@@ -0,0 +1,7 @@
+paddleaudio.functional.window module
+====================================
+
+.. automodule:: paddleaudio.functional.window
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.io.rst b/audio/docs/source/source/paddleaudio.io.rst
new file mode 100644
index 000000000..9ef75f748
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.io.rst
@@ -0,0 +1,7 @@
+paddleaudio.io package
+======================
+
+.. automodule:: paddleaudio.io
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.metric.eer.rst b/audio/docs/source/source/paddleaudio.metric.eer.rst
new file mode 100644
index 000000000..e4b4f5f34
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.metric.eer.rst
@@ -0,0 +1,7 @@
+paddleaudio.metric.eer module
+=============================
+
+.. automodule:: paddleaudio.metric.eer
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/audio/docs/source/source/paddleaudio.metric.rst b/audio/docs/source/source/paddleaudio.metric.rst
new file mode 100644
index 000000000..0074f0b5b
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.metric.rst
@@ -0,0 +1,15 @@
+paddleaudio.metric package
+==========================
+
+.. automodule:: paddleaudio.metric
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddleaudio.metric.eer
diff --git a/audio/docs/source/source/paddleaudio.rst b/audio/docs/source/source/paddleaudio.rst
new file mode 100644
index 000000000..9defb2ea8
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.rst
@@ -0,0 +1,22 @@
+paddleaudio package
+===================
+
+.. automodule:: paddleaudio
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddleaudio.backends
+   paddleaudio.compliance
+   paddleaudio.datasets
+   paddleaudio.features
+   paddleaudio.functional
+   paddleaudio.io
+   paddleaudio.metric
+   paddleaudio.sox_effects
diff --git a/audio/docs/source/source/paddleaudio.sox_effects.rst b/audio/docs/source/source/paddleaudio.sox_effects.rst
new file mode 100644
index 000000000..48cd27250
--- /dev/null
+++ b/audio/docs/source/source/paddleaudio.sox_effects.rst
@@ -0,0 +1,7 @@
+paddleaudio.sox\_effects package
+================================
+
+.. automodule:: paddleaudio.sox_effects
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.backends.rst b/docs/source/api/paddlespeech.audio.backends.rst
deleted file mode 100644
index e8917897e..000000000
--- a/docs/source/api/paddlespeech.audio.backends.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-paddlespeech.audio.backends package
-===================================
-
-.. automodule:: paddlespeech.audio.backends
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-.. toctree::
-   :maxdepth: 4
-
-   paddlespeech.audio.backends.soundfile_backend
-   paddlespeech.audio.backends.sox_backend
diff --git a/docs/source/api/paddlespeech.audio.backends.soundfile_backend.rst b/docs/source/api/paddlespeech.audio.backends.soundfile_backend.rst
deleted file mode 100644
index 5c4ef3881..000000000
--- a/docs/source/api/paddlespeech.audio.backends.soundfile_backend.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.audio.backends.soundfile\_backend module
-=====================================================
-
-.. automodule:: paddlespeech.audio.backends.soundfile_backend
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.backends.sox_backend.rst b/docs/source/api/paddlespeech.audio.backends.sox_backend.rst
deleted file mode 100644
index a99c49de8..000000000
--- a/docs/source/api/paddlespeech.audio.backends.sox_backend.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.audio.backends.sox\_backend module
-===============================================
-
-.. automodule:: paddlespeech.audio.backends.sox_backend
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.compliance.rst b/docs/source/api/paddlespeech.audio.compliance.rst
deleted file mode 100644
index 515d25e99..000000000
--- a/docs/source/api/paddlespeech.audio.compliance.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-paddlespeech.audio.compliance package
-=====================================
-
-.. automodule:: paddlespeech.audio.compliance
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-.. toctree::
-   :maxdepth: 4
-
-   paddlespeech.audio.compliance.kaldi
-   paddlespeech.audio.compliance.librosa
diff --git a/docs/source/api/paddlespeech.audio.datasets.rirs_noises.rst b/docs/source/api/paddlespeech.audio.datasets.rirs_noises.rst
deleted file mode 100644
index 3015ba9e4..000000000
--- a/docs/source/api/paddlespeech.audio.datasets.rirs_noises.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.audio.datasets.rirs\_noises module
-===============================================
-
-.. automodule:: paddlespeech.audio.datasets.rirs_noises
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.datasets.rst b/docs/source/api/paddlespeech.audio.datasets.rst
deleted file mode 100644
index bfc313a70..000000000
--- a/docs/source/api/paddlespeech.audio.datasets.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-paddlespeech.audio.datasets package
-===================================
-
-.. automodule:: paddlespeech.audio.datasets
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-.. toctree::
-   :maxdepth: 4
-
-   paddlespeech.audio.datasets.dataset
-   paddlespeech.audio.datasets.esc50
-   paddlespeech.audio.datasets.gtzan
-   paddlespeech.audio.datasets.hey_snips
-   paddlespeech.audio.datasets.rirs_noises
-   paddlespeech.audio.datasets.tess
-   paddlespeech.audio.datasets.urban_sound
-   paddlespeech.audio.datasets.voxceleb
diff --git a/docs/source/api/paddlespeech.audio.datasets.urban_sound.rst b/docs/source/api/paddlespeech.audio.datasets.urban_sound.rst
deleted file mode 100644
index 4efa060a8..000000000
--- a/docs/source/api/paddlespeech.audio.datasets.urban_sound.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.audio.datasets.urban\_sound module
-===============================================
-
-.. automodule:: paddlespeech.audio.datasets.urban_sound
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.functional.functional.rst b/docs/source/api/paddlespeech.audio.functional.functional.rst
deleted file mode 100644
index 80cc5a5a4..000000000
--- a/docs/source/api/paddlespeech.audio.functional.functional.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.audio.functional.functional module
-===============================================
-
-.. automodule:: paddlespeech.audio.functional.functional
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.functional.rst b/docs/source/api/paddlespeech.audio.functional.rst
deleted file mode 100644
index 4e979dd9a..000000000
--- a/docs/source/api/paddlespeech.audio.functional.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-paddlespeech.audio.functional package
-=====================================
-
-.. automodule:: paddlespeech.audio.functional
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-.. toctree::
-   :maxdepth: 4
-
-   paddlespeech.audio.functional.functional
-   paddlespeech.audio.functional.window
diff --git a/docs/source/api/paddlespeech.audio.kaldi.kaldi.rst b/docs/source/api/paddlespeech.audio.kaldi.kaldi.rst
new file mode 100644
index 000000000..1c41ac84d
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.kaldi.kaldi.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.kaldi.kaldi module
+=====================================
+
+.. automodule:: paddlespeech.audio.kaldi.kaldi
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.kaldi.rst b/docs/source/api/paddlespeech.audio.kaldi.rst
new file mode 100644
index 000000000..15d26a74c
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.kaldi.rst
@@ -0,0 +1,15 @@
+paddlespeech.audio.kaldi package
+================================
+
+.. automodule:: paddlespeech.audio.kaldi
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.audio.kaldi.kaldi
diff --git a/docs/source/api/paddlespeech.audio.metric.rst b/docs/source/api/paddlespeech.audio.metric.rst
deleted file mode 100644
index a6d411dd6..000000000
--- a/docs/source/api/paddlespeech.audio.metric.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-paddlespeech.audio.metric package
-=================================
-
-.. automodule:: paddlespeech.audio.metric
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-.. toctree::
-   :maxdepth: 4
-
-   paddlespeech.audio.metric.eer
diff --git a/docs/source/api/paddlespeech.audio.rst b/docs/source/api/paddlespeech.audio.rst
index 5a3867f96..368ffda94 100644
--- a/docs/source/api/paddlespeech.audio.rst
+++ b/docs/source/api/paddlespeech.audio.rst
@@ -12,12 +12,13 @@ Subpackages
 .. toctree::
    :maxdepth: 4
 
-   paddlespeech.audio.backends
-   paddlespeech.audio.compliance
-   paddlespeech.audio.datasets
    paddlespeech.audio.features
    paddlespeech.audio.functional
    paddlespeech.audio.io
+   paddlespeech.audio.kaldi
    paddlespeech.audio.metric
    paddlespeech.audio.sox_effects
+   paddlespeech.audio.streamdata
+   paddlespeech.audio.text
+   paddlespeech.audio.transform
    paddlespeech.audio.utils
diff --git a/docs/source/api/paddlespeech.audio.sox_effects.rst b/docs/source/api/paddlespeech.audio.sox_effects.rst
index 75f991a16..186b9738f 100644
--- a/docs/source/api/paddlespeech.audio.sox_effects.rst
+++ b/docs/source/api/paddlespeech.audio.sox_effects.rst
@@ -5,3 +5,11 @@ paddlespeech.audio.sox\_effects package
    :members:
    :undoc-members:
    :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.audio.sox_effects.sox_effects
diff --git a/docs/source/api/paddlespeech.audio.sox_effects.sox_effects.rst b/docs/source/api/paddlespeech.audio.sox_effects.sox_effects.rst
new file mode 100644
index 000000000..8232b4391
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.sox_effects.sox_effects.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.sox\_effects.sox\_effects module
+===================================================
+
+.. automodule:: paddlespeech.audio.sox_effects.sox_effects
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst
new file mode 100644
index 000000000..1e45c1373
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.autodecode module
+===============================================
+
+.. automodule:: paddlespeech.audio.streamdata.autodecode
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.datasets.dataset.rst b/docs/source/api/paddlespeech.audio.streamdata.cache.rst
similarity index 50%
rename from docs/source/api/paddlespeech.audio.datasets.dataset.rst
rename to docs/source/api/paddlespeech.audio.streamdata.cache.rst
index 41243fb73..393055e54 100644
--- a/docs/source/api/paddlespeech.audio.datasets.dataset.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.cache.rst
@@ -1,7 +1,7 @@
-paddlespeech.audio.datasets.dataset module
+paddlespeech.audio.streamdata.cache module
 ==========================================
 
-.. automodule:: paddlespeech.audio.datasets.dataset
+.. automodule:: paddlespeech.audio.streamdata.cache
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.datasets.voxceleb.rst b/docs/source/api/paddlespeech.audio.streamdata.compat.rst
similarity index 50%
rename from docs/source/api/paddlespeech.audio.datasets.voxceleb.rst
rename to docs/source/api/paddlespeech.audio.streamdata.compat.rst
index 179053dcd..760695b20 100644
--- a/docs/source/api/paddlespeech.audio.datasets.voxceleb.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.compat.rst
@@ -1,7 +1,7 @@
-paddlespeech.audio.datasets.voxceleb module
+paddlespeech.audio.streamdata.compat module
 ===========================================
 
-.. automodule:: paddlespeech.audio.datasets.voxceleb
+.. automodule:: paddlespeech.audio.streamdata.compat
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst
new file mode 100644
index 000000000..74628e963
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.extradatasets module
+==================================================
+
+.. automodule:: paddlespeech.audio.streamdata.extradatasets
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.filters.rst b/docs/source/api/paddlespeech.audio.streamdata.filters.rst
new file mode 100644
index 000000000..d26104279
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.filters.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.filters module
+============================================
+
+.. automodule:: paddlespeech.audio.streamdata.filters
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.compliance.kaldi.rst b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst
similarity index 50%
rename from docs/source/api/paddlespeech.audio.compliance.kaldi.rst
rename to docs/source/api/paddlespeech.audio.streamdata.gopen.rst
index f1459cf1a..1cccb7763 100644
--- a/docs/source/api/paddlespeech.audio.compliance.kaldi.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst
@@ -1,7 +1,7 @@
-paddlespeech.audio.compliance.kaldi module
+paddlespeech.audio.streamdata.gopen module
 ==========================================
 
-.. automodule:: paddlespeech.audio.compliance.kaldi
+.. automodule:: paddlespeech.audio.streamdata.gopen
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.handlers.rst b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst
new file mode 100644
index 000000000..7a4b3ce8e
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.handlers module
+=============================================
+
+.. automodule:: paddlespeech.audio.streamdata.handlers
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.mix.rst b/docs/source/api/paddlespeech.audio.streamdata.mix.rst
new file mode 100644
index 000000000..908b35dd1
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.mix.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.mix module
+========================================
+
+.. automodule:: paddlespeech.audio.streamdata.mix
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst
new file mode 100644
index 000000000..203343004
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.paddle\_utils module
+==================================================
+
+.. automodule:: paddlespeech.audio.streamdata.paddle_utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst
new file mode 100644
index 000000000..ae05fbecc
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.pipeline module
+=============================================
+
+.. automodule:: paddlespeech.audio.streamdata.pipeline
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.rst b/docs/source/api/paddlespeech.audio.streamdata.rst
new file mode 100644
index 000000000..a1f4560a3
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.rst
@@ -0,0 +1,28 @@
+paddlespeech.audio.streamdata package
+=====================================
+
+.. automodule:: paddlespeech.audio.streamdata
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.audio.streamdata.autodecode
+   paddlespeech.audio.streamdata.cache
+   paddlespeech.audio.streamdata.compat
+   paddlespeech.audio.streamdata.extradatasets
+   paddlespeech.audio.streamdata.filters
+   paddlespeech.audio.streamdata.gopen
+   paddlespeech.audio.streamdata.handlers
+   paddlespeech.audio.streamdata.mix
+   paddlespeech.audio.streamdata.paddle_utils
+   paddlespeech.audio.streamdata.pipeline
+   paddlespeech.audio.streamdata.shardlists
+   paddlespeech.audio.streamdata.tariterators
+   paddlespeech.audio.streamdata.utils
+   paddlespeech.audio.streamdata.writer
diff --git a/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst
new file mode 100644
index 000000000..ec1fe8236
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.shardlists module
+===============================================
+
+.. automodule:: paddlespeech.audio.streamdata.shardlists
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst
new file mode 100644
index 000000000..b003b2d42
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.tariterators module
+=================================================
+
+.. automodule:: paddlespeech.audio.streamdata.tariterators
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.streamdata.utils.rst b/docs/source/api/paddlespeech.audio.streamdata.utils.rst
new file mode 100644
index 000000000..f248b1131
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.streamdata.utils.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.streamdata.utils module
+==========================================
+
+.. automodule:: paddlespeech.audio.streamdata.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.functional.window.rst b/docs/source/api/paddlespeech.audio.streamdata.writer.rst
similarity index 50%
rename from docs/source/api/paddlespeech.audio.functional.window.rst
rename to docs/source/api/paddlespeech.audio.streamdata.writer.rst
index 347762751..7437241f3 100644
--- a/docs/source/api/paddlespeech.audio.functional.window.rst
+++ b/docs/source/api/paddlespeech.audio.streamdata.writer.rst
@@ -1,7 +1,7 @@
-paddlespeech.audio.functional.window module
+paddlespeech.audio.streamdata.writer module
 ===========================================
 
-.. automodule:: paddlespeech.audio.functional.window
+.. automodule:: paddlespeech.audio.streamdata.writer
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.text.rst b/docs/source/api/paddlespeech.audio.text.rst
new file mode 100644
index 000000000..a2018050a
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.text.rst
@@ -0,0 +1,16 @@
+paddlespeech.audio.text package
+===============================
+
+.. automodule:: paddlespeech.audio.text
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.audio.text.text_featurizer
+   paddlespeech.audio.text.utility
diff --git a/docs/source/api/paddlespeech.audio.text.text_featurizer.rst b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst
new file mode 100644
index 000000000..1a8262d08
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.text.text\_featurizer module
+===============================================
+
+.. automodule:: paddlespeech.audio.text.text_featurizer
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.text.utility.rst b/docs/source/api/paddlespeech.audio.text.utility.rst
new file mode 100644
index 000000000..90fcb25f6
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.text.utility.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.text.utility module
+======================================
+
+.. automodule:: paddlespeech.audio.text.utility
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.add_deltas.rst b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst
new file mode 100644
index 000000000..b4b596d6e
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.add\_deltas module
+===============================================
+
+.. automodule:: paddlespeech.audio.transform.add_deltas
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.channel_selector.rst b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst
new file mode 100644
index 000000000..4828b5904
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.channel\_selector module
+=====================================================
+
+.. automodule:: paddlespeech.audio.transform.channel_selector
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.cmvn.rst b/docs/source/api/paddlespeech.audio.transform.cmvn.rst
new file mode 100644
index 000000000..44655a1e4
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.cmvn.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.cmvn module
+========================================
+
+.. automodule:: paddlespeech.audio.transform.cmvn
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.functional.rst b/docs/source/api/paddlespeech.audio.transform.functional.rst
new file mode 100644
index 000000000..7877d2495
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.functional.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.functional module
+==============================================
+
+.. automodule:: paddlespeech.audio.transform.functional
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.perturb.rst b/docs/source/api/paddlespeech.audio.transform.perturb.rst
new file mode 100644
index 000000000..e3615a5d1
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.perturb.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.perturb module
+===========================================
+
+.. automodule:: paddlespeech.audio.transform.perturb
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.rst b/docs/source/api/paddlespeech.audio.transform.rst
new file mode 100644
index 000000000..47a7303b3
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.rst
@@ -0,0 +1,24 @@
+paddlespeech.audio.transform package
+====================================
+
+.. automodule:: paddlespeech.audio.transform
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.audio.transform.add_deltas
+   paddlespeech.audio.transform.channel_selector
+   paddlespeech.audio.transform.cmvn
+   paddlespeech.audio.transform.functional
+   paddlespeech.audio.transform.perturb
+   paddlespeech.audio.transform.spec_augment
+   paddlespeech.audio.transform.spectrogram
+   paddlespeech.audio.transform.transform_interface
+   paddlespeech.audio.transform.transformation
+   paddlespeech.audio.transform.wpe
diff --git a/docs/source/api/paddlespeech.audio.transform.spec_augment.rst b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst
new file mode 100644
index 000000000..f11a32241
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.spec\_augment module
+=================================================
+
+.. automodule:: paddlespeech.audio.transform.spec_augment
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.spectrogram.rst b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst
new file mode 100644
index 000000000..6be0c32ee
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.spectrogram module
+===============================================
+
+.. automodule:: paddlespeech.audio.transform.spectrogram
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.transform_interface.rst b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst
new file mode 100644
index 000000000..ec8b20857
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.transform\_interface module
+========================================================
+
+.. automodule:: paddlespeech.audio.transform.transform_interface
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.transformation.rst b/docs/source/api/paddlespeech.audio.transform.transformation.rst
new file mode 100644
index 000000000..94629b9af
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.transformation.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.transformation module
+==================================================
+
+.. automodule:: paddlespeech.audio.transform.transformation
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.transform.wpe.rst b/docs/source/api/paddlespeech.audio.transform.wpe.rst
new file mode 100644
index 000000000..85c758114
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.transform.wpe.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.transform.wpe module
+=======================================
+
+.. automodule:: paddlespeech.audio.transform.wpe
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.datasets.hey_snips.rst b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst
similarity index 50%
rename from docs/source/api/paddlespeech.audio.datasets.hey_snips.rst
rename to docs/source/api/paddlespeech.audio.utils.check_kwargs.rst
index ce08b7003..a18f27e65 100644
--- a/docs/source/api/paddlespeech.audio.datasets.hey_snips.rst
+++ b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst
@@ -1,7 +1,7 @@
-paddlespeech.audio.datasets.hey\_snips module
+paddlespeech.audio.utils.check\_kwargs module
 =============================================
 
-.. automodule:: paddlespeech.audio.datasets.hey_snips
+.. automodule:: paddlespeech.audio.utils.check_kwargs
    :members:
    :undoc-members:
    :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst
new file mode 100644
index 000000000..5d060ee15
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.utils.dynamic\_import module
+===============================================
+
+.. automodule:: paddlespeech.audio.utils.dynamic_import
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.utils.rst b/docs/source/api/paddlespeech.audio.utils.rst
index db15927da..0f1150ff3 100644
--- a/docs/source/api/paddlespeech.audio.utils.rst
+++ b/docs/source/api/paddlespeech.audio.utils.rst
@@ -12,8 +12,12 @@ Submodules
 .. toctree::
    :maxdepth: 4
 
+   paddlespeech.audio.utils.check_kwargs
    paddlespeech.audio.utils.download
+   paddlespeech.audio.utils.dynamic_import
    paddlespeech.audio.utils.error
    paddlespeech.audio.utils.log
    paddlespeech.audio.utils.numeric
+   paddlespeech.audio.utils.sox_utils
+   paddlespeech.audio.utils.tensor_utils
    paddlespeech.audio.utils.time
diff --git a/docs/source/api/paddlespeech.audio.utils.sox_utils.rst b/docs/source/api/paddlespeech.audio.utils.sox_utils.rst
new file mode 100644
index 000000000..6fd60b7c3
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.utils.sox_utils.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.utils.sox\_utils module
+==========================================
+
+.. automodule:: paddlespeech.audio.utils.sox_utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst
new file mode 100644
index 000000000..93a1f70eb
--- /dev/null
+++ b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst
@@ -0,0 +1,7 @@
+paddlespeech.audio.utils.tensor\_utils module
+=============================================
+
+.. automodule:: paddlespeech.audio.utils.tensor_utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.cls.exps.panns.rst b/docs/source/api/paddlespeech.cls.exps.panns.rst
index 6147b245e..da4cfe897 100644
--- a/docs/source/api/paddlespeech.cls.exps.panns.rst
+++ b/docs/source/api/paddlespeech.cls.exps.panns.rst
@@ -21,5 +21,9 @@ Submodules
    :maxdepth: 4
 
    paddlespeech.cls.exps.panns.export_model
+   paddlespeech.cls.exps.panns.panns
    paddlespeech.cls.exps.panns.predict
    paddlespeech.cls.exps.panns.train
+   paddlespeech.cls.exps.panns.u2
+   paddlespeech.cls.exps.panns.u2_st
+   paddlespeech.cls.exps.panns.util
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst
new file mode 100644
index 000000000..b533e8c42
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst
@@ -0,0 +1,7 @@
+paddlespeech.kws.exps.mdtc.collate module
+=========================================
+
+.. automodule:: paddlespeech.kws.exps.mdtc.collate
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst
new file mode 100644
index 000000000..45e094555
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst
@@ -0,0 +1,7 @@
+paddlespeech.kws.exps.mdtc.compute\_det module
+==============================================
+
+.. automodule:: paddlespeech.kws.exps.mdtc.compute_det
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
new file mode 100644
index 000000000..46a149b0b
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
@@ -0,0 +1,7 @@
+paddlespeech.kws.exps.mdtc.plot\_det\_curve module
+==================================================
+
+.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.rst
new file mode 100644
index 000000000..f6cad64e3
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.rst
@@ -0,0 +1,19 @@
+paddlespeech.kws.exps.mdtc package
+==================================
+
+.. automodule:: paddlespeech.kws.exps.mdtc
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.kws.exps.mdtc.collate
+   paddlespeech.kws.exps.mdtc.compute_det
+   paddlespeech.kws.exps.mdtc.plot_det_curve
+   paddlespeech.kws.exps.mdtc.score
+   paddlespeech.kws.exps.mdtc.train
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst
new file mode 100644
index 000000000..aa956b4cb
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst
@@ -0,0 +1,7 @@
+paddlespeech.kws.exps.mdtc.score module
+=======================================
+
+.. automodule:: paddlespeech.kws.exps.mdtc.score
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst
new file mode 100644
index 000000000..5e4ca401a
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst
@@ -0,0 +1,7 @@
+paddlespeech.kws.exps.mdtc.train module
+=======================================
+
+.. automodule:: paddlespeech.kws.exps.mdtc.train
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.rst b/docs/source/api/paddlespeech.kws.exps.rst
new file mode 100644
index 000000000..bf10d2c9f
--- /dev/null
+++ b/docs/source/api/paddlespeech.kws.exps.rst
@@ -0,0 +1,15 @@
+paddlespeech.kws.exps package
+=============================
+
+.. automodule:: paddlespeech.kws.exps
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.kws.exps.mdtc
diff --git a/docs/source/api/paddlespeech.kws.rst b/docs/source/api/paddlespeech.kws.rst
index c2829a42e..d21d094c7 100644
--- a/docs/source/api/paddlespeech.kws.rst
+++ b/docs/source/api/paddlespeech.kws.rst
@@ -12,4 +12,5 @@ Subpackages
 .. toctree::
    :maxdepth: 4
 
+   paddlespeech.kws.exps
    paddlespeech.kws.models
diff --git a/docs/source/api/paddlespeech.resource.model_alias.rst b/docs/source/api/paddlespeech.resource.model_alias.rst
new file mode 100644
index 000000000..b78e643ac
--- /dev/null
+++ b/docs/source/api/paddlespeech.resource.model_alias.rst
@@ -0,0 +1,7 @@
+paddlespeech.resource.model\_alias module
+=========================================
+
+.. automodule:: paddlespeech.resource.model_alias
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.resource.pretrained_models.rst b/docs/source/api/paddlespeech.resource.pretrained_models.rst
new file mode 100644
index 000000000..a02061693
--- /dev/null
+++ b/docs/source/api/paddlespeech.resource.pretrained_models.rst
@@ -0,0 +1,7 @@
+paddlespeech.resource.pretrained\_models module
+===============================================
+
+.. automodule:: paddlespeech.resource.pretrained_models
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.resource.resource.rst b/docs/source/api/paddlespeech.resource.resource.rst
new file mode 100644
index 000000000..8b51eda3c
--- /dev/null
+++ b/docs/source/api/paddlespeech.resource.resource.rst
@@ -0,0 +1,7 @@
+paddlespeech.resource.resource module
+=====================================
+
+.. automodule:: paddlespeech.resource.resource
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.resource.rst b/docs/source/api/paddlespeech.resource.rst
new file mode 100644
index 000000000..61fdd5317
--- /dev/null
+++ b/docs/source/api/paddlespeech.resource.rst
@@ -0,0 +1,17 @@
+paddlespeech.resource package
+=============================
+
+.. automodule:: paddlespeech.resource
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.resource.model_alias
+   paddlespeech.resource.pretrained_models
+   paddlespeech.resource.resource
diff --git a/docs/source/api/paddlespeech.rst b/docs/source/api/paddlespeech.rst
index e7a01bf76..70b93ca08 100644
--- a/docs/source/api/paddlespeech.rst
+++ b/docs/source/api/paddlespeech.rst
@@ -16,8 +16,18 @@ Subpackages
    paddlespeech.cli
    paddlespeech.cls
    paddlespeech.kws
+   paddlespeech.resource
    paddlespeech.s2t
    paddlespeech.server
    paddlespeech.t2s
    paddlespeech.text
+   paddlespeech.utils
    paddlespeech.vector
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.version
diff --git a/docs/source/api/paddlespeech.s2t.rst b/docs/source/api/paddlespeech.s2t.rst
index 4be22cb87..be9ef52f5 100644
--- a/docs/source/api/paddlespeech.s2t.rst
+++ b/docs/source/api/paddlespeech.s2t.rst
@@ -19,5 +19,4 @@ Subpackages
    paddlespeech.s2t.models
    paddlespeech.s2t.modules
    paddlespeech.s2t.training
-   paddlespeech.s2t.transform
    paddlespeech.s2t.utils
diff --git a/docs/source/api/paddlespeech.server.utils.rst b/docs/source/api/paddlespeech.server.utils.rst
index 9d1166392..b4051aee3 100644
--- a/docs/source/api/paddlespeech.server.utils.rst
+++ b/docs/source/api/paddlespeech.server.utils.rst
@@ -18,7 +18,6 @@ Submodules
    paddlespeech.server.utils.config
    paddlespeech.server.utils.errors
    paddlespeech.server.utils.exception
-   paddlespeech.server.utils.log
    paddlespeech.server.utils.onnx_infer
    paddlespeech.server.utils.paddle_predictor
    paddlespeech.server.utils.util
diff --git a/docs/source/api/paddlespeech.t2s.datasets.rst b/docs/source/api/paddlespeech.t2s.datasets.rst
index b40eb2bf1..dfbdb0b47 100644
--- a/docs/source/api/paddlespeech.t2s.datasets.rst
+++ b/docs/source/api/paddlespeech.t2s.datasets.rst
@@ -19,4 +19,5 @@ Submodules
    paddlespeech.t2s.datasets.get_feats
    paddlespeech.t2s.datasets.ljspeech
    paddlespeech.t2s.datasets.preprocess_utils
+   paddlespeech.t2s.datasets.sampler
    paddlespeech.t2s.datasets.vocoder_batch_fn
diff --git a/docs/source/api/paddlespeech.t2s.datasets.sampler.rst b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst
new file mode 100644
index 000000000..ed29c28d7
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.datasets.sampler module
+========================================
+
+.. automodule:: paddlespeech.t2s.datasets.sampler
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst
new file mode 100644
index 000000000..a5e07aace
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.align module
+=============================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.align
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst
new file mode 100644
index 000000000..3771311cb
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.normalize module
+=================================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.normalize
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst
new file mode 100644
index 000000000..8d4c24ffe
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.preprocess module
+==================================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.preprocess
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst
new file mode 100644
index 000000000..a61158420
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst
@@ -0,0 +1,21 @@
+paddlespeech.t2s.exps.ernie\_sat package
+========================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.t2s.exps.ernie_sat.align
+   paddlespeech.t2s.exps.ernie_sat.normalize
+   paddlespeech.t2s.exps.ernie_sat.preprocess
+   paddlespeech.t2s.exps.ernie_sat.synthesize
+   paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
+   paddlespeech.t2s.exps.ernie_sat.train
+   paddlespeech.t2s.exps.ernie_sat.utils
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst
new file mode 100644
index 000000000..ecda2a513
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.synthesize module
+==================================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst
new file mode 100644
index 000000000..00fc44952
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.synthesize\_e2e module
+=======================================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.synthesize_e2e
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst
new file mode 100644
index 000000000..ba9a33344
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.train module
+=============================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.train
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst
new file mode 100644
index 000000000..a2dd26c38
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.ernie\_sat.utils module
+=============================================
+
+.. automodule:: paddlespeech.t2s.exps.ernie_sat.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst
index 3c98aa882..fad1fd87f 100644
--- a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst
@@ -16,3 +16,4 @@ Submodules
    paddlespeech.t2s.exps.fastspeech2.normalize
    paddlespeech.t2s.exps.fastspeech2.preprocess
    paddlespeech.t2s.exps.fastspeech2.train
+   paddlespeech.t2s.exps.fastspeech2.vc2_infer
diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst
new file mode 100644
index 000000000..70a9d6e15
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.fastspeech2.vc2\_infer module
+===================================================
+
+.. automodule:: paddlespeech.t2s.exps.fastspeech2.vc2_infer
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.rst b/docs/source/api/paddlespeech.t2s.exps.rst
index a688435eb..bee18a972 100644
--- a/docs/source/api/paddlespeech.t2s.exps.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.rst
@@ -12,11 +12,13 @@ Subpackages
 .. toctree::
    :maxdepth: 4
 
+   paddlespeech.t2s.exps.ernie_sat
    paddlespeech.t2s.exps.fastspeech2
    paddlespeech.t2s.exps.gan_vocoder
    paddlespeech.t2s.exps.speedyspeech
    paddlespeech.t2s.exps.tacotron2
    paddlespeech.t2s.exps.transformer_tts
+   paddlespeech.t2s.exps.vits
    paddlespeech.t2s.exps.waveflow
    paddlespeech.t2s.exps.wavernn
 
@@ -31,6 +33,7 @@ Submodules
    paddlespeech.t2s.exps.ort_predict
    paddlespeech.t2s.exps.ort_predict_e2e
    paddlespeech.t2s.exps.ort_predict_streaming
+   paddlespeech.t2s.exps.stream_play_tts
    paddlespeech.t2s.exps.syn_utils
    paddlespeech.t2s.exps.synthesize
    paddlespeech.t2s.exps.synthesize_e2e
diff --git a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
new file mode 100644
index 000000000..cb22dde0c
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.stream\_play\_tts module
+==============================================
+
+.. automodule:: paddlespeech.t2s.exps.stream_play_tts
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst
new file mode 100644
index 000000000..c5606f998
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.vits.normalize module
+===========================================
+
+.. automodule:: paddlespeech.t2s.exps.vits.normalize
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst
new file mode 100644
index 000000000..50633c621
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.vits.preprocess module
+============================================
+
+.. automodule:: paddlespeech.t2s.exps.vits.preprocess
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.rst b/docs/source/api/paddlespeech.t2s.exps.vits.rst
new file mode 100644
index 000000000..51a9418d5
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.rst
@@ -0,0 +1,20 @@
+paddlespeech.t2s.exps.vits package
+==================================
+
+.. automodule:: paddlespeech.t2s.exps.vits
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.t2s.exps.vits.normalize
+   paddlespeech.t2s.exps.vits.preprocess
+   paddlespeech.t2s.exps.vits.synthesize
+   paddlespeech.t2s.exps.vits.synthesize_e2e
+   paddlespeech.t2s.exps.vits.train
+   paddlespeech.t2s.exps.vits.voice_cloning
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst
new file mode 100644
index 000000000..4b22d069a
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.vits.synthesize module
+============================================
+
+.. automodule:: paddlespeech.t2s.exps.vits.synthesize
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst
new file mode 100644
index 000000000..053ddfc83
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.vits.synthesize\_e2e module
+=================================================
+
+.. automodule:: paddlespeech.t2s.exps.vits.synthesize_e2e
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.train.rst b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst
new file mode 100644
index 000000000..31bd3a48f
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.vits.train module
+=======================================
+
+.. automodule:: paddlespeech.t2s.exps.vits.train
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst
new file mode 100644
index 000000000..d9be0f310
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.exps.vits.voice\_cloning module
+================================================
+
+.. automodule:: paddlespeech.t2s.exps.vits.voice_cloning
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst
new file mode 100644
index 000000000..1635ec284
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.frontend.g2pw.dataset module
+=============================================
+
+.. automodule:: paddlespeech.t2s.frontend.g2pw.dataset
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst
new file mode 100644
index 000000000..b7d549070
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.frontend.g2pw.onnx\_api module
+===============================================
+
+.. automodule:: paddlespeech.t2s.frontend.g2pw.onnx_api
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst
new file mode 100644
index 000000000..10a118b76
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst
@@ -0,0 +1,17 @@
+paddlespeech.t2s.frontend.g2pw package
+======================================
+
+.. automodule:: paddlespeech.t2s.frontend.g2pw
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.t2s.frontend.g2pw.dataset
+   paddlespeech.t2s.frontend.g2pw.onnx_api
+   paddlespeech.t2s.frontend.g2pw.utils
diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst
new file mode 100644
index 000000000..ce9428037
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.frontend.g2pw.utils module
+===========================================
+
+.. automodule:: paddlespeech.t2s.frontend.g2pw.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst
new file mode 100644
index 000000000..4505dddba
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.frontend.mix\_frontend module
+==============================================
+
+.. automodule:: paddlespeech.t2s.frontend.mix_frontend
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.rst
index 8fbf1e6eb..b61068616 100644
--- a/docs/source/api/paddlespeech.t2s.frontend.rst
+++ b/docs/source/api/paddlespeech.t2s.frontend.rst
@@ -12,6 +12,7 @@ Subpackages
 .. toctree::
    :maxdepth: 4
 
+   paddlespeech.t2s.frontend.g2pw
    paddlespeech.t2s.frontend.normalizer
    paddlespeech.t2s.frontend.zh_normalization
 
@@ -23,6 +24,7 @@ Submodules
 
    paddlespeech.t2s.frontend.arpabet
    paddlespeech.t2s.frontend.generate_lexicon
+   paddlespeech.t2s.frontend.mix_frontend
    paddlespeech.t2s.frontend.phonectic
    paddlespeech.t2s.frontend.punctuation
    paddlespeech.t2s.frontend.tone_sandhi
diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst
new file mode 100644
index 000000000..fce5a83cc
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.models.ernie\_sat.ernie\_sat module
+====================================================
+
+.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst
new file mode 100644
index 000000000..8a697d6cf
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.models.ernie\_sat.ernie\_sat\_updater module
+=============================================================
+
+.. automodule:: paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst
index 680a85dea..aff7489c7 100644
--- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst
+++ b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst
@@ -12,4 +12,5 @@ Submodules
 .. toctree::
    :maxdepth: 4
 
-   paddlespeech.t2s.models.ernie_sat.mlm
+   paddlespeech.t2s.models.ernie_sat.ernie_sat
+   paddlespeech.t2s.models.ernie_sat.ernie_sat_updater
diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
new file mode 100644
index 000000000..7aaba7952
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.models.vits.monotonic\_align.core module
+=========================================================
+
+.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
new file mode 100644
index 000000000..25c819a7e
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
@@ -0,0 +1,16 @@
+paddlespeech.t2s.models.vits.monotonic\_align package
+=====================================================
+
+.. automodule:: paddlespeech.t2s.models.vits.monotonic_align
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.t2s.models.vits.monotonic_align.core
+   paddlespeech.t2s.models.vits.monotonic_align.setup
diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
new file mode 100644
index 000000000..a93c3b8bf
--- /dev/null
+++ b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
@@ -0,0 +1,7 @@
+paddlespeech.t2s.models.vits.monotonic\_align.setup module
+==========================================================
+
+.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.utils.dynamic_import.rst b/docs/source/api/paddlespeech.utils.dynamic_import.rst
new file mode 100644
index 000000000..daa4e6e78
--- /dev/null
+++ b/docs/source/api/paddlespeech.utils.dynamic_import.rst
@@ -0,0 +1,7 @@
+paddlespeech.utils.dynamic\_import module
+=========================================
+
+.. automodule:: paddlespeech.utils.dynamic_import
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.utils.env.rst b/docs/source/api/paddlespeech.utils.env.rst
new file mode 100644
index 000000000..e51278f82
--- /dev/null
+++ b/docs/source/api/paddlespeech.utils.env.rst
@@ -0,0 +1,7 @@
+paddlespeech.utils.env module
+=============================
+
+.. automodule:: paddlespeech.utils.env
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.utils.rst b/docs/source/api/paddlespeech.utils.rst
new file mode 100644
index 000000000..3d47626bb
--- /dev/null
+++ b/docs/source/api/paddlespeech.utils.rst
@@ -0,0 +1,16 @@
+paddlespeech.utils package
+==========================
+
+.. automodule:: paddlespeech.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   paddlespeech.utils.dynamic_import
+   paddlespeech.utils.env
diff --git a/docs/source/api/paddlespeech.version.rst b/docs/source/api/paddlespeech.version.rst
new file mode 100644
index 000000000..707c5f886
--- /dev/null
+++ b/docs/source/api/paddlespeech.version.rst
@@ -0,0 +1,7 @@
+paddlespeech.version module
+===========================
+
+.. automodule:: paddlespeech.version
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/cls/custom_dataset.md b/docs/source/cls/custom_dataset.md
index e39dcf12d..0b9b050b0 100644
--- a/docs/source/cls/custom_dataset.md
+++ b/docs/source/cls/custom_dataset.md
@@ -14,7 +14,7 @@ Assuming you have some wave files that stored in your own directory. You should
 Here is an example to build your custom dataset in `custom_dataset.py`:
 
 ```python
-from paddlespeech.audio.datasets.dataset import AudioClassificationDataset
+from paddleaudio.datasets.dataset import AudioClassificationDataset
 
 class CustomDataset(AudioClassificationDataset):
     meta_file = '/PATH/TO/META_FILE.txt'
@@ -48,7 +48,7 @@ class CustomDataset(AudioClassificationDataset):
 Then you can build dataset and data loader from `CustomDataset`:
 ```python
 import paddle
-from paddlespeech.audio.features import LogMelSpectrogram
+from paddleaudio.features import LogMelSpectrogram
 
 from custom_dataset import CustomDataset
 
diff --git a/examples/esc50/cls0/conf/panns.yaml b/examples/esc50/cls0/conf/panns.yaml
index 1f0323f0d..3a9d42aa5 100644
--- a/examples/esc50/cls0/conf/panns.yaml
+++ b/examples/esc50/cls0/conf/panns.yaml
@@ -1,5 +1,5 @@
 data:
-  dataset: 'paddlespeech.audio.datasets:ESC50'
+  dataset: 'paddleaudio.datasets:ESC50'
   num_classes: 50
   train:
     mode: 'train'
diff --git a/examples/hey_snips/kws0/conf/mdtc.yaml b/examples/hey_snips/kws0/conf/mdtc.yaml
index 54d059472..857d36d46 100644
--- a/examples/hey_snips/kws0/conf/mdtc.yaml
+++ b/examples/hey_snips/kws0/conf/mdtc.yaml
@@ -2,7 +2,7 @@
 ###########################################
 #                   Data                  #
 ###########################################
-dataset: 'paddlespeech.audio.datasets:HeySnips'
+dataset: 'paddleaudio.datasets:HeySnips'
 data_dir: '../tests/hey_snips_research_6k_en_train_eval_clean_ter'
 
 ############################################
diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py
index e5a5dff7b..03d054004 100644
--- a/examples/voxceleb/sv0/local/data_prepare.py
+++ b/examples/voxceleb/sv0/local/data_prepare.py
@@ -16,7 +16,7 @@ import argparse
 import paddle
 from yacs.config import CfgNode
 
-from paddlespeech.audio.datasets.voxceleb import VoxCeleb
+from paddleaudio.datasets.voxceleb import VoxCeleb
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.augment import build_augment_pipeline
 from paddlespeech.vector.training.seeding import seed_everything
diff --git a/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py b/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py
index 233977bae..9aa8a2ebe 100644
--- a/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py
+++ b/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py
@@ -23,7 +23,7 @@ from typing import List
 import tqdm
 from yacs.config import CfgNode
 
-from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio
+from paddleaudio.backends import soundfile_load as load_audio
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.utils.vector_utils import get_chunks
 
diff --git a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
index 49c234a43..c39dc66df 100644
--- a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
+++ b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
@@ -24,7 +24,7 @@ import random
 import tqdm
 from yacs.config import CfgNode
 
-from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio
+from paddleaudio.backends import soundfile_load as load_audio
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.utils.vector_utils import get_chunks
 
diff --git a/paddlespeech/audio/backends/no_backend.py b/paddlespeech/audio/backends/no_backend.py
deleted file mode 100644
index 157536f46..000000000
--- a/paddlespeech/audio/backends/no_backend.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from pathlib import Path
-from typing import Callable
-from typing import Optional
-from typing import Tuple
-from typing import Union
-
-from paddle import Tensor
-
-#code is from: https://github.com/pytorch/audio/blob/main/torchaudio/backend/no_backend.py
-
-
-def load(
-        filepath: Union[str, Path],
-        out: Optional[Tensor]=None,
-        normalization: Union[bool, float, Callable]=True,
-        channels_first: bool=True,
-        num_frames: int=0,
-        offset: int=0,
-        filetype: Optional[str]=None, ) -> Tuple[Tensor, int]:
-    raise RuntimeError("No audio I/O backend is available.")
-
-
-def save(filepath: str,
-         src: Tensor,
-         sample_rate: int,
-         precision: int=16,
-         channels_first: bool=True) -> None:
-    raise RuntimeError("No audio I/O backend is available.")
-
-
-def info(filepath: str) -> None:
-    raise RuntimeError("No audio I/O backend is available.")
diff --git a/paddlespeech/audio/backends/soundfile_backend.py b/paddlespeech/audio/backends/soundfile_backend.py
deleted file mode 100644
index 57e06e521..000000000
--- a/paddlespeech/audio/backends/soundfile_backend.py
+++ /dev/null
@@ -1,662 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import warnings
-from typing import Optional
-from typing import Tuple
-
-import numpy as np
-import paddle
-import resampy
-import soundfile
-from scipy.io import wavfile
-
-from ..utils import depth_convert
-from ..utils import ParameterError
-from .common import AudioMetaData
-
-__all__ = [
-    'resample',
-    'to_mono',
-    'normalize',
-    'save',
-    'soundfile_save',
-    'load',
-    'soundfile_load',
-    'info',
-    'to_mono'
-]
-NORMALMIZE_TYPES = ['linear', 'gaussian']
-MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
-RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
-EPS = 1e-8
-
-
-def resample(y: np.ndarray,
-             src_sr: int,
-             target_sr: int,
-             mode: str='kaiser_fast') -> np.ndarray:
-    """Audio resampling.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        src_sr (int): Source sample rate.
-        target_sr (int): Target sample rate.
-        mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
-
-    Returns:
-        np.ndarray: `y` resampled to `target_sr`
-    """
-
-    if mode == 'kaiser_best':
-        warnings.warn(
-            f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
-        we recommend the mode kaiser_fast in large scale audio trainning')
-
-    if not isinstance(y, np.ndarray):
-        raise ParameterError(
-            'Only support numpy np.ndarray, but received y in {type(y)}')
-
-    if mode not in RESAMPLE_MODES:
-        raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
-
-    return resampy.resample(y, src_sr, target_sr, filter=mode)
-
-
-def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
-    """Convert sterior audio to mono.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
-
-    Returns:
-        np.ndarray: `y` with mono channel.
-    """
-
-    if merge_type not in MERGE_TYPES:
-        raise ParameterError(
-            f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
-        )
-    if y.ndim > 2:
-        raise ParameterError(
-            f'Unsupported audio array,  y.ndim > 2, the shape is {y.shape}')
-    if y.ndim == 1:  # nothing to merge
-        return y
-
-    if merge_type == 'ch0':
-        return y[0]
-    if merge_type == 'ch1':
-        return y[1]
-    if merge_type == 'random':
-        return y[np.random.randint(0, 2)]
-
-    # need to do averaging according to dtype
-
-    if y.dtype == 'float32':
-        y_out = (y[0] + y[1]) * 0.5
-    elif y.dtype == 'int16':
-        y_out = y.astype('int32')
-        y_out = (y_out[0] + y_out[1]) // 2
-        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
-                        np.iinfo(y.dtype).max).astype(y.dtype)
-
-    elif y.dtype == 'int8':
-        y_out = y.astype('int16')
-        y_out = (y_out[0] + y_out[1]) // 2
-        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
-                        np.iinfo(y.dtype).max).astype(y.dtype)
-    else:
-        raise ParameterError(f'Unsupported dtype: {y.dtype}')
-    return y_out
-
-
-def soundfile_load_(file: os.PathLike,
-                    offset: Optional[float]=None,
-                    dtype: str='int16',
-                    duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
-    """Load audio using soundfile library. This function load audio file using libsndfile.
-
-    Args:
-        file (os.PathLike): File of waveform.
-        offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
-        dtype (str, optional): Data type of waveform. Defaults to 'int16'.
-        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
-
-    Returns:
-        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
-    """
-    with soundfile.SoundFile(file) as sf_desc:
-        sr_native = sf_desc.samplerate
-        if offset:
-            sf_desc.seek(int(offset * sr_native))
-        if duration is not None:
-            frame_duration = int(duration * sr_native)
-        else:
-            frame_duration = -1
-        y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
-
-    return y, sf_desc.samplerate
-
-
-def normalize(y: np.ndarray, norm_type: str='linear',
-              mul_factor: float=1.0) -> np.ndarray:
-    """Normalize an input audio with additional multiplier.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
-        mul_factor (float, optional): Scaling factor. Defaults to 1.0.
-
-    Returns:
-        np.ndarray: `y` after normalization.
-    """
-
-    if norm_type == 'linear':
-        amax = np.max(np.abs(y))
-        factor = 1.0 / (amax + EPS)
-        y = y * factor * mul_factor
-    elif norm_type == 'gaussian':
-        amean = np.mean(y)
-        astd = np.std(y)
-        astd = max(astd, EPS)
-        y = mul_factor * (y - amean) / astd
-    else:
-        raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
-
-    return y
-
-
-def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
-    """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        sr (int): Sample rate.
-        file (os.PathLike): Path of auido file to save.
-    """
-    if not file.endswith('.wav'):
-        raise ParameterError(
-            f'only .wav file supported, but dst file name is: {file}')
-
-    if sr <= 0:
-        raise ParameterError(
-            f'Sample rate should be larger than 0, recieved sr = {sr}')
-
-    if y.dtype not in ['int16', 'int8']:
-        warnings.warn(
-            f'input data type is {y.dtype}, will convert data to int16 format before saving'
-        )
-        y_out = depth_convert(y, 'int16')
-    else:
-        y_out = y
-
-    wavfile.write(file, sr, y_out)
-
-def soundfile_load(
-        file: os.PathLike,
-        sr: Optional[int]=None,
-        mono: bool=True,
-        merge_type: str='average',  # ch0,ch1,random,average
-        normal: bool=True,
-        norm_type: str='linear',
-        norm_mul_factor: float=1.0,
-        offset: float=0.0,
-        duration: Optional[int]=None,
-        dtype: str='float32',
-        resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
-    """Load audio file from disk. This function loads audio from disk using using audio beackend.
-
-    Args:
-        file (os.PathLike): Path of auido file to load.
-        sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
-        mono (bool, optional): Return waveform with mono channel. Defaults to True.
-        merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
-        normal (bool, optional): Waveform normalization. Defaults to True.
-        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
-        norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
-        offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
-        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
-        dtype (str, optional): Data type of waveform. Defaults to 'float32'.
-        resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
-
-    Returns:
-        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
-    """
-
-    y, r = soundfile_load_(file, offset=offset, dtype=dtype, duration=duration)
-
-    if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
-        raise ParameterError(f'audio file {file} looks empty')
-
-    if mono:
-        y = to_mono(y, merge_type)
-
-    if sr is not None and sr != r:
-        y = resample(y, r, sr, mode=resample_mode)
-        r = sr
-
-    if normal:
-        y = normalize(y, norm_type, norm_mul_factor)
-    elif dtype in ['int8', 'int16']:
-        # still need to do normalization, before depth convertion
-        y = normalize(y, 'linear', 1.0)
-
-    y = depth_convert(y, dtype)
-    return y, r
-
-#the code below is form: https://github.com/pytorch/audio/blob/main/torchaudio/backend/soundfile_backend.py
-
-def _get_subtype_for_wav(dtype: paddle.dtype, encoding: str, bits_per_sample: int):
-    if not encoding:
-        if not bits_per_sample:
-            subtype = {
-                paddle.uint8: "PCM_U8",
-                paddle.int16: "PCM_16",
-                paddle.int32: "PCM_32",
-                paddle.float32: "FLOAT",
-                paddle.float64: "DOUBLE",
-            }.get(dtype)
-            if not subtype:
-                raise ValueError(f"Unsupported dtype for wav: {dtype}")
-            return subtype
-        if bits_per_sample == 8:
-            return "PCM_U8"
-        return f"PCM_{bits_per_sample}"
-    if encoding == "PCM_S":
-        if not bits_per_sample:
-            return "PCM_32"
-        if bits_per_sample == 8:
-            raise ValueError("wav does not support 8-bit signed PCM encoding.")
-        return f"PCM_{bits_per_sample}"
-    if encoding == "PCM_U":
-        if bits_per_sample in (None, 8):
-            return "PCM_U8"
-        raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
-    if encoding == "PCM_F":
-        if bits_per_sample in (None, 32):
-            return "FLOAT"
-        if bits_per_sample == 64:
-            return "DOUBLE"
-        raise ValueError("wav only supports 32/64-bit float PCM encoding.")
-    if encoding == "ULAW":
-        if bits_per_sample in (None, 8):
-            return "ULAW"
-        raise ValueError("wav only supports 8-bit mu-law encoding.")
-    if encoding == "ALAW":
-        if bits_per_sample in (None, 8):
-            return "ALAW"
-        raise ValueError("wav only supports 8-bit a-law encoding.")
-    raise ValueError(f"wav does not support {encoding}.")
-
-
-def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
-    if encoding in (None, "PCM_S"):
-        return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
-    if encoding in ("PCM_U", "PCM_F"):
-        raise ValueError(f"sph does not support {encoding} encoding.")
-    if encoding == "ULAW":
-        if bits_per_sample in (None, 8):
-            return "ULAW"
-        raise ValueError("sph only supports 8-bit for mu-law encoding.")
-    if encoding == "ALAW":
-        return "ALAW"
-    raise ValueError(f"sph does not support {encoding}.")
-
-
-def _get_subtype(dtype: paddle.dtype, format: str, encoding: str, bits_per_sample: int):
-    if format == "wav":
-        return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
-    if format == "flac":
-        if encoding:
-            raise ValueError("flac does not support encoding.")
-        if not bits_per_sample:
-            return "PCM_16"
-        if bits_per_sample > 24:
-            raise ValueError("flac does not support bits_per_sample > 24.")
-        return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
-    if format in ("ogg", "vorbis"):
-        if encoding or bits_per_sample:
-            raise ValueError("ogg/vorbis does not support encoding/bits_per_sample.")
-        return "VORBIS"
-    if format == "sph":
-        return _get_subtype_for_sphere(encoding, bits_per_sample)
-    if format in ("nis", "nist"):
-        return "PCM_16"
-    raise ValueError(f"Unsupported format: {format}")
-
-def save(
-    filepath: str,
-    src: paddle.Tensor,
-    sample_rate: int,
-    channels_first: bool = True,
-    compression: Optional[float] = None,
-    format: Optional[str] = None,
-    encoding: Optional[str] = None,
-    bits_per_sample: Optional[int] = None,
-):
-    """Save audio data to file.
-
-    Note:
-        The formats this function can handle depend on the soundfile installation.
-        This function is tested on the following formats;
-
-        * WAV
-
-            * 32-bit floating-point
-            * 32-bit signed integer
-            * 16-bit signed integer
-            * 8-bit unsigned integer
-
-        * FLAC
-        * OGG/VORBIS
-        * SPHERE
-
-    Note:
-        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
-        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
-
-    Args:
-        filepath (str or pathlib.Path): Path to audio file.
-        src (paddle.Tensor): Audio data to save. must be 2D tensor.
-        sample_rate (int): sampling rate
-        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
-            otherwise `[time, channel]`.
-        compression (float of None, optional): Not used.
-            It is here only for interface compatibility reson with "sox_io" backend.
-        format (str or None, optional): Override the audio format.
-            When ``filepath`` argument is path-like object, audio format is
-            inferred from file extension. If the file extension is missing or
-            different, you can specify the correct format with this argument.
-
-            When ``filepath`` argument is file-like object,
-            this argument is required.
-
-            Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
-            ``"flac"`` and ``"sph"``.
-        encoding (str or None, optional): Changes the encoding for supported formats.
-            This argument is effective only for supported formats, sush as
-            ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
-
-                - ``"PCM_S"`` (signed integer Linear PCM)
-                - ``"PCM_U"`` (unsigned integer Linear PCM)
-                - ``"PCM_F"`` (floating point PCM)
-                - ``"ULAW"`` (mu-law)
-                - ``"ALAW"`` (a-law)
-
-        bits_per_sample (int or None, optional): Changes the bit depth for the
-            supported formats.
-            When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
-            you can change the bit depth.
-            Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
-
-    Supported formats/encodings/bit depth/compression are:
-
-    ``"wav"``
-        - 32-bit floating-point PCM
-        - 32-bit signed integer PCM
-        - 24-bit signed integer PCM
-        - 16-bit signed integer PCM
-        - 8-bit unsigned integer PCM
-        - 8-bit mu-law
-        - 8-bit a-law
-
-        Note:
-            Default encoding/bit depth is determined by the dtype of
-            the input Tensor.
-
-    ``"flac"``
-        - 8-bit
-        - 16-bit (default)
-        - 24-bit
-
-    ``"ogg"``, ``"vorbis"``
-        - Doesn't accept changing configuration.
-
-    ``"sph"``
-        - 8-bit signed integer PCM
-        - 16-bit signed integer PCM
-        - 24-bit signed integer PCM
-        - 32-bit signed integer PCM (default)
-        - 8-bit mu-law
-        - 8-bit a-law
-        - 16-bit a-law
-        - 24-bit a-law
-        - 32-bit a-law
-
-    """
-    if src.ndim != 2:
-        raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
-    if compression is not None:
-        warnings.warn(
-            '`save` function of "soundfile" backend does not support "compression" parameter. '
-            "The argument is silently ignored."
-        )
-    if hasattr(filepath, "write"):
-        if format is None:
-            raise RuntimeError("`format` is required when saving to file object.")
-        ext = format.lower()
-    else:
-        ext = str(filepath).split(".")[-1].lower()
-
-    if bits_per_sample not in (None, 8, 16, 24, 32, 64):
-        raise ValueError("Invalid bits_per_sample.")
-    if bits_per_sample == 24:
-        warnings.warn(
-            "Saving audio with 24 bits per sample might warp samples near -1. "
-            "Using 16 bits per sample might be able to avoid this."
-        )
-    subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
-
-    # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
-    # so we extend the extensions manually here
-    if ext in ["nis", "nist", "sph"] and format is None:
-        format = "NIST"
-
-    if channels_first:
-        src = src.t()
-
-    soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
-
-_SUBTYPE2DTYPE = {
-    "PCM_S8": "int8",
-    "PCM_U8": "uint8",
-    "PCM_16": "int16",
-    "PCM_32": "int32",
-    "FLOAT": "float32",
-    "DOUBLE": "float64",
-}
-
-def load(
-    filepath: str,
-    frame_offset: int = 0,
-    num_frames: int = -1,
-    normalize: bool = True,
-    channels_first: bool = True,
-    format: Optional[str] = None,
-) -> Tuple[paddle.Tensor, int]:
-    """Load audio data from file.
-
-    Note:
-        The formats this function can handle depend on the soundfile installation.
-        This function is tested on the following formats;
-
-        * WAV
-
-            * 32-bit floating-point
-            * 32-bit signed integer
-            * 16-bit signed integer
-            * 8-bit unsigned integer
-
-        * FLAC
-        * OGG/VORBIS
-        * SPHERE
-
-    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
-    ``float32`` dtype and the shape of `[channel, time]`.
-    The samples are normalized to fit in the range of ``[-1.0, 1.0]``.
-
-    When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
-    signed integer and 8-bit unsigned integer (24-bit signed integer is not supported),
-    by providing ``normalize=False``, this function can return integer Tensor, where the samples
-    are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor
-    for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM.
-
-    ``normalize`` parameter has no effect on 32-bit floating-point WAV and other formats, such as
-    ``flac`` and ``mp3``.
-    For these formats, this function always returns ``float32`` Tensor with values normalized to
-    ``[-1.0, 1.0]``.
-
-    Note:
-        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
-        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend.
-
-    Args:
-        filepath (path-like object or file-like object):
-            Source of audio data.
-        frame_offset (int, optional):
-            Number of frames to skip before start reading data.
-        num_frames (int, optional):
-            Maximum number of frames to read. ``-1`` reads all the remaining samples,
-            starting from ``frame_offset``.
-            This function may return the less number of frames if there is not enough
-            frames in the given file.
-        normalize (bool, optional):
-            When ``True``, this function always return ``float32``, and sample values are
-            normalized to ``[-1.0, 1.0]``.
-            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
-            integer type.
-            This argument has no effect for formats other than integer WAV type.
-        channels_first (bool, optional):
-            When True, the returned Tensor has dimension `[channel, time]`.
-            Otherwise, the returned Tensor's dimension is `[time, channel]`.
-        format (str or None, optional):
-            Not used. PySoundFile does not accept format hint.
-
-    Returns:
-        (paddle.Tensor, int): Resulting Tensor and sample rate.
-            If the input file has integer wav format and normalization is off, then it has
-            integer type, else ``float32`` type. If ``channels_first=True``, it has
-            `[channel, time]` else `[time, channel]`.
-    """
-    with soundfile.SoundFile(filepath, "r") as file_:
-        if file_.format != "WAV" or normalize:
-            dtype = "float32"
-        elif file_.subtype not in _SUBTYPE2DTYPE:
-            raise ValueError(f"Unsupported subtype: {file_.subtype}")
-        else:
-            dtype = _SUBTYPE2DTYPE[file_.subtype]
-
-        frames = file_._prepare_read(frame_offset, None, num_frames)
-        waveform = file_.read(frames, dtype, always_2d=True)
-        sample_rate = file_.samplerate
-
-    waveform = paddle.to_tensor(waveform)
-    if channels_first:
-        waveform = paddle.transpose(waveform, perm=[1,0])
-    return waveform, sample_rate
-
-
-# Mapping from soundfile subtype to number of bits per sample.
-# This is mostly heuristical and the value is set to 0 when it is irrelevant
-# (lossy formats) or when it can't be inferred.
-# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
-# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
-# the default seems to be 8 bits but it can be compressed further to 4 bits.
-# The dict is inspired from
-# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
-_SUBTYPE_TO_BITS_PER_SAMPLE = {
-    "PCM_S8": 8,  # Signed 8 bit data
-    "PCM_16": 16,  # Signed 16 bit data
-    "PCM_24": 24,  # Signed 24 bit data
-    "PCM_32": 32,  # Signed 32 bit data
-    "PCM_U8": 8,  # Unsigned 8 bit data (WAV and RAW only)
-    "FLOAT": 32,  # 32 bit float data
-    "DOUBLE": 64,  # 64 bit float data
-    "ULAW": 8,  # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
-    "ALAW": 8,  # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
-    "IMA_ADPCM": 0,  # IMA ADPCM.
-    "MS_ADPCM": 0,  # Microsoft ADPCM.
-    "GSM610": 0,  # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
-    "VOX_ADPCM": 0,  # OKI / Dialogix ADPCM
-    "G721_32": 0,  # 32kbs G721 ADPCM encoding.
-    "G723_24": 0,  # 24kbs G723 ADPCM encoding.
-    "G723_40": 0,  # 40kbs G723 ADPCM encoding.
-    "DWVW_12": 12,  # 12 bit Delta Width Variable Word encoding.
-    "DWVW_16": 16,  # 16 bit Delta Width Variable Word encoding.
-    "DWVW_24": 24,  # 24 bit Delta Width Variable Word encoding.
-    "DWVW_N": 0,  # N bit Delta Width Variable Word encoding.
-    "DPCM_8": 8,  # 8 bit differential PCM (XI only)
-    "DPCM_16": 16,  # 16 bit differential PCM (XI only)
-    "VORBIS": 0,  # Xiph Vorbis encoding. (lossy)
-    "ALAC_16": 16,  # Apple Lossless Audio Codec (16 bit).
-    "ALAC_20": 20,  # Apple Lossless Audio Codec (20 bit).
-    "ALAC_24": 24,  # Apple Lossless Audio Codec (24 bit).
-    "ALAC_32": 32,  # Apple Lossless Audio Codec (32 bit).
-}
-
-def _get_bit_depth(subtype):
-    if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
-        warnings.warn(
-            f"The {subtype} subtype is unknown to PaddleAudio. As a result, the bits_per_sample "
-            "attribute will be set to 0. If you are seeing this warning, please "
-            "report by opening an issue on github (after checking for existing/closed ones). "
-            "You may otherwise ignore this warning."
-        )
-    return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
-
-_SUBTYPE_TO_ENCODING = {
-    "PCM_S8": "PCM_S",
-    "PCM_16": "PCM_S",
-    "PCM_24": "PCM_S",
-    "PCM_32": "PCM_S",
-    "PCM_U8": "PCM_U",
-    "FLOAT": "PCM_F",
-    "DOUBLE": "PCM_F",
-    "ULAW": "ULAW",
-    "ALAW": "ALAW",
-    "VORBIS": "VORBIS",
-}
-
-def _get_encoding(format: str, subtype: str):
-    if format == "FLAC":
-        return "FLAC"
-    return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
-
-def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
-    """Get signal information of an audio file.
-
-    Note:
-        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
-        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
-
-    Args:
-        filepath (path-like object or file-like object):
-            Source of audio data.
-        format (str or None, optional):
-            Not used. PySoundFile does not accept format hint.
-
-    Returns:
-        AudioMetaData: meta data of the given audio.
-
-    """
-    sinfo = soundfile.info(filepath)
-    return AudioMetaData(
-        sinfo.samplerate,
-        sinfo.frames,
-        sinfo.channels,
-        bits_per_sample=_get_bit_depth(sinfo.subtype),
-        encoding=_get_encoding(sinfo.format, sinfo.subtype),
-    )
diff --git a/paddlespeech/audio/compliance/__init__.py b/paddlespeech/audio/compliance/__init__.py
deleted file mode 100644
index c08f9ab11..000000000
--- a/paddlespeech/audio/compliance/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from . import kaldi
-from . import librosa
diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py
deleted file mode 100644
index 538be0196..000000000
--- a/paddlespeech/audio/compliance/kaldi.py
+++ /dev/null
@@ -1,638 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from torchaudio(https://github.com/pytorch/audio)
-import math
-from typing import Tuple
-
-import paddle
-from paddle import Tensor
-
-from ..functional import create_dct
-from ..functional.window import get_window
-
-__all__ = [
-    'spectrogram',
-    'fbank',
-    'mfcc',
-]
-
-# window types
-HANNING = 'hann'
-HAMMING = 'hamming'
-POVEY = 'povey'
-RECTANGULAR = 'rect'
-BLACKMAN = 'blackman'
-
-
-def _get_epsilon(dtype):
-    return paddle.to_tensor(1e-07, dtype=dtype)
-
-
-def _next_power_of_2(x: int) -> int:
-    return 1 if x == 0 else 2**(x - 1).bit_length()
-
-
-def _get_strided(waveform: Tensor,
-                 window_size: int,
-                 window_shift: int,
-                 snip_edges: bool) -> Tensor:
-    assert waveform.dim() == 1
-    num_samples = waveform.shape[0]
-
-    if snip_edges:
-        if num_samples < window_size:
-            return paddle.empty((0, 0), dtype=waveform.dtype)
-        else:
-            m = 1 + (num_samples - window_size) // window_shift
-    else:
-        reversed_waveform = paddle.flip(waveform, [0])
-        m = (num_samples + (window_shift // 2)) // window_shift
-        pad = window_size // 2 - window_shift // 2
-        pad_right = reversed_waveform
-        if pad > 0:
-            pad_left = reversed_waveform[-pad:]
-            waveform = paddle.concat((pad_left, waveform, pad_right), axis=0)
-        else:
-            waveform = paddle.concat((waveform[-pad:], pad_right), axis=0)
-
-    return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T
-
-
-def _feature_window_function(
-        window_type: str,
-        window_size: int,
-        blackman_coeff: float,
-        dtype: int, ) -> Tensor:
-    if window_type == HANNING:
-        return get_window('hann', window_size, fftbins=False, dtype=dtype)
-    elif window_type == HAMMING:
-        return get_window('hamming', window_size, fftbins=False, dtype=dtype)
-    elif window_type == POVEY:
-        return get_window(
-            'hann', window_size, fftbins=False, dtype=dtype).pow(0.85)
-    elif window_type == RECTANGULAR:
-        return paddle.ones([window_size], dtype=dtype)
-    elif window_type == BLACKMAN:
-        a = 2 * math.pi / (window_size - 1)
-        window_function = paddle.arange(window_size, dtype=dtype)
-        return (blackman_coeff - 0.5 * paddle.cos(a * window_function) +
-                (0.5 - blackman_coeff) * paddle.cos(2 * a * window_function)
-                ).astype(dtype)
-    else:
-        raise Exception('Invalid window type ' + window_type)
-
-
-def _get_log_energy(strided_input: Tensor, epsilon: Tensor,
-                    energy_floor: float) -> Tensor:
-    log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log()
-    if energy_floor == 0.0:
-        return log_energy
-    return paddle.maximum(
-        log_energy,
-        paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype))
-
-
-def _get_waveform_and_window_properties(
-        waveform: Tensor,
-        channel: int,
-        sr: int,
-        frame_shift: float,
-        frame_length: float,
-        round_to_power_of_two: bool,
-        preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]:
-    channel = max(channel, 0)
-    assert channel < waveform.shape[0], (
-        'Invalid channel {} for size {}'.format(channel, waveform.shape[0]))
-    waveform = waveform[channel, :]  # size (n)
-    window_shift = int(
-        sr * frame_shift *
-        0.001)  # pass frame_shift and frame_length in milliseconds
-    window_size = int(sr * frame_length * 0.001)
-    padded_window_size = _next_power_of_2(
-        window_size) if round_to_power_of_two else window_size
-
-    assert 2 <= window_size <= len(waveform), (
-        'choose a window size {} that is [2, {}]'.format(window_size,
-                                                         len(waveform)))
-    assert 0 < window_shift, '`window_shift` must be greater than 0'
-    assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \
-                                        ' use `round_to_power_of_two` or change `frame_length`'
-    assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]'
-    assert sr > 0, '`sr` must be greater than zero'
-    return waveform, window_shift, window_size, padded_window_size
-
-
-def _get_window(waveform: Tensor,
-                padded_window_size: int,
-                window_size: int,
-                window_shift: int,
-                window_type: str,
-                blackman_coeff: float,
-                snip_edges: bool,
-                raw_energy: bool,
-                energy_floor: float,
-                dither: float,
-                remove_dc_offset: bool,
-                preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]:
-    dtype = waveform.dtype
-    epsilon = _get_epsilon(dtype)
-
-    # (m, window_size)
-    strided_input = _get_strided(waveform, window_size, window_shift,
-                                 snip_edges)
-
-    if dither != 0.0:
-        x = paddle.maximum(epsilon,
-                           paddle.rand(strided_input.shape, dtype=dtype))
-        rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x)
-        strided_input = strided_input + rand_gauss * dither
-
-    if remove_dc_offset:
-        row_means = paddle.mean(strided_input, axis=1).unsqueeze(1)  # (m, 1)
-        strided_input = strided_input - row_means
-
-    if raw_energy:
-        signal_log_energy = _get_log_energy(strided_input, epsilon,
-                                            energy_floor)  # (m)
-
-    if preemphasis_coefficient != 0.0:
-        offset_strided_input = paddle.nn.functional.pad(
-            strided_input.unsqueeze(0), (1, 0),
-            data_format='NCL',
-            mode='replicate').squeeze(0)  # (m, window_size + 1)
-        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :
-                                                                                       -1]
-
-    window_function = _feature_window_function(
-        window_type, window_size, blackman_coeff,
-        dtype).unsqueeze(0)  # (1, window_size)
-    strided_input = strided_input * window_function  # (m, window_size)
-
-    # (m, padded_window_size)
-    if padded_window_size != window_size:
-        padding_right = padded_window_size - window_size
-        strided_input = paddle.nn.functional.pad(
-            strided_input.unsqueeze(0), (0, padding_right),
-            data_format='NCL',
-            mode='constant',
-            value=0).squeeze(0)
-
-    if not raw_energy:
-        signal_log_energy = _get_log_energy(strided_input, epsilon,
-                                            energy_floor)  # size (m)
-
-    return strided_input, signal_log_energy
-
-
-def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
-    if subtract_mean:
-        col_means = paddle.mean(tensor, axis=0).unsqueeze(0)
-        tensor = tensor - col_means
-    return tensor
-
-
-def spectrogram(waveform: Tensor,
-                blackman_coeff: float=0.42,
-                channel: int=-1,
-                dither: float=0.0,
-                energy_floor: float=1.0,
-                frame_length: float=25.0,
-                frame_shift: float=10.0,
-                preemphasis_coefficient: float=0.97,
-                raw_energy: bool=True,
-                remove_dc_offset: bool=True,
-                round_to_power_of_two: bool=True,
-                sr: int=16000,
-                snip_edges: bool=True,
-                subtract_mean: bool=False,
-                window_type: str=POVEY) -> Tensor:
-    """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
-
-    Args:
-        waveform (Tensor): A waveform tensor with shape `(C, T)`.
-        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
-        channel (int, optional): Select the channel of waveform. Defaults to -1.
-        dither (float, optional): Dithering constant . Defaults to 0.0.
-        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
-        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
-        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
-        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
-        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
-        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
-        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
-            to FFT. Defaults to True.
-        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
-        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
-            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
-        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
-        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
-
-    Returns:
-        Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames
-            depends on frame_length and frame_shift.
-    """
-    dtype = waveform.dtype
-    epsilon = _get_epsilon(dtype)
-
-    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
-        waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
-        preemphasis_coefficient)
-
-    strided_input, signal_log_energy = _get_window(
-        waveform, padded_window_size, window_size, window_shift, window_type,
-        blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
-        remove_dc_offset, preemphasis_coefficient)
-
-    # (m, padded_window_size // 2 + 1, 2)
-    fft = paddle.fft.rfft(strided_input)
-
-    power_spectrum = paddle.maximum(
-        fft.abs().pow(2.), epsilon).log()  # (m, padded_window_size // 2 + 1)
-    power_spectrum[:, 0] = signal_log_energy
-
-    power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
-    return power_spectrum
-
-
-def _inverse_mel_scale_scalar(mel_freq: float) -> float:
-    return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
-
-
-def _inverse_mel_scale(mel_freq: Tensor) -> Tensor:
-    return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
-
-
-def _mel_scale_scalar(freq: float) -> float:
-    return 1127.0 * math.log(1.0 + freq / 700.0)
-
-
-def _mel_scale(freq: Tensor) -> Tensor:
-    return 1127.0 * (1.0 + freq / 700.0).log()
-
-
-def _vtln_warp_freq(vtln_low_cutoff: float,
-                    vtln_high_cutoff: float,
-                    low_freq: float,
-                    high_freq: float,
-                    vtln_warp_factor: float,
-                    freq: Tensor) -> Tensor:
-    assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq'
-    assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]'
-    l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
-    h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
-    scale = 1.0 / vtln_warp_factor
-    Fl = scale * l
-    Fh = scale * h
-    assert l > low_freq and h < high_freq
-    scale_left = (Fl - low_freq) / (l - low_freq)
-    scale_right = (high_freq - Fh) / (high_freq - h)
-    res = paddle.empty_like(freq)
-
-    outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \
-        | paddle.greater_than(freq, paddle.to_tensor(high_freq))
-    before_l = paddle.less_than(freq, paddle.to_tensor(l))
-    before_h = paddle.less_than(freq, paddle.to_tensor(h))
-    after_h = paddle.greater_equal(freq, paddle.to_tensor(h))
-
-    res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
-    res[before_h] = scale * freq[before_h]
-    res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
-    res[outside_low_high_freq] = freq[outside_low_high_freq]
-
-    return res
-
-
-def _vtln_warp_mel_freq(vtln_low_cutoff: float,
-                        vtln_high_cutoff: float,
-                        low_freq,
-                        high_freq: float,
-                        vtln_warp_factor: float,
-                        mel_freq: Tensor) -> Tensor:
-    return _mel_scale(
-        _vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq,
-                        vtln_warp_factor, _inverse_mel_scale(mel_freq)))
-
-
-def _get_mel_banks(num_bins: int,
-                   window_length_padded: int,
-                   sample_freq: float,
-                   low_freq: float,
-                   high_freq: float,
-                   vtln_low: float,
-                   vtln_high: float,
-                   vtln_warp_factor: float) -> Tuple[Tensor, Tensor]:
-    assert num_bins > 3, 'Must have at least 3 mel bins'
-    assert window_length_padded % 2 == 0
-    num_fft_bins = window_length_padded / 2
-    nyquist = 0.5 * sample_freq
-
-    if high_freq <= 0.0:
-        high_freq += nyquist
-
-    assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \
-        ('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist))
-
-    fft_bin_width = sample_freq / window_length_padded
-    mel_low_freq = _mel_scale_scalar(low_freq)
-    mel_high_freq = _mel_scale_scalar(high_freq)
-
-    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
-
-    if vtln_high < 0.0:
-        vtln_high += nyquist
-
-    assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and
-                                       (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \
-        ('Bad values in options: vtln-low {} and vtln-high {}, versus '
-         'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq))
-
-    bin = paddle.arange(num_bins).unsqueeze(1)
-    left_mel = mel_low_freq + bin * mel_freq_delta  # (num_bins, 1)
-    center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta  # (num_bins, 1)
-    right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta  # (num_bins, 1)
-
-    if vtln_warp_factor != 1.0:
-        left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq,
-                                       vtln_warp_factor, left_mel)
-        center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
-                                         high_freq, vtln_warp_factor,
-                                         center_mel)
-        right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq,
-                                        high_freq, vtln_warp_factor, right_mel)
-
-    center_freqs = _inverse_mel_scale(center_mel)  # (num_bins)
-    # (1, num_fft_bins)
-    mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0)
-
-    # (num_bins, num_fft_bins)
-    up_slope = (mel - left_mel) / (center_mel - left_mel)
-    down_slope = (right_mel - mel) / (right_mel - center_mel)
-
-    if vtln_warp_factor == 1.0:
-        bins = paddle.maximum(
-            paddle.zeros([1]), paddle.minimum(up_slope, down_slope))
-    else:
-        bins = paddle.zeros_like(up_slope)
-        up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than(
-            mel, center_mel)
-        down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than(
-            mel, right_mel)
-        bins[up_idx] = up_slope[up_idx]
-        bins[down_idx] = down_slope[down_idx]
-
-    return bins, center_freqs
-
-
-def fbank(waveform: Tensor,
-          blackman_coeff: float=0.42,
-          channel: int=-1,
-          dither: float=0.0,
-          energy_floor: float=1.0,
-          frame_length: float=25.0,
-          frame_shift: float=10.0,
-          high_freq: float=0.0,
-          htk_compat: bool=False,
-          low_freq: float=20.0,
-          n_mels: int=23,
-          preemphasis_coefficient: float=0.97,
-          raw_energy: bool=True,
-          remove_dc_offset: bool=True,
-          round_to_power_of_two: bool=True,
-          sr: int=16000,
-          snip_edges: bool=True,
-          subtract_mean: bool=False,
-          use_energy: bool=False,
-          use_log_fbank: bool=True,
-          use_power: bool=True,
-          vtln_high: float=-500.0,
-          vtln_low: float=100.0,
-          vtln_warp: float=1.0,
-          window_type: str=POVEY) -> Tensor:
-    """Compute and return filter banks from a waveform. The output is identical to Kaldi's.
-
-    Args:
-        waveform (Tensor): A waveform tensor with shape `(C, T)`.
-        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
-        channel (int, optional): Select the channel of waveform. Defaults to -1.
-        dither (float, optional): Dithering constant . Defaults to 0.0.
-        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
-        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
-        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
-        high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
-        htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
-        low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
-        n_mels (int, optional): Number of output mel bins. Defaults to 23.
-        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
-        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
-        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
-        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
-            to FFT. Defaults to True.
-        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
-        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
-            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
-        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
-        use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
-        use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True.
-        use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True.
-        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
-        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
-        vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
-        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
-
-    Returns:
-        Tensor: A filter banks tensor with shape `(m, n_mels)`.
-    """
-    dtype = waveform.dtype
-
-    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
-        waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two,
-        preemphasis_coefficient)
-
-    strided_input, signal_log_energy = _get_window(
-        waveform, padded_window_size, window_size, window_shift, window_type,
-        blackman_coeff, snip_edges, raw_energy, energy_floor, dither,
-        remove_dc_offset, preemphasis_coefficient)
-
-    # (m, padded_window_size // 2 + 1)
-    spectrum = paddle.fft.rfft(strided_input).abs()
-    if use_power:
-        spectrum = spectrum.pow(2.)
-
-    # (n_mels, padded_window_size // 2)
-    mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq,
-                                     high_freq, vtln_low, vtln_high, vtln_warp)
-    mel_energies = mel_energies.astype(dtype)
-
-    # (n_mels, padded_window_size // 2 + 1)
-    mel_energies = paddle.nn.functional.pad(
-        mel_energies.unsqueeze(0), (0, 1),
-        data_format='NCL',
-        mode='constant',
-        value=0).squeeze(0)
-
-    # (m, n_mels)
-    mel_energies = paddle.mm(spectrum, mel_energies.T)
-    if use_log_fbank:
-        mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log()
-
-    if use_energy:
-        signal_log_energy = signal_log_energy.unsqueeze(1)
-        if htk_compat:
-            mel_energies = paddle.concat(
-                (mel_energies, signal_log_energy), axis=1)
-        else:
-            mel_energies = paddle.concat(
-                (signal_log_energy, mel_energies), axis=1)
-
-    # (m, n_mels + 1)
-    mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
-    return mel_energies
-
-
-def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor:
-    dct_matrix = create_dct(n_mels, n_mels, 'ortho')
-    dct_matrix[:, 0] = math.sqrt(1 / float(n_mels))
-    dct_matrix = dct_matrix[:, :n_mfcc]  # (n_mels, n_mfcc)
-    return dct_matrix
-
-
-def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor:
-    i = paddle.arange(n_mfcc)
-    return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i /
-                                                    cepstral_lifter)
-
-
-def mfcc(waveform: Tensor,
-         blackman_coeff: float=0.42,
-         cepstral_lifter: float=22.0,
-         channel: int=-1,
-         dither: float=0.0,
-         energy_floor: float=1.0,
-         frame_length: float=25.0,
-         frame_shift: float=10.0,
-         high_freq: float=0.0,
-         htk_compat: bool=False,
-         low_freq: float=20.0,
-         n_mfcc: int=13,
-         n_mels: int=23,
-         preemphasis_coefficient: float=0.97,
-         raw_energy: bool=True,
-         remove_dc_offset: bool=True,
-         round_to_power_of_two: bool=True,
-         sr: int=16000,
-         snip_edges: bool=True,
-         subtract_mean: bool=False,
-         use_energy: bool=False,
-         vtln_high: float=-500.0,
-         vtln_low: float=100.0,
-         vtln_warp: float=1.0,
-         window_type: str=POVEY) -> Tensor:
-    """Compute and return mel frequency cepstral coefficients from a waveform. The output is
-            identical to Kaldi's.
-
-    Args:
-        waveform (Tensor): A waveform tensor with shape `(C, T)`.
-        blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
-        cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
-        channel (int, optional): Select the channel of waveform. Defaults to -1.
-        dither (float, optional): Dithering constant . Defaults to 0.0.
-        energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0.
-        frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0.
-        frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0.
-        high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0.
-        htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False.
-        low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0.
-        n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13.
-        n_mels (int, optional): Number of output mel bins. Defaults to 23.
-        preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97.
-        raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True.
-        remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True.
-        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
-            to FFT. Defaults to True.
-        sr (int, optional): Sample rate of input waveform. Defaults to 16000.
-        snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it
-            is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
-        subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
-        use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
-        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0.
-        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0.
-        vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0.
-        window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
-
-    Returns:
-        Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`.
-    """
-    assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
-        n_mfcc, n_mels)
-
-    dtype = waveform.dtype
-
-    # (m, n_mels + use_energy)
-    feature = fbank(
-        waveform=waveform,
-        blackman_coeff=blackman_coeff,
-        channel=channel,
-        dither=dither,
-        energy_floor=energy_floor,
-        frame_length=frame_length,
-        frame_shift=frame_shift,
-        high_freq=high_freq,
-        htk_compat=htk_compat,
-        low_freq=low_freq,
-        n_mels=n_mels,
-        preemphasis_coefficient=preemphasis_coefficient,
-        raw_energy=raw_energy,
-        remove_dc_offset=remove_dc_offset,
-        round_to_power_of_two=round_to_power_of_two,
-        sr=sr,
-        snip_edges=snip_edges,
-        subtract_mean=False,
-        use_energy=use_energy,
-        use_log_fbank=True,
-        use_power=True,
-        vtln_high=vtln_high,
-        vtln_low=vtln_low,
-        vtln_warp=vtln_warp,
-        window_type=window_type)
-
-    if use_energy:
-        # (m)
-        signal_log_energy = feature[:, n_mels if htk_compat else 0]
-        mel_offset = int(not htk_compat)
-        feature = feature[:, mel_offset:(n_mels + mel_offset)]
-
-    # (n_mels, n_mfcc)
-    dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype)
-
-    # (m, n_mfcc)
-    feature = feature.matmul(dct_matrix)
-
-    if cepstral_lifter != 0.0:
-        # (1, n_mfcc)
-        lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0)
-        feature *= lifter_coeffs.astype(dtype=dtype)
-
-    if use_energy:
-        feature[:, 0] = signal_log_energy
-
-    if htk_compat:
-        energy = feature[:, 0].unsqueeze(1)  # (m, 1)
-        feature = feature[:, 1:]  # (m, n_mfcc - 1)
-        if not use_energy:
-            energy *= math.sqrt(2)
-
-        feature = paddle.concat((feature, energy), axis=1)
-
-    feature = _subtract_column_mean(feature, subtract_mean)
-    return feature
diff --git a/paddlespeech/audio/compliance/librosa.py b/paddlespeech/audio/compliance/librosa.py
deleted file mode 100644
index 17ad51b41..000000000
--- a/paddlespeech/audio/compliance/librosa.py
+++ /dev/null
@@ -1,788 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from librosa(https://github.com/librosa/librosa)
-import warnings
-from typing import List
-from typing import Optional
-from typing import Union
-
-import numpy as np
-import scipy
-from numpy.lib.stride_tricks import as_strided
-from scipy import signal
-
-from ..utils import depth_convert
-from ..utils import ParameterError
-
-__all__ = [
-    # dsp
-    'stft',
-    'mfcc',
-    'hz_to_mel',
-    'mel_to_hz',
-    'mel_frequencies',
-    'power_to_db',
-    'compute_fbank_matrix',
-    'melspectrogram',
-    'spectrogram',
-    'mu_encode',
-    'mu_decode',
-    # augmentation
-    'depth_augment',
-    'spect_augment',
-    'random_crop1d',
-    'random_crop2d',
-    'adaptive_spect_augment',
-]
-
-
-def _pad_center(data: np.ndarray, size: int, axis: int=-1,
-                **kwargs) -> np.ndarray:
-    """Pad an array to a target length along a target axis.
-
-    This differs from `np.pad` by centering the data prior to padding,
-    analogous to `str.center`
-    """
-
-    kwargs.setdefault("mode", "constant")
-    n = data.shape[axis]
-    lpad = int((size - n) // 2)
-    lengths = [(0, 0)] * data.ndim
-    lengths[axis] = (lpad, int(size - n - lpad))
-
-    if lpad < 0:
-        raise ParameterError(("Target size ({size:d}) must be "
-                              "at least input size ({n:d})"))
-
-    return np.pad(data, lengths, **kwargs)
-
-
-def _split_frames(x: np.ndarray,
-                  frame_length: int,
-                  hop_length: int,
-                  axis: int=-1) -> np.ndarray:
-    """Slice a data array into (overlapping) frames.
-
-    This function is aligned with librosa.frame
-    """
-
-    if not isinstance(x, np.ndarray):
-        raise ParameterError(
-            f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
-
-    if x.shape[axis] < frame_length:
-        raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
-                             f" for frame_length={frame_length:d}")
-
-    if hop_length < 1:
-        raise ParameterError(f"Invalid hop_length: {hop_length:d}")
-
-    if axis == -1 and not x.flags["F_CONTIGUOUS"]:
-        warnings.warn(f"librosa.util.frame called with axis={axis} "
-                      "on a non-contiguous input. This will result in a copy.")
-        x = np.asfortranarray(x)
-    elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
-        warnings.warn(f"librosa.util.frame called with axis={axis} "
-                      "on a non-contiguous input. This will result in a copy.")
-        x = np.ascontiguousarray(x)
-
-    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
-    strides = np.asarray(x.strides)
-
-    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
-
-    if axis == -1:
-        shape = list(x.shape)[:-1] + [frame_length, n_frames]
-        strides = list(strides) + [hop_length * new_stride]
-
-    elif axis == 0:
-        shape = [n_frames, frame_length] + list(x.shape)[1:]
-        strides = [hop_length * new_stride] + list(strides)
-
-    else:
-        raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
-
-    return as_strided(x, shape=shape, strides=strides)
-
-
-def _check_audio(y, mono=True) -> bool:
-    """Determine whether a variable contains valid audio data.
-
-    The audio y must be a np.ndarray, ether 1-channel or two channel
-    """
-    if not isinstance(y, np.ndarray):
-        raise ParameterError("Audio data must be of type numpy.ndarray")
-    if y.ndim > 2:
-        raise ParameterError(
-            f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
-
-    if mono and y.ndim == 2:
-        raise ParameterError(
-            f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
-
-    if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
-        raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
-
-    if not np.issubdtype(y.dtype, np.floating):
-        raise ParameterError("Audio data must be floating-point")
-
-    if not np.isfinite(y).all():
-        raise ParameterError("Audio buffer is not finite everywhere")
-
-    return True
-
-
-def hz_to_mel(frequencies: Union[float, List[float], np.ndarray],
-              htk: bool=False) -> np.ndarray:
-    """Convert Hz to Mels.
-
-    Args:
-        frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-
-    Returns:
-        np.ndarray: Frequency in mels.
-    """
-    freq = np.asanyarray(frequencies)
-
-    if htk:
-        return 2595.0 * np.log10(1.0 + freq / 700.0)
-
-    # Fill in the linear part
-    f_min = 0.0
-    f_sp = 200.0 / 3
-
-    mels = (freq - f_min) / f_sp
-
-    # Fill in the log-scale part
-
-    min_log_hz = 1000.0  # beginning of log region (Hz)
-    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-    logstep = np.log(6.4) / 27.0  # step size for log region
-
-    if freq.ndim:
-        # If we have array data, vectorize
-        log_t = freq >= min_log_hz
-        mels[log_t] = min_log_mel + \
-            np.log(freq[log_t] / min_log_hz) / logstep
-    elif freq >= min_log_hz:
-        # If we have scalar data, heck directly
-        mels = min_log_mel + np.log(freq / min_log_hz) / logstep
-
-    return mels
-
-
-def mel_to_hz(mels: Union[float, List[float], np.ndarray],
-              htk: int=False) -> np.ndarray:
-    """Convert mel bin numbers to frequencies.
-
-    Args:
-        mels (Union[float, List[float], np.ndarray]): Frequency in mels.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-
-    Returns:
-        np.ndarray: Frequencies in Hz.
-    """
-    mel_array = np.asanyarray(mels)
-
-    if htk:
-        return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
-
-    # Fill in the linear scale
-    f_min = 0.0
-    f_sp = 200.0 / 3
-    freqs = f_min + f_sp * mel_array
-
-    # And now the nonlinear scale
-    min_log_hz = 1000.0  # beginning of log region (Hz)
-    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-    logstep = np.log(6.4) / 27.0  # step size for log region
-
-    if mel_array.ndim:
-        # If we have vector data, vectorize
-        log_t = mel_array >= min_log_mel
-        freqs[log_t] = min_log_hz * \
-            np.exp(logstep * (mel_array[log_t] - min_log_mel))
-    elif mel_array >= min_log_mel:
-        # If we have scalar data, check directly
-        freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
-
-    return freqs
-
-
-def mel_frequencies(n_mels: int=128,
-                    fmin: float=0.0,
-                    fmax: float=11025.0,
-                    htk: bool=False) -> np.ndarray:
-    """Compute mel frequencies.
-
-    Args:
-        n_mels (int, optional): Number of mel bins. Defaults to 128.
-        fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
-        fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-
-    Returns:
-        np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`.
-    """
-    # 'Center freqs' of mel bands - uniformly spaced between limits
-    min_mel = hz_to_mel(fmin, htk=htk)
-    max_mel = hz_to_mel(fmax, htk=htk)
-
-    mels = np.linspace(min_mel, max_mel, n_mels)
-
-    return mel_to_hz(mels, htk=htk)
-
-
-def fft_frequencies(sr: int, n_fft: int) -> np.ndarray:
-    """Compute fourier frequencies.
-
-    Args:
-        sr (int): Sample rate.
-        n_fft (int): FFT size.
-
-    Returns:
-        np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
-    """
-    return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
-
-
-def compute_fbank_matrix(sr: int,
-                         n_fft: int,
-                         n_mels: int=128,
-                         fmin: float=0.0,
-                         fmax: Optional[float]=None,
-                         htk: bool=False,
-                         norm: str="slaney",
-                         dtype: type=np.float32) -> np.ndarray:
-    """Compute fbank matrix.
-
-    Args:
-        sr (int): Sample rate.
-        n_fft (int): FFT size.
-        n_mels (int, optional): Number of mel bins. Defaults to 128.
-        fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
-        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-        norm (str, optional): Type of normalization. Defaults to "slaney".
-        dtype (type, optional): Data type. Defaults to np.float32.
-
-
-    Returns:
-        np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
-    """
-    if norm != "slaney":
-        raise ParameterError('norm must set to slaney')
-
-    if fmax is None:
-        fmax = float(sr) / 2
-
-    # Initialize the weights
-    n_mels = int(n_mels)
-    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
-
-    # Center freqs of each FFT bin
-    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
-
-    # 'Center freqs' of mel bands - uniformly spaced between limits
-    mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
-
-    fdiff = np.diff(mel_f)
-    ramps = np.subtract.outer(mel_f, fftfreqs)
-
-    for i in range(n_mels):
-        # lower and upper slopes for all bins
-        lower = -ramps[i] / fdiff[i]
-        upper = ramps[i + 2] / fdiff[i + 1]
-
-        # .. then intersect them with each other and zero
-        weights[i] = np.maximum(0, np.minimum(lower, upper))
-
-    if norm == "slaney":
-        # Slaney-style mel is scaled to be approx constant energy per channel
-        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
-        weights *= enorm[:, np.newaxis]
-
-    # Only check weights if f_mel[0] is positive
-    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
-        # This means we have an empty channel somewhere
-        warnings.warn("Empty filters detected in mel frequency basis. "
-                      "Some channels will produce empty responses. "
-                      "Try increasing your sampling rate (and fmax) or "
-                      "reducing n_mels.")
-
-    return weights
-
-
-def stft(x: np.ndarray,
-         n_fft: int=2048,
-         hop_length: Optional[int]=None,
-         win_length: Optional[int]=None,
-         window: str="hann",
-         center: bool=True,
-         dtype: type=np.complex64,
-         pad_mode: str="reflect") -> np.ndarray:
-    """Short-time Fourier transform (STFT).
-
-    Args:
-        x (np.ndarray): Input waveform in one dimension.
-        n_fft (int, optional): FFT size. Defaults to 2048.
-        hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
-        win_length (Optional[int], optional): The size of window. Defaults to None.
-        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
-
-    Returns:
-        np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`.
-    """
-    _check_audio(x)
-
-    # By default, use the entire frame
-    if win_length is None:
-        win_length = n_fft
-
-    # Set the default hop, if it's not already specified
-    if hop_length is None:
-        hop_length = int(win_length // 4)
-
-    fft_window = signal.get_window(window, win_length, fftbins=True)
-
-    # Pad the window out to n_fft size
-    fft_window = _pad_center(fft_window, n_fft)
-
-    # Reshape so that the window can be broadcast
-    fft_window = fft_window.reshape((-1, 1))
-
-    # Pad the time series so that frames are centered
-    if center:
-        if n_fft > x.shape[-1]:
-            warnings.warn(
-                f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
-            )
-        x = np.pad(x, int(n_fft // 2), mode=pad_mode)
-
-    elif n_fft > x.shape[-1]:
-        raise ParameterError(
-            f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
-        )
-
-    # Window the time series.
-    x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length)
-    # Pre-allocate the STFT matrix
-    stft_matrix = np.empty(
-        (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
-    fft = np.fft  # use numpy fft as default
-    # Constrain STFT block sizes to 256 KB
-    MAX_MEM_BLOCK = 2**8 * 2**10
-    # how many columns can we fit within MAX_MEM_BLOCK?
-    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
-    n_columns = max(n_columns, 1)
-
-    for bl_s in range(0, stft_matrix.shape[1], n_columns):
-        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
-        stft_matrix[:, bl_s:bl_t] = fft.rfft(
-            fft_window * x_frames[:, bl_s:bl_t], axis=0)
-
-    return stft_matrix
-
-
-def power_to_db(spect: np.ndarray,
-                ref: float=1.0,
-                amin: float=1e-10,
-                top_db: Optional[float]=80.0) -> np.ndarray:
-    """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
-
-    Args:
-        spect (np.ndarray): STFT power spectrogram of an input waveform.
-        ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
-        amin (float, optional): Minimum threshold. Defaults to 1e-10.
-        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0.
-
-    Returns:
-        np.ndarray: Power spectrogram in db scale.
-    """
-    spect = np.asarray(spect)
-
-    if amin <= 0:
-        raise ParameterError("amin must be strictly positive")
-
-    if np.issubdtype(spect.dtype, np.complexfloating):
-        warnings.warn(
-            "power_to_db was called on complex input so phase "
-            "information will be discarded. To suppress this warning, "
-            "call power_to_db(np.abs(D)**2) instead.")
-        magnitude = np.abs(spect)
-    else:
-        magnitude = spect
-
-    if callable(ref):
-        # User supplied a function to calculate reference power
-        ref_value = ref(magnitude)
-    else:
-        ref_value = np.abs(ref)
-
-    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
-    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
-
-    if top_db is not None:
-        if top_db < 0:
-            raise ParameterError("top_db must be non-negative")
-        log_spec = np.maximum(log_spec, log_spec.max() - top_db)
-
-    return log_spec
-
-
-def mfcc(x: np.ndarray,
-         sr: int=16000,
-         spect: Optional[np.ndarray]=None,
-         n_mfcc: int=20,
-         dct_type: int=2,
-         norm: str="ortho",
-         lifter: int=0,
-         **kwargs) -> np.ndarray:
-    """Mel-frequency cepstral coefficients (MFCCs)
-
-    Args:
-        x (np.ndarray): Input waveform in one dimension.
-        sr (int, optional): Sample rate. Defaults to 16000.
-        spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None.
-        n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20.
-        dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2.
-        norm (str, optional): Type of normalization. Defaults to "ortho".
-        lifter (int, optional): Cepstral filtering. Defaults to 0.
-
-    Returns:
-        np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`.
-    """
-    if spect is None:
-        spect = melspectrogram(x, sr=sr, **kwargs)
-
-    M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
-
-    if lifter > 0:
-        factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
-                        lifter)
-        return M * factor[:, np.newaxis]
-    elif lifter == 0:
-        return M
-    else:
-        raise ParameterError(
-            f"MFCC lifter={lifter} must be a non-negative number")
-
-
-def melspectrogram(x: np.ndarray,
-                   sr: int=16000,
-                   window_size: int=512,
-                   hop_length: int=320,
-                   n_mels: int=64,
-                   fmin: float=50.0,
-                   fmax: Optional[float]=None,
-                   window: str='hann',
-                   center: bool=True,
-                   pad_mode: str='reflect',
-                   power: float=2.0,
-                   to_db: bool=True,
-                   ref: float=1.0,
-                   amin: float=1e-10,
-                   top_db: Optional[float]=None) -> np.ndarray:
-    """Compute mel-spectrogram.
-
-    Args:
-        x (np.ndarray): Input waveform in one dimension.
-        sr (int, optional): Sample rate. Defaults to 16000.
-        window_size (int, optional): Size of FFT and window length. Defaults to 512.
-        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
-        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
-        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
-        to_db (bool, optional): Enable db scale. Defaults to True.
-        ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
-        amin (float, optional): Minimum threshold. Defaults to 1e-10.
-        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
-
-    Returns:
-        np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`.
-    """
-    _check_audio(x, mono=True)
-    if len(x) <= 0:
-        raise ParameterError('The input waveform is empty')
-
-    if fmax is None:
-        fmax = sr // 2
-    if fmin < 0 or fmin >= fmax:
-        raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
-
-    s = stft(
-        x,
-        n_fft=window_size,
-        hop_length=hop_length,
-        win_length=window_size,
-        window=window,
-        center=center,
-        pad_mode=pad_mode)
-
-    spect_power = np.abs(s)**power
-    fb_matrix = compute_fbank_matrix(
-        sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
-    mel_spect = np.matmul(fb_matrix, spect_power)
-    if to_db:
-        return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
-    else:
-        return mel_spect
-
-
-def spectrogram(x: np.ndarray,
-                sr: int=16000,
-                window_size: int=512,
-                hop_length: int=320,
-                window: str='hann',
-                center: bool=True,
-                pad_mode: str='reflect',
-                power: float=2.0) -> np.ndarray:
-    """Compute spectrogram.
-
-    Args:
-        x (np.ndarray): Input waveform in one dimension.
-        sr (int, optional): Sample rate. Defaults to 16000.
-        window_size (int, optional): Size of FFT and window length. Defaults to 512.
-        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
-        window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
-        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
-
-    Returns:
-        np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`.
-    """
-
-    s = stft(
-        x,
-        n_fft=window_size,
-        hop_length=hop_length,
-        win_length=window_size,
-        window=window,
-        center=center,
-        pad_mode=pad_mode)
-
-    return np.abs(s)**power
-
-
-def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
-    """Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`.
-
-    Args:
-        x (np.ndarray): The input waveform to encode.
-        mu (int, optional): The endoceding parameter. Defaults to 255.
-        quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True.
-
-    Returns:
-        np.ndarray: The mu-law encoded waveform.
-    """
-    mu = 255
-    y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
-    if quantized:
-        y = np.floor((y + 1) / 2 * mu + 0.5)  # convert to [0 , mu-1]
-    return y
-
-
-def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
-    """Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise.
-
-    Args:
-        y (np.ndarray): The encoded waveform.
-        mu (int, optional): The endoceding parameter. Defaults to 255.
-        quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True.
-
-    Returns:
-        np.ndarray: The mu-law decoded waveform.
-    """
-    if mu < 1:
-        raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
-
-    mu = mu - 1
-    if quantized:  # undo the quantization
-        y = y * 2 / mu - 1
-    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
-    return x
-
-
-def _randint(high: int) -> int:
-    """Generate one random integer in range [0 high)
-
-     This is a helper function for random data augmentaiton
-    """
-    return int(np.random.randint(0, high=high))
-
-
-def depth_augment(y: np.ndarray,
-                  choices: List=['int8', 'int16'],
-                  probs: List[float]=[0.5, 0.5]) -> np.ndarray:
-    """ Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D or 2D.
-        choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16'].
-        probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5].
-
-    Returns:
-        np.ndarray: The augmented waveform.
-    """
-    assert len(probs) == len(
-        choices
-    ), 'number of choices {} must be equal to size of probs {}'.format(
-        len(choices), len(probs))
-    depth = np.random.choice(choices, p=probs)
-    src_depth = y.dtype
-    y1 = depth_convert(y, depth)
-    y2 = depth_convert(y1, src_depth)
-
-    return y2
-
-
-def adaptive_spect_augment(spect: np.ndarray,
-                           tempo_axis: int=0,
-                           level: float=0.1) -> np.ndarray:
-    """Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation.
-
-    Args:
-        spect (np.ndarray): Input spectrogram.
-        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
-        level (float, optional): The level factor of masking. Defaults to 0.1.
-
-    Returns:
-        np.ndarray: The augmented spectrogram.
-    """
-    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
-    if tempo_axis == 0:
-        nt, nf = spect.shape
-    else:
-        nf, nt = spect.shape
-
-    time_mask_width = int(nt * level * 0.5)
-    freq_mask_width = int(nf * level * 0.5)
-
-    num_time_mask = int(10 * level)
-    num_freq_mask = int(10 * level)
-
-    if tempo_axis == 0:
-        for _ in range(num_time_mask):
-            start = _randint(nt - time_mask_width)
-            spect[start:start + time_mask_width, :] = 0
-        for _ in range(num_freq_mask):
-            start = _randint(nf - freq_mask_width)
-            spect[:, start:start + freq_mask_width] = 0
-    else:
-        for _ in range(num_time_mask):
-            start = _randint(nt - time_mask_width)
-            spect[:, start:start + time_mask_width] = 0
-        for _ in range(num_freq_mask):
-            start = _randint(nf - freq_mask_width)
-            spect[start:start + freq_mask_width, :] = 0
-
-    return spect
-
-
-def spect_augment(spect: np.ndarray,
-                  tempo_axis: int=0,
-                  max_time_mask: int=3,
-                  max_freq_mask: int=3,
-                  max_time_mask_width: int=30,
-                  max_freq_mask_width: int=20) -> np.ndarray:
-    """Do spectrogram augmentation in both time and freq axis.
-
-    Args:
-        spect (np.ndarray): Input spectrogram.
-        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
-        max_time_mask (int, optional): Maximum number of time masking. Defaults to 3.
-        max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3.
-        max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30.
-        max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20.
-
-    Returns:
-        np.ndarray: The augmented spectrogram.
-    """
-    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
-    if tempo_axis == 0:
-        nt, nf = spect.shape
-    else:
-        nf, nt = spect.shape
-
-    num_time_mask = _randint(max_time_mask)
-    num_freq_mask = _randint(max_freq_mask)
-
-    time_mask_width = _randint(max_time_mask_width)
-    freq_mask_width = _randint(max_freq_mask_width)
-
-    if tempo_axis == 0:
-        for _ in range(num_time_mask):
-            start = _randint(nt - time_mask_width)
-            spect[start:start + time_mask_width, :] = 0
-        for _ in range(num_freq_mask):
-            start = _randint(nf - freq_mask_width)
-            spect[:, start:start + freq_mask_width] = 0
-    else:
-        for _ in range(num_time_mask):
-            start = _randint(nt - time_mask_width)
-            spect[:, start:start + time_mask_width] = 0
-        for _ in range(num_freq_mask):
-            start = _randint(nf - freq_mask_width)
-            spect[start:start + freq_mask_width, :] = 0
-
-    return spect
-
-
-def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray:
-    """ Random cropping on a input waveform.
-
-    Args:
-        y (np.ndarray): Input waveform array in 1D.
-        crop_len (int): Length of waveform to crop.
-
-    Returns:
-        np.ndarray: The cropped waveform.
-    """
-    if y.ndim != 1:
-        'only accept 1d tensor or numpy array'
-    n = len(y)
-    idx = _randint(n - crop_len)
-    return y[idx:idx + crop_len]
-
-
-def random_crop2d(s: np.ndarray, crop_len: int,
-                  tempo_axis: int=0) -> np.ndarray:
-    """ Random cropping on a spectrogram.
-
-    Args:
-        s (np.ndarray): Input spectrogram in 2D.
-        crop_len (int): Length of spectrogram to crop.
-        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
-
-    Returns:
-        np.ndarray: The cropped spectrogram.
-    """
-    if tempo_axis >= s.ndim:
-        raise ParameterError('axis out of range')
-
-    n = s.shape[tempo_axis]
-    idx = _randint(high=n - crop_len)
-    sli = [slice(None) for i in range(s.ndim)]
-    sli[tempo_axis] = slice(idx, idx + crop_len)
-    out = s[tuple(sli)]
-    return out
diff --git a/paddlespeech/audio/datasets/__init__.py b/paddlespeech/audio/datasets/__init__.py
deleted file mode 100644
index f95fad305..000000000
--- a/paddlespeech/audio/datasets/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .esc50 import ESC50
-from .gtzan import GTZAN
-from .hey_snips import HeySnips
-from .rirs_noises import OpenRIRNoise
-from .tess import TESS
-from .urban_sound import UrbanSound8K
-from .voxceleb import VoxCeleb
diff --git a/paddlespeech/audio/datasets/dataset.py b/paddlespeech/audio/datasets/dataset.py
deleted file mode 100644
index 81e6bdf5e..000000000
--- a/paddlespeech/audio/datasets/dataset.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List
-
-import numpy as np
-import paddle
-
-from ..compliance.kaldi import fbank as kaldi_fbank
-from ..compliance.kaldi import mfcc as kaldi_mfcc
-from ..compliance.librosa import melspectrogram
-from ..compliance.librosa import mfcc
-
-feat_funcs = {
-    'raw': None,
-    'melspectrogram': melspectrogram,
-    'mfcc': mfcc,
-    'kaldi_fbank': kaldi_fbank,
-    'kaldi_mfcc': kaldi_mfcc,
-}
-
-
-class AudioClassificationDataset(paddle.io.Dataset):
-    """
-    Base class of audio classification dataset.
-    """
-
-    def __init__(self,
-                 files: List[str],
-                 labels: List[int],
-                 feat_type: str='raw',
-                 sample_rate: int=None,
-                 **kwargs):
-        """
-        Ags:
-            files (:obj:`List[str]`): A list of absolute path of audio files.
-            labels (:obj:`List[int]`): Labels of audio files.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        super(AudioClassificationDataset, self).__init__()
-
-        if feat_type not in feat_funcs.keys():
-            raise RuntimeError(
-                f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
-            )
-
-        self.files = files
-        self.labels = labels
-
-        self.feat_type = feat_type
-        self.sample_rate = sample_rate
-        self.feat_config = kwargs  # Pass keyword arguments to customize feature config
-
-    def _get_data(self, input_file: str):
-        raise NotImplementedError
-
-    def _convert_to_record(self, idx):
-        file, label = self.files[idx], self.labels[idx]
-
-        if self.sample_rate is None:
-            waveform, sample_rate = paddlespeech.audio.load(file)
-        else:
-            waveform, sample_rate = paddlespeech.audio.load(
-                file, sr=self.sample_rate)
-
-        feat_func = feat_funcs[self.feat_type]
-
-        record = {}
-        if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
-            waveform = paddle.to_tensor(waveform).unsqueeze(0)  # (C, T)
-            record['feat'] = feat_func(
-                waveform=waveform, sr=self.sample_rate, **self.feat_config)
-        else:
-            record['feat'] = feat_func(
-                waveform, sample_rate,
-                **self.feat_config) if feat_func else waveform
-        record['label'] = label
-        return record
-
-    def __getitem__(self, idx):
-        record = self._convert_to_record(idx)
-        if self.feat_type in ['kaldi_fbank', 'kaldi_mfcc']:
-            return self.keys[idx], record['feat'], record['label']
-        else:
-            return np.array(record['feat']).transpose(), np.array(
-                record['label'], dtype=np.int64)
-
-    def __len__(self):
-        return len(self.files)
diff --git a/paddlespeech/audio/datasets/esc50.py b/paddlespeech/audio/datasets/esc50.py
deleted file mode 100644
index f5c7050f3..000000000
--- a/paddlespeech/audio/datasets/esc50.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-from typing import List
-from typing import Tuple
-
-from ..utils import DATA_HOME
-from ..utils.download import download_and_decompress
-from .dataset import AudioClassificationDataset
-
-__all__ = ['ESC50']
-
-
-class ESC50(AudioClassificationDataset):
-    """
-    The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
-    suitable for benchmarking methods of environmental sound classification. The dataset
-    consists of 5-second-long recordings organized into 50 semantical classes (with
-    40 examples per class)
-
-    Reference:
-        ESC: Dataset for Environmental Sound Classification
-        http://dx.doi.org/10.1145/2733373.2806390
-    """
-
-    archieves = [
-        {
-            'url':
-            'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
-            'md5': '7771e4b9d86d0945acce719c7a59305a',
-        },
-    ]
-    label_list = [
-        # Animals
-        'Dog',
-        'Rooster',
-        'Pig',
-        'Cow',
-        'Frog',
-        'Cat',
-        'Hen',
-        'Insects (flying)',
-        'Sheep',
-        'Crow',
-        # Natural soundscapes & water sounds
-        'Rain',
-        'Sea waves',
-        'Crackling fire',
-        'Crickets',
-        'Chirping birds',
-        'Water drops',
-        'Wind',
-        'Pouring water',
-        'Toilet flush',
-        'Thunderstorm',
-        # Human, non-speech sounds
-        'Crying baby',
-        'Sneezing',
-        'Clapping',
-        'Breathing',
-        'Coughing',
-        'Footsteps',
-        'Laughing',
-        'Brushing teeth',
-        'Snoring',
-        'Drinking, sipping',
-        # Interior/domestic sounds
-        'Door knock',
-        'Mouse click',
-        'Keyboard typing',
-        'Door, wood creaks',
-        'Can opening',
-        'Washing machine',
-        'Vacuum cleaner',
-        'Clock alarm',
-        'Clock tick',
-        'Glass breaking',
-        # Exterior/urban noises
-        'Helicopter',
-        'Chainsaw',
-        'Siren',
-        'Car horn',
-        'Engine',
-        'Train',
-        'Church bells',
-        'Airplane',
-        'Fireworks',
-        'Hand saw',
-    ]
-    meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
-    meta_info = collections.namedtuple(
-        'META_INFO',
-        ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
-    audio_path = os.path.join('ESC-50-master', 'audio')
-
-    def __init__(self,
-                 mode: str='train',
-                 split: int=1,
-                 feat_type: str='raw',
-                 **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        files, labels = self._get_data(mode, split)
-        super(ESC50, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self) -> List[collections.namedtuple]:
-        ret = []
-        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                ret.append(self.meta_info(*line.strip().split(',')))
-        return ret
-
-    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        meta_info = self._get_meta_info()
-
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, fold, target, _, _, _, _ = sample
-            if mode == 'train' and int(fold) != split:
-                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-                labels.append(int(target))
-
-            if mode != 'train' and int(fold) == split:
-                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-                labels.append(int(target))
-
-        return files, labels
diff --git a/paddlespeech/audio/datasets/gtzan.py b/paddlespeech/audio/datasets/gtzan.py
deleted file mode 100644
index 1f6835a5a..000000000
--- a/paddlespeech/audio/datasets/gtzan.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-import random
-from typing import List
-from typing import Tuple
-
-from ..utils import DATA_HOME
-from ..utils.download import download_and_decompress
-from .dataset import AudioClassificationDataset
-
-__all__ = ['GTZAN']
-
-
-class GTZAN(AudioClassificationDataset):
-    """
-    The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
-    each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
-    in machine listening research for music genre recognition (MGR).
-
-    Reference:
-        Musical genre classification of audio signals
-        https://ieeexplore.ieee.org/document/1021072/
-    """
-
-    archieves = [
-        {
-            'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
-            'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
-        },
-    ]
-    label_list = [
-        'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
-        'pop', 'reggae', 'rock'
-    ]
-    meta = os.path.join('genres', 'input.mf')
-    meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
-    audio_path = 'genres'
-
-    def __init__(self,
-                 mode='train',
-                 seed=0,
-                 n_folds=5,
-                 split=1,
-                 feat_type='raw',
-                 **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            seed (:obj:`int`, `optional`, defaults to 0):
-                Set the random seed to shuffle samples.
-            n_folds (:obj:`int`, `optional`, defaults to 5):
-                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
-        files, labels = self._get_data(mode, seed, n_folds, split)
-        super(GTZAN, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self) -> List[collections.namedtuple]:
-        ret = []
-        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
-            for line in rf.readlines():
-                ret.append(self.meta_info(*line.strip().split('\t')))
-        return ret
-
-    def _get_data(self, mode, seed, n_folds,
-                  split) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        meta_info = self._get_meta_info()
-        random.seed(seed)  # shuffle samples to split data
-        random.shuffle(
-            meta_info
-        )  # make sure using the same seed to create train and dev dataset
-
-        files = []
-        labels = []
-        n_samples_per_fold = len(meta_info) // n_folds
-        for idx, sample in enumerate(meta_info):
-            file_path, label = sample
-            filename = os.path.basename(file_path)
-            target = self.label_list.index(label)
-            fold = idx // n_samples_per_fold + 1
-
-            if mode == 'train' and int(fold) != split:
-                files.append(
-                    os.path.join(DATA_HOME, self.audio_path, label, filename))
-                labels.append(target)
-
-            if mode != 'train' and int(fold) == split:
-                files.append(
-                    os.path.join(DATA_HOME, self.audio_path, label, filename))
-                labels.append(target)
-
-        return files, labels
diff --git a/paddlespeech/audio/datasets/hey_snips.py b/paddlespeech/audio/datasets/hey_snips.py
deleted file mode 100644
index 7a67b843b..000000000
--- a/paddlespeech/audio/datasets/hey_snips.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import json
-import os
-from typing import List
-from typing import Tuple
-
-from .dataset import AudioClassificationDataset
-
-__all__ = ['HeySnips']
-
-
-class HeySnips(AudioClassificationDataset):
-    meta_info = collections.namedtuple('META_INFO',
-                                       ('key', 'label', 'duration', 'wav'))
-
-    def __init__(self,
-                 data_dir: os.PathLike,
-                 mode: str='train',
-                 feat_type: str='kaldi_fbank',
-                 sample_rate: int=16000,
-                 **kwargs):
-        self.data_dir = data_dir
-        files, labels = self._get_data(mode)
-        super(HeySnips, self).__init__(
-            files=files,
-            labels=labels,
-            feat_type=feat_type,
-            sample_rate=sample_rate,
-            **kwargs)
-
-    def _get_meta_info(self, mode) -> List[collections.namedtuple]:
-        ret = []
-        with open(os.path.join(self.data_dir, '{}.json'.format(mode)),
-                  'r') as f:
-            data = json.load(f)
-            for item in data:
-                sample = collections.OrderedDict()
-                if item['duration'] > 0:
-                    sample['key'] = item['id']
-                    sample['label'] = 0 if item['is_hotword'] == 1 else -1
-                    sample['duration'] = item['duration']
-                    sample['wav'] = os.path.join(self.data_dir,
-                                                 item['audio_file_path'])
-                    ret.append(self.meta_info(*sample.values()))
-        return ret
-
-    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
-        meta_info = self._get_meta_info(mode)
-
-        files = []
-        labels = []
-        self.keys = []
-        self.durations = []
-        for sample in meta_info:
-            key, target, duration, wav = sample
-            files.append(wav)
-            labels.append(int(target))
-            self.keys.append(key)
-            self.durations.append(float(duration))
-
-        return files, labels
diff --git a/paddlespeech/audio/datasets/rirs_noises.py b/paddlespeech/audio/datasets/rirs_noises.py
deleted file mode 100644
index 61bbf72a2..000000000
--- a/paddlespeech/audio/datasets/rirs_noises.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import csv
-import os
-import random
-from typing import List
-
-from paddle.io import Dataset
-from tqdm import tqdm
-
-from ..utils import DATA_HOME
-from ..utils.download import download_and_decompress
-from .dataset import feat_funcs
-
-__all__ = ['OpenRIRNoise']
-
-
-class OpenRIRNoise(Dataset):
-    archieves = [
-        {
-            'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
-            'md5': 'e6f48e257286e05de56413b4779d8ffb',
-        },
-    ]
-
-    sample_rate = 16000
-    meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav'))
-    base_path = os.path.join(DATA_HOME, 'open_rir_noise')
-    wav_path = os.path.join(base_path, 'RIRS_NOISES')
-    csv_path = os.path.join(base_path, 'csv')
-    subsets = ['rir', 'noise']
-
-    def __init__(self,
-                 subset: str='rir',
-                 feat_type: str='raw',
-                 target_dir=None,
-                 random_chunk: bool=True,
-                 chunk_duration: float=3.0,
-                 seed: int=0,
-                 **kwargs):
-
-        assert subset in self.subsets, \
-            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
-
-        self.subset = subset
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self.random_chunk = random_chunk
-        self.chunk_duration = chunk_duration
-
-        OpenRIRNoise.csv_path = os.path.join(
-            target_dir, "open_rir_noise",
-            "csv") if target_dir else self.csv_path
-        self._data = self._get_data()
-        super(OpenRIRNoise, self).__init__()
-
-        # Set up a seed to reproduce training or predicting result.
-        # random.seed(seed)
-
-    def _get_data(self):
-        # Download audio files.
-        print(f"rirs noises base path: {self.base_path}")
-        if not os.path.isdir(self.base_path):
-            download_and_decompress(
-                self.archieves, self.base_path, decompress=True)
-        else:
-            print(
-                f"{self.base_path} already exists, we will not download and decompress again"
-            )
-
-        # Data preparation.
-        print(f"prepare the csv to {self.csv_path}")
-        if not os.path.isdir(self.csv_path):
-            os.makedirs(self.csv_path)
-            self.prepare_data()
-
-        data = []
-        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                audio_id, duration, wav = line.strip().split(',')
-                data.append(self.meta_info(audio_id, float(duration), wav))
-
-        random.shuffle(data)
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = paddlespeech.audio.load(record['wav'])
-
-        assert self.feat_type in feat_funcs.keys(), \
-            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sr=sr, **self.feat_config) if feat_func else waveform
-
-        record.update({'feat': feat})
-        return record
-
-    @staticmethod
-    def _get_chunks(seg_dur, audio_id, audio_duration):
-        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
-
-        chunk_lst = [
-            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
-            for i in range(num_chunks)
-        ]
-        return chunk_lst
-
-    def _get_audio_info(self, wav_file: str,
-                        split_chunks: bool) -> List[List[str]]:
-        waveform, sr = paddlespeech.audio.load(wav_file)
-        audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
-        audio_duration = waveform.shape[0] / sr
-
-        ret = []
-        if split_chunks and audio_duration > self.chunk_duration:  # Split into pieces of self.chunk_duration seconds.
-            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
-                                                audio_duration)
-
-            for idx, chunk in enumerate(uniq_chunks_list):
-                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
-                start_sample = int(float(s) * sr)
-                end_sample = int(float(e) * sr)
-                new_wav_file = os.path.join(self.base_path,
-                                            audio_id + f'_chunk_{idx+1:02}.wav')
-                paddlespeech.audio.save(waveform[start_sample:end_sample], sr,
-                                        new_wav_file)
-                # id, duration, new_wav
-                ret.append([chunk, self.chunk_duration, new_wav_file])
-        else:  # Keep whole audio.
-            ret.append([audio_id, audio_duration, wav_file])
-        return ret
-
-    def generate_csv(self,
-                     wav_files: List[str],
-                     output_file: str,
-                     split_chunks: bool=True):
-        print(f'Generating csv: {output_file}')
-        header = ["id", "duration", "wav"]
-
-        infos = list(
-            tqdm(
-                map(self._get_audio_info, wav_files, [split_chunks] * len(
-                    wav_files)),
-                total=len(wav_files)))
-
-        csv_lines = []
-        for info in infos:
-            csv_lines.extend(info)
-
-        with open(output_file, mode="w") as csv_f:
-            csv_writer = csv.writer(
-                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
-            csv_writer.writerow(header)
-            for line in csv_lines:
-                csv_writer.writerow(line)
-
-    def prepare_data(self):
-        rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises",
-                                "rir_list")
-        rir_files = []
-        with open(rir_list, 'r') as f:
-            for line in f.readlines():
-                rir_file = line.strip().split(' ')[-1]
-                rir_files.append(os.path.join(self.base_path, rir_file))
-
-        noise_list = os.path.join(self.wav_path, "pointsource_noises",
-                                  "noise_list")
-        noise_files = []
-        with open(noise_list, 'r') as f:
-            for line in f.readlines():
-                noise_file = line.strip().split(' ')[-1]
-                noise_files.append(os.path.join(self.base_path, noise_file))
-
-        self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv'))
-        self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv'))
-
-    def __getitem__(self, idx):
-        return self._convert_to_record(idx)
-
-    def __len__(self):
-        return len(self._data)
diff --git a/paddlespeech/audio/datasets/tess.py b/paddlespeech/audio/datasets/tess.py
deleted file mode 100644
index 1469fa5e2..000000000
--- a/paddlespeech/audio/datasets/tess.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-import random
-from typing import List
-from typing import Tuple
-
-from ..utils import DATA_HOME
-from ..utils.download import download_and_decompress
-from .dataset import AudioClassificationDataset
-
-__all__ = ['TESS']
-
-
-class TESS(AudioClassificationDataset):
-    """
-    TESS is a set of 200 target words were spoken in the carrier phrase
-    "Say the word _____' by two actresses (aged 26 and 64 years) and
-    recordings were made of the set portraying each of seven emotions(anger,
-    disgust, fear, happiness, pleasant surprise, sadness, and neutral).
-    There are 2800 stimuli in total.
-
-    Reference:
-        Toronto emotional speech set (TESS)
-        https://doi.org/10.5683/SP2/E8H2MF
-    """
-
-    archieves = [
-        {
-            'url':
-            'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
-            'md5':
-            '1465311b24d1de704c4c63e4ccc470c7',
-        },
-    ]
-    label_list = [
-        'angry',
-        'disgust',
-        'fear',
-        'happy',
-        'neutral',
-        'ps',  # pleasant surprise
-        'sad',
-    ]
-    meta_info = collections.namedtuple('META_INFO',
-                                       ('speaker', 'word', 'emotion'))
-    audio_path = 'TESS_Toronto_emotional_speech_set'
-
-    def __init__(self,
-                 mode='train',
-                 seed=0,
-                 n_folds=5,
-                 split=1,
-                 feat_type='raw',
-                 **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            seed (:obj:`int`, `optional`, defaults to 0):
-                Set the random seed to shuffle samples.
-            n_folds (:obj:`int`, `optional`, defaults to 5):
-                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
-        files, labels = self._get_data(mode, seed, n_folds, split)
-        super(TESS, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, files) -> List[collections.namedtuple]:
-        ret = []
-        for file in files:
-            basename_without_extend = os.path.basename(file)[:-4]
-            ret.append(self.meta_info(*basename_without_extend.split('_')))
-        return ret
-
-    def _get_data(self, mode, seed, n_folds,
-                  split) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        wav_files = []
-        for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
-            for file in files:
-                if file.endswith('.wav'):
-                    wav_files.append(os.path.join(root, file))
-
-        random.seed(seed)  # shuffle samples to split data
-        random.shuffle(
-            wav_files
-        )  # make sure using the same seed to create train and dev dataset
-        meta_info = self._get_meta_info(wav_files)
-
-        files = []
-        labels = []
-        n_samples_per_fold = len(meta_info) // n_folds
-        for idx, sample in enumerate(meta_info):
-            _, _, emotion = sample
-            target = self.label_list.index(emotion)
-            fold = idx // n_samples_per_fold + 1
-
-            if mode == 'train' and int(fold) != split:
-                files.append(wav_files[idx])
-                labels.append(target)
-
-            if mode != 'train' and int(fold) == split:
-                files.append(wav_files[idx])
-                labels.append(target)
-
-        return files, labels
diff --git a/paddlespeech/audio/datasets/urban_sound.py b/paddlespeech/audio/datasets/urban_sound.py
deleted file mode 100644
index 0389cd5f9..000000000
--- a/paddlespeech/audio/datasets/urban_sound.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-from typing import List
-from typing import Tuple
-
-from ..utils import DATA_HOME
-from ..utils.download import download_and_decompress
-from .dataset import AudioClassificationDataset
-
-__all__ = ['UrbanSound8K']
-
-
-class UrbanSound8K(AudioClassificationDataset):
-    """
-    UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
-    sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
-    drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
-    classes are drawn from the urban sound taxonomy.
-
-    Reference:
-        A Dataset and Taxonomy for Urban Sound Research
-        https://dl.acm.org/doi/10.1145/2647868.2655045
-    """
-
-    archieves = [
-        {
-            'url':
-            'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
-            'md5': '9aa69802bbf37fb986f71ec1483a196e',
-        },
-    ]
-    label_list = [
-        "air_conditioner", "car_horn", "children_playing", "dog_bark",
-        "drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
-        "street_music"
-    ]
-    meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
-    meta_info = collections.namedtuple(
-        'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
-                      'class_id', 'label'))
-    audio_path = os.path.join('UrbanSound8K', 'audio')
-
-    def __init__(self,
-                 mode: str='train',
-                 split: int=1,
-                 feat_type: str='raw',
-                 **kwargs):
-        files, labels = self._get_data(mode, split)
-        super(UrbanSound8K, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-
-    def _get_meta_info(self):
-        ret = []
-        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                ret.append(self.meta_info(*line.strip().split(',')))
-        return ret
-
-    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        meta_info = self._get_meta_info()
-
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, _, _, _, _, fold, target, _ = sample
-            if mode == 'train' and int(fold) != split:
-                files.append(
-                    os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
-                                 filename))
-                labels.append(int(target))
-
-            if mode != 'train' and int(fold) == split:
-                files.append(
-                    os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
-                                 filename))
-                labels.append(int(target))
-
-        return files, labels
diff --git a/paddlespeech/audio/datasets/voxceleb.py b/paddlespeech/audio/datasets/voxceleb.py
deleted file mode 100644
index e1a8aa38b..000000000
--- a/paddlespeech/audio/datasets/voxceleb.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import csv
-import glob
-import os
-import random
-from multiprocessing import cpu_count
-from typing import List
-
-from paddle.io import Dataset
-from pathos.multiprocessing import Pool
-from tqdm import tqdm
-
-from ..utils import DATA_HOME
-from ..utils import decompress
-from ..utils.download import download_and_decompress
-from .dataset import feat_funcs
-
-__all__ = ['VoxCeleb']
-
-
-class VoxCeleb(Dataset):
-    source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
-    archieves_audio_dev = [
-        {
-            'url': source_url + 'vox1_dev_wav_partaa',
-            'md5': 'e395d020928bc15670b570a21695ed96',
-        },
-        {
-            'url': source_url + 'vox1_dev_wav_partab',
-            'md5': 'bbfaaccefab65d82b21903e81a8a8020',
-        },
-        {
-            'url': source_url + 'vox1_dev_wav_partac',
-            'md5': '017d579a2a96a077f40042ec33e51512',
-        },
-        {
-            'url': source_url + 'vox1_dev_wav_partad',
-            'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
-        },
-    ]
-    archieves_audio_test = [
-        {
-            'url': source_url + 'vox1_test_wav.zip',
-            'md5': '185fdc63c3c739954633d50379a3d102',
-        },
-    ]
-    archieves_meta = [
-        {
-            'url':
-            'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
-            'md5':
-            'b73110731c9223c1461fe49cb48dddfc',
-        },
-    ]
-
-    num_speakers = 1211  # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
-    sample_rate = 16000
-    meta_info = collections.namedtuple(
-        'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
-    base_path = os.path.join(DATA_HOME, 'vox1')
-    wav_path = os.path.join(base_path, 'wav')
-    meta_path = os.path.join(base_path, 'meta')
-    veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
-    csv_path = os.path.join(base_path, 'csv')
-    subsets = ['train', 'dev', 'enroll', 'test']
-
-    def __init__(
-            self,
-            subset: str='train',
-            feat_type: str='raw',
-            random_chunk: bool=True,
-            chunk_duration: float=3.0,  # seconds
-            split_ratio: float=0.9,  # train split ratio
-            seed: int=0,
-            target_dir: str=None,
-            vox2_base_path=None,
-            **kwargs):
-        """VoxCeleb data prepare and get the specific dataset audio info
-
-        Args:
-            subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
-            feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
-            random_chunk (bool, optional): random select a duration from audio. Defaults to True.
-            chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
-            target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
-            vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
-        """
-        assert subset in self.subsets, \
-            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
-
-        self.subset = subset
-        self.spk_id2label = {}
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self.random_chunk = random_chunk
-        self.chunk_duration = chunk_duration
-        self.split_ratio = split_ratio
-        self.target_dir = target_dir if target_dir else VoxCeleb.base_path
-        self.vox2_base_path = vox2_base_path
-
-        # if we set the target dir, we will change the vox data info data from base path to target dir
-        VoxCeleb.csv_path = os.path.join(
-            target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
-        VoxCeleb.meta_path = os.path.join(
-            target_dir, "voxceleb",
-            'meta') if target_dir else VoxCeleb.meta_path
-        VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
-                                               'veri_test2.txt')
-        # self._data = self._get_data()[:1000]  # KP: Small dataset test.
-        self._data = self._get_data()
-        super(VoxCeleb, self).__init__()
-
-        # Set up a seed to reproduce training or predicting result.
-        # random.seed(seed)
-
-    def _get_data(self):
-        # Download audio files.
-        # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
-        # so, we check the vox1/wav dir status
-        print(f"wav base path: {self.wav_path}")
-        if not os.path.isdir(self.wav_path):
-            print("start to download the voxceleb1 dataset")
-            download_and_decompress(  # multi-zip parts concatenate to vox1_dev_wav.zip
-                self.archieves_audio_dev,
-                self.base_path,
-                decompress=False)
-            download_and_decompress(  # download the vox1_test_wav.zip and unzip
-                self.archieves_audio_test,
-                self.base_path,
-                decompress=True)
-
-            # Download all parts and concatenate the files into one zip file.
-            dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
-            print(f'Concatenating all parts to: {dev_zipfile}')
-            os.system(
-                f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
-            )
-
-            # Extract all audio files of dev and test set.
-            decompress(dev_zipfile, self.base_path)
-
-        # Download meta files.
-        if not os.path.isdir(self.meta_path):
-            print("prepare the meta data")
-            download_and_decompress(
-                self.archieves_meta, self.meta_path, decompress=False)
-
-        # Data preparation.
-        if not os.path.isdir(self.csv_path):
-            os.makedirs(self.csv_path)
-            self.prepare_data()
-
-        data = []
-        print(
-            f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
-        )
-        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                audio_id, duration, wav, start, stop, spk_id = line.strip(
-                ).split(',')
-                data.append(
-                    self.meta_info(audio_id,
-                                   float(duration), wav,
-                                   int(start), int(stop), spk_id))
-
-        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
-            for line in f.readlines():
-                spk_id, label = line.strip().split(' ')
-                self.spk_id2label[spk_id] = int(label)
-
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = paddlespeech.audio.load(record['wav'])
-
-        # random select a chunk audio samples from the audio
-        if self.random_chunk:
-            num_wav_samples = waveform.shape[0]
-            num_chunk_samples = int(self.chunk_duration * sr)
-            start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
-            stop = start + num_chunk_samples
-        else:
-            start = record['start']
-            stop = record['stop']
-
-        waveform = waveform[start:stop]
-
-        assert self.feat_type in feat_funcs.keys(), \
-            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sr=sr, **self.feat_config) if feat_func else waveform
-
-        record.update({'feat': feat})
-        if self.subset in ['train',
-                           'dev']:  # Labels are available in train and dev.
-            record.update({'label': self.spk_id2label[record['spk_id']]})
-
-        return record
-
-    @staticmethod
-    def _get_chunks(seg_dur, audio_id, audio_duration):
-        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
-
-        chunk_lst = [
-            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
-            for i in range(num_chunks)
-        ]
-        return chunk_lst
-
-    def _get_audio_info(self, wav_file: str,
-                        split_chunks: bool) -> List[List[str]]:
-        waveform, sr = paddlespeech.audio.load(wav_file)
-        spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
-        audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
-        audio_duration = waveform.shape[0] / sr
-
-        ret = []
-        if split_chunks:  # Split into pieces of self.chunk_duration seconds.
-            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
-                                                audio_duration)
-
-            for chunk in uniq_chunks_list:
-                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
-                start_sample = int(float(s) * sr)
-                end_sample = int(float(e) * sr)
-                # id, duration, wav, start, stop, spk_id
-                ret.append([
-                    chunk, audio_duration, wav_file, start_sample, end_sample,
-                    spk_id
-                ])
-        else:  # Keep whole audio.
-            ret.append([
-                audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
-            ])
-        return ret
-
-    def generate_csv(self,
-                     wav_files: List[str],
-                     output_file: str,
-                     split_chunks: bool=True):
-        print(f'Generating csv: {output_file}')
-        header = ["id", "duration", "wav", "start", "stop", "spk_id"]
-        # Note: this may occurs c++ execption, but the program will execute fine
-        # so we can ignore the execption 
-        with Pool(cpu_count()) as p:
-            infos = list(
-                tqdm(
-                    p.imap(lambda x: self._get_audio_info(x, split_chunks),
-                           wav_files),
-                    total=len(wav_files)))
-
-        csv_lines = []
-        for info in infos:
-            csv_lines.extend(info)
-
-        with open(output_file, mode="w") as csv_f:
-            csv_writer = csv.writer(
-                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
-            csv_writer.writerow(header)
-            for line in csv_lines:
-                csv_writer.writerow(line)
-
-    def prepare_data(self):
-        # Audio of speakers in veri_test_file should not be included in training set.
-        print("start to prepare the data csv file")
-        enroll_files = set()
-        test_files = set()
-        # get the enroll and test audio file path
-        with open(self.veri_test_file, 'r') as f:
-            for line in f.readlines():
-                _, enrol_file, test_file = line.strip().split(' ')
-                enroll_files.add(os.path.join(self.wav_path, enrol_file))
-                test_files.add(os.path.join(self.wav_path, test_file))
-            enroll_files = sorted(enroll_files)
-            test_files = sorted(test_files)
-
-        # get the enroll and test speakers
-        test_spks = set()
-        for file in (enroll_files + test_files):
-            spk = file.split('/wav/')[1].split('/')[0]
-            test_spks.add(spk)
-
-        # get all the train and dev audios file path
-        audio_files = []
-        speakers = set()
-        print("Getting file list...")
-        for path in [self.wav_path, self.vox2_base_path]:
-            # if vox2 directory is not set and vox2 is not a directory 
-            # we will not process this directory
-            if not path or not os.path.exists(path):
-                print(f"{path} is an invalid path, please check again, "
-                      "and we will ignore the vox2 base path")
-                continue
-            for file in glob.glob(
-                    os.path.join(path, "**", "*.wav"), recursive=True):
-                spk = file.split('/wav/')[1].split('/')[0]
-                if spk in test_spks:
-                    continue
-                speakers.add(spk)
-                audio_files.append(file)
-
-        print(
-            f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
-        )
-        # encode the train and dev speakers label to spk_id2label.txt
-        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
-            for label, spk_id in enumerate(
-                    sorted(speakers)):  # 1211 vox1, 5994 vox2, 7205 vox1+2
-                f.write(f'{spk_id} {label}\n')
-
-        audio_files = sorted(audio_files)
-        random.shuffle(audio_files)
-        split_idx = int(self.split_ratio * len(audio_files))
-        # split_ratio to train
-        train_files, dev_files = audio_files[:split_idx], audio_files[
-            split_idx:]
-
-        self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
-        self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
-
-        self.generate_csv(
-            enroll_files,
-            os.path.join(self.csv_path, 'enroll.csv'),
-            split_chunks=False)
-        self.generate_csv(
-            test_files,
-            os.path.join(self.csv_path, 'test.csv'),
-            split_chunks=False)
-
-    def __getitem__(self, idx):
-        return self._convert_to_record(idx)
-
-    def __len__(self):
-        return len(self._data)
diff --git a/paddlespeech/audio/features/__init__.py b/paddlespeech/audio/features/__init__.py
deleted file mode 100644
index 00781397f..000000000
--- a/paddlespeech/audio/features/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .layers import LogMelSpectrogram
-from .layers import MelSpectrogram
-from .layers import MFCC
-from .layers import Spectrogram
diff --git a/paddlespeech/audio/features/layers.py b/paddlespeech/audio/features/layers.py
deleted file mode 100644
index 292363e64..000000000
--- a/paddlespeech/audio/features/layers.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from functools import partial
-from typing import Optional
-from typing import Union
-
-import paddle
-import paddle.nn as nn
-from paddle import Tensor
-
-from ..functional import compute_fbank_matrix
-from ..functional import create_dct
-from ..functional import power_to_db
-from ..functional.window import get_window
-
-__all__ = [
-    'Spectrogram',
-    'MelSpectrogram',
-    'LogMelSpectrogram',
-    'MFCC',
-]
-
-
-class Spectrogram(nn.Layer):
-    """Compute spectrogram of given signals, typically audio waveforms.
-    The spectorgram is defined as the complex norm of the short-time Fourier transformation.
-
-    Args:
-        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
-        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
-        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
-        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
-        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
-        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
-    """
-
-    def __init__(self,
-                 n_fft: int=512,
-                 hop_length: Optional[int]=None,
-                 win_length: Optional[int]=None,
-                 window: str='hann',
-                 power: float=2.0,
-                 center: bool=True,
-                 pad_mode: str='reflect',
-                 dtype: str='float32') -> None:
-        super(Spectrogram, self).__init__()
-
-        assert power > 0, 'Power of spectrogram must be > 0.'
-        self.power = power
-
-        if win_length is None:
-            win_length = n_fft
-
-        self.fft_window = get_window(
-            window, win_length, fftbins=True, dtype=dtype)
-        self._stft = partial(
-            paddle.signal.stft,
-            n_fft=n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            window=self.fft_window,
-            center=center,
-            pad_mode=pad_mode)
-        self.register_buffer('fft_window', self.fft_window)
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x (Tensor): Tensor of waveforms with shape `(N, T)`
-
-        Returns:
-            Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`.
-        """
-        stft = self._stft(x)
-        spectrogram = paddle.pow(paddle.abs(stft), self.power)
-        return spectrogram
-
-
-class MelSpectrogram(nn.Layer):
-    """Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix.
-
-    Args:
-        sr (int, optional): Sample rate. Defaults to 22050.
-        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
-        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
-        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
-        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
-        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
-        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
-        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
-        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
-    """
-
-    def __init__(self,
-                 sr: int=22050,
-                 n_fft: int=512,
-                 hop_length: Optional[int]=None,
-                 win_length: Optional[int]=None,
-                 window: str='hann',
-                 power: float=2.0,
-                 center: bool=True,
-                 pad_mode: str='reflect',
-                 n_mels: int=64,
-                 f_min: float=50.0,
-                 f_max: Optional[float]=None,
-                 htk: bool=False,
-                 norm: Union[str, float]='slaney',
-                 dtype: str='float32') -> None:
-        super(MelSpectrogram, self).__init__()
-
-        self._spectrogram = Spectrogram(
-            n_fft=n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            window=window,
-            power=power,
-            center=center,
-            pad_mode=pad_mode,
-            dtype=dtype)
-        self.n_mels = n_mels
-        self.f_min = f_min
-        self.f_max = f_max
-        self.htk = htk
-        self.norm = norm
-        if f_max is None:
-            f_max = sr // 2
-        self.fbank_matrix = compute_fbank_matrix(
-            sr=sr,
-            n_fft=n_fft,
-            n_mels=n_mels,
-            f_min=f_min,
-            f_max=f_max,
-            htk=htk,
-            norm=norm,
-            dtype=dtype)  # float64 for better numerical results
-        self.register_buffer('fbank_matrix', self.fbank_matrix)
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x (Tensor): Tensor of waveforms with shape `(N, T)`
-
-        Returns:
-            Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`.
-        """
-        spect_feature = self._spectrogram(x)
-        mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
-        return mel_feature
-
-
-class LogMelSpectrogram(nn.Layer):
-    """Compute log-mel-spectrogram feature of given signals, typically audio waveforms.
-
-    Args:
-        sr (int, optional): Sample rate. Defaults to 22050.
-        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
-        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
-        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
-        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
-        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
-        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
-        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
-        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
-        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
-        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
-        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
-    """
-
-    def __init__(self,
-                 sr: int=22050,
-                 n_fft: int=512,
-                 hop_length: Optional[int]=None,
-                 win_length: Optional[int]=None,
-                 window: str='hann',
-                 power: float=2.0,
-                 center: bool=True,
-                 pad_mode: str='reflect',
-                 n_mels: int=64,
-                 f_min: float=50.0,
-                 f_max: Optional[float]=None,
-                 htk: bool=False,
-                 norm: Union[str, float]='slaney',
-                 ref_value: float=1.0,
-                 amin: float=1e-10,
-                 top_db: Optional[float]=None,
-                 dtype: str='float32') -> None:
-        super(LogMelSpectrogram, self).__init__()
-
-        self._melspectrogram = MelSpectrogram(
-            sr=sr,
-            n_fft=n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            window=window,
-            power=power,
-            center=center,
-            pad_mode=pad_mode,
-            n_mels=n_mels,
-            f_min=f_min,
-            f_max=f_max,
-            htk=htk,
-            norm=norm,
-            dtype=dtype)
-
-        self.ref_value = ref_value
-        self.amin = amin
-        self.top_db = top_db
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x (Tensor): Tensor of waveforms with shape `(N, T)`
-
-        Returns:
-            Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`.
-        """
-        mel_feature = self._melspectrogram(x)
-        log_mel_feature = power_to_db(
-            mel_feature,
-            ref_value=self.ref_value,
-            amin=self.amin,
-            top_db=self.top_db)
-        return log_mel_feature
-
-
-class MFCC(nn.Layer):
-    """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
-
-    Args:
-        sr (int, optional): Sample rate. Defaults to 22050.
-        n_mfcc (int, optional): [description]. Defaults to 40.
-        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
-        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
-        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
-        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
-        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
-        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
-        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
-        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
-        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
-        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
-        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
-        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
-    """
-
-    def __init__(self,
-                 sr: int=22050,
-                 n_mfcc: int=40,
-                 n_fft: int=512,
-                 hop_length: Optional[int]=None,
-                 win_length: Optional[int]=None,
-                 window: str='hann',
-                 power: float=2.0,
-                 center: bool=True,
-                 pad_mode: str='reflect',
-                 n_mels: int=64,
-                 f_min: float=50.0,
-                 f_max: Optional[float]=None,
-                 htk: bool=False,
-                 norm: Union[str, float]='slaney',
-                 ref_value: float=1.0,
-                 amin: float=1e-10,
-                 top_db: Optional[float]=None,
-                 dtype: str=paddle.float32) -> None:
-        super(MFCC, self).__init__()
-        assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
-            n_mfcc, n_mels)
-        self._log_melspectrogram = LogMelSpectrogram(
-            sr=sr,
-            n_fft=n_fft,
-            hop_length=hop_length,
-            win_length=win_length,
-            window=window,
-            power=power,
-            center=center,
-            pad_mode=pad_mode,
-            n_mels=n_mels,
-            f_min=f_min,
-            f_max=f_max,
-            htk=htk,
-            norm=norm,
-            ref_value=ref_value,
-            amin=amin,
-            top_db=top_db,
-            dtype=dtype)
-        self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
-        self.register_buffer('dct_matrix', self.dct_matrix)
-
-    def forward(self, x: Tensor) -> Tensor:
-        """
-        Args:
-            x (Tensor): Tensor of waveforms with shape `(N, T)`
-
-        Returns:
-            Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`.
-        """
-        log_mel_feature = self._log_melspectrogram(x)
-        mfcc = paddle.matmul(
-            log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
-                (0, 2, 1))  # (B, n_mels, L)
-        return mfcc
diff --git a/paddlespeech/audio/functional/__init__.py b/paddlespeech/audio/functional/__init__.py
deleted file mode 100644
index c85232df1..000000000
--- a/paddlespeech/audio/functional/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .functional import compute_fbank_matrix
-from .functional import create_dct
-from .functional import fft_frequencies
-from .functional import hz_to_mel
-from .functional import mel_frequencies
-from .functional import mel_to_hz
-from .functional import power_to_db
diff --git a/paddlespeech/audio/functional/functional.py b/paddlespeech/audio/functional/functional.py
deleted file mode 100644
index 19c63a9ae..000000000
--- a/paddlespeech/audio/functional/functional.py
+++ /dev/null
@@ -1,266 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Modified from librosa(https://github.com/librosa/librosa)
-import math
-from typing import Optional
-from typing import Union
-
-import paddle
-from paddle import Tensor
-
-__all__ = [
-    'hz_to_mel',
-    'mel_to_hz',
-    'mel_frequencies',
-    'fft_frequencies',
-    'compute_fbank_matrix',
-    'power_to_db',
-    'create_dct',
-]
-
-
-def hz_to_mel(freq: Union[Tensor, float],
-              htk: bool=False) -> Union[Tensor, float]:
-    """Convert Hz to Mels.
-
-    Args:
-        freq (Union[Tensor, float]): The input tensor with arbitrary shape.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-
-    Returns:
-        Union[Tensor, float]: Frequency in mels.
-    """
-
-    if htk:
-        if isinstance(freq, Tensor):
-            return 2595.0 * paddle.log10(1.0 + freq / 700.0)
-        else:
-            return 2595.0 * math.log10(1.0 + freq / 700.0)
-
-    # Fill in the linear part
-    f_min = 0.0
-    f_sp = 200.0 / 3
-
-    mels = (freq - f_min) / f_sp
-
-    # Fill in the log-scale part
-
-    min_log_hz = 1000.0  # beginning of log region (Hz)
-    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-    logstep = math.log(6.4) / 27.0  # step size for log region
-
-    if isinstance(freq, Tensor):
-        target = min_log_mel + paddle.log(
-            freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
-        mask = (freq > min_log_hz).astype(freq.dtype)
-        mels = target * mask + mels * (
-            1 - mask)  # will replace by masked_fill OP in future
-    else:
-        if freq >= min_log_hz:
-            mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
-
-    return mels
-
-
-def mel_to_hz(mel: Union[float, Tensor],
-              htk: bool=False) -> Union[float, Tensor]:
-    """Convert mel bin numbers to frequencies.
-
-    Args:
-        mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-
-    Returns:
-        Union[float, Tensor]: Frequencies in Hz.
-    """
-    if htk:
-        return 700.0 * (10.0**(mel / 2595.0) - 1.0)
-
-    f_min = 0.0
-    f_sp = 200.0 / 3
-    freqs = f_min + f_sp * mel
-    # And now the nonlinear scale
-    min_log_hz = 1000.0  # beginning of log region (Hz)
-    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
-    logstep = math.log(6.4) / 27.0  # step size for log region
-    if isinstance(mel, Tensor):
-        target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
-        mask = (mel > min_log_mel).astype(mel.dtype)
-        freqs = target * mask + freqs * (
-            1 - mask)  # will replace by masked_fill OP in future
-    else:
-        if mel >= min_log_mel:
-            freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
-
-    return freqs
-
-
-def mel_frequencies(n_mels: int=64,
-                    f_min: float=0.0,
-                    f_max: float=11025.0,
-                    htk: bool=False,
-                    dtype: str='float32') -> Tensor:
-    """Compute mel frequencies.
-
-    Args:
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
-        fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-        dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
-
-    Returns:
-        Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
-    """
-    # 'Center freqs' of mel bands - uniformly spaced between limits
-    min_mel = hz_to_mel(f_min, htk=htk)
-    max_mel = hz_to_mel(f_max, htk=htk)
-    mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
-    freqs = mel_to_hz(mels, htk=htk)
-    return freqs
-
-
-def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor:
-    """Compute fourier frequencies.
-
-    Args:
-        sr (int): Sample rate.
-        n_fft (int): Number of fft bins.
-        dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
-
-    Returns:
-        Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
-    """
-    return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
-
-
-def compute_fbank_matrix(sr: int,
-                         n_fft: int,
-                         n_mels: int=64,
-                         f_min: float=0.0,
-                         f_max: Optional[float]=None,
-                         htk: bool=False,
-                         norm: Union[str, float]='slaney',
-                         dtype: str='float32') -> Tensor:
-    """Compute fbank matrix.
-
-    Args:
-        sr (int): Sample rate.
-        n_fft (int): Number of fft bins.
-        n_mels (int, optional): Number of mel bins. Defaults to 64.
-        f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
-        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
-        htk (bool, optional): Use htk scaling. Defaults to False.
-        norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'.
-        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
-
-    Returns:
-        Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
-    """
-
-    if f_max is None:
-        f_max = float(sr) / 2
-
-    # Initialize the weights
-    weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
-
-    # Center freqs of each FFT bin
-    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
-
-    # 'Center freqs' of mel bands - uniformly spaced between limits
-    mel_f = mel_frequencies(
-        n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
-
-    fdiff = mel_f[1:] - mel_f[:-1]  #np.diff(mel_f)
-    ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
-    #ramps = np.subtract.outer(mel_f, fftfreqs)
-
-    for i in range(n_mels):
-        # lower and upper slopes for all bins
-        lower = -ramps[i] / fdiff[i]
-        upper = ramps[i + 2] / fdiff[i + 1]
-
-        # .. then intersect them with each other and zero
-        weights[i] = paddle.maximum(
-            paddle.zeros_like(lower), paddle.minimum(lower, upper))
-
-    # Slaney-style mel is scaled to be approx constant energy per channel
-    if norm == 'slaney':
-        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
-        weights *= enorm.unsqueeze(1)
-    elif isinstance(norm, int) or isinstance(norm, float):
-        weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
-
-    return weights
-
-
-def power_to_db(spect: Tensor,
-                ref_value: float=1.0,
-                amin: float=1e-10,
-                top_db: Optional[float]=None) -> Tensor:
-    """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
-
-    Args:
-        spect (Tensor): STFT power spectrogram.
-        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
-        amin (float, optional): Minimum threshold. Defaults to 1e-10.
-        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
-
-    Returns:
-        Tensor: Power spectrogram in db scale.
-    """
-    if amin <= 0:
-        raise Exception("amin must be strictly positive")
-
-    if ref_value <= 0:
-        raise Exception("ref_value must be strictly positive")
-
-    ones = paddle.ones_like(spect)
-    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect))
-    log_spec -= 10.0 * math.log10(max(ref_value, amin))
-
-    if top_db is not None:
-        if top_db < 0:
-            raise Exception("top_db must be non-negative")
-        log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
-
-    return log_spec
-
-
-def create_dct(n_mfcc: int,
-               n_mels: int,
-               norm: Optional[str]='ortho',
-               dtype: str='float32') -> Tensor:
-    """Create a discrete cosine transform(DCT) matrix.
-
-    Args:
-        n_mfcc (int): Number of mel frequency cepstral coefficients. 
-        n_mels (int): Number of mel filterbanks.
-        norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
-        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
-
-    Returns:
-        Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
-    """
-    n = paddle.arange(n_mels, dtype=dtype)
-    k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
-    dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
-                     k)  # size (n_mfcc, n_mels)
-    if norm is None:
-        dct *= 2.0
-    else:
-        assert norm == "ortho"
-        dct[0] *= 1.0 / math.sqrt(2.0)
-        dct *= math.sqrt(2.0 / float(n_mels))
-    return dct.T
diff --git a/paddlespeech/audio/functional/window.py b/paddlespeech/audio/functional/window.py
deleted file mode 100644
index c99d50462..000000000
--- a/paddlespeech/audio/functional/window.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-import math
-from typing import List
-from typing import Tuple
-from typing import Union
-
-import paddle
-from paddle import Tensor
-
-__all__ = [
-    'get_window',
-]
-
-
-def _cat(x: List[Tensor], data_type: str) -> Tensor:
-    l = [paddle.to_tensor(_, data_type) for _ in x]
-    return paddle.concat(l)
-
-
-def _acosh(x: Union[Tensor, float]) -> Tensor:
-    if isinstance(x, float):
-        return math.log(x + math.sqrt(x**2 - 1))
-    return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
-
-
-def _extend(M: int, sym: bool) -> bool:
-    """Extend window by 1 sample if needed for DFT-even symmetry. """
-    if not sym:
-        return M + 1, True
-    else:
-        return M, False
-
-
-def _len_guards(M: int) -> bool:
-    """Handle small or incorrect window lengths. """
-    if int(M) != M or M < 0:
-        raise ValueError('Window length M must be a non-negative integer')
-
-    return M <= 1
-
-
-def _truncate(w: Tensor, needed: bool) -> Tensor:
-    """Truncate window by 1 sample if needed for DFT-even symmetry. """
-    if needed:
-        return w[:-1]
-    else:
-        return w
-
-
-def _general_gaussian(M: int, p, sig, sym: bool=True,
-                      dtype: str='float64') -> Tensor:
-    """Compute a window with a generalized Gaussian shape.
-    This function is consistent with scipy.signal.windows.general_gaussian().
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-
-    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
-    w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
-
-    return _truncate(w, needs_trunc)
-
-
-def _general_cosine(M: int, a: float, sym: bool=True,
-                    dtype: str='float64') -> Tensor:
-    """Compute a generic weighted sum of cosine terms window.
-    This function is consistent with scipy.signal.windows.general_cosine().
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
-    w = paddle.zeros((M, ), dtype=dtype)
-    for k in range(len(a)):
-        w += a[k] * paddle.cos(k * fac)
-    return _truncate(w, needs_trunc)
-
-
-def _general_hamming(M: int, alpha: float, sym: bool=True,
-                     dtype: str='float64') -> Tensor:
-    """Compute a generalized Hamming window.
-    This function is consistent with scipy.signal.windows.general_hamming()
-    """
-    return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
-
-
-def _taylor(M: int,
-            nbar=4,
-            sll=30,
-            norm=True,
-            sym: bool=True,
-            dtype: str='float64') -> Tensor:
-    """Compute a Taylor window.
-    The Taylor window taper function approximates the Dolph-Chebyshev window's
-    constant sidelobe level for a parameterized number of near-in sidelobes.
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-    # Original text uses a negative sidelobe level parameter and then negates
-    # it in the calculation of B. To keep consistent with other methods we
-    # assume the sidelobe level parameter to be positive.
-    B = 10**(sll / 20)
-    A = _acosh(B) / math.pi
-    s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
-    ma = paddle.arange(1, nbar, dtype=dtype)
-
-    Fm = paddle.empty((nbar - 1, ), dtype=dtype)
-    signs = paddle.empty_like(ma)
-    signs[::2] = 1
-    signs[1::2] = -1
-    m2 = ma * ma
-    for mi in range(len(ma)):
-        numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
-                                                           ))
-        if mi == 0:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
-        elif mi == len(ma) - 1:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
-        else:
-            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
-                mi] / m2[mi + 1:])
-
-        Fm[mi] = numer / denom
-
-    def W(n):
-        return 1 + 2 * paddle.matmul(
-            Fm.unsqueeze(0),
-            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
-
-    w = W(paddle.arange(0, M, dtype=dtype))
-
-    # normalize (Note that this is not described in the original text [1])
-    if norm:
-        scale = 1.0 / W((M - 1) / 2)
-        w *= scale
-    w = w.squeeze()
-    return _truncate(w, needs_trunc)
-
-
-def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a Hamming window.
-    The Hamming window is a taper formed by using a raised cosine with
-    non-zero endpoints, optimized to minimize the nearest side lobe.
-    """
-    return _general_hamming(M, 0.54, sym, dtype=dtype)
-
-
-def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a Hann window.
-    The Hann window is a taper formed by using a raised cosine or sine-squared
-    with ends that touch zero.
-    """
-    return _general_hamming(M, 0.5, sym, dtype=dtype)
-
-
-def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a Tukey window.
-    The Tukey window is also known as a tapered cosine window.
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-
-    if alpha <= 0:
-        return paddle.ones((M, ), dtype=dtype)
-    elif alpha >= 1.0:
-        return hann(M, sym=sym)
-
-    M, needs_trunc = _extend(M, sym)
-
-    n = paddle.arange(0, M, dtype=dtype)
-    width = int(alpha * (M - 1) / 2.0)
-    n1 = n[0:width + 1]
-    n2 = n[width + 1:M - width - 1]
-    n3 = n[M - width - 1:]
-
-    w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
-    w2 = paddle.ones(n2.shape, dtype=dtype)
-    w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
-                                          (M - 1))))
-    w = paddle.concat([w1, w2, w3])
-
-    return _truncate(w, needs_trunc)
-
-
-def _kaiser(M: int, beta: float, sym: bool=True,
-            dtype: str='float64') -> Tensor:
-    """Compute a Kaiser window.
-    The Kaiser window is a taper formed by using a Bessel function.
-    """
-    raise NotImplementedError()
-
-
-def _gaussian(M: int, std: float, sym: bool=True,
-              dtype: str='float64') -> Tensor:
-    """Compute a Gaussian window.
-    The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-
-    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
-    sig2 = 2 * std * std
-    w = paddle.exp(-n**2 / sig2)
-
-    return _truncate(w, needs_trunc)
-
-
-def _exponential(M: int,
-                 center=None,
-                 tau=1.,
-                 sym: bool=True,
-                 dtype: str='float64') -> Tensor:
-    """Compute an exponential (or Poisson) window. """
-    if sym and center is not None:
-        raise ValueError("If sym==True, center must be None.")
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-
-    if center is None:
-        center = (M - 1) / 2
-
-    n = paddle.arange(0, M, dtype=dtype)
-    w = paddle.exp(-paddle.abs(n - center) / tau)
-
-    return _truncate(w, needs_trunc)
-
-
-def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a triangular window.
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-
-    n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
-    if M % 2 == 0:
-        w = (2 * n - 1.0) / M
-        w = paddle.concat([w, w[::-1]])
-    else:
-        w = 2 * n / (M + 1.0)
-        w = paddle.concat([w, w[-2::-1]])
-
-    return _truncate(w, needs_trunc)
-
-
-def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a Bohman window.
-    The Bohman window is the autocorrelation of a cosine window.
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-
-    fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
-    w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
-        math.pi * fac)
-    w = _cat([0, w, 0], dtype)
-
-    return _truncate(w, needs_trunc)
-
-
-def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a Blackman window.
-    The Blackman window is a taper formed by using the first three terms of
-    a summation of cosines. It was designed to have close to the minimal
-    leakage possible.  It is close to optimal, only slightly worse than a
-    Kaiser window.
-    """
-    return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
-
-
-def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
-    """Compute a window with a simple cosine shape.
-    """
-    if _len_guards(M):
-        return paddle.ones((M, ), dtype=dtype)
-    M, needs_trunc = _extend(M, sym)
-    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
-
-    return _truncate(w, needs_trunc)
-
-
-def get_window(window: Union[str, Tuple[str, float]],
-               win_length: int,
-               fftbins: bool=True,
-               dtype: str='float64') -> Tensor:
-    """Return a window of a given length and type.
-
-    Args:
-        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
-        win_length (int): Number of samples.
-        fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
-        dtype (str, optional): The data type of the return window. Defaults to 'float64'.
-
-    Returns:
-        Tensor: The window represented as a tensor.
-    """
-    sym = not fftbins
-
-    args = ()
-    if isinstance(window, tuple):
-        winstr = window[0]
-        if len(window) > 1:
-            args = window[1:]
-    elif isinstance(window, str):
-        if window in ['gaussian', 'exponential']:
-            raise ValueError("The '" + window + "' window needs one or "
-                             "more parameters -- pass a tuple.")
-        else:
-            winstr = window
-    else:
-        raise ValueError("%s as window type is not supported." %
-                         str(type(window)))
-
-    try:
-        winfunc = eval('_' + winstr)
-    except KeyError as e:
-        raise ValueError("Unknown window type.") from e
-
-    params = (win_length, ) + args
-    kwargs = {'sym': sym}
-    return winfunc(*params, dtype=dtype, **kwargs)
diff --git a/paddlespeech/audio/io/__init__.py b/paddlespeech/audio/io/__init__.py
deleted file mode 100644
index 185a92b8d..000000000
--- a/paddlespeech/audio/io/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddlespeech/audio/metric/__init__.py b/paddlespeech/audio/metric/__init__.py
deleted file mode 100644
index 7ce6f5cff..000000000
--- a/paddlespeech/audio/metric/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from .eer import compute_eer
-from .eer import compute_minDCF
diff --git a/paddlespeech/audio/metric/eer.py b/paddlespeech/audio/metric/eer.py
deleted file mode 100644
index a1166d3f9..000000000
--- a/paddlespeech/audio/metric/eer.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List
-
-import numpy as np
-import paddle
-from sklearn.metrics import roc_curve
-
-
-def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
-    """Compute EER and return score threshold.
-
-    Args:
-        labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num
-        scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num
-
-    Returns:
-        List[float]: eer and the specific threshold
-    """
-    fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores)
-    fnr = 1 - tpr
-    eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
-    eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
-    return eer, eer_threshold
-
-
-def compute_minDCF(positive_scores,
-                   negative_scores,
-                   c_miss=1.0,
-                   c_fa=1.0,
-                   p_target=0.01):
-    """
-    This is modified from SpeechBrain
-    https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509
-    Computes the minDCF metric normally used to evaluate speaker verification
-    systems. The min_DCF is the minimum of the following C_det function computed
-    within the defined threshold range:
-
-    C_det =  c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target)
-
-    where p_miss is the missing probability and p_fa is the probability of having
-    a false alarm.
-
-    Args:
-        positive_scores (Paddle.Tensor): The scores from entries of the same class.
-        negative_scores (Paddle.Tensor): The scores from entries of different classes.
-        c_miss (float, optional): Cost assigned to a missing error (default 1.0).
-        c_fa (float, optional): Cost assigned to a false alarm (default 1.0).
-        p_target (float, optional): Prior probability of having a target (default 0.01).
-
-    Returns:
-        List[float]: min dcf and the specific threshold
-    """
-    # Computing candidate thresholds
-    if len(positive_scores.shape) > 1:
-        positive_scores = positive_scores.squeeze()
-
-    if len(negative_scores.shape) > 1:
-        negative_scores = negative_scores.squeeze()
-
-    thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores]))
-    thresholds = paddle.unique(thresholds)
-
-    # Adding intermediate thresholds
-    interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2
-    thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds]))
-
-    # Computing False Rejection Rate (miss detection)
-    positive_scores = paddle.concat(
-        len(thresholds) * [positive_scores.unsqueeze(0)])
-    pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds
-    p_miss = (pos_scores_threshold.sum(0)
-              ).astype("float32") / positive_scores.shape[1]
-    del positive_scores
-    del pos_scores_threshold
-
-    # Computing False Acceptance Rate (false alarm)
-    negative_scores = paddle.concat(
-        len(thresholds) * [negative_scores.unsqueeze(0)])
-    neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds
-    p_fa = (neg_scores_threshold.sum(0)
-            ).astype("float32") / negative_scores.shape[1]
-    del negative_scores
-    del neg_scores_threshold
-
-    c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target)
-    c_min = paddle.min(c_det, axis=0)
-    min_index = paddle.argmin(c_det, axis=0)
-    return float(c_min), float(thresholds[min_index])
diff --git a/paddlespeech/audio/streamdata/autodecode.py b/paddlespeech/audio/streamdata/autodecode.py
index d7f7937bd..c3ff148f8 100644
--- a/paddlespeech/audio/streamdata/autodecode.py
+++ b/paddlespeech/audio/streamdata/autodecode.py
@@ -310,7 +310,7 @@ def paddle_audio(key, data):
         fname = os.path.join(dirname, f"file.{extension}")
         with open(fname, "wb") as stream:
             stream.write(data)
-        return paddlespeech.audio.load(fname)
+        return paddleaudio.backends.soundfile_load(fname)
 
 
 ################################################################
diff --git a/paddlespeech/audio/streamdata/tariterators.py b/paddlespeech/audio/streamdata/tariterators.py
index 79b81c0ce..39dbea621 100644
--- a/paddlespeech/audio/streamdata/tariterators.py
+++ b/paddlespeech/audio/streamdata/tariterators.py
@@ -111,7 +111,7 @@ def tar_file_iterator(fileobj,
             assert pos > 0
             prefix, postfix = name[:pos], name[pos + 1:]
             if postfix == 'wav':
-                waveform, sample_rate = paddlespeech.audio.load(
+                waveform, sample_rate = paddleaudio.backends.soundfile_load(
                     stream.extractfile(tarinfo), normal=False)
                 result = dict(
                     fname=prefix, wav=waveform, sample_rate=sample_rate)
@@ -163,7 +163,7 @@ def tar_file_and_group_iterator(fileobj,
                 if postfix == 'txt':
                     example['txt'] = file_obj.read().decode('utf8').strip()
                 elif postfix in AUDIO_FORMAT_SETS:
-                    waveform, sample_rate = paddlespeech.audio.load(
+                    waveform, sample_rate = paddleaudio.backends.soundfile_load(
                         file_obj, normal=False)
                     waveform = paddle.to_tensor(
                         np.expand_dims(np.array(waveform), 0),
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
index 5ace7fe0d..5e9b5acec 100644
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -25,8 +25,8 @@ import yaml
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from paddlespeech.audio.soundfile_backend import soundfile_load as load
-from paddlespeech.audio.features import LogMelSpectrogram
+from paddleaudio.backends import soundfile_load as load
+from paddleaudio.features import LogMelSpectrogram
 
 __all__ = ['CLSExecutor']
 
diff --git a/paddlespeech/cli/kws/infer.py b/paddlespeech/cli/kws/infer.py
index bd15e80e6..17482f653 100644
--- a/paddlespeech/cli/kws/infer.py
+++ b/paddlespeech/cli/kws/infer.py
@@ -24,8 +24,8 @@ import yaml
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio
-from paddlespeech.audio.compliance.kaldi import fbank as kaldi_fbank
+from paddleaudio.backends import soundfile_load as load_audio
+from paddleaudio.compliance.kaldi import fbank as kaldi_fbank
 
 __all__ = ['KWSExecutor']
 
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index 5a66b4861..b1335f281 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -27,8 +27,8 @@ from yacs.config import CfgNode
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio
-from paddlespeech.audio.compliance.librosa import melspectrogram
+from paddleaudio.backends import soundfile_load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 
diff --git a/paddlespeech/cls/exps/panns/deploy/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py
index 93eee74b9..b13d037f5 100644
--- a/paddlespeech/cls/exps/panns/deploy/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
@@ -18,9 +18,9 @@ import numpy as np
 from paddle import inference
 from scipy.special import softmax
 
-from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio
-from paddlespeech.audio.datasets import ESC50
-from paddlespeech.audio.features import melspectrogram
+from paddleaudio.backends import soundfile_load as load_audio
+from paddleaudio.datasets import ESC50
+from paddleaudio.features import melspectrogram
 
 # yapf: disable
 parser = argparse.ArgumentParser()
diff --git a/paddlespeech/cls/exps/panns/export_model.py b/paddlespeech/cls/exps/panns/export_model.py
index e62d58f02..c295c6a33 100644
--- a/paddlespeech/cls/exps/panns/export_model.py
+++ b/paddlespeech/cls/exps/panns/export_model.py
@@ -16,7 +16,7 @@ import os
 
 import paddle
 
-from paddlespeech.audio.datasets import ESC50
+from paddleaudio.datasets import ESC50
 from paddlespeech.cls.models import cnn14
 from paddlespeech.cls.models import SoundClassifier
 
diff --git a/paddlespeech/cls/exps/panns/predict.py b/paddlespeech/cls/exps/panns/predict.py
index 97759a89d..8064ab0d4 100644
--- a/paddlespeech/cls/exps/panns/predict.py
+++ b/paddlespeech/cls/exps/panns/predict.py
@@ -18,9 +18,9 @@ import paddle
 import paddle.nn.functional as F
 import yaml
 
-from paddlespeech.audio.backends import load as load_audio
-from paddlespeech.audio.features import LogMelSpectrogram
-from paddlespeech.audio.utils import logger
+from paddleaudio.backends import soundfile_load as load_audio
+from paddleaudio.features import LogMelSpectrogram
+from paddleaudio.utils import logger
 from paddlespeech.cls.models import SoundClassifier
 from paddlespeech.utils.dynamic_import import dynamic_import
 
diff --git a/paddlespeech/cls/exps/panns/train.py b/paddlespeech/cls/exps/panns/train.py
index fba38a01c..56082bd77 100644
--- a/paddlespeech/cls/exps/panns/train.py
+++ b/paddlespeech/cls/exps/panns/train.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,88 +11,97 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
 import os
 
 import paddle
-import yaml
+from yacs.config import CfgNode
 
-from paddlespeech.audio.features import LogMelSpectrogram
-from paddlespeech.audio.utils import logger
-from paddlespeech.audio.utils import Timer
-from paddlespeech.cls.models import SoundClassifier
-from paddlespeech.utils.dynamic_import import dynamic_import
+from paddleaudio.utils import logger
+from paddleaudio.utils import Timer
+from paddlespeech.kws.exps.mdtc.collate import collate_features
+from paddlespeech.kws.models.loss import max_pooling_loss
+from paddlespeech.kws.models.mdtc import KWSModel
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument("--cfg_path", type=str, required=True)
-args = parser.parse_args()
-# yapf: enable
+if __name__ == '__main__':
+    parser = default_argument_parser()
+    args = parser.parse_args()
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
 
-if __name__ == "__main__":
     nranks = paddle.distributed.get_world_size()
     if paddle.distributed.get_world_size() > 1:
         paddle.distributed.init_parallel_env()
     local_rank = paddle.distributed.get_rank()
 
-    args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
-    with open(args.cfg_path, 'r') as f:
-        config = yaml.safe_load(f)
-
-    model_conf = config['model']
-    data_conf = config['data']
-    feat_conf = config['feature']
-    training_conf = config['training']
-
     # Dataset
-    ds_class = dynamic_import(data_conf['dataset'])
-    train_ds = ds_class(**data_conf['train'])
-    dev_ds = ds_class(**data_conf['dev'])
+    ds_class = dynamic_import(config['dataset'])
+    train_ds = ds_class(
+        data_dir=config['data_dir'],
+        mode='train',
+        feat_type=config['feat_type'],
+        sample_rate=config['sample_rate'],
+        frame_shift=config['frame_shift'],
+        frame_length=config['frame_length'],
+        n_mels=config['n_mels'], )
+    dev_ds = ds_class(
+        data_dir=config['data_dir'],
+        mode='dev',
+        feat_type=config['feat_type'],
+        sample_rate=config['sample_rate'],
+        frame_shift=config['frame_shift'],
+        frame_length=config['frame_length'],
+        n_mels=config['n_mels'], )
+
     train_sampler = paddle.io.DistributedBatchSampler(
         train_ds,
-        batch_size=training_conf['batch_size'],
+        batch_size=config['batch_size'],
         shuffle=True,
         drop_last=False)
     train_loader = paddle.io.DataLoader(
         train_ds,
         batch_sampler=train_sampler,
-        num_workers=training_conf['num_workers'],
+        num_workers=config['num_workers'],
         return_list=True,
-        use_buffer_reader=True, )
-
-    # Feature
-    feature_extractor = LogMelSpectrogram(**feat_conf)
+        use_buffer_reader=True,
+        collate_fn=collate_features, )
 
     # Model
-    backbone_class = dynamic_import(model_conf['backbone'])
-    backbone = backbone_class(pretrained=True, extract_embedding=True)
-    model = SoundClassifier(backbone, num_class=data_conf['num_classes'])
+    backbone_class = dynamic_import(config['backbone'])
+    backbone = backbone_class(
+        stack_num=config['stack_num'],
+        stack_size=config['stack_size'],
+        in_channels=config['in_channels'],
+        res_channels=config['res_channels'],
+        kernel_size=config['kernel_size'], )
+    model = KWSModel(backbone=backbone, num_keywords=config['num_keywords'])
     model = paddle.DataParallel(model)
+    clip = paddle.nn.ClipGradByGlobalNorm(config['grad_clip'])
     optimizer = paddle.optimizer.Adam(
-        learning_rate=training_conf['learning_rate'],
-        parameters=model.parameters())
-    criterion = paddle.nn.loss.CrossEntropyLoss()
+        learning_rate=config['learning_rate'],
+        weight_decay=config['weight_decay'],
+        parameters=model.parameters(),
+        grad_clip=clip)
+    criterion = max_pooling_loss
 
     steps_per_epoch = len(train_sampler)
-    timer = Timer(steps_per_epoch * training_conf['epochs'])
+    timer = Timer(steps_per_epoch * config['epochs'])
     timer.start()
 
-    for epoch in range(1, training_conf['epochs'] + 1):
+    for epoch in range(1, config['epochs'] + 1):
         model.train()
 
         avg_loss = 0
         num_corrects = 0
         num_samples = 0
         for batch_idx, batch in enumerate(train_loader):
-            waveforms, labels = batch
-            feats = feature_extractor(
-                waveforms
-            )  # Need a padding when lengths of waveforms differ in a batch.
-            feats = paddle.transpose(feats, [0, 2, 1])  # To [N, length, n_mels]
-
+            keys, feats, labels, lengths = batch
             logits = model(feats)
-
-            loss = criterion(logits, labels)
+            loss, corrects, acc = criterion(logits, labels, lengths)
             loss.backward()
             optimizer.step()
             if isinstance(optimizer._learning_rate,
@@ -104,21 +113,18 @@ if __name__ == "__main__":
             avg_loss += loss.numpy()[0]
 
             # Calculate metrics
-            preds = paddle.argmax(logits, axis=1)
-            num_corrects += (preds == labels).numpy().sum()
+            num_corrects += corrects
             num_samples += feats.shape[0]
 
             timer.count()
 
-            if (batch_idx + 1
-                ) % training_conf['log_freq'] == 0 and local_rank == 0:
+            if (batch_idx + 1) % config['log_freq'] == 0 and local_rank == 0:
                 lr = optimizer.get_lr()
-                avg_loss /= training_conf['log_freq']
+                avg_loss /= config['log_freq']
                 avg_acc = num_corrects / num_samples
 
                 print_msg = 'Epoch={}/{}, Step={}/{}'.format(
-                    epoch, training_conf['epochs'], batch_idx + 1,
-                    steps_per_epoch)
+                    epoch, config['epochs'], batch_idx + 1, steps_per_epoch)
                 print_msg += ' loss={:.4f}'.format(avg_loss)
                 print_msg += ' acc={:.4f}'.format(avg_acc)
                 print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
@@ -129,41 +135,40 @@ if __name__ == "__main__":
                 num_corrects = 0
                 num_samples = 0
 
-        if epoch % training_conf[
+        if epoch % config[
                 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
             dev_sampler = paddle.io.BatchSampler(
                 dev_ds,
-                batch_size=training_conf['batch_size'],
+                batch_size=config['batch_size'],
                 shuffle=False,
                 drop_last=False)
             dev_loader = paddle.io.DataLoader(
                 dev_ds,
                 batch_sampler=dev_sampler,
-                num_workers=training_conf['num_workers'],
-                return_list=True, )
+                num_workers=config['num_workers'],
+                return_list=True,
+                use_buffer_reader=True,
+                collate_fn=collate_features, )
 
             model.eval()
             num_corrects = 0
             num_samples = 0
             with logger.processing('Evaluation on validation dataset'):
                 for batch_idx, batch in enumerate(dev_loader):
-                    waveforms, labels = batch
-                    feats = feature_extractor(waveforms)
-                    feats = paddle.transpose(feats, [0, 2, 1])
-
+                    keys, feats, labels, lengths = batch
                     logits = model(feats)
-
-                    preds = paddle.argmax(logits, axis=1)
-                    num_corrects += (preds == labels).numpy().sum()
+                    loss, corrects, acc = criterion(logits, labels, lengths)
+                    num_corrects += corrects
                     num_samples += feats.shape[0]
 
+            eval_acc = num_corrects / num_samples
             print_msg = '[Evaluation result]'
-            print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
+            print_msg += ' dev_acc={:.4f}'.format(eval_acc)
 
             logger.eval(print_msg)
 
             # Save model
-            save_dir = os.path.join(training_conf['checkpoint_dir'],
+            save_dir = os.path.join(config['checkpoint_dir'],
                                     'epoch_{}'.format(epoch))
             logger.info('Saving model checkpoint to {}'.format(save_dir))
             paddle.save(model.state_dict(),
diff --git a/paddlespeech/cls/models/panns/panns.py b/paddlespeech/cls/models/panns/panns.py
index 37deae80c..feefecbe1 100644
--- a/paddlespeech/cls/models/panns/panns.py
+++ b/paddlespeech/cls/models/panns/panns.py
@@ -16,7 +16,7 @@ import os
 import paddle.nn as nn
 import paddle.nn.functional as F
 
-from paddlespeech.audio.utils.download import load_state_dict_from_url
+from paddleaudio.utils.download import load_state_dict_from_url
 from paddlespeech.utils.env import MODEL_HOME
 
 __all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6']
diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py
index 94e45d590..56082bd77 100644
--- a/paddlespeech/kws/exps/mdtc/train.py
+++ b/paddlespeech/kws/exps/mdtc/train.py
@@ -16,8 +16,8 @@ import os
 import paddle
 from yacs.config import CfgNode
 
-from paddlespeech.audio.utils import logger
-from paddlespeech.audio.utils import Timer
+from paddleaudio.utils import logger
+from paddleaudio.utils import Timer
 from paddlespeech.kws.exps.mdtc.collate import collate_features
 from paddlespeech.kws.models.loss import max_pooling_loss
 from paddlespeech.kws.models.mdtc import KWSModel
diff --git a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
index ac5720fd5..12e8a2966 100644
--- a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
@@ -17,7 +17,7 @@ import paddle
 from python_speech_features import delta
 from python_speech_features import mfcc
 
-import paddlespeech.audio.compliance.kaldi as kaldi
+import paddleaudio.compliance.kaldi as kaldi
 
 
 class AudioFeaturizer():
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 8a9849492..5c2fa3071 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -29,9 +29,9 @@ import paddle
 from paddle import jit
 from paddle import nn
 
-from paddlespeech.audio.utils.tensor_utils import add_sos_eos
-from paddlespeech.audio.utils.tensor_utils import pad_sequence
-from paddlespeech.audio.utils.tensor_utils import th_accuracy
+from paddleaudio.utils.tensor_utils import add_sos_eos
+from paddleaudio.utils.tensor_utils import pad_sequence
+from paddleaudio.utils.tensor_utils import th_accuracy
 from paddlespeech.s2t.decoders.scorers.ctc import CTCPrefixScorer
 from paddlespeech.s2t.frontend.utility import IGNORE_ID
 from paddlespeech.s2t.frontend.utility import load_cmvn
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index e8b61bc0d..1ba313c46 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -25,8 +25,8 @@ import paddle
 from paddle import jit
 from paddle import nn
 
-from paddlespeech.audio.utils.tensor_utils import add_sos_eos
-from paddlespeech.audio.utils.tensor_utils import th_accuracy
+from paddleaudio.utils.tensor_utils import add_sos_eos
+from paddleaudio.utils.tensor_utils import th_accuracy
 from paddlespeech.s2t.frontend.utility import IGNORE_ID
 from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.modules.cmvn import GlobalCMVN
diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py
index e617c3650..ecbdbfa5a 100644
--- a/paddlespeech/server/engine/vector/python/vector_engine.py
+++ b/paddlespeech/server/engine/vector/python/vector_engine.py
@@ -17,8 +17,8 @@ from collections import OrderedDict
 import numpy as np
 import paddle
 
-from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio
-from paddlespeech.audio.compliance.librosa import melspectrogram
+from paddleaudio.backends import soundfile_load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.vector.infer import VectorExecutor
 from paddlespeech.server.engine.base_engine import BaseEngine
diff --git a/paddlespeech/server/util.py b/paddlespeech/server/util.py
index 32546a330..ac92cf666 100644
--- a/paddlespeech/server/util.py
+++ b/paddlespeech/server/util.py
@@ -28,7 +28,7 @@ import requests
 import yaml
 from paddle.framework import load
 
-import paddlespeech.audio
+import paddleaudio
 from .entry import client_commands
 from .entry import server_commands
 from paddlespeech.cli import download
@@ -289,7 +289,7 @@ def _note_one_stat(cls_name, params={}):
 
     if 'audio_file' in params:
         try:
-            _, sr = paddlespeech.audio.load(params['audio_file'])
+            _, sr = paddleaudio.backends.soundfile_load(params['audio_file'])
         except Exception:
             sr = -1
 
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
index e9203ef99..790a4eb67 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
@@ -18,8 +18,8 @@ import time
 import paddle
 from yacs.config import CfgNode
 
-from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio
-from paddlespeech.audio.compliance.librosa import melspectrogram
+from paddleaudio.backends import soundfile_load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/test.py b/paddlespeech/vector/exps/ecapa_tdnn/test.py
index 6c87dbe7b..1b38075d6 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/test.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/test.py
@@ -21,7 +21,7 @@ from paddle.io import DataLoader
 from tqdm import tqdm
 from yacs.config import CfgNode
 
-from paddlespeech.audio.metric import compute_eer
+from paddleaudio.metric import compute_eer
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.batch import batch_feature_normalize
 from paddlespeech.vector.io.dataset import CSVDataset
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py
index 961b75e29..73da16dc7 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -22,7 +22,7 @@ from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
 from yacs.config import CfgNode
 
-from paddlespeech.audio.compliance.librosa import melspectrogram
+from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.augment import build_augment_pipeline
 from paddlespeech.vector.io.augment import waveform_augment
diff --git a/paddlespeech/vector/io/dataset.py b/paddlespeech/vector/io/dataset.py
index c9d56b5ea..1fa8b6b99 100644
--- a/paddlespeech/vector/io/dataset.py
+++ b/paddlespeech/vector/io/dataset.py
@@ -16,9 +16,9 @@ from dataclasses import fields
 
 from paddle.io import Dataset
 
-from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio
+from paddleaudio.backends import soundfile_load as load_audio
 
-from paddlespeech.audio.compliance.librosa import melspectrogram
+from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.log import Log
 logger = Log(__name__).getlog()
 
diff --git a/paddlespeech/vector/io/dataset_from_json.py b/paddlespeech/vector/io/dataset_from_json.py
index 32960e456..39b92af66 100644
--- a/paddlespeech/vector/io/dataset_from_json.py
+++ b/paddlespeech/vector/io/dataset_from_json.py
@@ -17,9 +17,9 @@ from dataclasses import fields
 
 from paddle.io import Dataset
 
-from paddlespeech.audio.soundfile_backend import soundfile_load as load_audio
-from paddlespeech.audio.compliance.librosa import melspectrogram
-from paddlespeech.audio.compliance.librosa import mfcc
+from paddleaudio.backends import soundfile_load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
+from paddleaudio.compliance.librosa import mfcc
 
 
 @dataclass
diff --git a/tests/benchmark/audio/README.md b/tests/benchmark/audio/README.md
deleted file mode 100644
index 9cade74e0..000000000
--- a/tests/benchmark/audio/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# 1. Prepare
-First, install `pytest-benchmark` via pip.
-```sh
-pip install pytest-benchmark
-```
-
-# 2. Run
-Run the specific script for profiling.
-```sh
-pytest melspectrogram.py
-```
-
-Result:
-```sh
-========================================================================== test session starts ==========================================================================
-platform linux -- Python 3.7.7, pytest-7.0.1, pluggy-1.0.0
-benchmark: 3.4.1 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000)
-plugins: typeguard-2.12.1, benchmark-3.4.1, anyio-3.5.0
-collected 4 items
-
-melspectrogram.py ....                                                                                                                                            [100%]
-
-
--------------------------------------------------------------------------------------------------- benchmark: 4 tests -------------------------------------------------------------------------------------------------
-Name (time in us)                        Min                    Max                   Mean              StdDev                 Median                 IQR            Outliers         OPS            Rounds  Iterations
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-test_melspect_gpu_torchaudio        202.0765 (1.0)         360.6230 (1.0)         218.1168 (1.0)       16.3022 (1.0)         214.2871 (1.0)       21.8451 (1.0)          40;3  4,584.7001 (1.0)         286           1
-test_melspect_gpu                   657.8509 (3.26)        908.0470 (2.52)        724.2545 (3.32)     106.5771 (6.54)        669.9096 (3.13)     113.4719 (5.19)          1;0  1,380.7300 (0.30)          5           1
-test_melspect_cpu_torchaudio      1,247.6053 (6.17)      2,892.5799 (8.02)      1,443.2853 (6.62)     345.3732 (21.19)     1,262.7263 (5.89)     221.6385 (10.15)       56;53    692.8637 (0.15)        399           1
-test_melspect_cpu                20,326.2549 (100.59)   20,607.8682 (57.15)    20,473.4125 (93.86)     63.8654 (3.92)     20,467.0429 (95.51)     68.4294 (3.13)          8;1     48.8438 (0.01)         29           1
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-
-Legend:
-  Outliers: 1 Standard Deviation from Mean; 1.5 IQR (InterQuartile Range) from 1st Quartile and 3rd Quartile.
-  OPS: Operations Per Second, computed as 1 / Mean
-========================================================================== 4 passed in 21.12s ===========================================================================
-
-```
diff --git a/tests/benchmark/audio/log_melspectrogram.py b/tests/benchmark/audio/log_melspectrogram.py
deleted file mode 100644
index c85fcecfb..000000000
--- a/tests/benchmark/audio/log_melspectrogram.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import urllib.request
-
-import librosa
-import numpy as np
-import paddle
-import torch
-import torchaudio
-
-import paddlespeech.audio
-
-wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
-if not os.path.isfile(os.path.basename(wav_url)):
-    urllib.request.urlretrieve(wav_url, os.path.basename(wav_url))
-
-waveform, sr = paddlespeech.audio.load(
-    os.path.abspath(os.path.basename(wav_url)))
-waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0)
-waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0)
-
-# Feature conf
-mel_conf = {
-    'sr': sr,
-    'n_fft': 512,
-    'hop_length': 128,
-    'n_mels': 40,
-}
-
-mel_conf_torchaudio = {
-    'sample_rate': sr,
-    'n_fft': 512,
-    'hop_length': 128,
-    'n_mels': 40,
-    'norm': 'slaney',
-    'mel_scale': 'slaney',
-}
-
-
-def enable_cpu_device():
-    paddle.set_device('cpu')
-
-
-def enable_gpu_device():
-    paddle.set_device('gpu')
-
-
-log_mel_extractor = paddlespeech.audio.features.LogMelSpectrogram(
-    **mel_conf, f_min=0.0, top_db=80.0, dtype=waveform_tensor.dtype)
-
-
-def log_melspectrogram():
-    return log_mel_extractor(waveform_tensor).squeeze(0)
-
-
-def test_log_melspect_cpu(benchmark):
-    enable_cpu_device()
-    feature_audio = benchmark(log_melspectrogram)
-    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
-    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
-    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=3)
-
-
-def test_log_melspect_gpu(benchmark):
-    enable_gpu_device()
-    feature_audio = benchmark(log_melspectrogram)
-    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
-    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
-    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=2)
-
-
-mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram(
-    **mel_conf_torchaudio, f_min=0.0)
-amplitude_to_DB = torchaudio.transforms.AmplitudeToDB('power', top_db=80.0)
-
-
-def melspectrogram_torchaudio():
-    return mel_extractor_torchaudio(waveform_tensor_torch).squeeze(0)
-
-
-def log_melspectrogram_torchaudio():
-    mel_specgram = mel_extractor_torchaudio(waveform_tensor_torch)
-    return amplitude_to_DB(mel_specgram).squeeze(0)
-
-
-def test_log_melspect_cpu_torchaudio(benchmark):
-    global waveform_tensor_torch, mel_extractor_torchaudio, amplitude_to_DB
-
-    mel_extractor_torchaudio = mel_extractor_torchaudio.to('cpu')
-    waveform_tensor_torch = waveform_tensor_torch.to('cpu')
-    amplitude_to_DB = amplitude_to_DB.to('cpu')
-
-    feature_audio = benchmark(log_melspectrogram_torchaudio)
-    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
-    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
-    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=3)
-
-
-def test_log_melspect_gpu_torchaudio(benchmark):
-    global waveform_tensor_torch, mel_extractor_torchaudio, amplitude_to_DB
-
-    mel_extractor_torchaudio = mel_extractor_torchaudio.to('cuda')
-    waveform_tensor_torch = waveform_tensor_torch.to('cuda')
-    amplitude_to_DB = amplitude_to_DB.to('cuda')
-
-    feature_torchaudio = benchmark(log_melspectrogram_torchaudio)
-    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
-    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
-    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_torchaudio.cpu(), decimal=2)
diff --git a/tests/benchmark/audio/melspectrogram.py b/tests/benchmark/audio/melspectrogram.py
deleted file mode 100644
index 498158941..000000000
--- a/tests/benchmark/audio/melspectrogram.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import urllib.request
-
-import librosa
-import numpy as np
-import paddle
-import torch
-import torchaudio
-
-import paddlespeech.audio
-
-wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
-if not os.path.isfile(os.path.basename(wav_url)):
-    urllib.request.urlretrieve(wav_url, os.path.basename(wav_url))
-
-waveform, sr = paddlespeech.audio.load(
-    os.path.abspath(os.path.basename(wav_url)))
-waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0)
-waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0)
-
-# Feature conf
-mel_conf = {
-    'sr': sr,
-    'n_fft': 512,
-    'hop_length': 128,
-    'n_mels': 40,
-}
-
-mel_conf_torchaudio = {
-    'sample_rate': sr,
-    'n_fft': 512,
-    'hop_length': 128,
-    'n_mels': 40,
-    'norm': 'slaney',
-    'mel_scale': 'slaney',
-}
-
-
-def enable_cpu_device():
-    paddle.set_device('cpu')
-
-
-def enable_gpu_device():
-    paddle.set_device('gpu')
-
-
-mel_extractor = paddlespeech.audio.features.MelSpectrogram(
-    **mel_conf, f_min=0.0, dtype=waveform_tensor.dtype)
-
-
-def melspectrogram():
-    return mel_extractor(waveform_tensor).squeeze(0)
-
-
-def test_melspect_cpu(benchmark):
-    enable_cpu_device()
-    feature_audio = benchmark(melspectrogram)
-    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
-    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=3)
-
-
-def test_melspect_gpu(benchmark):
-    enable_gpu_device()
-    feature_audio = benchmark(melspectrogram)
-    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
-    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=3)
-
-
-mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram(
-    **mel_conf_torchaudio, f_min=0.0)
-
-
-def melspectrogram_torchaudio():
-    return mel_extractor_torchaudio(waveform_tensor_torch).squeeze(0)
-
-
-def test_melspect_cpu_torchaudio(benchmark):
-    global waveform_tensor_torch, mel_extractor_torchaudio
-    mel_extractor_torchaudio = mel_extractor_torchaudio.to('cpu')
-    waveform_tensor_torch = waveform_tensor_torch.to('cpu')
-    feature_audio = benchmark(melspectrogram_torchaudio)
-    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
-    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=3)
-
-
-def test_melspect_gpu_torchaudio(benchmark):
-    global waveform_tensor_torch, mel_extractor_torchaudio
-    mel_extractor_torchaudio = mel_extractor_torchaudio.to('cuda')
-    waveform_tensor_torch = waveform_tensor_torch.to('cuda')
-    feature_torchaudio = benchmark(melspectrogram_torchaudio)
-    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
-    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_torchaudio.cpu(), decimal=3)
diff --git a/tests/benchmark/audio/mfcc.py b/tests/benchmark/audio/mfcc.py
deleted file mode 100644
index 4e286de90..000000000
--- a/tests/benchmark/audio/mfcc.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import urllib.request
-
-import librosa
-import numpy as np
-import paddle
-import torch
-import torchaudio
-
-import paddlespeech.audio
-
-wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
-if not os.path.isfile(os.path.basename(wav_url)):
-    urllib.request.urlretrieve(wav_url, os.path.basename(wav_url))
-
-waveform, sr = paddlespeech.audio.load(
-    os.path.abspath(os.path.basename(wav_url)))
-waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0)
-waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0)
-
-# Feature conf
-mel_conf = {
-    'sr': sr,
-    'n_fft': 512,
-    'hop_length': 128,
-    'n_mels': 40,
-}
-mfcc_conf = {
-    'n_mfcc': 20,
-    'top_db': 80.0,
-}
-mfcc_conf.update(mel_conf)
-
-mel_conf_torchaudio = {
-    'sample_rate': sr,
-    'n_fft': 512,
-    'hop_length': 128,
-    'n_mels': 40,
-    'norm': 'slaney',
-    'mel_scale': 'slaney',
-}
-mfcc_conf_torchaudio = {
-    'sample_rate': sr,
-    'n_mfcc': 20,
-}
-
-
-def enable_cpu_device():
-    paddle.set_device('cpu')
-
-
-def enable_gpu_device():
-    paddle.set_device('gpu')
-
-
-mfcc_extractor = paddlespeech.audio.features.MFCC(
-    **mfcc_conf, f_min=0.0, dtype=waveform_tensor.dtype)
-
-
-def mfcc():
-    return mfcc_extractor(waveform_tensor).squeeze(0)
-
-
-def test_mfcc_cpu(benchmark):
-    enable_cpu_device()
-    feature_audio = benchmark(mfcc)
-    feature_librosa = librosa.feature.mfcc(waveform, **mel_conf)
-    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=3)
-
-
-def test_mfcc_gpu(benchmark):
-    enable_gpu_device()
-    feature_audio = benchmark(mfcc)
-    feature_librosa = librosa.feature.mfcc(waveform, **mel_conf)
-    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=3)
-
-
-del mel_conf_torchaudio['sample_rate']
-mfcc_extractor_torchaudio = torchaudio.transforms.MFCC(
-    **mfcc_conf_torchaudio, melkwargs=mel_conf_torchaudio)
-
-
-def mfcc_torchaudio():
-    return mfcc_extractor_torchaudio(waveform_tensor_torch).squeeze(0)
-
-
-def test_mfcc_cpu_torchaudio(benchmark):
-    global waveform_tensor_torch, mfcc_extractor_torchaudio
-
-    mel_extractor_torchaudio = mfcc_extractor_torchaudio.to('cpu')
-    waveform_tensor_torch = waveform_tensor_torch.to('cpu')
-
-    feature_audio = benchmark(mfcc_torchaudio)
-    feature_librosa = librosa.feature.mfcc(waveform, **mel_conf)
-    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_audio, decimal=3)
-
-
-def test_mfcc_gpu_torchaudio(benchmark):
-    global waveform_tensor_torch, mfcc_extractor_torchaudio
-
-    mel_extractor_torchaudio = mfcc_extractor_torchaudio.to('cuda')
-    waveform_tensor_torch = waveform_tensor_torch.to('cuda')
-
-    feature_torchaudio = benchmark(mfcc_torchaudio)
-    feature_librosa = librosa.feature.mfcc(waveform, **mel_conf)
-    np.testing.assert_array_almost_equal(
-        feature_librosa, feature_torchaudio.cpu(), decimal=3)
diff --git a/tests/unit/audio/backends/soundfile/__init__.py b/tests/unit/audio/backends/soundfile/__init__.py
deleted file mode 100644
index 97043fd7b..000000000
--- a/tests/unit/audio/backends/soundfile/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/tests/unit/audio/backends/soundfile/common.py b/tests/unit/audio/backends/soundfile/common.py
deleted file mode 100644
index 7067b4a98..000000000
--- a/tests/unit/audio/backends/soundfile/common.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import itertools
-from unittest import skipIf
-
-from parameterized import parameterized
-from paddlespeech.audio._internal.module_utils import is_module_available
-
-
-def name_func(func, _, params):
-    return f'{func.__name__}_{"_".join(str(arg) for arg in params.args)}'
-
-
-def dtype2subtype(dtype):
-    return {
-        "float64": "DOUBLE",
-        "float32": "FLOAT",
-        "int32": "PCM_32",
-        "int16": "PCM_16",
-        "uint8": "PCM_U8",
-        "int8": "PCM_S8",
-    }[dtype]
-
-
-def skipIfFormatNotSupported(fmt):
-    fmts = []
-    if is_module_available("soundfile"):
-        import soundfile
-
-        fmts = soundfile.available_formats()
-        return skipIf(fmt not in fmts, f'"{fmt}" is not supported by soundfile')
-    return skipIf(True, '"soundfile" not available.')
-
-
-def parameterize(*params):
-    return parameterized.expand(list(itertools.product(*params)), name_func=name_func)
-
-
-def fetch_wav_subtype(dtype, encoding, bits_per_sample):
-    subtype = {
-        (None, None): dtype2subtype(dtype),
-        (None, 8): "PCM_U8",
-        ("PCM_U", None): "PCM_U8",
-        ("PCM_U", 8): "PCM_U8",
-        ("PCM_S", None): "PCM_32",
-        ("PCM_S", 16): "PCM_16",
-        ("PCM_S", 32): "PCM_32",
-        ("PCM_F", None): "FLOAT",
-        ("PCM_F", 32): "FLOAT",
-        ("PCM_F", 64): "DOUBLE",
-        ("ULAW", None): "ULAW",
-        ("ULAW", 8): "ULAW",
-        ("ALAW", None): "ALAW",
-        ("ALAW", 8): "ALAW",
-    }.get((encoding, bits_per_sample))
-    if subtype:
-        return subtype
-    raise ValueError(f"wav does not support ({encoding}, {bits_per_sample}).")
-
diff --git a/tests/unit/audio/backends/soundfile/info_test.py b/tests/unit/audio/backends/soundfile/info_test.py
deleted file mode 100644
index c94826858..000000000
--- a/tests/unit/audio/backends/soundfile/info_test.py
+++ /dev/null
@@ -1,199 +0,0 @@
-#this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/info_test.py
-
-import tarfile
-import warnings
-import unittest
-from unittest.mock import patch
-
-import paddle
-from paddlespeech.audio._internal import module_utils as _mod_utils
-from paddlespeech.audio.backends import soundfile_backend
-from tests.unit.audio.backends.common import get_bits_per_sample, get_encoding 
-from tests.unit.common_utils import (
-    get_wav_data,
-    nested_params,
-    save_wav,
-    TempDirMixin,
-)
-
-from common import parameterize, skipIfFormatNotSupported
-
-import soundfile
-
-
-class TestInfo(TempDirMixin, unittest.TestCase):
-    @parameterize(
-        ["float32", "int32"],
-        [8000, 16000],
-        [1, 2],
-    )
-    def test_wav(self, dtype, sample_rate, num_channels):
-        """`soundfile_backend.info` can check wav file correctly"""
-        duration = 1
-        path = self.get_temp_path("data.wav")
-        data = get_wav_data(dtype, num_channels, normalize=False, num_frames=duration * sample_rate)
-        save_wav(path, data, sample_rate)
-        info = soundfile_backend.info(path)
-        assert info.sample_rate == sample_rate
-        assert info.num_frames == sample_rate * duration
-        assert info.num_channels == num_channels
-        assert info.bits_per_sample == get_bits_per_sample("wav", dtype)
-        assert info.encoding == get_encoding("wav", dtype)
-
-    @parameterize([8000, 16000], [1, 2])
-    @skipIfFormatNotSupported("FLAC")
-    def test_flac(self, sample_rate, num_channels):
-        """`soundfile_backend.info` can check flac file correctly"""
-        duration = 1
-        num_frames = sample_rate * duration
-        #data = torch.randn(num_frames, num_channels).numpy()
-        data = paddle.randn(shape=[num_frames, num_channels]).numpy()
-
-        path = self.get_temp_path("data.flac")
-        soundfile.write(path, data, sample_rate)
-
-        info = soundfile_backend.info(path)
-        assert info.sample_rate == sample_rate
-        assert info.num_frames == num_frames
-        assert info.num_channels == num_channels
-        assert info.bits_per_sample == 16
-        assert info.encoding == "FLAC"
-
-    #@parameterize([8000, 16000], [1, 2])
-    #@skipIfFormatNotSupported("OGG")
-    #def test_ogg(self, sample_rate, num_channels):
-        #"""`soundfile_backend.info` can check ogg file correctly"""
-        #duration = 1
-        #num_frames = sample_rate * duration
-        ##data = torch.randn(num_frames, num_channels).numpy()
-        #data = paddle.randn(shape=[num_frames, num_channels]).numpy()
-        #print(len(data))
-        #path = self.get_temp_path("data.ogg")
-        #soundfile.write(path, data, sample_rate)
-
-        #info = soundfile_backend.info(path)
-        #print(info)
-        #assert info.sample_rate == sample_rate
-        #print("info")
-        #print(info.num_frames)
-        #print("jiji")
-        #print(sample_rate*duration)
-        ##assert info.num_frames == sample_rate * duration
-        #assert info.num_channels == num_channels
-        #assert info.bits_per_sample == 0
-        #assert info.encoding == "VORBIS"
-
-    @nested_params(
-        [8000, 16000],
-        [1, 2],
-        [("PCM_24", 24), ("PCM_32", 32)],
-    )
-    @skipIfFormatNotSupported("NIST")
-    def test_sphere(self, sample_rate, num_channels, subtype_and_bit_depth):
-        """`soundfile_backend.info` can check sph file correctly"""
-        duration = 1
-        num_frames = sample_rate * duration
-        #data = torch.randn(num_frames, num_channels).numpy()
-        data = paddle.randn(shape=[num_frames, num_channels]).numpy()
-        path = self.get_temp_path("data.nist")
-        subtype, bits_per_sample = subtype_and_bit_depth
-        soundfile.write(path, data, sample_rate, subtype=subtype)
-
-        info = soundfile_backend.info(path)
-        assert info.sample_rate == sample_rate
-        assert info.num_frames == sample_rate * duration
-        assert info.num_channels == num_channels
-        assert info.bits_per_sample == bits_per_sample
-        assert info.encoding == "PCM_S"
-
-    def test_unknown_subtype_warning(self):
-        """soundfile_backend.info issues a warning when the subtype is unknown
-
-        This will happen if a new subtype is supported in SoundFile: the _SUBTYPE_TO_BITS_PER_SAMPLE
-        dict should be updated.
-        """
-
-        def _mock_info_func(_):
-            class MockSoundFileInfo:
-                samplerate = 8000
-                frames = 356
-                channels = 2
-                subtype = "UNSEEN_SUBTYPE"
-                format = "UNKNOWN"
-
-            return MockSoundFileInfo()
-
-        with patch("soundfile.info", _mock_info_func):
-            with warnings.catch_warnings(record=True) as w:
-                info = soundfile_backend.info("foo")
-                assert len(w) == 1
-                assert "UNSEEN_SUBTYPE subtype is unknown to PaddleAudio" in str(w[-1].message)
-                assert info.bits_per_sample == 0
-
-
-class TestFileObject(TempDirMixin, unittest.TestCase):
-    def _test_fileobj(self, ext, subtype, bits_per_sample):
-        """Query audio via file-like object works"""
-        duration = 2
-        sample_rate = 16000
-        num_channels = 2
-        num_frames = sample_rate * duration
-        path = self.get_temp_path(f"test.{ext}")
-
-        #data = torch.randn(num_frames, num_channels).numpy()
-        data = paddle.randn(shape=[num_frames, num_channels]).numpy()
-        soundfile.write(path, data, sample_rate, subtype=subtype)
-
-        with open(path, "rb") as fileobj:
-            info = soundfile_backend.info(fileobj)
-        assert info.sample_rate == sample_rate
-        assert info.num_frames == num_frames
-        assert info.num_channels == num_channels
-        assert info.bits_per_sample == bits_per_sample
-        assert info.encoding == "FLAC" if ext == "flac" else "PCM_S"
-
-    def test_fileobj_wav(self):
-        """Loading audio via file-like object works"""
-        self._test_fileobj("wav", "PCM_16", 16)
-
-    @skipIfFormatNotSupported("FLAC")
-    def test_fileobj_flac(self):
-        """Loading audio via file-like object works"""
-        self._test_fileobj("flac", "PCM_16", 16)
-
-    def _test_tarobj(self, ext, subtype, bits_per_sample):
-        """Query compressed audio via file-like object works"""
-        duration = 2
-        sample_rate = 16000
-        num_channels = 2
-        num_frames = sample_rate * duration
-        audio_file = f"test.{ext}"
-        audio_path = self.get_temp_path(audio_file)
-        archive_path = self.get_temp_path("archive.tar.gz")
-
-        #data = torch.randn(num_frames, num_channels).numpy()
-        data = paddle.randn(shape=[num_frames, num_channels]).numpy()
-        soundfile.write(audio_path, data, sample_rate, subtype=subtype)
-
-        with tarfile.TarFile(archive_path, "w") as tarobj:
-            tarobj.add(audio_path, arcname=audio_file)
-        with tarfile.TarFile(archive_path, "r") as tarobj:
-            fileobj = tarobj.extractfile(audio_file)
-            info = soundfile_backend.info(fileobj)
-        assert info.sample_rate == sample_rate
-        assert info.num_frames == num_frames
-        assert info.num_channels == num_channels
-        assert info.bits_per_sample == bits_per_sample
-        assert info.encoding == "FLAC" if ext == "flac" else "PCM_S"
-
-    def test_tarobj_wav(self):
-        """Query compressed audio via file-like object works"""
-        self._test_tarobj("wav", "PCM_16", 16)
-
-    @skipIfFormatNotSupported("FLAC")
-    def test_tarobj_flac(self):
-        """Query compressed audio via file-like object works"""
-        self._test_tarobj("flac", "PCM_16", 16)
-
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
diff --git a/tests/unit/audio/backends/soundfile/load_test.py b/tests/unit/audio/backends/soundfile/load_test.py
deleted file mode 100644
index 626009382..000000000
--- a/tests/unit/audio/backends/soundfile/load_test.py
+++ /dev/null
@@ -1,369 +0,0 @@
-#this code is from: https://github.com/pytorch/audio/blob/main/test/torchaudio_unittest/backend/soundfile/load_test.py
-
-import os
-import tarfile
-import unittest
-from unittest.mock import patch
-import numpy as np
-
-from parameterized import parameterized
-import paddle
-from paddlespeech.audio._internal import module_utils as _mod_utils
-from paddlespeech.audio.backends import soundfile_backend
-from tests.unit.audio.backends.common import get_bits_per_sample, get_encoding 
-from tests.unit.common_utils import (
-    get_wav_data,
-    load_wav,
-    nested_params,
-    normalize_wav,
-    save_wav,
-    TempDirMixin,
-)
-
-from common import dtype2subtype, parameterize, skipIfFormatNotSupported
-
-import soundfile
-
-
-def _get_mock_path(
-    ext: str,
-    dtype: str,
-    sample_rate: int,
-    num_channels: int,
-    num_frames: int,
-):
-    return f"{dtype}_{sample_rate}_{num_channels}_{num_frames}.{ext}"
-
-
-def _get_mock_params(path: str):
-    filename, ext = path.split(".")
-    parts = filename.split("_")
-    return {
-        "ext": ext,
-        "dtype": parts[0],
-        "sample_rate": int(parts[1]),
-        "num_channels": int(parts[2]),
-        "num_frames": int(parts[3]),
-    }
-
-
-class SoundFileMock:
-    def __init__(self, path, mode):
-        assert mode == "r"
-        self.path = path
-        self._params = _get_mock_params(path)
-        self._start = None
-
-    @property
-    def samplerate(self):
-        return self._params["sample_rate"]
-
-    @property
-    def format(self):
-        if self._params["ext"] == "wav":
-            return "WAV"
-        if self._params["ext"] == "flac":
-            return "FLAC"
-        if self._params["ext"] == "ogg":
-            return "OGG"
-        if self._params["ext"] in ["sph", "nis", "nist"]:
-            return "NIST"
-
-    @property
-    def subtype(self):
-        if self._params["ext"] == "ogg":
-            return "VORBIS"
-        return dtype2subtype(self._params["dtype"])
-
-    def _prepare_read(self, start, stop, frames):
-        assert stop is None
-        self._start = start
-        return frames
-
-    def read(self, frames, dtype, always_2d):
-        assert always_2d
-        data = get_wav_data(
-            dtype,
-            self._params["num_channels"],
-            normalize=False,
-            num_frames=self._params["num_frames"],
-            channels_first=False,
-        ).numpy()
-        return data[self._start : self._start + frames]
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, *args, **kwargs):
-        pass
-
-
-class MockedLoadTest(unittest.TestCase):
-    def assert_dtype(self, ext, dtype, sample_rate, num_channels, normalize, channels_first):
-        """When format is WAV or NIST, normalize=False will return the native dtype Tensor, otherwise float32"""
-        num_frames = 3 * sample_rate
-        path = _get_mock_path(ext, dtype, sample_rate, num_channels, num_frames)
-        expected_dtype = paddle.float32 if normalize or ext not in ["wav", "nist"] else getattr(paddle, dtype)
-        with patch("soundfile.SoundFile", SoundFileMock):
-            found, sr = soundfile_backend.load(path, normalize=normalize, channels_first=channels_first)
-            assert found.dtype == expected_dtype
-            assert sample_rate == sr
-
-    @parameterize(
-        ["int32", "float32", "float64"],
-        [8000, 16000],
-        [1, 2],
-        [True, False],
-        [True, False],
-    )
-    def test_wav(self, dtype, sample_rate, num_channels, normalize, channels_first):
-        """Returns native dtype when normalize=False else float32"""
-        self.assert_dtype("wav", dtype, sample_rate, num_channels, normalize, channels_first)
-
-    @parameterize(
-        ["int32"],
-        [8000, 16000],
-        [1, 2],
-        [True, False],
-        [True, False],
-    )
-    def test_sphere(self, dtype, sample_rate, num_channels, normalize, channels_first):
-        """Returns float32 always"""
-        self.assert_dtype("sph", dtype, sample_rate, num_channels, normalize, channels_first)
-
-    @parameterize([8000, 16000], [1, 2], [True, False], [True, False])
-    def test_ogg(self, sample_rate, num_channels, normalize, channels_first):
-        """Returns float32 always"""
-        self.assert_dtype("ogg", "int16", sample_rate, num_channels, normalize, channels_first)
-
-    @parameterize([8000, 16000], [1, 2], [True, False], [True, False])
-    def test_flac(self, sample_rate, num_channels, normalize, channels_first):
-        """`soundfile_backend.load` can load ogg format."""
-        self.assert_dtype("flac", "int16", sample_rate, num_channels, normalize, channels_first)
-
-
-class LoadTestBase(TempDirMixin, unittest.TestCase):
-    def assert_wav(
-        self,
-        dtype,
-        sample_rate,
-        num_channels,
-        normalize,
-        channels_first=True,
-        duration=1,
-    ):
-        """`soundfile_backend.load` can load wav format correctly.
-
-        Wav data loaded with soundfile backend should match those with scipy
-        """
-        path = self.get_temp_path("reference.wav")
-        num_frames = duration * sample_rate
-        data = get_wav_data(
-            dtype,
-            num_channels,
-            normalize=normalize,
-            num_frames=num_frames,
-            channels_first=channels_first,
-        )
-        save_wav(path, data, sample_rate, channels_first=channels_first)
-        expected = load_wav(path, normalize=normalize, channels_first=channels_first)[0]
-        data, sr = soundfile_backend.load(path, normalize=normalize, channels_first=channels_first)
-        assert sr == sample_rate
-        np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
-
-    def assert_sphere(
-        self,
-        dtype,
-        sample_rate,
-        num_channels,
-        channels_first=True,
-        duration=1,
-    ):
-        """`soundfile_backend.load` can load SPHERE format correctly."""
-        path = self.get_temp_path("reference.sph")
-        num_frames = duration * sample_rate
-        raw = get_wav_data(
-            dtype,
-            num_channels,
-            num_frames=num_frames,
-            normalize=False,
-            channels_first=False,
-        )
-        soundfile.write(path, raw, sample_rate, subtype=dtype2subtype(dtype), format="NIST")
-        expected = normalize_wav(raw.t() if channels_first else raw)
-        data, sr = soundfile_backend.load(path, channels_first=channels_first)
-        assert sr == sample_rate
-        #self.assertEqual(data, expected, atol=1e-4, rtol=1e-8)
-        np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
-
-    def assert_flac(
-        self,
-        dtype,
-        sample_rate,
-        num_channels,
-        channels_first=True,
-        duration=1,
-    ):
-        """`soundfile_backend.load` can load FLAC format correctly."""
-        path = self.get_temp_path("reference.flac")
-        num_frames = duration * sample_rate
-        raw = get_wav_data(
-            dtype,
-            num_channels,
-            num_frames=num_frames,
-            normalize=False,
-            channels_first=False,
-        )
-        soundfile.write(path, raw, sample_rate)
-        expected = normalize_wav(raw.t() if channels_first else raw)
-        data, sr = soundfile_backend.load(path, channels_first=channels_first)
-        assert sr == sample_rate
-        #self.assertEqual(data, expected, atol=1e-4, rtol=1e-8)
-        np.testing.assert_array_almost_equal(data.numpy(), expected.numpy())
-        
-
-
-class TestLoad(LoadTestBase):
-    """Test the correctness of `soundfile_backend.load` for various formats"""
-
-    @parameterize(
-        ["float32", "int32"],
-        [8000, 16000],
-        [1, 2],
-        [False, True],
-        [False, True],
-    )
-    def test_wav(self, dtype, sample_rate, num_channels, normalize, channels_first):
-        """`soundfile_backend.load` can load wav format correctly."""
-        self.assert_wav(dtype, sample_rate, num_channels, normalize, channels_first)
-
-    @parameterize(
-        ["int32"],
-        [16000],
-        [2],
-        [False],
-    )
-    def test_wav_large(self, dtype, sample_rate, num_channels, normalize):
-        """`soundfile_backend.load` can load large wav file correctly."""
-        two_hours = 2 * 60 * 60
-        self.assert_wav(dtype, sample_rate, num_channels, normalize, duration=two_hours)
-
-    @parameterize(["float32", "int32"], [4, 8, 16, 32], [False, True])
-    def test_multiple_channels(self, dtype, num_channels, channels_first):
-        """`soundfile_backend.load` can load wav file with more than 2 channels."""
-        sample_rate = 8000
-        normalize = False
-        self.assert_wav(dtype, sample_rate, num_channels, normalize, channels_first)
-
-    #@parameterize(["int32"], [8000, 16000], [1, 2], [False, True])
-    #@skipIfFormatNotSupported("NIST")
-    #def test_sphere(self, dtype, sample_rate, num_channels, channels_first):
-        #"""`soundfile_backend.load` can load sphere format correctly."""
-        #self.assert_sphere(dtype, sample_rate, num_channels, channels_first)
-
-    #@parameterize(["int32"], [8000, 16000], [1, 2], [False, True])
-    #@skipIfFormatNotSupported("FLAC")
-    #def test_flac(self, dtype, sample_rate, num_channels, channels_first):
-        #"""`soundfile_backend.load` can load flac format correctly."""
-        #self.assert_flac(dtype, sample_rate, num_channels, channels_first)
-
-
-class TestLoadFormat(TempDirMixin, unittest.TestCase):
-    """Given `format` parameter, `so.load` can load files without extension"""
-
-    original = None
-    path = None
-
-    def _make_file(self, format_):
-        sample_rate = 8000
-        path_with_ext = self.get_temp_path(f"test.{format_}")
-        data = get_wav_data("float32", num_channels=2).numpy().T
-        soundfile.write(path_with_ext, data, sample_rate)
-        expected = soundfile.read(path_with_ext, dtype="float32")[0].T
-        path = os.path.splitext(path_with_ext)[0]
-        os.rename(path_with_ext, path)
-        return path, expected
-
-    def _test_format(self, format_):
-        """Providing format allows to read file without extension"""
-        path, expected = self._make_file(format_)
-        found, _ = soundfile_backend.load(path)
-        #self.assertEqual(found, expected)
-        np.testing.assert_array_almost_equal(found, expected)
-
-    @parameterized.expand(
-        [
-            ("WAV",),
-            ("wav",),
-        ]
-    )
-    def test_wav(self, format_):
-        self._test_format(format_)
-
-    @parameterized.expand(
-        [
-            ("FLAC",),
-            ("flac",),
-        ]
-    )
-    @skipIfFormatNotSupported("FLAC")
-    def test_flac(self, format_):
-        self._test_format(format_)
-
-
-class TestFileObject(TempDirMixin, unittest.TestCase):
-    def _test_fileobj(self, ext):
-        """Loading audio via file-like object works"""
-        sample_rate = 16000
-        path = self.get_temp_path(f"test.{ext}")
-
-        data = get_wav_data("float32", num_channels=2).numpy().T
-        soundfile.write(path, data, sample_rate)
-        expected = soundfile.read(path, dtype="float32")[0].T
-
-        with open(path, "rb") as fileobj:
-            found, sr = soundfile_backend.load(fileobj)
-        assert sr == sample_rate
-        #self.assertEqual(expected, found)
-        np.testing.assert_array_almost_equal(found, expected)
-
-    def test_fileobj_wav(self):
-        """Loading audio via file-like object works"""
-        self._test_fileobj("wav")
-
-    def test_fileobj_flac(self):
-        """Loading audio via file-like object works"""
-        self._test_fileobj("flac")
-
-    def _test_tarfile(self, ext):
-        """Loading audio via file-like object works"""
-        sample_rate = 16000
-        audio_file = f"test.{ext}"
-        audio_path = self.get_temp_path(audio_file)
-        archive_path = self.get_temp_path("archive.tar.gz")
-
-        data = get_wav_data("float32", num_channels=2).numpy().T
-        soundfile.write(audio_path, data, sample_rate)
-        expected = soundfile.read(audio_path, dtype="float32")[0].T
-
-        with tarfile.TarFile(archive_path, "w") as tarobj:
-            tarobj.add(audio_path, arcname=audio_file)
-        with tarfile.TarFile(archive_path, "r") as tarobj:
-            fileobj = tarobj.extractfile(audio_file)
-            found, sr = soundfile_backend.load(fileobj)
-
-        assert sr == sample_rate
-        #self.assertEqual(expected, found)
-        np.testing.assert_array_almost_equal(found.numpy(), expected)
-
-
-    def test_tarfile_wav(self):
-        """Loading audio via file-like object works"""
-        self._test_tarfile("wav")
-
-    def test_tarfile_flac(self):
-        """Loading audio via file-like object works"""
-        self._test_tarfile("flac")
-
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
diff --git a/tests/unit/audio/backends/soundfile/save_test.py b/tests/unit/audio/backends/soundfile/save_test.py
deleted file mode 100644
index 9139d84cd..000000000
--- a/tests/unit/audio/backends/soundfile/save_test.py
+++ /dev/null
@@ -1,322 +0,0 @@
-import io
-import unittest
-from unittest.mock import patch
-
-from paddlespeech.audio._internal import module_utils as _mod_utils
-from paddlespeech.audio.backends import soundfile_backend
-from tests.unit.common_utils import (
-    get_wav_data,
-    load_wav,
-    nested_params,
-    normalize_wav,
-    save_wav,
-    TempDirMixin,
-)
-
-from common import fetch_wav_subtype, parameterize, skipIfFormatNotSupported
-
-import paddle
-import numpy as np
-
-import soundfile
-
-
-class MockedSaveTest(unittest.TestCase):
-    @nested_params(
-        ["float32", "int32"],
-        [8000, 16000],
-        [1, 2],
-        [False, True],
-        [
-            (None, None),
-            ("PCM_U", None),
-            ("PCM_U", 8),
-            ("PCM_S", None),
-            ("PCM_S", 16),
-            ("PCM_S", 32),
-            ("PCM_F", None),
-            ("PCM_F", 32),
-            ("PCM_F", 64),
-            ("ULAW", None),
-            ("ULAW", 8),
-            ("ALAW", None),
-            ("ALAW", 8),
-        ],
-    )
-    @patch("soundfile.write")
-    def test_wav(self, dtype, sample_rate, num_channels, channels_first, enc_params, mocked_write):
-        """soundfile_backend.save passes correct subtype to soundfile.write when WAV"""
-        filepath = "foo.wav"
-        input_tensor = get_wav_data(
-            dtype,
-            num_channels,
-            num_frames=3 * sample_rate,
-            normalize=dtype == "float32",
-            channels_first=channels_first,
-        )
-        input_tensor = paddle.transpose(input_tensor, [1, 0])
-
-        encoding, bits_per_sample = enc_params
-        soundfile_backend.save(
-            filepath,
-            input_tensor,
-            sample_rate,
-            channels_first=channels_first,
-            encoding=encoding,
-            bits_per_sample=bits_per_sample,
-        )
-
-        # on +Py3.8 call_args.kwargs is more descreptive
-        args = mocked_write.call_args[1]
-        assert args["file"] == filepath
-        assert args["samplerate"] == sample_rate
-        assert args["subtype"] == fetch_wav_subtype(dtype, encoding, bits_per_sample)
-        assert args["format"] is None
-        tensor_result = paddle.transpose(input_tensor, [1, 0]) if channels_first else input_tensor
-        #self.assertEqual(args["data"], tensor_result.numpy())
-        np.testing.assert_array_almost_equal(args["data"].numpy(), tensor_result.numpy())
-
-        
-
-    @patch("soundfile.write")
-    def assert_non_wav(
-        self,
-        fmt,
-        dtype,
-        sample_rate,
-        num_channels,
-        channels_first,
-        mocked_write,
-        encoding=None,
-        bits_per_sample=None,
-    ):
-        """soundfile_backend.save passes correct subtype and format to soundfile.write when SPHERE"""
-        filepath = f"foo.{fmt}"
-        input_tensor = get_wav_data(
-            dtype,
-            num_channels,
-            num_frames=3 * sample_rate,
-            normalize=False,
-            channels_first=channels_first,
-        )
-        input_tensor = paddle.transpose(input_tensor, [1, 0])
-
-        expected_data = paddle.transpose(input_tensor, [1, 0]) if channels_first else input_tensor
-
-        soundfile_backend.save(
-            filepath,
-            input_tensor,
-            sample_rate,
-            channels_first,
-            encoding=encoding,
-            bits_per_sample=bits_per_sample,
-        )
-
-        # on +Py3.8 call_args.kwargs is more descreptive
-        args = mocked_write.call_args[1]
-        assert args["file"] == filepath
-        assert args["samplerate"] == sample_rate
-        if fmt in ["sph", "nist", "nis"]:
-            assert args["format"] == "NIST"
-        else:
-            assert args["format"] is None
-        np.testing.assert_array_almost_equal(args["data"].numpy(), expected_data.numpy())
-        #self.assertEqual(args["data"], expected_data)
-
-    @nested_params(
-        ["sph", "nist", "nis"],
-        ["int32"],
-        [8000, 16000],
-        [1, 2],
-        [False, True],
-        [
-            ("PCM_S", 8),
-            ("PCM_S", 16),
-            ("PCM_S", 24),
-            ("PCM_S", 32),
-            ("ULAW", 8),
-            ("ALAW", 8),
-            ("ALAW", 16),
-            ("ALAW", 24),
-            ("ALAW", 32),
-        ],
-    )
-    def test_sph(self, fmt, dtype, sample_rate, num_channels, channels_first, enc_params):
-        """soundfile_backend.save passes default format and subtype (None-s) to
-        soundfile.write when not WAV"""
-        encoding, bits_per_sample = enc_params
-        self.assert_non_wav(
-            fmt, dtype, sample_rate, num_channels, channels_first, encoding=encoding, bits_per_sample=bits_per_sample
-        )
-
-    @parameterize(
-        ["int32"],
-        [8000, 16000],
-        [1, 2],
-        [False, True],
-        [8, 16, 24],
-    )
-    def test_flac(self, dtype, sample_rate, num_channels, channels_first, bits_per_sample):
-        """soundfile_backend.save passes default format and subtype (None-s) to
-        soundfile.write when not WAV"""
-        self.assert_non_wav("flac", dtype, sample_rate, num_channels, channels_first, bits_per_sample=bits_per_sample)
-
-    @parameterize(
-        ["int32"],
-        [8000, 16000],
-        [1, 2],
-        [False, True],
-    )
-    def test_ogg(self, dtype, sample_rate, num_channels, channels_first):
-        """soundfile_backend.save passes default format and subtype (None-s) to
-        soundfile.write when not WAV"""
-        self.assert_non_wav("ogg", dtype, sample_rate, num_channels, channels_first)
-
-
-class SaveTestBase(TempDirMixin, unittest.TestCase):
-    def assert_wav(self, dtype, sample_rate, num_channels, num_frames):
-        """`soundfile_backend.save` can save wav format."""
-        path = self.get_temp_path("data.wav")
-        expected = get_wav_data(dtype, num_channels, num_frames=num_frames, normalize=False)
-        soundfile_backend.save(path, expected, sample_rate)
-        found, sr = load_wav(path, normalize=False)
-        assert sample_rate == sr
-        #self.assertEqual(found, expected)
-        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
-
-    def _assert_non_wav(self, fmt, dtype, sample_rate, num_channels):
-        """`soundfile_backend.save` can save non-wav format.
-
-        Due to precision missmatch, and the lack of alternative way to decode the
-        resulting files without using soundfile, only meta data are validated.
-        """
-        num_frames = sample_rate * 3
-        path = self.get_temp_path(f"data.{fmt}")
-        expected = get_wav_data(dtype, num_channels, num_frames=num_frames, normalize=False)
-        soundfile_backend.save(path, expected, sample_rate)
-        sinfo = soundfile.info(path)
-        assert sinfo.format == fmt.upper()
-        #assert sinfo.frames == num_frames this go wrong
-        assert sinfo.channels == num_channels
-        assert sinfo.samplerate == sample_rate
-
-    def assert_flac(self, dtype, sample_rate, num_channels):
-        """`soundfile_backend.save` can save flac format."""
-        self._assert_non_wav("flac", dtype, sample_rate, num_channels)
-
-    def assert_sphere(self, dtype, sample_rate, num_channels):
-        """`soundfile_backend.save` can save sph format."""
-        self._assert_non_wav("nist", dtype, sample_rate, num_channels)
-
-    def assert_ogg(self, dtype, sample_rate, num_channels):
-        """`soundfile_backend.save` can save ogg format.
-
-        As we cannot inspect the OGG format (it's lossy), we only check the metadata.
-        """
-        self._assert_non_wav("ogg", dtype, sample_rate, num_channels)
-
-
-class TestSave(SaveTestBase):
-    @parameterize(
-        ["float32", "int32"],
-        [8000, 16000],
-        [1, 2],
-    )
-    def test_wav(self, dtype, sample_rate, num_channels):
-        """`soundfile_backend.save` can save wav format."""
-        self.assert_wav(dtype, sample_rate, num_channels, num_frames=None)
-
-    @parameterize(
-        ["float32", "int32"],
-        [4, 8, 16, 32],
-    )
-    def test_multiple_channels(self, dtype, num_channels):
-        """`soundfile_backend.save` can save wav with more than 2 channels."""
-        sample_rate = 8000
-        self.assert_wav(dtype, sample_rate, num_channels, num_frames=None)
-
-    @parameterize(
-        ["int32"],
-        [8000, 16000],
-        [1, 2],
-    )
-    @skipIfFormatNotSupported("NIST")
-    def test_sphere(self, dtype, sample_rate, num_channels):
-        """`soundfile_backend.save` can save sph format."""
-        self.assert_sphere(dtype, sample_rate, num_channels)
-
-    @parameterize(
-        [8000, 16000],
-        [1, 2],
-    )
-    @skipIfFormatNotSupported("FLAC")
-    def test_flac(self, sample_rate, num_channels):
-        """`soundfile_backend.save` can save flac format."""
-        self.assert_flac("float32", sample_rate, num_channels)
-
-    @parameterize(
-        [8000, 16000],
-        [1, 2],
-    )
-    @skipIfFormatNotSupported("OGG")
-    def test_ogg(self, sample_rate, num_channels):
-        """`soundfile_backend.save` can save ogg/vorbis format."""
-        self.assert_ogg("float32", sample_rate, num_channels)
-
-
-class TestSaveParams(TempDirMixin, unittest.TestCase):
-    """Test the correctness of optional parameters of `soundfile_backend.save`"""
-
-    @parameterize([True, False])
-    def test_channels_first(self, channels_first):
-        """channels_first swaps axes"""
-        path = self.get_temp_path("data.wav")
-        data = get_wav_data("int32", 2, channels_first=channels_first)
-        soundfile_backend.save(path, data, 8000, channels_first=channels_first)
-        found = load_wav(path)[0]
-        expected = data if channels_first else data.transpose([1, 0])
-        #self.assertEqual(found, expected, atol=1e-4, rtol=1e-8)
-        np.testing.assert_array_almost_equal(found.numpy(), expected.numpy())
-
-
-class TestFileObject(TempDirMixin, unittest.TestCase):
-    def _test_fileobj(self, ext):
-        """Saving audio to file-like object works"""
-        sample_rate = 16000
-        path = self.get_temp_path(f"test.{ext}")
-
-        subtype = "FLOAT" if ext == "wav" else None
-        data = get_wav_data("float32", num_channels=2)
-        soundfile.write(path, data.numpy().T, sample_rate, subtype=subtype)
-        expected = soundfile.read(path, dtype="float32")[0]
-
-        fileobj = io.BytesIO()
-        soundfile_backend.save(fileobj, data, sample_rate, format=ext)
-        fileobj.seek(0)
-        found, sr = soundfile.read(fileobj, dtype="float32")
-
-        assert sr == sample_rate
-        #self.assertEqual(expected, found, atol=1e-4, rtol=1e-8)
-        np.testing.assert_array_almost_equal(found, expected)
-
-    def test_fileobj_wav(self):
-        """Saving audio via file-like object works"""
-        self._test_fileobj("wav")
-
-    @skipIfFormatNotSupported("FLAC")
-    def test_fileobj_flac(self):
-        """Saving audio via file-like object works"""
-        self._test_fileobj("flac")
-
-    @skipIfFormatNotSupported("NIST")
-    def test_fileobj_nist(self):
-        """Saving audio via file-like object works"""
-        self._test_fileobj("NIST")
-
-    @skipIfFormatNotSupported("OGG")
-    def test_fileobj_ogg(self):
-        """Saving audio via file-like object works"""
-        self._test_fileobj("OGG")
-
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
diff --git a/tests/unit/audio/backends/soundfile/test_io.py b/tests/unit/audio/backends/soundfile/test_io.py
deleted file mode 100644
index 26276751f..000000000
--- a/tests/unit/audio/backends/soundfile/test_io.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import filecmp
-import os
-import unittest
-
-import numpy as np
-import soundfile as sf
-
-import paddlespeech.audio
-from ..base import BackendTest
-
-
-class TestIO(BackendTest):
-    def test_load_mono_channel(self):
-        sf_data, sf_sr = sf.read(self.files[0])
-        pa_data, pa_sr = paddlespeech.audio.load(
-            self.files[0], normal=False, dtype='float64')
-
-        self.assertEqual(sf_data.dtype, pa_data.dtype)
-        self.assertEqual(sf_sr, pa_sr)
-        np.testing.assert_array_almost_equal(sf_data, pa_data)
-
-    def test_load_multi_channels(self):
-        sf_data, sf_sr = sf.read(self.files[1])
-        sf_data = sf_data.T  # Channel dim first
-        pa_data, pa_sr = paddlespeech.audio.load(
-            self.files[1], mono=False, normal=False, dtype='float64')
-
-        self.assertEqual(sf_data.dtype, pa_data.dtype)
-        self.assertEqual(sf_sr, pa_sr)
-        np.testing.assert_array_almost_equal(sf_data, pa_data)
-
-    def test_save_mono_channel(self):
-        waveform, sr = np.random.randint(
-            low=-32768, high=32768, size=(48000), dtype=np.int16), 16000
-        sf_tmp_file = 'sf_tmp.wav'
-        pa_tmp_file = 'pa_tmp.wav'
-
-        sf.write(sf_tmp_file, waveform, sr)
-        paddlespeech.audio.save(waveform, sr, pa_tmp_file)
-
-        self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file))
-        for file in [sf_tmp_file, pa_tmp_file]:
-            os.remove(file)
-
-    def test_save_multi_channels(self):
-        waveform, sr = np.random.randint(
-            low=-32768, high=32768, size=(2, 48000), dtype=np.int16), 16000
-        sf_tmp_file = 'sf_tmp.wav'
-        pa_tmp_file = 'pa_tmp.wav'
-
-        sf.write(sf_tmp_file, waveform.T, sr)
-        paddlespeech.audio.save(waveform.T, sr, pa_tmp_file)
-
-        self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file))
-        for file in [sf_tmp_file, pa_tmp_file]:
-            os.remove(file)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/unit/audio/features/base.py b/tests/unit/audio/features/base.py
index 4049b6102..614fce28c 100644
--- a/tests/unit/audio/features/base.py
+++ b/tests/unit/audio/features/base.py
@@ -18,7 +18,7 @@ import urllib.request
 import numpy as np
 import paddle
 
-from paddlespeech.audio.soundfile_backend import soundfile_load as load
+from paddleaudio.backends import soundfile_load as load
 
 wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
 
diff --git a/tests/unit/audio/features/test_istft.py b/tests/unit/audio/features/test_istft.py
index f1e6e4e33..23371200b 100644
--- a/tests/unit/audio/features/test_istft.py
+++ b/tests/unit/audio/features/test_istft.py
@@ -17,7 +17,7 @@ import numpy as np
 import paddle
 
 from .base import FeatTest
-from paddlespeech.audio.functional.window import get_window
+from paddleaudio.functional.window import get_window
 from paddlespeech.s2t.transform.spectrogram import IStft
 from paddlespeech.s2t.transform.spectrogram import Stft
 
diff --git a/tests/unit/audio/features/test_kaldi.py b/tests/unit/audio/features/test_kaldi.py
deleted file mode 100644
index 2b0ece890..000000000
--- a/tests/unit/audio/features/test_kaldi.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-import numpy as np
-import paddle
-import torch
-import torchaudio
-
-import paddlespeech.audio
-from .base import FeatTest
-
-
-class TestKaldi(FeatTest):
-    def initParmas(self):
-        self.window_size = 1024
-        self.dtype = 'float32'
-
-    def test_window(self):
-        t_hann_window = torch.hann_window(
-            self.window_size, periodic=False, dtype=eval(f'torch.{self.dtype}'))
-        t_hamm_window = torch.hamming_window(
-            self.window_size,
-            periodic=False,
-            alpha=0.54,
-            beta=0.46,
-            dtype=eval(f'torch.{self.dtype}'))
-        t_povey_window = torch.hann_window(
-            self.window_size, periodic=False,
-            dtype=eval(f'torch.{self.dtype}')).pow(0.85)
-
-        p_hann_window = paddlespeech.audio.functional.window.get_window(
-            'hann',
-            self.window_size,
-            fftbins=False,
-            dtype=eval(f'paddle.{self.dtype}'))
-        p_hamm_window = paddlespeech.audio.functional.window.get_window(
-            'hamming',
-            self.window_size,
-            fftbins=False,
-            dtype=eval(f'paddle.{self.dtype}'))
-        p_povey_window = paddlespeech.audio.functional.window.get_window(
-            'hann',
-            self.window_size,
-            fftbins=False,
-            dtype=eval(f'paddle.{self.dtype}')).pow(0.85)
-
-        np.testing.assert_array_almost_equal(t_hann_window, p_hann_window)
-        np.testing.assert_array_almost_equal(t_hamm_window, p_hamm_window)
-        np.testing.assert_array_almost_equal(t_povey_window, p_povey_window)
-
-    def test_fbank(self):
-        ta_features = torchaudio.compliance.kaldi.fbank(
-            torch.from_numpy(self.waveform.astype(self.dtype)))
-        pa_features = paddlespeech.audio.compliance.kaldi.fbank(
-            paddle.to_tensor(self.waveform.astype(self.dtype)))
-        np.testing.assert_array_almost_equal(
-            ta_features, pa_features, decimal=4)
-
-    def test_mfcc(self):
-        ta_features = torchaudio.compliance.kaldi.mfcc(
-            torch.from_numpy(self.waveform.astype(self.dtype)))
-        pa_features = paddlespeech.audio.compliance.kaldi.mfcc(
-            paddle.to_tensor(self.waveform.astype(self.dtype)))
-        np.testing.assert_array_almost_equal(
-            ta_features, pa_features, decimal=4)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/unit/audio/features/test_librosa.py b/tests/unit/audio/features/test_librosa.py
deleted file mode 100644
index ffdec3e78..000000000
--- a/tests/unit/audio/features/test_librosa.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-import librosa
-import numpy as np
-import paddle
-
-import paddlespeech.audio
-from .base import FeatTest
-from paddlespeech.audio.functional.window import get_window
-
-
-class TestLibrosa(FeatTest):
-    def initParmas(self):
-        self.n_fft = 512
-        self.hop_length = 128
-        self.n_mels = 40
-        self.n_mfcc = 20
-        self.fmin = 0.0
-        self.window_str = 'hann'
-        self.pad_mode = 'reflect'
-        self.top_db = 80.0
-
-    def test_stft(self):
-        if len(self.waveform.shape) == 2:  # (C, T)
-            self.waveform = self.waveform.squeeze(
-                0)  # 1D input for librosa.feature.melspectrogram
-
-        feature_librosa = librosa.core.stft(
-            y=self.waveform,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            win_length=None,
-            window=self.window_str,
-            center=True,
-            dtype=None,
-            pad_mode=self.pad_mode, )
-        x = paddle.to_tensor(self.waveform).unsqueeze(0)
-        window = get_window(self.window_str, self.n_fft, dtype=x.dtype)
-        feature_paddle = paddle.signal.stft(
-            x=x,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            win_length=None,
-            window=window,
-            center=True,
-            pad_mode=self.pad_mode,
-            normalized=False,
-            onesided=True, ).squeeze(0)
-
-        np.testing.assert_array_almost_equal(
-            feature_librosa, feature_paddle, decimal=5)
-
-    def test_istft(self):
-        if len(self.waveform.shape) == 2:  # (C, T)
-            self.waveform = self.waveform.squeeze(
-                0)  # 1D input for librosa.feature.melspectrogram
-
-        # Get stft result from librosa.
-        stft_matrix = librosa.core.stft(
-            y=self.waveform,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            win_length=None,
-            window=self.window_str,
-            center=True,
-            pad_mode=self.pad_mode, )
-
-        feature_librosa = librosa.core.istft(
-            stft_matrix=stft_matrix,
-            hop_length=self.hop_length,
-            win_length=None,
-            window=self.window_str,
-            center=True,
-            dtype=None,
-            length=None, )
-
-        x = paddle.to_tensor(stft_matrix).unsqueeze(0)
-        window = get_window(
-            self.window_str,
-            self.n_fft,
-            dtype=paddle.to_tensor(self.waveform).dtype)
-        feature_paddle = paddle.signal.istft(
-            x=x,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            win_length=None,
-            window=window,
-            center=True,
-            normalized=False,
-            onesided=True,
-            length=None,
-            return_complex=False, ).squeeze(0)
-
-        np.testing.assert_array_almost_equal(
-            feature_librosa, feature_paddle, decimal=5)
-
-    def test_mel(self):
-        feature_librosa = librosa.filters.mel(
-            sr=self.sr,
-            n_fft=self.n_fft,
-            n_mels=self.n_mels,
-            fmin=self.fmin,
-            fmax=None,
-            htk=False,
-            norm='slaney',
-            dtype=self.waveform.dtype, )
-        feature_compliance = paddlespeech.audio.compliance.librosa.compute_fbank_matrix(
-            sr=self.sr,
-            n_fft=self.n_fft,
-            n_mels=self.n_mels,
-            fmin=self.fmin,
-            fmax=None,
-            htk=False,
-            norm='slaney',
-            dtype=self.waveform.dtype, )
-        x = paddle.to_tensor(self.waveform)
-        feature_functional = paddlespeech.audio.functional.compute_fbank_matrix(
-            sr=self.sr,
-            n_fft=self.n_fft,
-            n_mels=self.n_mels,
-            f_min=self.fmin,
-            f_max=None,
-            htk=False,
-            norm='slaney',
-            dtype=x.dtype, )
-
-        np.testing.assert_array_almost_equal(feature_librosa,
-                                             feature_compliance)
-        np.testing.assert_array_almost_equal(feature_librosa,
-                                             feature_functional)
-
-    def test_melspect(self):
-        if len(self.waveform.shape) == 2:  # (C, T)
-            self.waveform = self.waveform.squeeze(
-                0)  # 1D input for librosa.feature.melspectrogram
-
-        # librosa:
-        feature_librosa = librosa.feature.melspectrogram(
-            y=self.waveform,
-            sr=self.sr,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            fmin=self.fmin)
-
-        # paddlespeech.audio.compliance.librosa:
-        feature_compliance = paddlespeech.audio.compliance.librosa.melspectrogram(
-            x=self.waveform,
-            sr=self.sr,
-            window_size=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            fmin=self.fmin,
-            to_db=False)
-
-        # paddlespeech.audio.features.layer
-        x = paddle.to_tensor(
-            self.waveform, dtype=paddle.float64).unsqueeze(0)  # Add batch dim.
-        feature_extractor = paddlespeech.audio.features.MelSpectrogram(
-            sr=self.sr,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            f_min=self.fmin,
-            dtype=x.dtype)
-        feature_layer = feature_extractor(x).squeeze(0).numpy()
-
-        np.testing.assert_array_almost_equal(
-            feature_librosa, feature_compliance, decimal=5)
-        np.testing.assert_array_almost_equal(
-            feature_librosa, feature_layer, decimal=5)
-
-    def test_log_melspect(self):
-        if len(self.waveform.shape) == 2:  # (C, T)
-            self.waveform = self.waveform.squeeze(
-                0)  # 1D input for librosa.feature.melspectrogram
-
-        # librosa:
-        feature_librosa = librosa.feature.melspectrogram(
-            y=self.waveform,
-            sr=self.sr,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            fmin=self.fmin)
-        feature_librosa = librosa.power_to_db(feature_librosa, top_db=None)
-
-        # paddlespeech.audio.compliance.librosa:
-        feature_compliance = paddlespeech.audio.compliance.librosa.melspectrogram(
-            x=self.waveform,
-            sr=self.sr,
-            window_size=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            fmin=self.fmin)
-
-        # paddlespeech.audio.features.layer
-        x = paddle.to_tensor(
-            self.waveform, dtype=paddle.float64).unsqueeze(0)  # Add batch dim.
-        feature_extractor = paddlespeech.audio.features.LogMelSpectrogram(
-            sr=self.sr,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            f_min=self.fmin,
-            dtype=x.dtype)
-        feature_layer = feature_extractor(x).squeeze(0).numpy()
-
-        np.testing.assert_array_almost_equal(
-            feature_librosa, feature_compliance, decimal=5)
-        np.testing.assert_array_almost_equal(
-            feature_librosa, feature_layer, decimal=4)
-
-    def test_mfcc(self):
-        if len(self.waveform.shape) == 2:  # (C, T)
-            self.waveform = self.waveform.squeeze(
-                0)  # 1D input for librosa.feature.melspectrogram
-
-        # librosa:
-        feature_librosa = librosa.feature.mfcc(
-            y=self.waveform,
-            sr=self.sr,
-            S=None,
-            n_mfcc=self.n_mfcc,
-            dct_type=2,
-            norm='ortho',
-            lifter=0,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            fmin=self.fmin)
-
-        # paddlespeech.audio.compliance.librosa:
-        feature_compliance = paddlespeech.audio.compliance.librosa.mfcc(
-            x=self.waveform,
-            sr=self.sr,
-            n_mfcc=self.n_mfcc,
-            dct_type=2,
-            norm='ortho',
-            lifter=0,
-            window_size=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            fmin=self.fmin,
-            top_db=self.top_db)
-
-        # paddlespeech.audio.features.layer
-        x = paddle.to_tensor(
-            self.waveform, dtype=paddle.float64).unsqueeze(0)  # Add batch dim.
-        feature_extractor = paddlespeech.audio.features.MFCC(
-            sr=self.sr,
-            n_mfcc=self.n_mfcc,
-            n_fft=self.n_fft,
-            hop_length=self.hop_length,
-            n_mels=self.n_mels,
-            f_min=self.fmin,
-            top_db=self.top_db,
-            dtype=x.dtype)
-        feature_layer = feature_extractor(x).squeeze(0).numpy()
-
-        np.testing.assert_array_almost_equal(
-            feature_librosa, feature_compliance, decimal=4)
-        np.testing.assert_array_almost_equal(
-            feature_librosa, feature_layer, decimal=4)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/unit/audio/features/test_log_melspectrogram.py b/tests/unit/audio/features/test_log_melspectrogram.py
index 59eb73e8c..0c38de22c 100644
--- a/tests/unit/audio/features/test_log_melspectrogram.py
+++ b/tests/unit/audio/features/test_log_melspectrogram.py
@@ -16,7 +16,7 @@ import unittest
 import numpy as np
 import paddle
 
-import paddlespeech.audio
+import paddleaudio
 from .base import FeatTest
 from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram
 
@@ -33,7 +33,7 @@ class TestLogMelSpectrogram(FeatTest):
         ps_res = ps_melspect(self.waveform.T).squeeze(1).T
 
         x = paddle.to_tensor(self.waveform)
-        ps_melspect = paddlespeech.audio.features.LogMelSpectrogram(
+        ps_melspect = paddleaudio.features.LogMelSpectrogram(
             self.sr,
             self.n_fft,
             self.hop_length,
diff --git a/tests/unit/audio/features/test_spectrogram.py b/tests/unit/audio/features/test_spectrogram.py
index 7d908a7ef..50b21403b 100644
--- a/tests/unit/audio/features/test_spectrogram.py
+++ b/tests/unit/audio/features/test_spectrogram.py
@@ -16,7 +16,7 @@ import unittest
 import numpy as np
 import paddle
 
-import paddlespeech.audio
+import paddleaudio
 from .base import FeatTest
 from paddlespeech.s2t.transform.spectrogram import Spectrogram
 
@@ -31,7 +31,7 @@ class TestSpectrogram(FeatTest):
         ps_res = ps_spect(self.waveform.T).squeeze(1).T  # Magnitude
 
         x = paddle.to_tensor(self.waveform)
-        pa_spect = paddlespeech.audio.features.Spectrogram(
+        pa_spect = paddleaudio.features.Spectrogram(
             self.n_fft, self.hop_length, power=1.0)
         pa_res = pa_spect(x).squeeze(0).numpy()
 
diff --git a/tests/unit/audio/features/test_stft.py b/tests/unit/audio/features/test_stft.py
index 03448ca80..c64b5ebe6 100644
--- a/tests/unit/audio/features/test_stft.py
+++ b/tests/unit/audio/features/test_stft.py
@@ -17,7 +17,7 @@ import numpy as np
 import paddle
 
 from .base import FeatTest
-from paddlespeech.audio.functional.window import get_window
+from paddleaudio.functional.window import get_window
 from paddlespeech.s2t.transform.spectrogram import Stft