Merge remote-tracking branch 'upstream/develop' into develop

3 years ago · 4d6f1646d4
parent 15b6a8ca7c b55582ea36
commit 4d6f1646d4
1152 changed files with 25100 additions and 1559577 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,6 +11,10 @@
 *.npz
 *.done
 *.whl
+*.egg-info
+build
+
+docs/build/

 tools/venv
 tools/kenlm
@ -20,5 +24,7 @@ tools/montreal-forced-aligner/
 tools/Montreal-Forced-Aligner/
 tools/sctk
 tools/sctk-20159b5/
+tools/kaldi
+tools/OpenBLAS/

 *output/
--- a/.mergify.yml
+++ b/.mergify.yml
@ -39,12 +39,30 @@ pull_request_rules:
    actions:
      label:
        remove: ["conflicts"]
-  - name: "auto add label=enhancement"
+  - name: "auto add label=S2T"
    conditions:
-      - files~=^deepspeech/
+      - files~=^paddlespeech/s2t/
    actions:
      label:
-        add: ["enhancement"]
+        add: ["S2T"]
+  - name: "auto add label=T2S"
+    conditions:
+      - files~=^paddlespeech/t2s/
+    actions:
+      label:
+        add: ["T2S"]
+  - name: "auto add label=Audio"
+    conditions:
+      - files~=^paddleaudio/
+    actions:
+      label:
+        add: ["Audio"]
+  - name: "auto add label=TextProcess"
+    conditions:
+      - files~=^paddlespeech/text/
+    actions:
+      label:
+        add: ["TextProcess"]
  - name: "auto add label=Example"
    conditions:
      - files~=^examples/
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@ -7,7 +7,7 @@ version: 2

 # Build documentation in the docs/ directory with Sphinx
 sphinx:
-  configuration: docs/src/conf.py
+  configuration: docs/source/conf.py

 # Build documentation with MkDocs
 #mkdocs:
@ -20,11 +20,6 @@ formats: []
 python:
  version: 3.7
  install:
-    - method: pip
-      path: .
-      extra_requirements:
-        - doc
-    
    - requirements: docs/requirements.txt


--- a/.travis/install.sh
+++ b/.travis/install.sh
@ -1,37 +0,0 @@
-#!/bin/bash
-
-setup_env(){
-    cd tools && make && cd - 
-}
-
-install(){
-    if [ -f "setup.sh" ]; then
-        bash setup.sh
-        #export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-    fi
-    if [ $? != 0 ]; then
-        exit 1
-    fi
-}
-
-print_env(){
-    cat /etc/lsb-release
-    gcc -v
-    g++ -v
-}
-
-abort(){
-    echo "Run install failed" 1>&2
-    echo "Please check your code" 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-
-print_env
-setup_env
-source tools/venv/bin/activate
-install
-
-trap : 0
--- a/.travis/precommit.sh
+++ b/.travis/precommit.sh
@ -1,23 +0,0 @@
-#!/bin/bash
-
-function abort(){
-    echo "Your commit not fit PaddlePaddle code style" 1>&2
-    echo "Please use pre-commit scripts to auto-format your code" 1>&2
-    exit 1
-}
-
-
-trap 'abort' 0
-set -e
-
-source tools/venv/bin/activate
-
-python3 --version
-
-if ! pre-commit run -a ; then
-  ls -lh
-  git diff  --exit-code
-  exit 1
-fi
-
-trap : 0
--- a/.travis/unittest.sh
+++ b/.travis/unittest.sh
@ -1,54 +0,0 @@
-#!/bin/bash
-
-
-
-abort(){
-    echo "Run unittest failed" 1>&2
-    echo "Please check your code" 1>&2
-    exit 1
-}
-
-
-unittest(){
-    cd $1 > /dev/null
-    if [ -f "setup.sh" ]; then
-        bash setup.sh
-        export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-    fi
-    if [ $? != 0 ]; then
-        exit 1
-    fi
-    find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \
-        xargs -0 -I{} -n1 bash -c \
-        'python3 -m unittest discover -v -s {}'
-    cd - > /dev/null
-}
-
-coverage(){
-    cd $1 > /dev/null
-
-    if [ -f "setup.sh" ]; then
-        bash setup.sh
-        export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-    fi
-    if [ $? != 0 ]; then
-        exit 1
-    fi
-
-    find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \
-        xargs -0 -I{} -n1 bash -c \
-        'python3 -m coverage run --branch {}'
-    python3 -m coverage report -m
-    python3 -m coverage html
-    cd - > /dev/null
-}
-
-trap 'abort' 0
-set -e
-
-source tools/venv/bin/activate
-#pip3 install pytest
-#unittest .
-coverage .
-
-trap : 0
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@ English | [简体中文](README_ch.md)
  <h3>
  <a href="#quick-start"> Quick Start </a>
  | <a href="#tutorials"> Tutorials </a>
-  | <a href="#model-list"> Models List </a>
+  | <a href="#models-list"> Models List </a>
 </div>

 ------------------------------------------------------------------------------------
@ -57,7 +57,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
 </table>

 </div>
-  
+
 ##### Text-To-Speech
 <div align = "center">
 <table style="width:100%">
@ -119,7 +119,7 @@ If you want to set up PaddleSpeech in other environment, please see the [install

 Developers can have a try of our model with only a few lines of code.

-A tiny DeepSpeech2 *Speech-To-Text* model training on toy set of LibriSpeech:
+A tiny DeepSpeech2 **Speech-To-Text** model training on toy set of LibriSpeech:

 ```shell
 cd examples/tiny/s0/
@ -131,10 +131,7 @@ bash local/data.sh
 bash local/test.sh conf/deepspeech2.yaml ckptfile offline
 ```

-For *Text-To-Speech*, try FastSpeech2 on LJSpeech:
- Download LJSpeech-1.1 from the [ljspeech official website](https://keithito.com/LJ-Speech-Dataset/), our prepared durations for fastspeech2 [ljspeech_alignment](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz).
- The pretrained models are seperated into two parts: [fastspeech2_nosil_ljspeech_ckpt](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) and [pwg_ljspeech_ckpt](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip). Please download then unzip to `./model/fastspeech2` and `./model/pwg` respectively.
- Assume your path to the dataset is `~/datasets/LJSpeech-1.1` and `./ljspeech_alignment` accordingly, preprocess your data and then use our pretrained model to synthesize:
+For **Text-To-Speech**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC:
 ```shell
 cd examples/csmsc/tts3
 # download the pretrained models and unaip them
@ -161,7 +158,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
  --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 ```

-If you want to try more functions like training and tuning, please see [Speech-To-Text getting started](./docs/source/asr/getting_started.md) and [Text-To-Speech Basic Use](./docs/source/tts/basic_usage.md).
+If you want to try more functions like training and tuning, please see [Speech-To-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-To-Speech Quick Start](./docs/source/tts/quick_start.md).

 # Models List

@ -240,7 +237,7 @@ PaddleSpeech Text-To-Speech mainly contains three modules: *Text Frontend*, *Aco
    <td> Text Frontend</td>
    <td colspan="2"> &emsp; </td>
    <td>
-    <a href = "./examples/other/text_frontend">chinese-fronted</a>
+    <a href = "./examples/other/tn">tn</a> / <a href = "./examples/other/g2p">g2p</a>
    </td>
    </tr>
    <tr>
@ -283,7 +280,7 @@ PaddleSpeech Text-To-Speech mainly contains three modules: *Text Frontend*, *Aco
      <td >Parallel WaveGAN</td>
      <td >LJSpeech / VCTK / CSMSC</td>
      <td>
-      <a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a> 
+      <a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a>
      </td>
    </tr>
    <tr>
@ -305,11 +302,10 @@ PaddleSpeech Text-To-Speech mainly contains three modules: *Text Frontend*, *Aco
  </tbody>
 </table>

-
 # Tutorials

 Normally, [Speech SoTA](https://paperswithcode.com/area/speech) gives you an overview of the hot academic topics in speech. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas.
- [Overview](./docs/source/introduction.md) 
+- [Overview](./docs/source/introduction.md)
 - Quick Start
  - [Dependencies](./docs/source/dependencies.md) and [Installation](./docs/source/install.md)
  - [Quick Start of Speech-To-Text](./docs/source/asr/quick_start.md)
@ -327,8 +323,12 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech) gives you an ove
  - [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) and [PaddleSpeech VS. Espnet](https://paddlespeech.readthedocs.io/en/latest/tts/demo_2.html)
 - [Released Models](./docs/source/released_model.md)

-# License and Acknowledgement
+The TTS module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with DeepSpeech. If you are interested in academic research about this function, please see [TTS research overview](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://paddleparakeet.readthedocs.io/en/latest/released_models.html) is a good guideline for the pipeline components.

+## FAQ and Contributing
+You are warmly welcome to submit questions in [discussions](https://github.com/PaddlePaddle/DeepSpeech/discussions) and bug reports in [issues](https://github.com/PaddlePaddle/DeepSpeech/issues)! Also, we highly appreciate if you would like to contribute to this project!
+
+# License and Acknowledgement
 PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE).

 PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.
--- a/audio/.gitignore
+++ b/audio/.gitignore
@ -0,0 +1,7 @@
+.ipynb_checkpoints/**
+*.ipynb
+nohup.out
+__pycache__/
+*.wav
+*.m4a
+obsolete/**
--- a/audio/.pre-commit-config.yaml
+++ b/audio/.pre-commit-config.yaml
@ -0,0 +1,45 @@
+repos:
+-   repo: local
+    hooks:
+    -   id: yapf
+        name: yapf
+        entry: yapf
+        language: system
+        args: [-i, --style .style.yapf]
+        files: \.py$
+
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: a11d9314b22d8f8c7556443875b731ef05965464
+    hooks:
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+    -   id: detect-private-key
+    -   id: check-symlinks
+    -   id: check-added-large-files
+
+-   repo: https://github.com/pycqa/isort
+    rev: 5.8.0
+    hooks:
+    -   id: isort
+        name: isort (python)
+    -   id: isort
+        name: isort (cython)
+        types: [cython]
+    -   id: isort
+        name: isort (pyi)
+        types: [pyi]
+
+-   repo: local
+    hooks:
+    -   id: flake8
+        name: flake8
+        entry: flake8
+        language: system
+        args:
+        -   --count
+        -   --select=E9,F63,F7,F82
+        -   --show-source
+        -   --statistics
+        files: \.py$
--- a/third_party/python-pinyin/.style.yapf
+++ b/third_party/python-pinyin/.style.yapf
--- a/third_party/chinese_text_normalization/thrax/src/LICENSE
+++ b/third_party/chinese_text_normalization/thrax/src/LICENSE
@ -1,4 +1,3 @@
-
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
--- a/audio/README.md
+++ b/audio/README.md
@ -0,0 +1,37 @@
+# PaddleAudio:  The audio library for PaddlePaddle
+
+## Introduction
+PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap.
+
+
+
+## Features
+- Spectrogram and related features are compatible with librosa.
+- State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come.
+- Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap.
+- Data loading supports for common open source audio in multiple languages including English, Mandarin and so on.
+
+
+## Install
+```
+git clone https://github.com/PaddlePaddle/models
+cd models/PaddleAudio
+pip install .
+
+```
+
+## Quick start
+### Audio loading and feature extraction
+```
+import paddleaudio as pa
+s,r = pa.load(f)
+mel_spect = pa.melspectrogram(s,sr=r)
+```
+
+###  Examples
+We provide a set of examples to help you get started in using PaddleAudio quickly.
+- [PANNs:  acoustic scene and events analysis using pre-trained models](./examples/panns)
+- [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification)
+- [Training a audio-tagging network on Audioset](./examples/audioset_training)
+
+Please refer to [example directory](./examples) for more details.
--- a/audio/examples/panns/README.md
+++ b/audio/examples/panns/README.md
@ -0,0 +1,128 @@
+# Audio Tagging
+
+声音分类的任务是单标签的分类任务，但是对于一段音频来说，它可以是多标签的。譬如在一般的室内办公环境进行录音，这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音，还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言，对一段音频预测多个标签是具有很强的实用性的。
+
+在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
+
+`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别，因此可用于声音的实时tagging。
+
+本示例采用`PANNs`预训练模型，基于Audioset的标签类别对输入音频实时tagging，并最终以文本形式输出对应时刻的top k类别和对应的得分。
+
+
+## 模型简介
+
+PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
+- CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
+- CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
+- CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
+
+
+## 快速开始
+
+### 模型预测
+
+```shell
+export CUDA_VISIBLE_DEVICES=0
+python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir
+```
+
+可支持配置的参数：
+
+- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `wav`: 指定预测的音频文件。
+- `sample_duration`: 模型每次预测的音频时间长度，单位为秒，默认为2s。
+- `hop_duration`: 每两个预测音频的时间间隔，单位为秒，默认为0.3s。
+- `output_dir`: 模型预测结果存放的路径，默认为`./output_dir`。
+
+示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
+```python
+from paddleaudio.models.panns import cnn14, cnn10, cnn6
+
+# CNN14
+model = cnn14(pretrained=True, extract_embedding=False)
+# CNN10
+model = cnn10(pretrained=True, extract_embedding=False)
+# CNN6
+model = cnn6(pretrained=True, extract_embedding=False)
+```
+
+执行结果：
+```
+[2021-04-30 19:15:41,025] [    INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz
+```
+
+执行后得分结果保存在`output_dir`的`.npz`文件中。
+
+
+### 生成tagging标签文本
+```shell
+python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir
+```
+
+可支持配置的参数：
+
+- `tagging_file`: 模型预测结果文件。
+- `top_k`: 获取预测结果中，得分最高的前top_k个标签，默认为10。
+- `smooth`: 预测结果的后验概率平滑，默认为True，表示应用平滑。
+- `smooth_size`: 平滑计算过程中的样本数量，默认为5。
+- `label_file`: 模型预测结果对应的Audioset类别的文本文件。
+- `output_dir`: 标签文本存放的路径，默认为`./output_dir`。
+
+执行结果：
+```
+[2021-04-30 19:26:58,743] [    INFO] - Posterior smoothing...
+[2021-04-30 19:26:58,746] [    INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt
+```
+
+执行后文本结果保存在`output_dir`的`.txt`文件中。
+
+
+## Tagging标签文本
+
+最终输出的文本结果如下所示。  
+样本每个时间范围的top k结果用空行分隔。在每一个结果中，第一行是时间信息，数字表示tagging结果在时间起点信息，比例值代表当前时刻`t`与音频总长度`T`的比值；紧接的k行是对应的标签和得分。
+
+```
+0.0
+Cat: 0.9144676923751831
+Animal: 0.8855036497116089
+Domestic animals, pets: 0.804577112197876
+Meow: 0.7422927021980286
+Music: 0.19959309697151184
+Inside, small room: 0.12550437450408936
+Caterwaul: 0.021584441885352135
+Purr: 0.020247288048267365
+Speech: 0.018197158351540565
+Vehicle: 0.007446660194545984
+
+0.059197544398158296
+Cat: 0.9250872135162354
+Animal: 0.8957151174545288
+Domestic animals, pets: 0.8228275775909424
+Meow: 0.7650775909423828
+Music: 0.20210561156272888
+Inside, small room: 0.12290887534618378
+Caterwaul: 0.029371455311775208
+Purr: 0.018731823191046715
+Speech: 0.017130598425865173
+Vehicle: 0.007748497650027275
+
+0.11839508879631659
+Cat: 0.9336574673652649
+Animal: 0.9111202359199524
+Domestic animals, pets: 0.8349071145057678
+Meow: 0.7761964797973633
+Music: 0.20467285811901093
+Inside, small room: 0.10709915310144424
+Caterwaul: 0.05370649695396423
+Purr: 0.018830426037311554
+Speech: 0.017361722886562347
+Vehicle: 0.006929398979991674
+
+...
+...
+```
+
+以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子，可以实时地对音频进行多标签预测。
+
+![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif)
--- a/audio/examples/panns/assets/audioset_labels.txt
+++ b/audio/examples/panns/assets/audioset_labels.txt
@ -0,0 +1,527 @@
+Speech
+Male speech, man speaking
+Female speech, woman speaking
+Child speech, kid speaking
+Conversation
+Narration, monologue
+Babbling
+Speech synthesizer
+Shout
+Bellow
+Whoop
+Yell
+Battle cry
+Children shouting
+Screaming
+Whispering
+Laughter
+Baby laughter
+Giggle
+Snicker
+Belly laugh
+Chuckle, chortle
+Crying, sobbing
+Baby cry, infant cry
+Whimper
+Wail, moan
+Sigh
+Singing
+Choir
+Yodeling
+Chant
+Mantra
+Male singing
+Female singing
+Child singing
+Synthetic singing
+Rapping
+Humming
+Groan
+Grunt
+Whistling
+Breathing
+Wheeze
+Snoring
+Gasp
+Pant
+Snort
+Cough
+Throat clearing
+Sneeze
+Sniff
+Run
+Shuffle
+Walk, footsteps
+Chewing, mastication
+Biting
+Gargling
+Stomach rumble
+Burping, eructation
+Hiccup
+Fart
+Hands
+Finger snapping
+Clapping
+Heart sounds, heartbeat
+Heart murmur
+Cheering
+Applause
+Chatter
+Crowd
+Hubbub, speech noise, speech babble
+Children playing
+Animal
+Domestic animals, pets
+Dog
+Bark
+Yip
+Howl
+Bow-wow
+Growling
+Whimper (dog)
+Cat
+Purr
+Meow
+Hiss
+Caterwaul
+Livestock, farm animals, working animals
+Horse
+Clip-clop
+Neigh, whinny
+Cattle, bovinae
+Moo
+Cowbell
+Pig
+Oink
+Goat
+Bleat
+Sheep
+Fowl
+Chicken, rooster
+Cluck
+Crowing, cock-a-doodle-doo
+Turkey
+Gobble
+Duck
+Quack
+Goose
+Honk
+Wild animals
+Roaring cats (lions, tigers)
+Roar
+Bird
+Bird vocalization, bird call, bird song
+Chirp, tweet
+Squawk
+Pigeon, dove
+Coo
+Crow
+Caw
+Owl
+Hoot
+Bird flight, flapping wings
+Canidae, dogs, wolves
+Rodents, rats, mice
+Mouse
+Patter
+Insect
+Cricket
+Mosquito
+Fly, housefly
+Buzz
+Bee, wasp, etc.
+Frog
+Croak
+Snake
+Rattle
+Whale vocalization
+Music
+Musical instrument
+Plucked string instrument
+Guitar
+Electric guitar
+Bass guitar
+Acoustic guitar
+Steel guitar, slide guitar
+Tapping (guitar technique)
+Strum
+Banjo
+Sitar
+Mandolin
+Zither
+Ukulele
+Keyboard (musical)
+Piano
+Electric piano
+Organ
+Electronic organ
+Hammond organ
+Synthesizer
+Sampler
+Harpsichord
+Percussion
+Drum kit
+Drum machine
+Drum
+Snare drum
+Rimshot
+Drum roll
+Bass drum
+Timpani
+Tabla
+Cymbal
+Hi-hat
+Wood block
+Tambourine
+Rattle (instrument)
+Maraca
+Gong
+Tubular bells
+Mallet percussion
+Marimba, xylophone
+Glockenspiel
+Vibraphone
+Steelpan
+Orchestra
+Brass instrument
+French horn
+Trumpet
+Trombone
+Bowed string instrument
+String section
+Violin, fiddle
+Pizzicato
+Cello
+Double bass
+Wind instrument, woodwind instrument
+Flute
+Saxophone
+Clarinet
+Harp
+Bell
+Church bell
+Jingle bell
+Bicycle bell
+Tuning fork
+Chime
+Wind chime
+Change ringing (campanology)
+Harmonica
+Accordion
+Bagpipes
+Didgeridoo
+Shofar
+Theremin
+Singing bowl
+Scratching (performance technique)
+Pop music
+Hip hop music
+Beatboxing
+Rock music
+Heavy metal
+Punk rock
+Grunge
+Progressive rock
+Rock and roll
+Psychedelic rock
+Rhythm and blues
+Soul music
+Reggae
+Country
+Swing music
+Bluegrass
+Funk
+Folk music
+Middle Eastern music
+Jazz
+Disco
+Classical music
+Opera
+Electronic music
+House music
+Techno
+Dubstep
+Drum and bass
+Electronica
+Electronic dance music
+Ambient music
+Trance music
+Music of Latin America
+Salsa music
+Flamenco
+Blues
+Music for children
+New-age music
+Vocal music
+A capella
+Music of Africa
+Afrobeat
+Christian music
+Gospel music
+Music of Asia
+Carnatic music
+Music of Bollywood
+Ska
+Traditional music
+Independent music
+Song
+Background music
+Theme music
+Jingle (music)
+Soundtrack music
+Lullaby
+Video game music
+Christmas music
+Dance music
+Wedding music
+Happy music
+Funny music
+Sad music
+Tender music
+Exciting music
+Angry music
+Scary music
+Wind
+Rustling leaves
+Wind noise (microphone)
+Thunderstorm
+Thunder
+Water
+Rain
+Raindrop
+Rain on surface
+Stream
+Waterfall
+Ocean
+Waves, surf
+Steam
+Gurgling
+Fire
+Crackle
+Vehicle
+Boat, Water vehicle
+Sailboat, sailing ship
+Rowboat, canoe, kayak
+Motorboat, speedboat
+Ship
+Motor vehicle (road)
+Car
+Vehicle horn, car horn, honking
+Toot
+Car alarm
+Power windows, electric windows
+Skidding
+Tire squeal
+Car passing by
+Race car, auto racing
+Truck
+Air brake
+Air horn, truck horn
+Reversing beeps
+Ice cream truck, ice cream van
+Bus
+Emergency vehicle
+Police car (siren)
+Ambulance (siren)
+Fire engine, fire truck (siren)
+Motorcycle
+Traffic noise, roadway noise
+Rail transport
+Train
+Train whistle
+Train horn
+Railroad car, train wagon
+Train wheels squealing
+Subway, metro, underground
+Aircraft
+Aircraft engine
+Jet engine
+Propeller, airscrew
+Helicopter
+Fixed-wing aircraft, airplane
+Bicycle
+Skateboard
+Engine
+Light engine (high frequency)
+Dental drill, dentist's drill
+Lawn mower
+Chainsaw
+Medium engine (mid frequency)
+Heavy engine (low frequency)
+Engine knocking
+Engine starting
+Idling
+Accelerating, revving, vroom
+Door
+Doorbell
+Ding-dong
+Sliding door
+Slam
+Knock
+Tap
+Squeak
+Cupboard open or close
+Drawer open or close
+Dishes, pots, and pans
+Cutlery, silverware
+Chopping (food)
+Frying (food)
+Microwave oven
+Blender
+Water tap, faucet
+Sink (filling or washing)
+Bathtub (filling or washing)
+Hair dryer
+Toilet flush
+Toothbrush
+Electric toothbrush
+Vacuum cleaner
+Zipper (clothing)
+Keys jangling
+Coin (dropping)
+Scissors
+Electric shaver, electric razor
+Shuffling cards
+Typing
+Typewriter
+Computer keyboard
+Writing
+Alarm
+Telephone
+Telephone bell ringing
+Ringtone
+Telephone dialing, DTMF
+Dial tone
+Busy signal
+Alarm clock
+Siren
+Civil defense siren
+Buzzer
+Smoke detector, smoke alarm
+Fire alarm
+Foghorn
+Whistle
+Steam whistle
+Mechanisms
+Ratchet, pawl
+Clock
+Tick
+Tick-tock
+Gears
+Pulleys
+Sewing machine
+Mechanical fan
+Air conditioning
+Cash register
+Printer
+Camera
+Single-lens reflex camera
+Tools
+Hammer
+Jackhammer
+Sawing
+Filing (rasp)
+Sanding
+Power tool
+Drill
+Explosion
+Gunshot, gunfire
+Machine gun
+Fusillade
+Artillery fire
+Cap gun
+Fireworks
+Firecracker
+Burst, pop
+Eruption
+Boom
+Wood
+Chop
+Splinter
+Crack
+Glass
+Chink, clink
+Shatter
+Liquid
+Splash, splatter
+Slosh
+Squish
+Drip
+Pour
+Trickle, dribble
+Gush
+Fill (with liquid)
+Spray
+Pump (liquid)
+Stir
+Boiling
+Sonar
+Arrow
+Whoosh, swoosh, swish
+Thump, thud
+Thunk
+Electronic tuner
+Effects unit
+Chorus effect
+Basketball bounce
+Bang
+Slap, smack
+Whack, thwack
+Smash, crash
+Breaking
+Bouncing
+Whip
+Flap
+Scratch
+Scrape
+Rub
+Roll
+Crushing
+Crumpling, crinkling
+Tearing
+Beep, bleep
+Ping
+Ding
+Clang
+Squeal
+Creak
+Rustle
+Whir
+Clatter
+Sizzle
+Clicking
+Clickety-clack
+Rumble
+Plop
+Jingle, tinkle
+Hum
+Zing
+Boing
+Crunch
+Silence
+Sine wave
+Harmonic
+Chirp tone
+Sound effect
+Pulse
+Inside, small room
+Inside, large room or hall
+Inside, public space
+Outside, urban or manmade
+Outside, rural or natural
+Reverberation
+Echo
+Noise
+Environmental noise
+Static
+Mains hum
+Distortion
+Sidetone
+Cacophony
+White noise
+Pink noise
+Throbbing
+Vibration
+Television
+Radio
+Field recording
--- a/audio/examples/panns/audio_tag.py
+++ b/audio/examples/panns/audio_tag.py
@ -0,0 +1,111 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from typing import List
+
+import numpy as np
+import paddle
+from paddleaudio.backends import load as load_audio
+from paddleaudio.features import melspectrogram
+from paddleaudio.models.panns import cnn14
+from paddleaudio.utils import logger
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
+parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
+parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
+parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
+parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
+args = parser.parse_args()
+# yapf: enable
+
+
+def split(waveform: np.ndarray, win_size: int, hop_size: int):
+    """
+    Split into N waveforms.
+    N is decided by win_size and hop_size.
+    """
+    assert isinstance(waveform, np.ndarray)
+    time = []
+    data = []
+    for i in range(0, len(waveform), hop_size):
+        segment = waveform[i:i + win_size]
+        if len(segment) < win_size:
+            segment = np.pad(segment, (0, win_size - len(segment)))
+        data.append(segment)
+        time.append(i / len(waveform))
+    return time, data
+
+
+def batchify(data: List[List[float]],
+             sample_rate: int,
+             batch_size: int,
+             **kwargs):
+    """
+    Extract features from waveforms and create batches.
+    """
+    examples = []
+    for waveform in data:
+        feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
+        examples.append(feats)
+
+    # Seperates data into some batches.
+    one_batch = []
+    for example in examples:
+        one_batch.append(example)
+        if len(one_batch) == batch_size:
+            yield one_batch
+            one_batch = []
+    if one_batch:
+        yield one_batch
+
+
+def predict(model, data: List[List[float]], sample_rate: int,
+            batch_size: int=1):
+    """
+    Use pretrained model to make predictions.
+    """
+    batches = batchify(data, sample_rate, batch_size)
+    results = None
+    model.eval()
+    for batch in batches:
+        feats = paddle.to_tensor(batch).unsqueeze(1)  \
+            # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
+
+        audioset_scores = model(feats)
+        if results is None:
+            results = audioset_scores.numpy()
+        else:
+            results = np.concatenate((results, audioset_scores.numpy()))
+
+    return results
+
+
+if __name__ == '__main__':
+    paddle.set_device(args.device)
+    model = cnn14(pretrained=True, extract_embedding=False)
+    waveform, sr = load_audio(args.wav, sr=None)
+    time, data = split(waveform,
+                       int(args.sample_duration * sr),
+                       int(args.hop_duration * sr))
+    results = predict(model, data, sr, batch_size=8)
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
+    output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
+    np.savez(output_file, time=time, scores=results)
+    logger.info(f'Saved tagging results to {output_file}')
--- a/audio/examples/panns/parse_result.py
+++ b/audio/examples/panns/parse_result.py
@ -0,0 +1,83 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import ast
+import os
+from typing import Dict
+
+import numpy as np
+from paddleaudio.utils import logger
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--tagging_file', type=str, required=True, help='')
+parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
+parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
+parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
+parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
+parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
+args = parser.parse_args()
+# yapf: enable
+
+
+def smooth(results: np.ndarray, win_size: int):
+    """
+    Execute posterior smoothing in-place.
+    """
+    for i in range(len(results) - 1, -1, -1):
+        if i < win_size - 1:
+            left = 0
+        else:
+            left = i + 1 - win_size
+        results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
+
+
+def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
+    """
+    Return top k result.
+    """
+    result = np.asarray(result)
+    topk_idx = (-result).argsort()[:k]
+
+    ret = ''
+    for idx in topk_idx:
+        label, score = label_map[idx], result[idx]
+        ret += f'{label}: {score}\n'
+    return ret
+
+
+if __name__ == "__main__":
+    label_map = {}
+    with open(args.label_file, 'r') as f:
+        for i, l in enumerate(f.readlines()):
+            label_map[i] = l.strip()
+
+    results = np.load(args.tagging_file, allow_pickle=True)
+    times, scores = results['time'], results['scores']
+
+    if args.smooth:
+        logger.info('Posterior smoothing...')
+        smooth(scores, win_size=args.smooth_size)
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    output_file = os.path.join(
+        args.output_dir,
+        os.path.basename(args.tagging_file).split('.')[0] + '.txt')
+    with open(output_file, 'w') as f:
+        for time, score in zip(times, scores):
+            f.write(f'{time}\n')
+            f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
+
+    logger.info(f'Saved tagging labels to {output_file}')
--- a/audio/examples/sound_classification/README.md
+++ b/audio/examples/sound_classification/README.md
@ -0,0 +1,116 @@
+# 声音分类
+
+声音分类和检测是声音算法的一个热门研究方向。  
+
+对于声音分类任务，传统机器学习的一个常用做法是首先人工提取音频的时域和频域的多种特征并做特征选择、组合、变换等，然后基于SVM或决策树进行分类。而端到端的深度学习则通常利用深度网络如RNN，CNN等直接对声间波形(waveform)或时频特征(time-frequency)进行特征学习(representation learning)和分类预测。
+
+在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
+
+`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。经过预训练后，模型可以用于提取音频的embbedding。本示例将使用`PANNs`的预训练模型Finetune完成声音分类的任务。
+
+
+## 模型简介
+
+PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
+- CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
+- CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
+- CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
+
+
+## 快速开始
+
+### 模型训练
+
+以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。关于如何使用`paddle.distributed.launch`启动多卡训练，请查看[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html)。
+
+单卡训练:
+```shell
+$ python train.py --epochs 50 --batch_size 16 --checkpoint_dir ./checkpoint --save_freq 10
+```
+
+多卡训练:
+```shell
+$ unset CUDA_VISIBLE_DEVICES
+$ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_size 16 --num_worker 4 --checkpoint_dir ./checkpoint --save_freq 10
+```
+
+可支持配置的参数：
+
+- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `epochs`: 训练轮次，默认为50。
+- `learning_rate`: Fine-tune的学习率；默认为5e-5。
+- `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为16。
+- `num_workers`: Dataloader获取数据的子进程数。默认为0，加载数据的流程在主进程执行。
+- `checkpoint_dir`: 模型参数文件和optimizer参数文件的保存目录，默认为`./checkpoint`。
+- `save_freq`: 训练过程中的模型保存频率，默认为10。
+- `log_freq`: 训练过程中的信息打印频率，默认为10。
+
+示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
+```python
+from model import SoundClassifier
+from paddleaudio.datasets import ESC50
+from paddleaudio.models.panns import cnn14, cnn10, cnn6
+
+# CNN14
+backbone = cnn14(pretrained=True, extract_embedding=True)
+model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
+
+# CNN10
+backbone = cnn10(pretrained=True, extract_embedding=True)
+model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
+
+# CNN6
+backbone = cnn6(pretrained=True, extract_embedding=True)
+model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
+```
+
+### 模型预测
+
+```shell
+python -u predict.py --wav ./dog.wav --top_k 3 --checkpoint ./checkpoint/epoch_50/model.pdparams
+```
+
+可支持配置的参数：
+- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `wav`: 指定预测的音频文件。
+- `top_k`: 预测显示的top k标签的得分，默认为1。
+- `checkpoint`: 模型参数checkpoint文件。
+
+输出的预测结果如下：
+```
+[/audio/dog.wav]
+Dog: 0.9999538660049438
+Clock tick: 1.3341237718123011e-05
+Cat: 6.579841738130199e-06
+```
+
+### 模型部署
+
+#### 1. 动转静
+
+模型训练结束后，可以将已保存的动态图参数导出成静态图的模型和参数，然后实施静态图的部署。
+
+```shell
+python -u export_model.py --checkpoint ./checkpoint/epoch_50/model.pdparams --output_dir ./export
+```
+
+可支持配置的参数：
+- `checkpoint`: 模型参数checkpoint文件。
+- `output_dir`: 导出静态图模型和参数文件的保存目录。
+
+导出的静态图模型和参数文件如下：
+```sh
+$ tree export
+export
+├── inference.pdiparams
+├── inference.pdiparams.info
+└── inference.pdmodel
+```
+
+#### 2. 模型部署和预测
+
+`deploy/python/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
+
+```sh
+python deploy/python/predict.py --model_dir ./export --device gpu
+```
--- a/audio/examples/sound_classification/deploy/python/predict.py
+++ b/audio/examples/sound_classification/deploy/python/predict.py
@ -0,0 +1,146 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import numpy as np
+from paddle import inference
+from paddleaudio.backends import load as load_audio
+from paddleaudio.datasets import ESC50
+from paddleaudio.features import melspectrogram
+from scipy.special import softmax
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
+parser.add_argument("--batch_size", type=int, default=2, help="Batch size per GPU/CPU for training.")
+parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
+parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.')
+parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.')
+parser.add_argument('--enable_mkldnn', type=eval, default=False, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
+parser.add_argument("--log_dir", type=str, default="./log", help="The path to save log.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def extract_features(files: str, **kwargs):
+    waveforms = []
+    srs = []
+    max_length = float('-inf')
+    for file in files:
+        waveform, sr = load_audio(file, sr=None)
+        max_length = max(max_length, len(waveform))
+        waveforms.append(waveform)
+        srs.append(sr)
+
+    feats = []
+    for i in range(len(waveforms)):
+        # padding
+        if len(waveforms[i]) < max_length:
+            pad_width = max_length - len(waveforms[i])
+            waveforms[i] = np.pad(waveforms[i], pad_width=(0, pad_width))
+
+        feat = melspectrogram(waveforms[i], sr, **kwargs).transpose()
+        feats.append(feat)
+
+    return np.stack(feats, axis=0)
+
+
+class Predictor(object):
+    def __init__(self,
+                 model_dir,
+                 device="gpu",
+                 batch_size=1,
+                 use_tensorrt=False,
+                 precision="fp32",
+                 cpu_threads=10,
+                 enable_mkldnn=False):
+        self.batch_size = batch_size
+
+        model_file = os.path.join(model_dir, "inference.pdmodel")
+        params_file = os.path.join(model_dir, "inference.pdiparams")
+
+        assert os.path.isfile(model_file) and os.path.isfile(
+            params_file), 'Please check model and parameter files.'
+
+        config = inference.Config(model_file, params_file)
+        if device == "gpu":
+            # set GPU configs accordingly
+            # such as intialize the gpu memory, enable tensorrt
+            config.enable_use_gpu(100, 0)
+            precision_map = {
+                "fp16": inference.PrecisionType.Half,
+                "fp32": inference.PrecisionType.Float32,
+            }
+            precision_mode = precision_map[precision]
+
+            if use_tensorrt:
+                config.enable_tensorrt_engine(
+                    max_batch_size=batch_size,
+                    min_subgraph_size=30,
+                    precision_mode=precision_mode)
+        elif device == "cpu":
+            # set CPU configs accordingly,
+            # such as enable_mkldnn, set_cpu_math_library_num_threads
+            config.disable_gpu()
+            if enable_mkldnn:
+                # cache 10 different shapes for mkldnn to avoid memory leak
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+            config.set_cpu_math_library_num_threads(cpu_threads)
+        elif device == "xpu":
+            # set XPU configs accordingly
+            config.enable_xpu(100)
+
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = inference.create_predictor(config)
+        self.input_handles = [
+            self.predictor.get_input_handle(name)
+            for name in self.predictor.get_input_names()
+        ]
+        self.output_handle = self.predictor.get_output_handle(
+            self.predictor.get_output_names()[0])
+
+    def predict(self, wavs):
+        feats = extract_features(wavs)
+
+        self.input_handles[0].copy_from_cpu(feats)
+        self.predictor.run()
+        logits = self.output_handle.copy_to_cpu()
+        probs = softmax(logits, axis=1)
+        indices = np.argmax(probs, axis=1)
+
+        return indices
+
+
+if __name__ == "__main__":
+    # Define predictor to do prediction.
+    predictor = Predictor(args.model_dir, args.device, args.batch_size,
+                          args.use_tensorrt, args.precision, args.cpu_threads,
+                          args.enable_mkldnn)
+
+    wavs = [
+        '~/audio_demo_resource/cat.wav',
+        '~/audio_demo_resource/dog.wav',
+    ]
+
+    for i in range(len(wavs)):
+        wavs[i] = os.path.abspath(os.path.expanduser(wavs[i]))
+        assert os.path.isfile(
+            wavs[i]), f'Please check input wave file: {wavs[i]}'
+
+    results = predictor.predict(wavs)
+    for idx, wav in enumerate(wavs):
+        print(f'Wav: {wav} \t Label: {ESC50.label_list[results[idx]]}')
--- a/audio/examples/sound_classification/export_model.py
+++ b/audio/examples/sound_classification/export_model.py
@ -0,0 +1,44 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+from model import SoundClassifier
+from paddleaudio.datasets import ESC50
+from paddleaudio.models.panns import cnn14
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
+parser.add_argument("--output_dir", type=str, default='./export', help="Path to save static model and its parameters.")
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == '__main__':
+    model = SoundClassifier(
+        backbone=cnn14(pretrained=False, extract_embedding=True),
+        num_class=len(ESC50.label_list))
+    model.set_state_dict(paddle.load(args.checkpoint))
+    model.eval()
+
+    model = paddle.jit.to_static(
+        model,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, None, 64], dtype=paddle.float32)
+        ])
+
+    # Save in static graph model.
+    paddle.jit.save(model, os.path.join(args.output_dir, "inference"))
--- a/audio/examples/sound_classification/model.py
+++ b/audio/examples/sound_classification/model.py
@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.nn as nn
+
+
+class SoundClassifier(nn.Layer):
+    """
+    Model for sound classification which uses panns pretrained models to extract
+    embeddings from audio files.
+    """
+
+    def __init__(self, backbone, num_class, dropout=0.1):
+        super(SoundClassifier, self).__init__()
+        self.backbone = backbone
+        self.dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(self.backbone.emb_size, num_class)
+
+    def forward(self, x):
+        # x: (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
+        x = x.unsqueeze(1)
+        x = self.backbone(x)
+        x = self.dropout(x)
+        logits = self.fc(x)
+
+        return logits
--- a/audio/examples/sound_classification/predict.py
+++ b/audio/examples/sound_classification/predict.py
@ -0,0 +1,60 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from model import SoundClassifier
+from paddleaudio.backends import load as load_audio
+from paddleaudio.datasets import ESC50
+from paddleaudio.features import melspectrogram
+from paddleaudio.models.panns import cnn14
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
+parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
+parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def extract_features(file: str, **kwargs):
+    waveform, sr = load_audio(file, sr=None)
+    feat = melspectrogram(waveform, sr, **kwargs).transpose()
+    return feat
+
+
+if __name__ == '__main__':
+    paddle.set_device(args.device)
+
+    model = SoundClassifier(
+        backbone=cnn14(pretrained=False, extract_embedding=True),
+        num_class=len(ESC50.label_list))
+    model.set_state_dict(paddle.load(args.checkpoint))
+    model.eval()
+
+    feat = np.expand_dims(extract_features(args.wav), 0)
+    feat = paddle.to_tensor(feat)
+    logits = model(feat)
+    probs = F.softmax(logits, axis=1).numpy()
+
+    sorted_indices = (-probs[0]).argsort()
+
+    msg = f'[{args.wav}]\n'
+    for idx in sorted_indices[:args.top_k]:
+        msg += f'{ESC50.label_list[idx]}: {probs[0][idx]}\n'
+    print(msg)
--- a/audio/examples/sound_classification/train.py
+++ b/audio/examples/sound_classification/train.py
@ -0,0 +1,148 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+from model import SoundClassifier
+from paddleaudio.datasets import ESC50
+from paddleaudio.models.panns import cnn14
+from paddleaudio.utils import logger
+from paddleaudio.utils import Timer
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
+parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
+parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
+parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
+parser.add_argument("--checkpoint_dir", type=str, default='./checkpoint', help="Directory to save model checkpoints.")
+parser.add_argument("--save_freq", type=int, default=10, help="Save checkpoint every n epoch.")
+parser.add_argument("--log_freq", type=int, default=10, help="Log the training infomation every n steps.")
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == "__main__":
+    paddle.set_device(args.device)
+    nranks = paddle.distributed.get_world_size()
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+    local_rank = paddle.distributed.get_rank()
+
+    backbone = cnn14(pretrained=True, extract_embedding=True)
+    model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
+    model = paddle.DataParallel(model)
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=args.learning_rate, parameters=model.parameters())
+    criterion = paddle.nn.loss.CrossEntropyLoss()
+
+    train_ds = ESC50(mode='train', feat_type='melspectrogram')
+    dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
+
+    train_sampler = paddle.io.DistributedBatchSampler(
+        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
+    train_loader = paddle.io.DataLoader(
+        train_ds,
+        batch_sampler=train_sampler,
+        num_workers=args.num_workers,
+        return_list=True,
+        use_buffer_reader=True, )
+
+    steps_per_epoch = len(train_sampler)
+    timer = Timer(steps_per_epoch * args.epochs)
+    timer.start()
+
+    for epoch in range(1, args.epochs + 1):
+        model.train()
+
+        avg_loss = 0
+        num_corrects = 0
+        num_samples = 0
+        for batch_idx, batch in enumerate(train_loader):
+            feats, labels = batch
+            logits = model(feats)
+
+            loss = criterion(logits, labels)
+            loss.backward()
+            optimizer.step()
+            if isinstance(optimizer._learning_rate,
+                          paddle.optimizer.lr.LRScheduler):
+                optimizer._learning_rate.step()
+            optimizer.clear_grad()
+
+            # Calculate loss
+            avg_loss += loss.numpy()[0]
+
+            # Calculate metrics
+            preds = paddle.argmax(logits, axis=1)
+            num_corrects += (preds == labels).numpy().sum()
+            num_samples += feats.shape[0]
+
+            timer.count()
+
+            if (batch_idx + 1) % args.log_freq == 0 and local_rank == 0:
+                lr = optimizer.get_lr()
+                avg_loss /= args.log_freq
+                avg_acc = num_corrects / num_samples
+
+                print_msg = 'Epoch={}/{}, Step={}/{}'.format(
+                    epoch, args.epochs, batch_idx + 1, steps_per_epoch)
+                print_msg += ' loss={:.4f}'.format(avg_loss)
+                print_msg += ' acc={:.4f}'.format(avg_acc)
+                print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
+                    lr, timer.timing, timer.eta)
+                logger.train(print_msg)
+
+                avg_loss = 0
+                num_corrects = 0
+                num_samples = 0
+
+        if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
+            dev_sampler = paddle.io.BatchSampler(
+                dev_ds,
+                batch_size=args.batch_size,
+                shuffle=False,
+                drop_last=False)
+            dev_loader = paddle.io.DataLoader(
+                dev_ds,
+                batch_sampler=dev_sampler,
+                num_workers=args.num_workers,
+                return_list=True, )
+
+            model.eval()
+            num_corrects = 0
+            num_samples = 0
+            with logger.processing('Evaluation on validation dataset'):
+                for batch_idx, batch in enumerate(dev_loader):
+                    feats, labels = batch
+                    logits = model(feats)
+
+                    preds = paddle.argmax(logits, axis=1)
+                    num_corrects += (preds == labels).numpy().sum()
+                    num_samples += feats.shape[0]
+
+            print_msg = '[Evaluation result]'
+            print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
+
+            logger.eval(print_msg)
+
+            # Save model
+            save_dir = os.path.join(args.checkpoint_dir,
+                                    'epoch_{}'.format(epoch))
+            logger.info('Saving model checkpoint to {}'.format(save_dir))
+            paddle.save(model.state_dict(),
+                        os.path.join(save_dir, 'model.pdparams'))
+            paddle.save(optimizer.state_dict(),
+                        os.path.join(save_dir, 'model.pdopt'))
--- a/audio/paddleaudio/init.py
+++ b/audio/paddleaudio/init.py
@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .backends import *
+from .features import *
--- a/audio/paddleaudio/backends/init.py
+++ b/audio/paddleaudio/backends/init.py
@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .audio import *
--- a/audio/paddleaudio/backends/audio.py
+++ b/audio/paddleaudio/backends/audio.py
@ -0,0 +1,303 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import resampy
+import soundfile as sf
+from numpy import ndarray as array
+from scipy.io import wavfile
+
+from ..utils import ParameterError
+
+__all__ = [
+    'resample',
+    'to_mono',
+    'depth_convert',
+    'normalize',
+    'save_wav',
+    'load',
+]
+NORMALMIZE_TYPES = ['linear', 'gaussian']
+MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
+RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
+EPS = 1e-8
+
+
+def resample(y: array, src_sr: int, target_sr: int,
+             mode: str='kaiser_fast') -> array:
+    """ Audio resampling
+
+     This function is the same as using resampy.resample().
+
+     Notes:
+        The default mode is kaiser_fast.  For better audio quality, use mode = 'kaiser_fast'
+
+     """
+
+    if mode == 'kaiser_best':
+        warnings.warn(
+            f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
+        we recommend the mode kaiser_fast in large scale audio trainning')
+
+    if not isinstance(y, np.ndarray):
+        raise ParameterError(
+            'Only support numpy array, but received y in {type(y)}')
+
+    if mode not in RESAMPLE_MODES:
+        raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
+
+    return resampy.resample(y, src_sr, target_sr, filter=mode)
+
+
+def to_mono(y: array, merge_type: str='average') -> array:
+    """ convert sterior audio to mono
+    """
+    if merge_type not in MERGE_TYPES:
+        raise ParameterError(
+            f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
+        )
+    if y.ndim > 2:
+        raise ParameterError(
+            f'Unsupported audio array,  y.ndim > 2, the shape is {y.shape}')
+    if y.ndim == 1:  # nothing to merge
+        return y
+
+    if merge_type == 'ch0':
+        return y[0]
+    if merge_type == 'ch1':
+        return y[1]
+    if merge_type == 'random':
+        return y[np.random.randint(0, 2)]
+
+    # need to do averaging according to dtype
+
+    if y.dtype == 'float32':
+        y_out = (y[0] + y[1]) * 0.5
+    elif y.dtype == 'int16':
+        y_out = y.astype('int32')
+        y_out = (y_out[0] + y_out[1]) // 2
+        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
+                        np.iinfo(y.dtype).max).astype(y.dtype)
+
+    elif y.dtype == 'int8':
+        y_out = y.astype('int16')
+        y_out = (y_out[0] + y_out[1]) // 2
+        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
+                        np.iinfo(y.dtype).max).astype(y.dtype)
+    else:
+        raise ParameterError(f'Unsupported dtype: {y.dtype}')
+    return y_out
+
+
+def _safe_cast(y: array, dtype: Union[type, str]) -> array:
+    """ data type casting in a safe way, i.e., prevent overflow or underflow
+
+    This function is used internally.
+    """
+    return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
+
+
+def depth_convert(y: array, dtype: Union[type, str],
+                  dithering: bool=True) -> array:
+    """Convert audio array to target dtype safely
+
+    This function convert audio waveform to a target dtype, with addition steps of
+    preventing overflow/underflow and preserving audio range.
+
+    """
+
+    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
+    if y.dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype == y.dtype:
+        return y
+
+    if dtype == 'float64' and y.dtype == 'float32':
+        return _safe_cast(y, dtype)
+    if dtype == 'float32' and y.dtype == 'float64':
+        return _safe_cast(y, dtype)
+
+    if dtype == 'int16' or dtype == 'int8':
+        if y.dtype in ['float64', 'float32']:
+            factor = np.iinfo(dtype).max
+            y = np.clip(y * factor, np.iinfo(dtype).min,
+                        np.iinfo(dtype).max).astype(dtype)
+            y = y.astype(dtype)
+        else:
+            if dtype == 'int16' and y.dtype == 'int8':
+                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
+                y = y.astype('float32') * factor
+                y = y.astype('int16')
+
+            else:  # dtype == 'int8' and y.dtype=='int16':
+                y = y.astype('int32') * np.iinfo('int8').max / \
+                    np.iinfo('int16').max
+                y = y.astype('int8')
+
+    if dtype in ['float32', 'float64']:
+        org_dtype = y.dtype
+        y = y.astype(dtype) / np.iinfo(org_dtype).max
+    return y
+
+
+def sound_file_load(file: str,
+                    offset: Optional[float]=None,
+                    dtype: str='int16',
+                    duration: Optional[int]=None) -> Tuple[array, int]:
+    """Load audio using soundfile library
+
+    This function load audio file using libsndfile.
+
+    Reference:
+        http://www.mega-nerd.com/libsndfile/#Features
+
+    """
+    with sf.SoundFile(file) as sf_desc:
+        sr_native = sf_desc.samplerate
+        if offset:
+            sf_desc.seek(int(offset * sr_native))
+        if duration is not None:
+            frame_duration = int(duration * sr_native)
+        else:
+            frame_duration = -1
+        y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
+
+    return y, sf_desc.samplerate
+
+
+def audio_file_load():
+    """Load audio using audiofile library
+
+    This function load audio file using audiofile.
+
+    Reference:
+        https://audiofile.68k.org/
+
+    """
+    raise NotImplementedError()
+
+
+def sox_file_load():
+    """Load audio using sox library
+
+    This function load audio file using sox.
+
+    Reference:
+        http://sox.sourceforge.net/
+    """
+    raise NotImplementedError()
+
+
+def normalize(y: array, norm_type: str='linear',
+              mul_factor: float=1.0) -> array:
+    """ normalize an input audio with additional multiplier.
+
+    """
+
+    if norm_type == 'linear':
+        amax = np.max(np.abs(y))
+        factor = 1.0 / (amax + EPS)
+        y = y * factor * mul_factor
+    elif norm_type == 'gaussian':
+        amean = np.mean(y)
+        astd = np.std(y)
+        astd = max(astd, EPS)
+        y = mul_factor * (y - amean) / astd
+    else:
+        raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
+
+    return y
+
+
+def save_wav(y: array, sr: int, file: str) -> None:
+    """Save audio file to disk.
+    This function saves audio to disk using scipy.io.wavfile, with additional step
+    to convert input waveform to int16 unless it already is int16
+
+    Notes:
+        It only support raw wav format.
+
+    """
+    if not file.endswith('.wav'):
+        raise ParameterError(
+            f'only .wav file supported, but dst file name is: {file}')
+
+    if sr <= 0:
+        raise ParameterError(
+            f'Sample rate should be larger than 0, recieved sr = {sr}')
+
+    if y.dtype not in ['int16', 'int8']:
+        warnings.warn(
+            f'input data type is {y.dtype}, will convert data to int16 format before saving'
+        )
+        y_out = depth_convert(y, 'int16')
+    else:
+        y_out = y
+
+    wavfile.write(file, sr, y_out)
+
+
+def load(
+        file: str,
+        sr: Optional[int]=None,
+        mono: bool=True,
+        merge_type: str='average',  # ch0,ch1,random,average
+        normal: bool=True,
+        norm_type: str='linear',
+        norm_mul_factor: float=1.0,
+        offset: float=0.0,
+        duration: Optional[int]=None,
+        dtype: str='float32',
+        resample_mode: str='kaiser_fast') -> Tuple[array, int]:
+    """Load audio file from disk.
+    This function loads audio from disk using using audio beackend.
+
+    Parameters:
+
+    Notes:
+
+    """
+
+    y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration)
+
+    if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
+        raise ParameterError(f'audio file {file} looks empty')
+
+    if mono:
+        y = to_mono(y, merge_type)
+
+    if sr is not None and sr != r:
+        y = resample(y, r, sr, mode=resample_mode)
+        r = sr
+
+    if normal:
+        y = normalize(y, norm_type, norm_mul_factor)
+    elif dtype in ['int8', 'int16']:
+        # still need to do normalization, before depth convertion
+        y = normalize(y, 'linear', 1.0)
+
+    y = depth_convert(y, dtype)
+    return y, r
--- a/audio/paddleaudio/datasets/init.py
+++ b/audio/paddleaudio/datasets/init.py
@ -0,0 +1,34 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aishell import AISHELL1
+from .dcase import UrbanAcousticScenes
+from .dcase import UrbanAudioVisualScenes
+from .esc50 import ESC50
+from .gtzan import GTZAN
+from .librispeech import LIBRISPEECH
+from .ravdess import RAVDESS
+from .tess import TESS
+from .urban_sound import UrbanSound8K
+
+__all__ = [
+    'AISHELL1',
+    'LIBRISPEECH',
+    'ESC50',
+    'UrbanSound8K',
+    'GTZAN',
+    'UrbanAcousticScenes',
+    'UrbanAudioVisualScenes',
+    'RAVDESS',
+    'TESS',
+]
--- a/audio/paddleaudio/datasets/aishell.py
+++ b/audio/paddleaudio/datasets/aishell.py
@ -0,0 +1,154 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import codecs
+import collections
+import json
+import os
+from typing import Dict
+
+from paddle.io import Dataset
+from tqdm import tqdm
+
+from ..backends import load as load_audio
+from ..utils.download import decompress
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from ..utils.log import logger
+from .dataset import feat_funcs
+
+__all__ = ['AISHELL1']
+
+
+class AISHELL1(Dataset):
+    """
+    This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
+    It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
+    smart home, autonomous driving, and industrial production. The whole recording was
+    put in quiet indoor environment, using 3 different devices at the same time: high
+    fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
+    iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
+    to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
+    in China were invited to participate in the recording. The manual transcription
+    accuracy rate is above 95%, through professional speech annotation and strict
+    quality inspection. The corpus is divided into training, development and testing
+    sets.
+
+    Reference:
+        AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
+        https://arxiv.org/abs/1709.05522
+    """
+
+    archieves = [
+        {
+            'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
+            'md5': '2f494334227864a8a8fec932999db9d8',
+        },
+    ]
+    text_meta = os.path.join('data_aishell', 'transcript',
+                             'aishell_transcript_v0.8.txt')
+    utt_info = collections.namedtuple('META_INFO',
+                                      ('file_path', 'utt_id', 'text'))
+    audio_path = os.path.join('data_aishell', 'wav')
+    manifest_path = os.path.join('data_aishell', 'manifest')
+    subset = ['train', 'dev', 'test']
+
+    def __init__(self, subset: str='train', feat_type: str='raw', **kwargs):
+        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
+            self.subset, subset)
+        self.subset = subset
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self._data = self._get_data()
+        super(AISHELL1, self).__init__()
+
+    def _get_text_info(self) -> Dict[str, str]:
+        ret = {}
+        with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                utt_id, text = map(str.strip, line.split(' ',
+                                                         1))  # utt_id, text
+                ret.update({utt_id: ''.join(text.split())})
+        return ret
+
+    def _get_data(self):
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+            # Extract *wav from *.tar.gz.
+            for root, _, files in os.walk(
+                    os.path.join(DATA_HOME, self.audio_path)):
+                for file in files:
+                    if file.endswith('.tar.gz'):
+                        decompress(os.path.join(root, file))
+                        os.remove(os.path.join(root, file))
+
+        text_info = self._get_text_info()
+
+        data = []
+        for root, _, files in os.walk(
+                os.path.join(DATA_HOME, self.audio_path, self.subset)):
+            for file in files:
+                if file.endswith('.wav'):
+                    utt_id = os.path.splitext(file)[0]
+                    if utt_id not in text_info:  # There are some utt_id that without label
+                        continue
+                    text = text_info[utt_id]
+                    file_path = os.path.join(root, file)
+                    data.append(self.utt_info(file_path, utt_id, text))
+
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(
+            sample[0])  # The first element of sample is file path
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(
+            waveform, sample_rate=sr,
+            **self.feat_config) if feat_func else waveform
+        record.update({'feat': feat, 'duration': len(waveform) / sr})
+        return record
+
+    def create_manifest(self, prefix='manifest'):
+        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
+            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
+
+        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
+                                     f'{prefix}.{self.subset}')
+        with codecs.open(manifest_file, 'w', 'utf-8') as f:
+            for idx in tqdm(range(len(self))):
+                record = self._convert_to_record(idx)
+                record_line = json.dumps(
+                    {
+                        'utt': record['utt_id'],
+                        'feat': record['file_path'],
+                        'feat_shape': (record['duration'], ),
+                        'text': record['text']
+                    },
+                    ensure_ascii=False)
+                f.write(record_line + '\n')
+        logger.info(f'Manifest file {manifest_file} created.')
+
+    def __getitem__(self, idx):
+        record = self._convert_to_record(idx)
+        return tuple(record.values())
+
+    def __len__(self):
+        return len(self._data)
--- a/audio/paddleaudio/datasets/dataset.py
+++ b/audio/paddleaudio/datasets/dataset.py
@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import numpy as np
+import paddle
+
+from ..backends import load as load_audio
+from ..features import melspectrogram
+from ..features import mfcc
+
+feat_funcs = {
+    'raw': None,
+    'melspectrogram': melspectrogram,
+    'mfcc': mfcc,
+}
+
+
+class AudioClassificationDataset(paddle.io.Dataset):
+    """
+    Base class of audio classification dataset.
+    """
+
+    def __init__(self,
+                 files: List[str],
+                 labels: List[int],
+                 feat_type: str='raw',
+                 **kwargs):
+        """
+        Ags:
+            files (:obj:`List[str]`): A list of absolute path of audio files.
+            labels (:obj:`List[int]`): Labels of audio files.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        super(AudioClassificationDataset, self).__init__()
+
+        if feat_type not in feat_funcs.keys():
+            raise RuntimeError(
+                f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
+            )
+
+        self.files = files
+        self.labels = labels
+
+        self.feat_type = feat_type
+        self.feat_config = kwargs  # Pass keyword arguments to customize feature config
+
+    def _get_data(self, input_file: str):
+        raise NotImplementedError
+
+    def _convert_to_record(self, idx):
+        file, label = self.files[idx], self.labels[idx]
+
+        waveform, sample_rate = load_audio(file)
+        feat_func = feat_funcs[self.feat_type]
+
+        record = {}
+        record['feat'] = feat_func(
+            waveform, sample_rate,
+            **self.feat_config) if feat_func else waveform
+        record['label'] = label
+        return record
+
+    def __getitem__(self, idx):
+        record = self._convert_to_record(idx)
+        return np.array(record['feat']).transpose(), np.array(
+            record['label'], dtype=np.int64)
+
+    def __len__(self):
+        return len(self.files)
--- a/audio/paddleaudio/datasets/dcase.py
+++ b/audio/paddleaudio/datasets/dcase.py
@ -0,0 +1,298 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
+
+
+class UrbanAcousticScenes(AudioClassificationDataset):
+    """
+    TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from
+    12 European cities in 10 different acoustic scenes using 4 different devices.
+    Additionally, synthetic data for 11 mobile devices was created based on the original
+    recordings. Of the 12 cities, two are present only in the evaluation set.
+
+    Reference:
+        A multi-device dataset for urban acoustic scene classification
+        https://arxiv.org/abs/1807.09840
+    """
+
+    source_url = 'https://zenodo.org/record/3819968/files/'
+    base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
+    archieves = [
+        {
+            'url': source_url + base_name + '.meta.zip',
+            'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
+        },
+        {
+            'url': source_url + base_name + '.audio.1.zip',
+            'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
+        },
+        {
+            'url': source_url + base_name + '.audio.2.zip',
+            'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
+        },
+        {
+            'url': source_url + base_name + '.audio.3.zip',
+            'md5': 'ed38956c4246abb56190c1e9b602b7b8',
+        },
+        {
+            'url': source_url + base_name + '.audio.4.zip',
+            'md5': '97ab8560056b6816808dedc044dcc023',
+        },
+        {
+            'url': source_url + base_name + '.audio.5.zip',
+            'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
+        },
+        {
+            'url': source_url + base_name + '.audio.6.zip',
+            'md5': 'fbf856a3a86fff7520549c899dc94372',
+        },
+        {
+            'url': source_url + base_name + '.audio.7.zip',
+            'md5': '0dbffe7b6e45564da649378723284062',
+        },
+        {
+            'url': source_url + base_name + '.audio.8.zip',
+            'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
+        },
+        {
+            'url': source_url + base_name + '.audio.9.zip',
+            'md5': 'a65596a5372eab10c78e08a0de797c9e',
+        },
+        {
+            'url': source_url + base_name + '.audio.10.zip',
+            'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
+        },
+        {
+            'url': source_url + base_name + '.audio.11.zip',
+            'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
+        },
+        {
+            'url': source_url + base_name + '.audio.12.zip',
+            'md5': 'e5f4400c6b9697295fab4cf507155a2f',
+        },
+        {
+            'url': source_url + base_name + '.audio.13.zip',
+            'md5': '8855ab9f9896422746ab4c5d89d8da2f',
+        },
+        {
+            'url': source_url + base_name + '.audio.14.zip',
+            'md5': '092ad744452cd3e7de78f988a3d13020',
+        },
+        {
+            'url': source_url + base_name + '.audio.15.zip',
+            'md5': '4b5eb85f6592aebf846088d9df76b420',
+        },
+        {
+            'url': source_url + base_name + '.audio.16.zip',
+            'md5': '2e0a89723e58a3836be019e6996ae460',
+        },
+    ]
+    label_list = [
+        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
+        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
+    ]
+
+    meta = os.path.join(base_name, 'meta.csv')
+    meta_info = collections.namedtuple('META_INFO', (
+        'filename', 'scene_label', 'identifier', 'source_label'))
+    subset_meta = {
+        'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
+        'dev':
+        os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
+        'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
+    }
+    subset_meta_info = collections.namedtuple('SUBSET_META_INFO',
+                                              ('filename', 'scene_label'))
+    audio_path = os.path.join(base_name, 'audio')
+
+    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        files, labels = self._get_data(mode)
+        super(UrbanAcousticScenes, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self, subset: str=None,
+                       skip_header: bool=True) -> List[collections.namedtuple]:
+        if subset is None:
+            meta_file = self.meta
+            meta_info = self.meta_info
+        else:
+            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
+            meta_file = self.subset_meta[subset]
+            meta_info = self.subset_meta_info
+
+        ret = []
+        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
+            lines = rf.readlines()[1:] if skip_header else rf.readlines()
+            for line in lines:
+                ret.append(meta_info(*line.strip().split('\t')))
+        return ret
+
+    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        meta_info = self._get_meta_info(subset=mode, skip_header=True)
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, label = sample[:2]
+            filename = os.path.basename(filename)
+            target = self.label_list.index(label)
+
+            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+            labels.append(int(target))
+
+        return files, labels
+
+
+class UrbanAudioVisualScenes(AudioClassificationDataset):
+    """
+    TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
+    and video recordings from 12 European cities in 10 different scenes.
+    This dataset consists of 10-seconds audio and video segments from 10
+    acoustic scenes. The total amount of audio in the development set is 34 hours.
+
+    Reference:
+        A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
+        https://arxiv.org/abs/2011.00030
+    """
+
+    source_url = 'https://zenodo.org/record/4477542/files/'
+    base_name = 'TAU-urban-audio-visual-scenes-2021-development'
+
+    archieves = [
+        {
+            'url': source_url + base_name + '.meta.zip',
+            'md5': '76e3d7ed5291b118372e06379cb2b490',
+        },
+        {
+            'url': source_url + base_name + '.audio.1.zip',
+            'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
+        },
+        {
+            'url': source_url + base_name + '.audio.2.zip',
+            'md5': '7fd6bb63127f5785874a55aba4e77aa5',
+        },
+        {
+            'url': source_url + base_name + '.audio.3.zip',
+            'md5': '61396bede29d7c8c89729a01a6f6b2e2',
+        },
+        {
+            'url': source_url + base_name + '.audio.4.zip',
+            'md5': '6ddac89717fcf9c92c451868eed77fe1',
+        },
+        {
+            'url': source_url + base_name + '.audio.5.zip',
+            'md5': 'af4820756cdf1a7d4bd6037dc034d384',
+        },
+        {
+            'url': source_url + base_name + '.audio.6.zip',
+            'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
+        },
+        {
+            'url': source_url + base_name + '.audio.7.zip',
+            'md5': '2be39a76aeed704d5929d020a2909efd',
+        },
+        {
+            'url': source_url + base_name + '.audio.8.zip',
+            'md5': '972d8afe0874720fc2f28086e7cb22a9',
+        },
+    ]
+    label_list = [
+        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
+        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
+    ]
+
+    meta_base_path = os.path.join(base_name, base_name + '.meta')
+    meta = os.path.join(meta_base_path, 'meta.csv')
+    meta_info = collections.namedtuple('META_INFO', (
+        'filename_audio', 'filename_video', 'scene_label', 'identifier'))
+    subset_meta = {
+        'train':
+        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
+        'dev':
+        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
+        'test':
+        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
+    }
+    subset_meta_info = collections.namedtuple('SUBSET_META_INFO', (
+        'filename_audio', 'filename_video', 'scene_label'))
+    audio_path = os.path.join(base_name, 'audio')
+
+    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        files, labels = self._get_data(mode)
+        super(UrbanAudioVisualScenes, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self, subset: str=None,
+                       skip_header: bool=True) -> List[collections.namedtuple]:
+        if subset is None:
+            meta_file = self.meta
+            meta_info = self.meta_info
+        else:
+            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
+            meta_file = self.subset_meta[subset]
+            meta_info = self.subset_meta_info
+
+        ret = []
+        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
+            lines = rf.readlines()[1:] if skip_header else rf.readlines()
+            for line in lines:
+                ret.append(meta_info(*line.strip().split('\t')))
+        return ret
+
+    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves,
+                                    os.path.join(DATA_HOME, self.base_name))
+
+        meta_info = self._get_meta_info(subset=mode, skip_header=True)
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, _, label = sample[:3]
+            filename = os.path.basename(filename)
+            target = self.label_list.index(label)
+
+            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+            labels.append(int(target))
+
+        return files, labels
--- a/audio/paddleaudio/datasets/esc50.py
+++ b/audio/paddleaudio/datasets/esc50.py
@ -0,0 +1,152 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['ESC50']
+
+
+class ESC50(AudioClassificationDataset):
+    """
+    The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
+    suitable for benchmarking methods of environmental sound classification. The dataset
+    consists of 5-second-long recordings organized into 50 semantical classes (with
+    40 examples per class)
+
+    Reference:
+        ESC: Dataset for Environmental Sound Classification
+        http://dx.doi.org/10.1145/2733373.2806390
+    """
+
+    archieves = [
+        {
+            'url':
+            'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
+            'md5': '7771e4b9d86d0945acce719c7a59305a',
+        },
+    ]
+    label_list = [
+        # Animals
+        'Dog',
+        'Rooster',
+        'Pig',
+        'Cow',
+        'Frog',
+        'Cat',
+        'Hen',
+        'Insects (flying)',
+        'Sheep',
+        'Crow',
+        # Natural soundscapes & water sounds
+        'Rain',
+        'Sea waves',
+        'Crackling fire',
+        'Crickets',
+        'Chirping birds',
+        'Water drops',
+        'Wind',
+        'Pouring water',
+        'Toilet flush',
+        'Thunderstorm',
+        # Human, non-speech sounds
+        'Crying baby',
+        'Sneezing',
+        'Clapping',
+        'Breathing',
+        'Coughing',
+        'Footsteps',
+        'Laughing',
+        'Brushing teeth',
+        'Snoring',
+        'Drinking, sipping',
+        # Interior/domestic sounds
+        'Door knock',
+        'Mouse click',
+        'Keyboard typing',
+        'Door, wood creaks',
+        'Can opening',
+        'Washing machine',
+        'Vacuum cleaner',
+        'Clock alarm',
+        'Clock tick',
+        'Glass breaking',
+        # Exterior/urban noises
+        'Helicopter',
+        'Chainsaw',
+        'Siren',
+        'Car horn',
+        'Engine',
+        'Train',
+        'Church bells',
+        'Airplane',
+        'Fireworks',
+        'Hand saw',
+    ]
+    meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
+    meta_info = collections.namedtuple(
+        'META_INFO',
+        ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
+    audio_path = os.path.join('ESC-50-master', 'audio')
+
+    def __init__(self,
+                 mode: str='train',
+                 split: int=1,
+                 feat_type: str='raw',
+                 **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        files, labels = self._get_data(mode, split)
+        super(ESC50, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self) -> List[collections.namedtuple]:
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                ret.append(self.meta_info(*line.strip().split(',')))
+        return ret
+
+    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        meta_info = self._get_meta_info()
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, fold, target, _, _, _, _ = sample
+            if mode == 'train' and int(fold) != split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+                labels.append(int(target))
+
+            if mode != 'train' and int(fold) == split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+                labels.append(int(target))
+
+        return files, labels
--- a/audio/paddleaudio/datasets/gtzan.py
+++ b/audio/paddleaudio/datasets/gtzan.py
@ -0,0 +1,115 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+import random
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['GTZAN']
+
+
+class GTZAN(AudioClassificationDataset):
+    """
+    The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
+    each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
+    in machine listening research for music genre recognition (MGR).
+
+    Reference:
+        Musical genre classification of audio signals
+        https://ieeexplore.ieee.org/document/1021072/
+    """
+
+    archieves = [
+        {
+            'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
+            'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
+        },
+    ]
+    label_list = [
+        'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
+        'pop', 'reggae', 'rock'
+    ]
+    meta = os.path.join('genres', 'input.mf')
+    meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
+    audio_path = 'genres'
+
+    def __init__(self,
+                 mode='train',
+                 seed=0,
+                 n_folds=5,
+                 split=1,
+                 feat_type='raw',
+                 **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            seed (:obj:`int`, `optional`, defaults to 0):
+                Set the random seed to shuffle samples.
+            n_folds (:obj:`int`, `optional`, defaults to 5):
+                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
+        files, labels = self._get_data(mode, seed, n_folds, split)
+        super(GTZAN, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self) -> List[collections.namedtuple]:
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines():
+                ret.append(self.meta_info(*line.strip().split('\t')))
+        return ret
+
+    def _get_data(self, mode, seed, n_folds,
+                  split) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        meta_info = self._get_meta_info()
+        random.seed(seed)  # shuffle samples to split data
+        random.shuffle(
+            meta_info
+        )  # make sure using the same seed to create train and dev dataset
+
+        files = []
+        labels = []
+        n_samples_per_fold = len(meta_info) // n_folds
+        for idx, sample in enumerate(meta_info):
+            file_path, label = sample
+            filename = os.path.basename(file_path)
+            target = self.label_list.index(label)
+            fold = idx // n_samples_per_fold + 1
+
+            if mode == 'train' and int(fold) != split:
+                files.append(
+                    os.path.join(DATA_HOME, self.audio_path, label, filename))
+                labels.append(target)
+
+            if mode != 'train' and int(fold) == split:
+                files.append(
+                    os.path.join(DATA_HOME, self.audio_path, label, filename))
+                labels.append(target)
+
+        return files, labels
--- a/audio/paddleaudio/datasets/librispeech.py
+++ b/audio/paddleaudio/datasets/librispeech.py
@ -0,0 +1,199 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import codecs
+import collections
+import json
+import os
+from typing import Dict
+
+from paddle.io import Dataset
+from tqdm import tqdm
+
+from ..backends import load as load_audio
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from ..utils.log import logger
+from .dataset import feat_funcs
+
+__all__ = ['LIBRISPEECH']
+
+
+class LIBRISPEECH(Dataset):
+    """
+    LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
+    prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
+    derived from read audiobooks from the LibriVox project, and has been carefully
+    segmented and aligned.
+
+    Reference:
+        LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
+        http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
+        https://arxiv.org/abs/1709.05522
+    """
+
+    source_url = 'http://www.openslr.org/resources/12/'
+    archieves = [
+        {
+            'url': source_url + 'train-clean-100.tar.gz',
+            'md5': '2a93770f6d5c6c964bc36631d331a522',
+        },
+        {
+            'url': source_url + 'train-clean-360.tar.gz',
+            'md5': 'c0e676e450a7ff2f54aeade5171606fa',
+        },
+        {
+            'url': source_url + 'train-other-500.tar.gz',
+            'md5': 'd1a0fd59409feb2c614ce4d30c387708',
+        },
+        {
+            'url': source_url + 'dev-clean.tar.gz',
+            'md5': '42e2234ba48799c1f50f24a7926300a1',
+        },
+        {
+            'url': source_url + 'dev-other.tar.gz',
+            'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
+        },
+        {
+            'url': source_url + 'test-clean.tar.gz',
+            'md5': '32fa31d27d2e1cad72775fee3f4849a9',
+        },
+        {
+            'url': source_url + 'test-other.tar.gz',
+            'md5': 'fb5a50374b501bb3bac4815ee91d3135',
+        },
+    ]
+    speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
+    utt_info = collections.namedtuple('META_INFO', (
+        'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
+    audio_path = 'LibriSpeech'
+    manifest_path = os.path.join('LibriSpeech', 'manifest')
+    subset = [
+        'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean',
+        'dev-other', 'test-clean', 'test-other'
+    ]
+
+    def __init__(self,
+                 subset: str='train-clean-100',
+                 feat_type: str='raw',
+                 **kwargs):
+        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
+            self.subset, subset)
+        self.subset = subset
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self._data = self._get_data()
+        super(LIBRISPEECH, self).__init__()
+
+    def _get_speaker_info(self) -> Dict[str, str]:
+        ret = {}
+        with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
+            for line in rf.readlines():
+                if ';' in line:  # Skip dataset abstract
+                    continue
+                spk_id, gender = map(str.strip,
+                                     line.split('|')[:2])  # spk_id, gender
+                ret.update({spk_id: gender})
+        return ret
+
+    def _get_text_info(self, trans_file) -> Dict[str, str]:
+        ret = {}
+        with open(trans_file, 'r') as rf:
+            for line in rf.readlines():
+                utt_id, text = map(str.strip, line.split(' ',
+                                                         1))  # utt_id, text
+                ret.update({utt_id: text})
+        return ret
+
+    def _get_data(self):
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
+            download_and_decompress(self.archieves, DATA_HOME,
+                                    len(self.archieves))
+
+        # Speaker info
+        speaker_info = self._get_speaker_info()
+
+        # Text info
+        text_info = {}
+        for root, _, files in os.walk(
+                os.path.join(DATA_HOME, self.audio_path, self.subset)):
+            for file in files:
+                if file.endswith('.trans.txt'):
+                    text_info.update(
+                        self._get_text_info(os.path.join(root, file)))
+
+        data = []
+        for root, _, files in os.walk(
+                os.path.join(DATA_HOME, self.audio_path, self.subset)):
+            for file in files:
+                if file.endswith('.flac'):
+                    utt_id = os.path.splitext(file)[0]
+                    spk_id = utt_id.split('-')[0]
+                    if utt_id not in text_info \
+                        or spk_id not in speaker_info :  # Skip samples with incomplete data
+                        continue
+                    file_path = os.path.join(root, file)
+                    text = text_info[utt_id]
+                    spk_gender = speaker_info[spk_id]
+                    data.append(
+                        self.utt_info(file_path, utt_id, text, spk_id,
+                                      spk_gender))
+
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(
+            sample[0])  # The first element of sample is file path
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(
+            waveform, sample_rate=sr,
+            **self.feat_config) if feat_func else waveform
+        record.update({'feat': feat, 'duration': len(waveform) / sr})
+        return record
+
+    def create_manifest(self, prefix='manifest'):
+        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
+            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
+
+        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
+                                     f'{prefix}.{self.subset}')
+        with codecs.open(manifest_file, 'w', 'utf-8') as f:
+            for idx in tqdm(range(len(self))):
+                record = self._convert_to_record(idx)
+                record_line = json.dumps(
+                    {
+                        'utt': record['utt_id'],
+                        'feat': record['file_path'],
+                        'feat_shape': (record['duration'], ),
+                        'text': record['text'],
+                        'spk': record['spk_id'],
+                        'gender': record['spk_gender'],
+                    },
+                    ensure_ascii=False)
+                f.write(record_line + '\n')
+        logger.info(f'Manifest file {manifest_file} created.')
+
+    def __getitem__(self, idx):
+        record = self._convert_to_record(idx)
+        return tuple(record.values())
+
+    def __len__(self):
+        return len(self._data)
--- a/audio/paddleaudio/datasets/ravdess.py
+++ b/audio/paddleaudio/datasets/ravdess.py
@ -0,0 +1,136 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+import random
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['RAVDESS']
+
+
+class RAVDESS(AudioClassificationDataset):
+    """
+    The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
+    lexically-matched statements in a neutral North American accent. Speech emotions
+    includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
+    Each expression is produced at two levels of emotional intensity (normal, strong),
+    with an additional neutral expression.
+
+    Reference:
+        The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
+        A dynamic, multimodal set of facial and vocal expressions in North American English
+        https://doi.org/10.1371/journal.pone.0196391
+    """
+
+    archieves = [
+        {
+            'url':
+            'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
+            'md5':
+            '5411230427d67a21e18aa4d466e6d1b9',
+        },
+        {
+            'url':
+            'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
+            'md5':
+            'bc696df654c87fed845eb13823edef8a',
+        },
+    ]
+    label_list = [
+        'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
+        'surprised'
+    ]
+    meta_info = collections.namedtuple(
+        'META_INFO', ('modality', 'vocal_channel', 'emotion',
+                      'emotion_intensity', 'statement', 'repitition', 'actor'))
+    speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
+    song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
+
+    def __init__(self,
+                 mode='train',
+                 seed=0,
+                 n_folds=5,
+                 split=1,
+                 feat_type='raw',
+                 **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            seed (:obj:`int`, `optional`, defaults to 0):
+                Set the random seed to shuffle samples.
+            n_folds (:obj:`int`, `optional`, defaults to 5):
+                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
+        files, labels = self._get_data(mode, seed, n_folds, split)
+        super(RAVDESS, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self, files) -> List[collections.namedtuple]:
+        ret = []
+        for file in files:
+            basename_without_extend = os.path.basename(file)[:-4]
+            ret.append(self.meta_info(*basename_without_extend.split('-')))
+        return ret
+
+    def _get_data(self, mode, seed, n_folds,
+                  split) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(self.speech_path) and not os.path.isdir(
+                self.song_path):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        wav_files = []
+        for root, _, files in os.walk(self.speech_path):
+            for file in files:
+                if file.endswith('.wav'):
+                    wav_files.append(os.path.join(root, file))
+
+        for root, _, files in os.walk(self.song_path):
+            for file in files:
+                if file.endswith('.wav'):
+                    wav_files.append(os.path.join(root, file))
+
+        random.seed(seed)  # shuffle samples to split data
+        random.shuffle(
+            wav_files
+        )  # make sure using the same seed to create train and dev dataset
+        meta_info = self._get_meta_info(wav_files)
+
+        files = []
+        labels = []
+        n_samples_per_fold = len(meta_info) // n_folds
+        for idx, sample in enumerate(meta_info):
+            _, _, emotion, _, _, _, _ = sample
+            target = int(emotion) - 1
+            fold = idx // n_samples_per_fold + 1
+
+            if mode == 'train' and int(fold) != split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+            if mode != 'train' and int(fold) == split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+        return files, labels
--- a/audio/paddleaudio/datasets/tess.py
+++ b/audio/paddleaudio/datasets/tess.py
@ -0,0 +1,126 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+import random
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['TESS']
+
+
+class TESS(AudioClassificationDataset):
+    """
+    TESS is a set of 200 target words were spoken in the carrier phrase
+    "Say the word _____' by two actresses (aged 26 and 64 years) and
+    recordings were made of the set portraying each of seven emotions(anger,
+    disgust, fear, happiness, pleasant surprise, sadness, and neutral).
+    There are 2800 stimuli in total.
+
+    Reference:
+        Toronto emotional speech set (TESS)
+        https://doi.org/10.5683/SP2/E8H2MF
+    """
+
+    archieves = [
+        {
+            'url':
+            'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
+            'md5':
+            '1465311b24d1de704c4c63e4ccc470c7',
+        },
+    ]
+    label_list = [
+        'angry',
+        'disgust',
+        'fear',
+        'happy',
+        'neutral',
+        'ps',  # pleasant surprise
+        'sad',
+    ]
+    meta_info = collections.namedtuple('META_INFO',
+                                       ('speaker', 'word', 'emotion'))
+    audio_path = 'TESS_Toronto_emotional_speech_set'
+
+    def __init__(self,
+                 mode='train',
+                 seed=0,
+                 n_folds=5,
+                 split=1,
+                 feat_type='raw',
+                 **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            seed (:obj:`int`, `optional`, defaults to 0):
+                Set the random seed to shuffle samples.
+            n_folds (:obj:`int`, `optional`, defaults to 5):
+                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
+        files, labels = self._get_data(mode, seed, n_folds, split)
+        super(TESS, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self, files) -> List[collections.namedtuple]:
+        ret = []
+        for file in files:
+            basename_without_extend = os.path.basename(file)[:-4]
+            ret.append(self.meta_info(*basename_without_extend.split('_')))
+        return ret
+
+    def _get_data(self, mode, seed, n_folds,
+                  split) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        wav_files = []
+        for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
+            for file in files:
+                if file.endswith('.wav'):
+                    wav_files.append(os.path.join(root, file))
+
+        random.seed(seed)  # shuffle samples to split data
+        random.shuffle(
+            wav_files
+        )  # make sure using the same seed to create train and dev dataset
+        meta_info = self._get_meta_info(wav_files)
+
+        files = []
+        labels = []
+        n_samples_per_fold = len(meta_info) // n_folds
+        for idx, sample in enumerate(meta_info):
+            _, _, emotion = sample
+            target = self.label_list.index(emotion)
+            fold = idx // n_samples_per_fold + 1
+
+            if mode == 'train' and int(fold) != split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+            if mode != 'train' and int(fold) == split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+        return files, labels
--- a/audio/paddleaudio/datasets/urban_sound.py
+++ b/audio/paddleaudio/datasets/urban_sound.py
@ -0,0 +1,104 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['UrbanSound8K']
+
+
+class UrbanSound8K(AudioClassificationDataset):
+    """
+    UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
+    sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
+    drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
+    classes are drawn from the urban sound taxonomy.
+
+    Reference:
+        A Dataset and Taxonomy for Urban Sound Research
+        https://dl.acm.org/doi/10.1145/2647868.2655045
+    """
+
+    archieves = [
+        {
+            'url':
+            'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
+            'md5': '9aa69802bbf37fb986f71ec1483a196e',
+        },
+    ]
+    label_list = [
+        "air_conditioner", "car_horn", "children_playing", "dog_bark",
+        "drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
+        "street_music"
+    ]
+    meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
+    meta_info = collections.namedtuple(
+        'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
+                      'class_id', 'label'))
+    audio_path = os.path.join('UrbanSound8K', 'audio')
+
+    def __init__(self,
+                 mode: str='train',
+                 split: int=1,
+                 feat_type: str='raw',
+                 **kwargs):
+        files, labels = self._get_data(mode, split)
+        super(UrbanSound8K, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+
+    def _get_meta_info(self):
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                ret.append(self.meta_info(*line.strip().split(',')))
+        return ret
+
+    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        meta_info = self._get_meta_info()
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, _, _, _, _, fold, target, _ = sample
+            if mode == 'train' and int(fold) != split:
+                files.append(
+                    os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
+                                 filename))
+                labels.append(int(target))
+
+            if mode != 'train' and int(fold) == split:
+                files.append(
+                    os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
+                                 filename))
+                labels.append(int(target))
+
+        return files, labels
--- a/audio/paddleaudio/features/init.py
+++ b/audio/paddleaudio/features/init.py
@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .augment import *
+from .core import *
--- a/audio/paddleaudio/features/augment.py
+++ b/audio/paddleaudio/features/augment.py
@ -0,0 +1,169 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import numpy as np
+from numpy import ndarray as array
+from paddleaudio.backends import depth_convert
+from paddleaudio.utils import ParameterError
+
+__all__ = [
+    'depth_augment',
+    'spect_augment',
+    'random_crop1d',
+    'random_crop2d',
+    'adaptive_spect_augment',
+]
+
+
+def randint(high: int) -> int:
+    """Generate one random integer in range [0 high)
+
+     This is a helper function for random data augmentaiton
+    """
+    return int(np.random.randint(0, high=high))
+
+
+def rand() -> float:
+    """Generate one floating-point number in range [0 1)
+
+    This is a helper function for random data augmentaiton
+    """
+    return float(np.random.rand(1))
+
+
+def depth_augment(y: array,
+                  choices: List=['int8', 'int16'],
+                  probs: List[float]=[0.5, 0.5]) -> array:
+    """ Audio depth augmentation
+
+    Do audio depth augmentation to simulate the distortion brought by quantization.
+    """
+    assert len(probs) == len(
+        choices
+    ), 'number of choices {} must be equal to size of probs {}'.format(
+        len(choices), len(probs))
+    depth = np.random.choice(choices, p=probs)
+    src_depth = y.dtype
+    y1 = depth_convert(y, depth)
+    y2 = depth_convert(y1, src_depth)
+
+    return y2
+
+
+def adaptive_spect_augment(spect: array, tempo_axis: int=0,
+                           level: float=0.1) -> array:
+    """Do adpative spectrogram augmentation
+
+    The level of the augmentation is gowern by the paramter level,
+    ranging from 0 to 1, with 0 represents no augmentation。
+
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+
+    time_mask_width = int(nt * level * 0.5)
+    freq_mask_width = int(nf * level * 0.5)
+
+    num_time_mask = int(10 * level)
+    num_freq_mask = int(10 * level)
+
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+
+    return spect
+
+
+def spect_augment(spect: array,
+                  tempo_axis: int=0,
+                  max_time_mask: int=3,
+                  max_freq_mask: int=3,
+                  max_time_mask_width: int=30,
+                  max_freq_mask_width: int=20) -> array:
+    """Do spectrogram augmentation in both time and freq axis
+
+    Reference:
+
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+
+    num_time_mask = randint(max_time_mask)
+    num_freq_mask = randint(max_freq_mask)
+
+    time_mask_width = randint(max_time_mask_width)
+    freq_mask_width = randint(max_freq_mask_width)
+
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+
+    return spect
+
+
+def random_crop1d(y: array, crop_len: int) -> array:
+    """ Do random cropping on 1d input signal
+
+    The input is a 1d signal, typically a sound waveform
+    """
+    if y.ndim != 1:
+        'only accept 1d tensor or numpy array'
+    n = len(y)
+    idx = randint(n - crop_len)
+    return y[idx:idx + crop_len]
+
+
+def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
+    """ Do random cropping for 2D array, typically a spectrogram.
+
+    The cropping is done in temporal direction on the time-freq input signal.
+    """
+    if tempo_axis >= s.ndim:
+        raise ParameterError('axis out of range')
+
+    n = s.shape[tempo_axis]
+    idx = randint(high=n - crop_len)
+    sli = [slice(None) for i in range(s.ndim)]
+    sli[tempo_axis] = slice(idx, idx + crop_len)
+    out = s[tuple(sli)]
+    return out
--- a/audio/paddleaudio/features/core.py
+++ b/audio/paddleaudio/features/core.py
@ -0,0 +1,576 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from librosa(https://github.com/librosa/librosa)
+import warnings
+from typing import List
+from typing import Optional
+from typing import Union
+
+import numpy as np
+import scipy
+from numpy import ndarray as array
+from numpy.lib.stride_tricks import as_strided
+from paddleaudio.utils import ParameterError
+from scipy.signal import get_window
+
+__all__ = [
+    'stft',
+    'mfcc',
+    'hz_to_mel',
+    'mel_to_hz',
+    'split_frames',
+    'mel_frequencies',
+    'power_to_db',
+    'compute_fbank_matrix',
+    'melspectrogram',
+    'spectrogram',
+    'mu_encode',
+    'mu_decode',
+]
+
+
+def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array:
+    """Pad an array to a target length along a target axis.
+
+    This differs from `np.pad` by centering the data prior to padding,
+    analogous to `str.center`
+    """
+
+    kwargs.setdefault("mode", "constant")
+    n = data.shape[axis]
+    lpad = int((size - n) // 2)
+    lengths = [(0, 0)] * data.ndim
+    lengths[axis] = (lpad, int(size - n - lpad))
+
+    if lpad < 0:
+        raise ParameterError(("Target size ({size:d}) must be "
+                              "at least input size ({n:d})"))
+
+    return np.pad(data, lengths, **kwargs)
+
+
+def split_frames(x: array, frame_length: int, hop_length: int,
+                 axis: int=-1) -> array:
+    """Slice a data array into (overlapping) frames.
+
+    This function is aligned with librosa.frame
+    """
+
+    if not isinstance(x, np.ndarray):
+        raise ParameterError(
+            f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
+
+    if x.shape[axis] < frame_length:
+        raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
+                             f" for frame_length={frame_length:d}")
+
+    if hop_length < 1:
+        raise ParameterError(f"Invalid hop_length: {hop_length:d}")
+
+    if axis == -1 and not x.flags["F_CONTIGUOUS"]:
+        warnings.warn(f"librosa.util.frame called with axis={axis} "
+                      "on a non-contiguous input. This will result in a copy.")
+        x = np.asfortranarray(x)
+    elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
+        warnings.warn(f"librosa.util.frame called with axis={axis} "
+                      "on a non-contiguous input. This will result in a copy.")
+        x = np.ascontiguousarray(x)
+
+    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
+    strides = np.asarray(x.strides)
+
+    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
+
+    if axis == -1:
+        shape = list(x.shape)[:-1] + [frame_length, n_frames]
+        strides = list(strides) + [hop_length * new_stride]
+
+    elif axis == 0:
+        shape = [n_frames, frame_length] + list(x.shape)[1:]
+        strides = [hop_length * new_stride] + list(strides)
+
+    else:
+        raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
+
+    return as_strided(x, shape=shape, strides=strides)
+
+
+def _check_audio(y, mono=True) -> bool:
+    """Determine whether a variable contains valid audio data.
+
+    The audio y must be a np.ndarray, ether 1-channel or two channel
+    """
+    if not isinstance(y, np.ndarray):
+        raise ParameterError("Audio data must be of type numpy.ndarray")
+    if y.ndim > 2:
+        raise ParameterError(
+            f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
+
+    if mono and y.ndim == 2:
+        raise ParameterError(
+            f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
+
+    if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
+        raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
+
+    if not np.issubdtype(y.dtype, np.floating):
+        raise ParameterError("Audio data must be floating-point")
+
+    if not np.isfinite(y).all():
+        raise ParameterError("Audio buffer is not finite everywhere")
+
+    return True
+
+
+def hz_to_mel(frequencies: Union[float, List[float], array],
+              htk: bool=False) -> array:
+    """Convert Hz to Mels
+
+    This function is aligned with librosa.
+    """
+    freq = np.asanyarray(frequencies)
+
+    if htk:
+        return 2595.0 * np.log10(1.0 + freq / 700.0)
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = np.log(6.4) / 27.0  # step size for log region
+
+    if freq.ndim:
+        # If we have array data, vectorize
+        log_t = freq >= min_log_hz
+        mels[log_t] = min_log_mel + \
+            np.log(freq[log_t] / min_log_hz) / logstep
+    elif freq >= min_log_hz:
+        # If we have scalar data, heck directly
+        mels = min_log_mel + np.log(freq / min_log_hz) / logstep
+
+    return mels
+
+
+def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array:
+    """Convert mel bin numbers to frequencies.
+
+    This function is aligned with librosa.
+    """
+    mel_array = np.asanyarray(mels)
+
+    if htk:
+        return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
+
+    # Fill in the linear scale
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mel_array
+
+    # And now the nonlinear scale
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = np.log(6.4) / 27.0  # step size for log region
+
+    if mel_array.ndim:
+        # If we have vector data, vectorize
+        log_t = mel_array >= min_log_mel
+        freqs[log_t] = min_log_hz * \
+            np.exp(logstep * (mel_array[log_t] - min_log_mel))
+    elif mel_array >= min_log_mel:
+        # If we have scalar data, check directly
+        freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
+
+    return freqs
+
+
+def mel_frequencies(n_mels: int=128,
+                    fmin: float=0.0,
+                    fmax: float=11025.0,
+                    htk: bool=False) -> array:
+    """Compute mel frequencies
+
+    This function is aligned with librosa.
+    """
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    min_mel = hz_to_mel(fmin, htk=htk)
+    max_mel = hz_to_mel(fmax, htk=htk)
+
+    mels = np.linspace(min_mel, max_mel, n_mels)
+
+    return mel_to_hz(mels, htk=htk)
+
+
+def fft_frequencies(sr: int, n_fft: int) -> array:
+    """Compute fourier frequencies.
+
+    This function is aligned with librosa.
+    """
+    return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
+
+
+def compute_fbank_matrix(sr: int,
+                         n_fft: int,
+                         n_mels: int=128,
+                         fmin: float=0.0,
+                         fmax: Optional[float]=None,
+                         htk: bool=False,
+                         norm: str="slaney",
+                         dtype: type=np.float32):
+    """Compute fbank matrix.
+
+    This funciton is aligned with librosa.
+    """
+    if norm != "slaney":
+        raise ParameterError('norm must set to slaney')
+
+    if fmax is None:
+        fmax = float(sr) / 2
+
+    # Initialize the weights
+    n_mels = int(n_mels)
+    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
+
+    fdiff = np.diff(mel_f)
+    ramps = np.subtract.outer(mel_f, fftfreqs)
+
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+
+        # .. then intersect them with each other and zero
+        weights[i] = np.maximum(0, np.minimum(lower, upper))
+
+    if norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm[:, np.newaxis]
+
+    # Only check weights if f_mel[0] is positive
+    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
+        # This means we have an empty channel somewhere
+        warnings.warn("Empty filters detected in mel frequency basis. "
+                      "Some channels will produce empty responses. "
+                      "Try increasing your sampling rate (and fmax) or "
+                      "reducing n_mels.")
+
+    return weights
+
+
+def stft(x: array,
+         n_fft: int=2048,
+         hop_length: Optional[int]=None,
+         win_length: Optional[int]=None,
+         window: str="hann",
+         center: bool=True,
+         dtype: type=np.complex64,
+         pad_mode: str="reflect") -> array:
+    """Short-time Fourier transform (STFT).
+
+    This function is aligned with librosa.
+    """
+    _check_audio(x)
+    # By default, use the entire frame
+    if win_length is None:
+        win_length = n_fft
+
+    # Set the default hop, if it's not already specified
+    if hop_length is None:
+        hop_length = int(win_length // 4)
+
+    fft_window = get_window(window, win_length, fftbins=True)
+
+    # Pad the window out to n_fft size
+    fft_window = pad_center(fft_window, n_fft)
+
+    # Reshape so that the window can be broadcast
+    fft_window = fft_window.reshape((-1, 1))
+
+    # Pad the time series so that frames are centered
+    if center:
+        if n_fft > x.shape[-1]:
+            warnings.warn(
+                f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
+            )
+        x = np.pad(x, int(n_fft // 2), mode=pad_mode)
+
+    elif n_fft > x.shape[-1]:
+        raise ParameterError(
+            f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
+        )
+
+    # Window the time series.
+    x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length)
+    # Pre-allocate the STFT matrix
+    stft_matrix = np.empty(
+        (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
+    fft = np.fft  # use numpy fft as default
+    # Constrain STFT block sizes to 256 KB
+    MAX_MEM_BLOCK = 2**8 * 2**10
+    # how many columns can we fit within MAX_MEM_BLOCK?
+    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
+    n_columns = max(n_columns, 1)
+
+    for bl_s in range(0, stft_matrix.shape[1], n_columns):
+        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
+        stft_matrix[:, bl_s:bl_t] = fft.rfft(
+            fft_window * x_frames[:, bl_s:bl_t], axis=0)
+
+    return stft_matrix
+
+
+def power_to_db(spect: array,
+                ref: float=1.0,
+                amin: float=1e-10,
+                top_db: Optional[float]=80.0) -> array:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units
+
+    This computes the scaling ``10 * log10(spect / ref)`` in a numerically
+    stable way.
+
+    This function is aligned with librosa.
+    """
+    spect = np.asarray(spect)
+
+    if amin <= 0:
+        raise ParameterError("amin must be strictly positive")
+
+    if np.issubdtype(spect.dtype, np.complexfloating):
+        warnings.warn(
+            "power_to_db was called on complex input so phase "
+            "information will be discarded. To suppress this warning, "
+            "call power_to_db(np.abs(D)**2) instead.")
+        magnitude = np.abs(spect)
+    else:
+        magnitude = spect
+
+    if callable(ref):
+        # User supplied a function to calculate reference power
+        ref_value = ref(magnitude)
+    else:
+        ref_value = np.abs(ref)
+
+    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
+    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
+
+    if top_db is not None:
+        if top_db < 0:
+            raise ParameterError("top_db must be non-negative")
+        log_spec = np.maximum(log_spec, log_spec.max() - top_db)
+
+    return log_spec
+
+
+def mfcc(x,
+         sr: int=16000,
+         spect: Optional[array]=None,
+         n_mfcc: int=20,
+         dct_type: int=2,
+         norm: str="ortho",
+         lifter: int=0,
+         **kwargs) -> array:
+    """Mel-frequency cepstral coefficients (MFCCs)
+
+    This function is NOT strictly aligned with librosa. The following example shows how to get the
+    same result with librosa:
+
+    # paddleaudioe mfcc:
+     kwargs = {
+        'window_size':512,
+        'hop_length':320,
+        'mel_bins':64,
+        'fmin':50,
+         'to_db':False}
+    a = mfcc(x,
+        spect=None,
+        n_mfcc=20,
+        dct_type=2,
+        norm='ortho',
+        lifter=0,
+        **kwargs)
+
+    # librosa mfcc:
+    spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512,
+                                              win_length=512,
+                                              hop_length=320,
+                                              n_mels=64, fmin=50)
+    b = librosa.feature.mfcc(x,
+        sr=16000,
+        S=spect,
+        n_mfcc=20,
+        dct_type=2,
+        norm='ortho',
+        lifter=0)
+
+    assert np.mean( (a-b)**2) < 1e-8
+
+    """
+    if spect is None:
+        spect = melspectrogram(x, sr=sr, **kwargs)
+
+    M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
+
+    if lifter > 0:
+        factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
+                        lifter)
+        return M * factor[:, np.newaxis]
+    elif lifter == 0:
+        return M
+    else:
+        raise ParameterError(
+            f"MFCC lifter={lifter} must be a non-negative number")
+
+
+def melspectrogram(x: array,
+                   sr: int=16000,
+                   window_size: int=512,
+                   hop_length: int=320,
+                   n_mels: int=64,
+                   fmin: int=50,
+                   fmax: Optional[float]=None,
+                   window: str='hann',
+                   center: bool=True,
+                   pad_mode: str='reflect',
+                   power: float=2.0,
+                   to_db: bool=True,
+                   ref: float=1.0,
+                   amin: float=1e-10,
+                   top_db: Optional[float]=None) -> array:
+    """Compute mel-spectrogram.
+
+    Parameters:
+        x: numpy.ndarray
+        The input wavform is a numpy array [shape=(n,)]
+
+        window_size: int, typically 512, 1024, 2048, etc.
+        The window size for framing, also used as n_fft for stft
+
+
+    Returns:
+        The mel-spectrogram in power scale or db scale(default)
+
+
+    Notes:
+    1. sr is default to 16000, which is commonly used in speech/speaker processing.
+    2. when fmax is None, it is set to sr//2.
+    3. this function will convert mel spectgrum to db scale by default. This is different
+    that of librosa.
+
+    """
+    _check_audio(x, mono=True)
+    if len(x) <= 0:
+        raise ParameterError('The input waveform is empty')
+
+    if fmax is None:
+        fmax = sr // 2
+    if fmin < 0 or fmin >= fmax:
+        raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
+
+    s = stft(
+        x,
+        n_fft=window_size,
+        hop_length=hop_length,
+        win_length=window_size,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    spect_power = np.abs(s)**power
+    fb_matrix = compute_fbank_matrix(
+        sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
+    mel_spect = np.matmul(fb_matrix, spect_power)
+    if to_db:
+        return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
+    else:
+        return mel_spect
+
+
+def spectrogram(x: array,
+                sr: int=16000,
+                window_size: int=512,
+                hop_length: int=320,
+                window: str='hann',
+                center: bool=True,
+                pad_mode: str='reflect',
+                power: float=2.0) -> array:
+    """Compute spectrogram from an input waveform.
+
+    This function is a wrapper for librosa.feature.stft, with addition step to
+    compute the magnitude of the complex spectrogram.
+    """
+
+    s = stft(
+        x,
+        n_fft=window_size,
+        hop_length=hop_length,
+        win_length=window_size,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    return np.abs(s)**power
+
+
+def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array:
+    """Mu-law encoding.
+
+    Compute the mu-law decoding given an input code.
+    When quantized is True, the result will be converted to
+    integer in range [0,mu-1]. Otherwise, the resulting signal
+    is in range [-1,1]
+
+
+    Reference:
+        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+
+    """
+    mu = 255
+    y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
+    if quantized:
+        y = np.floor((y + 1) / 2 * mu + 0.5)  # convert to [0 , mu-1]
+    return y
+
+
+def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
+    """Mu-law decoding.
+
+    Compute the mu-law decoding given an input code.
+
+    it assumes that the input y is in
+    range [0,mu-1] when quantize is True and [-1,1] otherwise
+
+    Reference:
+        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+
+    """
+    if mu < 1:
+        raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
+
+    mu = mu - 1
+    if quantized:  # undo the quantization
+        y = y * 2 / mu - 1
+    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
+    return x
--- a/deepspeech/decoders/ctcdecoder/init.py
+++ b/deepspeech/decoders/ctcdecoder/init.py
--- a/audio/paddleaudio/models/panns.py
+++ b/audio/paddleaudio/models/panns.py
@ -0,0 +1,309 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ..utils.download import load_state_dict_from_url
+from ..utils.env import MODEL_HOME
+
+__all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6']
+
+pretrained_model_urls = {
+    'cnn14': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams',
+    'cnn10': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams',
+    'cnn6': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams',
+}
+
+
+class ConvBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+
+        self.conv1 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias_attr=False)
+        self.conv2 = nn.Conv2D(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(out_channels)
+        self.bn2 = nn.BatchNorm2D(out_channels)
+
+    def forward(self, x, pool_size=(2, 2), pool_type='avg'):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = F.relu(x)
+
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x = F.avg_pool2d(
+                x, kernel_size=pool_size) + F.max_pool2d(
+                    x, kernel_size=pool_size)
+        else:
+            raise Exception(
+                f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".'
+            )
+        return x
+
+
+class ConvBlock5x5(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock5x5, self).__init__()
+
+        self.conv1 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(5, 5),
+            stride=(1, 1),
+            padding=(2, 2),
+            bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(out_channels)
+
+    def forward(self, x, pool_size=(2, 2), pool_type='avg'):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x = F.avg_pool2d(
+                x, kernel_size=pool_size) + F.max_pool2d(
+                    x, kernel_size=pool_size)
+        else:
+            raise Exception(
+                f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".'
+            )
+        return x
+
+
+class CNN14(nn.Layer):
+    """
+    The CNN14(14-layer CNNs) mainly consist of 6 convolutional blocks while each convolutional
+    block consists of 2 convolutional layers with a kernel size of 3 × 3.
+
+    Reference:
+        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+        https://arxiv.org/pdf/1912.10211.pdf
+    """
+    emb_size = 2048
+
+    def __init__(self, extract_embedding: bool=True):
+
+        super(CNN14, self).__init__()
+        self.bn0 = nn.BatchNorm2D(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+
+        self.fc1 = nn.Linear(2048, self.emb_size)
+        self.fc_audioset = nn.Linear(self.emb_size, 527)
+        self.extract_embedding = extract_embedding
+
+    def forward(self, x):
+        x.stop_gradient = False
+        x = x.transpose([0, 3, 2, 1])
+        x = self.bn0(x)
+        x = x.transpose([0, 3, 2, 1])
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = x.mean(axis=3)
+        x = x.max(axis=2) + x.mean(axis=2)
+
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu(self.fc1(x))
+
+        if self.extract_embedding:
+            output = F.dropout(x, p=0.5, training=self.training)
+        else:
+            output = F.sigmoid(self.fc_audioset(x))
+        return output
+
+
+class CNN10(nn.Layer):
+    """
+    The CNN10(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
+    block consists of 2 convolutional layers with a kernel size of 3 × 3.
+
+    Reference:
+        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+        https://arxiv.org/pdf/1912.10211.pdf
+    """
+    emb_size = 512
+
+    def __init__(self, extract_embedding: bool=True):
+
+        super(CNN10, self).__init__()
+        self.bn0 = nn.BatchNorm2D(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+
+        self.fc1 = nn.Linear(512, self.emb_size)
+        self.fc_audioset = nn.Linear(self.emb_size, 527)
+        self.extract_embedding = extract_embedding
+
+    def forward(self, x):
+        x.stop_gradient = False
+        x = x.transpose([0, 3, 2, 1])
+        x = self.bn0(x)
+        x = x.transpose([0, 3, 2, 1])
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = x.mean(axis=3)
+        x = x.max(axis=2) + x.mean(axis=2)
+
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu(self.fc1(x))
+
+        if self.extract_embedding:
+            output = F.dropout(x, p=0.5, training=self.training)
+        else:
+            output = F.sigmoid(self.fc_audioset(x))
+        return output
+
+
+class CNN6(nn.Layer):
+    """
+    The CNN14(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
+    block consists of 1 convolutional layers with a kernel size of 5 × 5.
+
+    Reference:
+        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+        https://arxiv.org/pdf/1912.10211.pdf
+    """
+    emb_size = 512
+
+    def __init__(self, extract_embedding: bool=True):
+
+        super(CNN6, self).__init__()
+        self.bn0 = nn.BatchNorm2D(64)
+        self.conv_block1 = ConvBlock5x5(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock5x5(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock5x5(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock5x5(in_channels=256, out_channels=512)
+
+        self.fc1 = nn.Linear(512, self.emb_size)
+        self.fc_audioset = nn.Linear(self.emb_size, 527)
+        self.extract_embedding = extract_embedding
+
+    def forward(self, x):
+        x.stop_gradient = False
+        x = x.transpose([0, 3, 2, 1])
+        x = self.bn0(x)
+        x = x.transpose([0, 3, 2, 1])
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = x.mean(axis=3)
+        x = x.max(axis=2) + x.mean(axis=2)
+
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu(self.fc1(x))
+
+        if self.extract_embedding:
+            output = F.dropout(x, p=0.5, training=self.training)
+        else:
+            output = F.sigmoid(self.fc_audioset(x))
+        return output
+
+
+def cnn14(pretrained: bool=False, extract_embedding: bool=True) -> CNN14:
+    model = CNN14(extract_embedding=extract_embedding)
+    if pretrained:
+        state_dict = load_state_dict_from_url(
+            url=pretrained_model_urls['cnn14'],
+            path=os.path.join(MODEL_HOME, 'panns'))
+        model.set_state_dict(state_dict)
+    return model
+
+
+def cnn10(pretrained: bool=False, extract_embedding: bool=True) -> CNN10:
+    model = CNN10(extract_embedding=extract_embedding)
+    if pretrained:
+        state_dict = load_state_dict_from_url(
+            url=pretrained_model_urls['cnn10'],
+            path=os.path.join(MODEL_HOME, 'panns'))
+        model.set_state_dict(state_dict)
+    return model
+
+
+def cnn6(pretrained: bool=False, extract_embedding: bool=True) -> CNN6:
+    model = CNN6(extract_embedding=extract_embedding)
+    if pretrained:
+        state_dict = load_state_dict_from_url(
+            url=pretrained_model_urls['cnn6'],
+            path=os.path.join(MODEL_HOME, 'panns'))
+        model.set_state_dict(state_dict)
+    return model
--- a/audio/paddleaudio/utils/init.py
+++ b/audio/paddleaudio/utils/init.py
@ -0,0 +1,18 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .download import *
+from .env import *
+from .error import *
+from .log import *
+from .time import *
--- a/audio/paddleaudio/utils/download.py
+++ b/audio/paddleaudio/utils/download.py
@ -0,0 +1,66 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Dict
+from typing import List
+
+from paddle.framework import load as load_state_dict
+from paddle.utils import download
+from pathos.multiprocessing import ProcessPool
+
+from .log import logger
+
+download.logger = logger
+
+
+def decompress(file: str):
+    """
+    Extracts all files from a compressed file.
+    """
+    assert os.path.isfile(file), "File: {} not exists.".format(file)
+    download._decompress(file)
+
+
+def download_and_decompress(archives: List[Dict[str, str]],
+                            path: str,
+                            n_workers: int=0):
+    """
+    Download archieves and decompress to specific path.
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    if n_workers <= 0:
+        for archive in archives:
+            assert 'url' in archive and 'md5' in archive, \
+                'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+
+            download.get_path_from_url(archive['url'], path, archive['md5'])
+    else:
+        pool = ProcessPool(nodes=n_workers)
+        pool.imap(download.get_path_from_url, [_['url'] for _ in archives],
+                  [path] * len(archives), [_['md5'] for _ in archives])
+        pool.close()
+        pool.join()
+
+
+def load_state_dict_from_url(url: str, path: str, md5: str=None):
+    """
+    Download and load a state dict from url
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    download.get_path_from_url(url, path, md5)
+    return load_state_dict(os.path.join(path, os.path.basename(url)))
--- a/audio/paddleaudio/utils/env.py
+++ b/audio/paddleaudio/utils/env.py
@ -0,0 +1,53 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+This module is used to store environmental variables in PaddleAudio.
+PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
+├                            default value through the PPAUDIO_HOME environment variable.
+├─ MODEL_HOME    -->  Store model files.
+└─ DATA_HOME     -->  Store automatically downloaded datasets.
+'''
+import os
+
+
+def _get_user_home():
+    return os.path.expanduser('~')
+
+
+def _get_ppaudio_home():
+    if 'PPAUDIO_HOME' in os.environ:
+        home_path = os.environ['PPAUDIO_HOME']
+        if os.path.exists(home_path):
+            if os.path.isdir(home_path):
+                return home_path
+            else:
+                raise RuntimeError(
+                    'The environment variable PPAUDIO_HOME {} is not a directory.'.
+                    format(home_path))
+        else:
+            return home_path
+    return os.path.join(_get_user_home(), '.paddleaudio')
+
+
+def _get_sub_home(directory):
+    home = os.path.join(_get_ppaudio_home(), directory)
+    if not os.path.exists(home):
+        os.makedirs(home)
+    return home
+
+
+USER_HOME = _get_user_home()
+PPAUDIO_HOME = _get_ppaudio_home()
+MODEL_HOME = _get_sub_home('models')
+DATA_HOME = _get_sub_home('datasets')
--- a/audio/paddleaudio/utils/error.py
+++ b/audio/paddleaudio/utils/error.py
@ -0,0 +1,20 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['ParameterError']
+
+
+class ParameterError(Exception):
+    """Exception class for Parameter checking"""
+    pass
--- a/audio/paddleaudio/utils/log.py
+++ b/audio/paddleaudio/utils/log.py
@ -0,0 +1,136 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import functools
+import logging
+import threading
+import time
+
+import colorlog
+
+loggers = {}
+
+log_config = {
+    'DEBUG': {
+        'level': 10,
+        'color': 'purple'
+    },
+    'INFO': {
+        'level': 20,
+        'color': 'green'
+    },
+    'TRAIN': {
+        'level': 21,
+        'color': 'cyan'
+    },
+    'EVAL': {
+        'level': 22,
+        'color': 'blue'
+    },
+    'WARNING': {
+        'level': 30,
+        'color': 'yellow'
+    },
+    'ERROR': {
+        'level': 40,
+        'color': 'red'
+    },
+    'CRITICAL': {
+        'level': 50,
+        'color': 'bold_red'
+    }
+}
+
+
+class Logger(object):
+    '''
+    Deafult logger in PaddleAudio
+    Args:
+        name(str) : Logger name, default is 'PaddleAudio'
+    '''
+
+    def __init__(self, name: str=None):
+        name = 'PaddleAudio' if not name else name
+        self.logger = logging.getLogger(name)
+
+        for key, conf in log_config.items():
+            logging.addLevelName(conf['level'], key)
+            self.__dict__[key] = functools.partial(self.__call__, conf['level'])
+            self.__dict__[key.lower()] = functools.partial(self.__call__,
+                                                           conf['level'])
+
+        self.format = colorlog.ColoredFormatter(
+            '%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
+            log_colors={key: conf['color']
+                        for key, conf in log_config.items()})
+
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+
+        self.logger.addHandler(self.handler)
+        self.logLevel = 'DEBUG'
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+        self._is_enable = True
+
+    def disable(self):
+        self._is_enable = False
+
+    def enable(self):
+        self._is_enable = True
+
+    @property
+    def is_enable(self) -> bool:
+        return self._is_enable
+
+    def __call__(self, log_level: str, msg: str):
+        if not self.is_enable:
+            return
+
+        self.logger.log(log_level, msg)
+
+    @contextlib.contextmanager
+    def use_terminator(self, terminator: str):
+        old_terminator = self.handler.terminator
+        self.handler.terminator = terminator
+        yield
+        self.handler.terminator = old_terminator
+
+    @contextlib.contextmanager
+    def processing(self, msg: str, interval: float=0.1):
+        '''
+        Continuously print a progress bar with rotating special effects.
+        Args:
+            msg(str): Message to be printed.
+            interval(float): Rotation interval. Default to 0.1.
+        '''
+        end = False
+
+        def _printer():
+            index = 0
+            flags = ['\\', '|', '/', '-']
+            while not end:
+                flag = flags[index % len(flags)]
+                with self.use_terminator('\r'):
+                    self.info('{}: {}'.format(msg, flag))
+                time.sleep(interval)
+                index += 1
+
+        t = threading.Thread(target=_printer)
+        t.start()
+        yield
+        end = True
+
+
+logger = Logger()
--- a/audio/paddleaudio/utils/time.py
+++ b/audio/paddleaudio/utils/time.py
@ -0,0 +1,67 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import time
+
+
+class Timer(object):
+    '''Calculate runing speed and estimated time of arrival(ETA)'''
+
+    def __init__(self, total_step: int):
+        self.total_step = total_step
+        self.last_start_step = 0
+        self.current_step = 0
+        self._is_running = True
+
+    def start(self):
+        self.last_time = time.time()
+        self.start_time = time.time()
+
+    def stop(self):
+        self._is_running = False
+        self.end_time = time.time()
+
+    def count(self) -> int:
+        if not self.current_step >= self.total_step:
+            self.current_step += 1
+        return self.current_step
+
+    @property
+    def timing(self) -> float:
+        run_steps = self.current_step - self.last_start_step
+        self.last_start_step = self.current_step
+        time_used = time.time() - self.last_time
+        self.last_time = time.time()
+        return run_steps / time_used
+
+    @property
+    def is_running(self) -> bool:
+        return self._is_running
+
+    @property
+    def eta(self) -> str:
+        if not self.is_running:
+            return '00:00:00'
+        scale = self.total_step / self.current_step
+        remaining_time = (time.time() - self.start_time) * scale
+        return seconds_to_hms(remaining_time)
+
+
+def seconds_to_hms(seconds: int) -> str:
+    '''Convert the number of seconds to hh:mm:ss'''
+    h = math.floor(seconds / 3600)
+    m = math.floor((seconds - h * 3600) / 60)
+    s = int(seconds - h * 3600 - m * 60)
+    hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
+    return hms_str
--- a/audio/setup.py
+++ b/audio/setup.py
@ -0,0 +1,48 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import setuptools
+
+# set the version here
+version = '0.1.0a'
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+setuptools.setup(
+    name="paddleaudio",
+    version=version,
+    author="",
+    author_email="",
+    description="PaddleAudio, in development",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="",
+    packages=setuptools.find_packages(exclude=["build*", "test*", "examples*"]),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+    install_requires=[
+        'numpy >= 1.15.0',
+        'scipy >= 1.0.0',
+        'resampy >= 0.2.2',
+        'soundfile >= 0.9.0',
+        'colorlog',
+        'pathos',
+    ],
+    extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2']
+                    }  # for dev only, install: pip install -e .[dev]
+)
--- a/audio/test/README.md
+++ b/audio/test/README.md
@ -0,0 +1,41 @@
+# PaddleAudio Testing Guide
+
+
+
+
+# Testing
+First clone a version of the project by
+```
+git clone https://github.com/PaddlePaddle/models.git
+
+```
+Then install the project in your virtual environment.
+```
+cd models/PaddleAudio
+python setup.py bdist_wheel
+pip install -e .[dev]
+```
+The requirements for testing will be installed along with PaddleAudio.  
+
+Now run
+```
+pytest test
+```
+
+If it goes well, you will see outputs like these:
+```
+platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
+rootdir: ./models/PaddleAudio
+plugins: hydra-core-1.0.6
+collected 16 items  
+
+test/unit_test/test_backend.py ...........                                                                         [ 68%]
+test/unit_test/test_features.py .....                                                                              [100%]
+
+==================================================== warnings summary ====================================================
+.
+.
+.
+-- Docs: https://docs.pytest.org/en/stable/warnings.html
+============================================ 16 passed, 11 warnings in 6.76s =============================================
+```
--- a/audio/test/unit_test/test_backend.py
+++ b/audio/test/unit_test/test_backend.py
@ -0,0 +1,113 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import librosa
+import numpy as np
+import paddleaudio
+import pytest
+
+TEST_FILE = './test/data/test_audio.wav'
+
+
+def relative_err(a, b, real=True):
+    """compute relative error of two matrices or vectors"""
+    if real:
+        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
+    else:
+        err = np.sum((a.real - b.real)**2) / \
+            (EPS + np.sum(a.real**2) + np.sum(b.real**2))
+        err += np.sum((a.imag - b.imag)**2) / \
+            (EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
+
+        return err
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def load_audio():
+    x, r = librosa.load(TEST_FILE, sr=16000)
+    print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
+    return x, r
+
+
+# start testing
+x, r = load_audio()
+EPS = 1e-8
+
+
+def test_load():
+    s, r = paddleaudio.load(TEST_FILE, sr=16000)
+    assert r == 16000
+    assert s.dtype == 'float32'
+
+    s, r = paddleaudio.load(
+        TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
+    assert len(s) / r == 2.0
+    assert r == 16000
+    assert s.dtype == 'int16'
+
+
+def test_depth_convert():
+    y = paddleaudio.depth_convert(x, 'int16')
+    assert len(y) == len(x)
+    assert y.dtype == 'int16'
+    assert np.max(y) <= 32767
+    assert np.min(y) >= -32768
+    assert np.std(y) > EPS
+
+    y = paddleaudio.depth_convert(x, 'int8')
+    assert len(y) == len(x)
+    assert y.dtype == 'int8'
+    assert np.max(y) <= 127
+    assert np.min(y) >= -128
+    assert np.std(y) > EPS
+
+
+# test case for resample
+rs_test_data = [
+    (32000, 'kaiser_fast'),
+    (16000, 'kaiser_fast'),
+    (8000, 'kaiser_fast'),
+    (32000, 'kaiser_best'),
+    (16000, 'kaiser_best'),
+    (8000, 'kaiser_best'),
+    (22050, 'kaiser_best'),
+    (44100, 'kaiser_best'),
+]
+
+
+@pytest.mark.parametrize('sr,mode', rs_test_data)
+def test_resample(sr, mode):
+    y = paddleaudio.resample(x, 16000, sr, mode=mode)
+    factor = sr / 16000
+    err = relative_err(len(y), len(x) * factor)
+    print('err:', err)
+    assert err < EPS
+
+
+def test_normalize():
+    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
+    assert np.max(y) < 0.5 + EPS
+
+    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
+    assert np.max(y) <= 2.0 + EPS
+
+    y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
+    print('np.std(y):', np.std(y))
+    assert np.abs(np.std(y) - 1.0) < EPS
+
+
+if __name__ == '__main__':
+    test_load()
+    test_depth_convert()
+    test_resample(22050, 'kaiser_fast')
+    test_normalize()
--- a/audio/test/unit_test/test_features.py
+++ b/audio/test/unit_test/test_features.py
@ -0,0 +1,143 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import librosa
+import numpy as np
+import paddleaudio as pa
+import pytest
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def load_audio():
+    x, r = librosa.load('./test/data/test_audio.wav')
+    #x,r = librosa.load('../data/test_audio.wav',sr=16000)
+    return x, r
+
+
+## start testing
+x, r = load_audio()
+EPS = 1e-8
+
+
+def relative_err(a, b, real=True):
+    """compute relative error of two matrices or vectors"""
+    if real:
+        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
+    else:
+        err = np.sum((a.real - b.real)**2) / (
+            EPS + np.sum(a.real**2) + np.sum(b.real**2))
+        err += np.sum((a.imag - b.imag)**2) / (
+            EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
+
+        return err
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_melspectrogram():
+    a = pa.melspectrogram(
+        x,
+        window_size=512,
+        sr=16000,
+        hop_length=320,
+        n_mels=64,
+        fmin=50,
+        to_db=False, )
+    b = librosa.feature.melspectrogram(
+        x,
+        sr=16000,
+        n_fft=512,
+        win_length=512,
+        hop_length=320,
+        n_mels=64,
+        fmin=50)
+    assert relative_err(a, b) < EPS
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_melspectrogram_db():
+
+    a = pa.melspectrogram(
+        x,
+        window_size=512,
+        sr=16000,
+        hop_length=320,
+        n_mels=64,
+        fmin=50,
+        to_db=True,
+        ref=1.0,
+        amin=1e-10,
+        top_db=None)
+    b = librosa.feature.melspectrogram(
+        x,
+        sr=16000,
+        n_fft=512,
+        win_length=512,
+        hop_length=320,
+        n_mels=64,
+        fmin=50)
+    b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
+    assert relative_err(a, b) < EPS
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_stft():
+    a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
+    b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
+    assert a.shape == b.shape
+    assert relative_err(a, b, real=False) < EPS
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_split_frames():
+    a = librosa.util.frame(x, frame_length=512, hop_length=320)
+    b = pa.split_frames(x, frame_length=512, hop_length=320)
+    assert relative_err(a, b) < EPS
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_mfcc():
+    kwargs = {
+        'window_size': 512,
+        'hop_length': 320,
+        'n_mels': 64,
+        'fmin': 50,
+        'to_db': False
+    }
+    a = pa.mfcc(
+        x,
+        #sample_rate=16000,
+        spect=None,
+        n_mfcc=20,
+        dct_type=2,
+        norm='ortho',
+        lifter=0,
+        **kwargs)
+    S = librosa.feature.melspectrogram(
+        x,
+        sr=16000,
+        n_fft=512,
+        win_length=512,
+        hop_length=320,
+        n_mels=64,
+        fmin=50)
+    b = librosa.feature.mfcc(
+        x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
+    assert relative_err(a, b) < EPS
+
+
+if __name__ == '__main__':
+    test_melspectrogram()
+    test_melspectrogram_db()
+    test_stft()
+    test_split_frames()
+    test_mfcc()
--- a/deepspeech/decoders/utils.py
+++ b/deepspeech/decoders/utils.py
@ -1,49 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = ["end_detect"]
-
-
-def end_detect(ended_hyps, i, M=3, D_end=np.log(1 * np.exp(-10))):
-    """End detection.
-
-    described in Eq. (50) of S. Watanabe et al
-    "Hybrid CTC/Attention Architecture for End-to-End Speech Recognition"
-
-    :param ended_hyps: dict
-    :param i: int
-    :param M: int
-    :param D_end: float
-    :return: bool
-    """
-    if len(ended_hyps) == 0:
-        return False
-    count = 0
-    best_hyp = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[0]
-    for m in range(M):
-        # get ended_hyps with their length is i - m
-        hyp_length = i - m
-        hyps_same_length = [
-            x for x in ended_hyps if len(x["yseq"]) == hyp_length
-        ]
-        if len(hyps_same_length) > 0:
-            best_hyp_same_length = sorted(
-                hyps_same_length, key=lambda x: x["score"], reverse=True)[0]
-            if best_hyp_same_length["score"] - best_hyp["score"] < D_end:
-                count += 1
-
-    if count == M:
-        return True
-    else:
-        return False
--- a/deepspeech/utils/bleu_score.py
+++ b/deepspeech/utils/bleu_score.py
@ -1,54 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This module provides functions to calculate bleu score in different level.
-e.g. wer for word-level, cer for char-level.
-"""
-import sacrebleu
-
-__all__ = ['bleu', 'char_bleu']
-
-
-def bleu(hypothesis, reference):
-    """Calculate BLEU. BLEU compares reference text and
-    hypothesis text in word-level using scarebleu.
-
-   
-
-    :param reference: The reference sentences.
-    :type reference: list[list[str]]
-    :param hypothesis: The hypothesis sentence.
-    :type hypothesis: list[str]
-    :raises ValueError: If the reference length is zero.
-    """
-
-    return sacrebleu.corpus_bleu(hypothesis, reference)
-
-
-def char_bleu(hypothesis, reference):
-    """Calculate BLEU. BLEU compares reference text and
-    hypothesis text in char-level using scarebleu.
-
-   
-
-    :param reference: The reference sentences.
-    :type reference: list[list[str]]
-    :param hypothesis: The hypothesis sentence.
-    :type hypothesis: list[str]
-    :raises ValueError: If the reference number is zero.
-    """
-    hypothesis = [' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis]
-    reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref]
-                 for ref in reference]
-
-    return sacrebleu.corpus_bleu(hypothesis, reference)
--- a/examples/other/g2p/.gitignore
+++ b/examples/other/g2p/.gitignore
@ -1,2 +1 @@
 data
-exp
--- a/demos/echo/README.md
+++ b/demos/echo/README.md
@ -0,0 +1,3 @@
+# echo system
+
+ASR + TTS
--- a/demos/echo/run.sh
+++ b/demos/echo/run.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+mkdir -p data
+
+wav_en=data/en.wav
+wav_zh=data/zh.wav
+
+test -e ${wav_en}  || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -P data
+test -e ${wav_zh}  || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -P data
+
+pip install paddlehub
+
+asr_en_cmd="import paddlehub as hub; model = hub.Module(name='u2_conformer_librispeech'); print(model.speech_recognize("${wav_en}", device='gpu'))"
+asr_zh_cmd="import paddlehub as hub; model = hub.Module(name='u2_conformer_aishell'); print(model.speech_recognize("${wav_zh}", device='gpu'))"
+
+python -c "${asr_en_cmd}"
+python -c "${asr_zh_cmd}"
--- a/docs/images/paddle.png
+++ b/docs/images/paddle.png
--- a/docs/images/tuning_error_surface.png
+++ b/docs/images/tuning_error_surface.png
--- a/docs/make.bat
+++ b/docs/make.bat
@ -1,35 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.http://sphinx-doc.org/
-	exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,4 +1,4 @@
-myst_parser
+myst-parser
 numpydoc
 recommonmark>=0.5.0
 sphinx
--- a/docs/source/_static/custom.css
+++ b/docs/source/_static/custom.css
@ -0,0 +1,5 @@
+.wy-nav-content {
+    max-width: 80%;
+}
+.table table{ background:#b9b9b9} 
+.table table td{ background:#FFF; } 
--- a/docs/source/asr/getting_started.md
+++ b/docs/source/asr/getting_started.md
@ -1,80 +0,0 @@
-# Getting Started
-
-Several shell scripts provided in `./examples/tiny/local` will help us to quickly give it a try, for most major modules, including data preparation, model training, case inference and model evaluation, with a few public dataset (e.g. [LibriSpeech](http://www.openslr.org/12/), [Aishell](http://www.openslr.org/33)). Reading these examples will also help you to understand how to make it work with your own data.
-
-Some of the scripts in `./examples` are not configured with GPUs. If you want to train with 8 GPUs, please modify `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`. If you don't have any GPU available, please set `CUDA_VISIBLE_DEVICES=` to use CPUs instead. Besides, if out-of-memory problem occurs, just reduce `batch_size` to fit.
-
-Let's take a tiny sampled subset of [LibriSpeech dataset](http://www.openslr.org/12/) for instance.
-
- Go to directory
-
-    ```bash
-    cd examples/tiny
-    ```
-
-    Notice that this is only a toy example with a tiny sampled subset of LibriSpeech. If you would like to try with the complete dataset (would take several days for training), please go to `examples/librispeech` instead.
-
- Source env
-
-    ```bash
-    source path.sh
-    ```
-    **Must do this before starting do anything.**
-    Set `MAIN_ROOT` as project dir. Using defualt `deepspeech2` model as default, you can change this in the script.
-
- Main entrypoint
-
-    ```bash
-    bash run.sh
-    ```
-    This just a demo, please make sure every `step` is work fine when do next `step`.
-
-More detailed information are provided in the following sections. Wish you a happy journey with the *DeepSpeech on PaddlePaddle* ASR engine!
-
-## Training a model
-
-The key steps of training for Mandarin language are same to that of English language and we have also provided an example for Mandarin training with Aishell in ```examples/aishell/local```. As mentioned above, please execute ```sh data.sh```, ```sh train.sh```, ```sh test.sh``` and ```sh infer.sh``` to do data preparation, training, testing and inference correspondingly. We have also prepared a pre-trained model (downloaded by local/download_model.sh) for users to try with ```sh infer_golden.sh``` and ```sh test_golden.sh```. Notice that, different from English LM, the Mandarin LM is character-based and please run ```local/tune.sh``` to find an optimal setting.
-
-## Speech-to-text Inference
-
-An inference module caller `infer.py` is provided to infer, decode and visualize speech-to-text results for several given audio clips. It might help to have an intuitive and qualitative evaluation of the ASR model's performance.
-
-```bash
-CUDA_VISIBLE_DEVICES=0 bash local/infer.sh
-```
-
-We provide two types of CTC decoders: *CTC greedy decoder* and *CTC beam search decoder*. The *CTC greedy decoder* is an implementation of the simple best-path decoding algorithm, selecting at each timestep the most likely token, thus being greedy and locally optimal. The [*CTC beam search decoder*](https://arxiv.org/abs/1408.2873) otherwise utilizes a heuristic breadth-first graph search for reaching a near global optimality; it also requires a pre-trained KenLM language model for better scoring and ranking. The decoder type can be set with argument `decoding_method`.
-
-## Evaluate a Model
-
-To evaluate a model's performance quantitatively, please run:
-
-```bash
-CUDA_VISIBLE_DEVICES=0 bash local/test.sh
-```
-
-The error rate (default: word error rate; can be set with `error_rate_type`) will be printed.
-
-For more help on arguments:
-
-## Hyper-parameters Tuning
-
-The hyper-parameters $\alpha$ (language model weight) and $\beta$ (word insertion weight) for the [*CTC beam search decoder*](https://arxiv.org/abs/1408.2873) often have a significant impact on the decoder's performance. It would be better to re-tune them on the validation set when the acoustic model is renewed.
-
-`tune.py` performs a 2-D grid search over the hyper-parameter $\alpha$ and $\beta$. You must provide the range of $\alpha$ and $\beta$, as well as the number of their attempts.
-
-
-```bash
-CUDA_VISIBLE_DEVICES=0 bash local/tune.sh
-```
-
- The grid search will print the WER (word error rate) or CER (character error rate) at each point in the hyper-parameters space, and draw the error surface optionally. A proper hyper-parameters range should include the global minima of the error surface for WER/CER, as illustrated in the following figure.
-
-<p align="center">
-<img src="images/tuning_error_surface.png" width=550>
-<br/>An example error surface for tuning on the dev-clean set of LibriSpeech
-</p>
-
-Usually, as the figure shows, the variation of language model weight ($\alpha$) significantly affect the performance of CTC beam search decoder. And a better procedure is to first tune on serveral data batches (the number can be specified) to find out the proper range of hyper-parameters, then change to the whole validation set to carray out an accurate tuning.
-
-After tuning, you can reset $\alpha$ and $\beta$ in the inference and evaluation modules to see if they really help improve the ASR performance. For more help
--- a/docs/source/asr/deepspeech_architecture.md
+++ b/docs/source/asr/deepspeech_architecture.md
@ -1,6 +1,5 @@
-# Deepspeech2
-## Streaming
-
+# Models introduction
+## Streaming DeepSpeech2
 The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes.
 The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers.

@ -14,21 +13,22 @@ In addition, the training process and the testing process are also introduced.
 The arcitecture of the model is shown in Fig.1.

 <p align="center">
-<img src="../images/ds2onlineModel.png" width=800>
-<br/>Fig.1 The Arcitecture of deepspeech2 online model
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/ds2onlineModel.png" width=800>
+    <br/>Fig.1 The Arcitecture of deepspeech2 online model
 </p>

+
 ### Data Preparation
 #### Vocabulary
 For English data, the vocabulary dictionary is composed of 26 English characters with " ' ", space, \<blank\> and \<eos\>. The \<blank\> represents the blank label in CTC, the \<unk\> represents the unknown character and the \<eos\> represents the start and the end characters. For mandarin, the vocabulary dictionary is composed of chinese characters statisticed from the training set and three additional characters are added. The added characters are \<blank\>, \<unk\> and \<eos\>.  For both English and mandarin data, we set the default indexs that \<blank\>=0, \<unk\>=1 and \<eos\>= last index.
 ```
- # The code to build vocabulary
- cd examples/aishell/s0
- python3 ../../../utils/build_vocab.py \
-     --unit_type="char" \
-     --count_threshold=0 \
-     --vocab_path="data/vocab.txt" \
-     --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
+# The code to build vocabulary
+cd examples/aishell/s0
+python3 ../../../utils/build_vocab.py \
+    --unit_type="char" \
+    --count_threshold=0 \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"

 # vocabulary for aishell dataset (Mandarin)
 vi examples/aishell/s0/data/vocab.txt
@ -40,36 +40,36 @@ vi examples/librispeech/s0/data/vocab.txt
 #### CMVN
 For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std.
 ```
- # The code to compute the feature mean and std
+# The code to compute the feature mean and std
 cd examples/aishell/s0
 python3 ../../../utils/compute_mean_std.py \
-     --manifest_path="data/manifest.train.raw" \
-     --spectrum_type="linear" \
-     --delta_delta=false \
-     --stride_ms=10.0 \
-     --window_ms=20.0 \
-     --sample_rate=16000 \
-     --use_dB_normalization=True \
-     --num_samples=2000 \
-     --num_workers=10 \
-     --output_path="data/mean_std.json"
+    --manifest_path="data/manifest.train.raw" \
+    --spectrum_type="linear" \
+    --delta_delta=false \
+    --stride_ms=10.0 \
+    --window_ms=20.0 \
+    --sample_rate=16000 \
+    --use_dB_normalization=True \
+    --num_samples=2000 \
+    --num_workers=10 \
+    --output_path="data/mean_std.json"

 ```

 #### Feature Extraction
- For feature extraction, three methods are implemented, which are linear (FFT without using filter bank), fbank and mfcc.
- Currently, the released deepspeech2 online model use the linear feature extraction method.
- ```
- The code for feature extraction
- vi deepspeech/frontend/featurizer/audio_featurizer.py
- ```
+For feature extraction, three methods are implemented, which are linear (FFT without using filter bank), fbank and mfcc.
+Currently, the released deepspeech2 online model use the linear feature extraction method.
+```
+The code for feature extraction
+vi paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
+```

 ### Encoder
 The encoder is composed of two 2D convolution subsampling layers and a number of stacked single direction rnn layers. The 2D convolution subsampling layers extract feature representation from the raw audio feature and reduce the length of audio feature at the same time. After passing through the convolution subsampling layers, then the feature representation are input into the stacked rnn layers. For the stacked rnn layers, LSTM cell and GRU cell are provided to use. Adding one fully connected (fc) layer after the stacked rnn layers is optional. If the number of stacked rnn layers is less than 5, adding one fc layer after stacked rnn layers is recommand.

 The code of Encoder is in:
 ```
-vi deepspeech/models/ds2_online/deepspeech2.py
+vi paddlespeech/s2t/models/ds2_online/deepspeech2.py
 ```

 ### Decoder
@ -78,16 +78,16 @@ To got the character possibilities of each frame, the feature representation of
 The code of the decoder is in:
 ```
 # The code of constructing the decoder in model
-vi deepspeech/models/ds2_online/deepspeech2.py
+vi paddlespeech/s2t/models/ds2_online/deepspeech2.py
 # The code of CTC Decoder
-vi deepspeech/modules/ctc.py
+vi paddlespeech/s2t/modules/ctc.py
 ```

-## Training Process
+### Training Process
 Using the command below, you can train the deepspeech2 online model.
 ```
- cd examples/aishell/s0
- bash run.sh --stage 0 --stop_stage 2 --model_type online --conf_path conf/deepspeech2_online.yaml
+cd examples/aishell/s0
+bash run.sh --stage 0 --stop_stage 2 --model_type online --conf_path conf/deepspeech2_online.yaml
 ```
 The detail commands are:
 ```  
@ -126,10 +126,10 @@ fi

 By using the command above, the training process can be started. There are 5 stages in "run.sh", and the first 3 stages are used for training process. The stage 0 is used for data preparation, in which the dataset will be downloaded, and the manifest files of the datasets, vocabulary dictionary and CMVN file will be generated in "./data/". The stage 1 is used for training the model, the log files and model checkpoint is saved in "exp/deepspeech2_online/". The stage 2 is used to generated final model for predicting by averaging the top-k model parameters based on validation loss.  

-## Testing Process
+### Testing Process
 Using the command below, you can test the deepspeech2 online model.
- ```
- bash run.sh --stage 3 --stop_stage 5 --model_type online --conf_path conf/deepspeech2_online.yaml
+```
+bash run.sh --stage 3 --stop_stage 5 --model_type online --conf_path conf/deepspeech2_online.yaml
 ```
 The detail commands are:
 ```
@ -138,7 +138,7 @@ avg_num=1
 model_type=online
 avg_ckpt=avg_${avg_num}

- if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
    CUDA_VISIBLE_DEVICES=2 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
 fi
@ -152,37 +152,35 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
    # test export ckpt avg_n
    CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
 fi
- ```
+```
 After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph.

-
-## Non-Streaming
+## Non-Streaming DeepSpeech2
 The deepspeech2 offline model is similarity to the deepspeech2 online model. The main difference between them is the offline model use the stacked bi-directional rnn layers while the online model use the single direction rnn layers and the fc layer is not used. For the stacked bi-directional rnn layers in the offline model, the rnn cell and gru cell are provided to use.

 The arcitecture of the model is shown in Fig.2.
 <p align="center">
-<img src="../images/ds2offlineModel.png" width=800>
-<br/>Fig.2 The Arcitecture of deepspeech2 offline model
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/ds2offlineModel.png" width=800>
+    <br/>Fig.2 The Arcitecture of deepspeech2 offline model
 </p>


-
 For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model.

 The code of encoder and decoder for deepspeech2 offline model is in:
 ```
-vi deepspeech/models/ds2/deepspeech2.py
+vi paddlespeech/s2t/models/ds2/deepspeech2.py
 ```

 The training process and testing process of deepspeech2 offline model is very similary to deepspeech2 online model.
 Only some changes should be noticed.

 For training and testing, the "model_type" and the "conf_path" must be set.
- ```
+```
 # Training offline
 cd examples/aishell/s0
 bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deepspeech2.yaml
-```
+ ```
 ```
 # Testing offline
 cd examples/aishell/s0
--- a/docs/source/asr/quick_start.md
+++ b/docs/source/asr/quick_start.md
@ -0,0 +1,40 @@
+# Quick Start of Speech-To-Text
+Several shell scripts provided in `./examples/tiny/local` will help us to quickly give it a try, for most major modules, including data preparation, model training, case inference and model evaluation, with a few public dataset (e.g. [LibriSpeech](http://www.openslr.org/12/), [Aishell](http://www.openslr.org/33)). Reading these examples will also help you to understand how to make it work with your own data.
+
+Some of the scripts in `./examples` are not configured with GPUs. If you want to train with 8 GPUs, please modify `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`. If you don't have any GPU available, please set `CUDA_VISIBLE_DEVICES=` to use CPUs instead. Besides, if out-of-memory problem occurs, just reduce `batch_size` to fit.
+
+Let's take a tiny sampled subset of [LibriSpeech dataset](http://www.openslr.org/12/) for instance.
+
+- Go to directory
+
+    ```bash
+    cd examples/tiny
+    ```
+    Notice that this is only a toy example with a tiny sampled subset of LibriSpeech. If you would like to try with the complete dataset (would take several days for training), please go to `examples/librispeech` instead.
+- Source env
+    ```bash
+    source path.sh
+    ```
+    **Must do this before you start to do anything.**
+    Set `MAIN_ROOT` as project dir. Using defualt `deepspeech2` model as `MODEL`, you can change this in the script.
+- Main entrypoint
+    ```bash
+    bash run.sh
+    ```
+    This is just a demo, please make sure every `step` works well before next `step`.
+
+More detailed information are provided in the following sections. Wish you a happy journey with the *DeepSpeech on PaddlePaddle* ASR engine!
+
+## Training a model
+
+The key steps of training for Mandarin language are same to that of English language and we have also provided an example for Mandarin training with Aishell in ```examples/aishell/local```. As mentioned above, please execute ```sh data.sh```, ```sh train.sh```and```sh test.sh```to do data preparation, training, and testing correspondingly.
+
+
+## Evaluate a Model
+To evaluate a model's performance quantitatively, please run:
+```bash
+CUDA_VISIBLE_DEVICES=0 bash local/test.sh
+```
+The error rate (default: word error rate; can be set with `error_rate_type`) will be printed.
+
+We provide two types of CTC decoders: *CTC greedy decoder* and *CTC beam search decoder*. The *CTC greedy decoder* is an implementation of the simple best-path decoding algorithm, selecting at each timestep the most likely token, thus being greedy and locally optimal. The [*CTC beam search decoder*](https://arxiv.org/abs/1408.2873) otherwise utilizes a heuristic breadth-first graph search for reaching a near global optimality; it also requires a pre-trained KenLM language model for better scoring and ranking. The decoder type can be set with argument `decoding_method`.
--- a/docs/source/asr/reference.md
+++ b/docs/source/asr/reference.md
@ -1,8 +0,0 @@
-# Reference
-
-We refer these repos to build `model` and `engine`:
-
-* [delta](https://github.com/Delta-ML/delta.git)
-* [espnet](https://github.com/espnet/espnet.git)
-* [kaldi](https://github.com/kaldi-asr/kaldi.git)
-* [wenet](https://github.com/mobvoi/wenet)
--- a/docs/source/asr/released_model.md
+++ b/docs/source/asr/released_model.md
@ -1,28 +0,0 @@
-# Released Models
-
-## Acoustic Model Released in paddle 2.X
-Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
-:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
-[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 |-| 151 h
-[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h
-[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h
-[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h
-[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h
-[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h
-
-## Acoustic Model Transformed from paddle 1.8
-Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
-:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------- | :---------
-[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 |-| 151 h|
-[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers |-| 0.0685| 960 h|
-[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
-
-
-
-## Language Model Released
-
-Language Model | Training Data | Token-based | Size | Descriptions
-:-------------:| :------------:| :-----: | -----: | :-----------------
-[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) |  [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie'  binary with '-a 22 -q 8 -b 8'
-[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
-[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -23,11 +23,13 @@
 import recommonmark.parser
 import sphinx_rtd_theme

+autodoc_mock_imports = ["soundfile", "librosa"]
+
 # -- Project information -----------------------------------------------------

 project = 'paddle speech'
-copyright = '2021, Deepspeech-developers'
-author = 'Deepspeech-developers'
+copyright = '2021, paddlespeech-developers'
+author = 'paddlespeech-developers'

 # The full version, including alpha/beta/rc tags
 release = '2.1'
@ -46,10 +48,10 @@ pygments_style = 'sphinx'
 extensions = [
    'sphinx.ext.autodoc',
    'sphinx.ext.viewcode',
-    'sphinx_rtd_theme',
+    "sphinx_rtd_theme",
    'sphinx.ext.mathjax',
-    'sphinx.ext.autosummary',
    'numpydoc',
+    'sphinx.ext.autosummary',
    'myst_parser',
 ]

@ -76,6 +78,10 @@ smartquotes = False
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
 html_static_path = ['_static']
+html_logo = '../images/paddle.png'
+html_css_files = [
+    'custom.css',
+]

 # -- Extension configuration -------------------------------------------------
 # numpydoc_show_class_members = False
--- a/docs/source/dependencies.md
+++ b/docs/source/dependencies.md
@ -0,0 +1,36 @@
+# The Dependencies
+
+## By apt-get
+
+### The base dependencies:
+
+```
+bc flac jq vim tig tree pkg-config libsndfile1 libflac-dev libvorbis-dev libboost-dev swig python3-dev
+```
+
+### The dependencies of kenlm:
+
+```  
+build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev gcc-5 g++-5
+```
+
+### The dependencies of sox:
+
+```
+libvorbis-dev libmp3lame-dev libmad-ocaml-dev
+```  
+
+
+## By make or setup
+
+```  
+kenlm
+sox
+mfa
+openblas
+kaldi
+sctk
+AutoLog
+swig-decoder
+python_kaldi_features
+```
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -1,7 +1,7 @@
-Welcome to paddle Deepspeech documentation !
+Welcome to paddle PaddleSpeech documentation !
 ==============================================

-**Deepspeech** is a Speech toolkits implemented by paddlepaddle.
+**PaddleSpeech** is a Speech toolkits implemented by paddlepaddle.


 Contents
@ -10,34 +10,44 @@ Contents
 .. toctree::
   :maxdepth: 1
   :caption: Introduction
-   
-   asr/deepspeech_architecture 

+   introduction

 .. toctree::
   :maxdepth: 1
-   :caption: Getting_started
-
-   asr/install
-   asr/getting_started
-
+   :caption: Quick Start

+   install
+   asr/quick_start
+   tts/quick_start
+   
 .. toctree::
   :maxdepth: 1
-   :caption: More Information
+   :caption: Speech-To-Text

+   asr/models_introduction
   asr/data_preparation
   asr/augmentation
   asr/feature_list
-   asr/ngram_lm  
-
+   asr/ngram_lm

 .. toctree::
   :maxdepth: 1
-   :caption: Released_model
+   :caption: Text-To-Speech

-   asr/released_model
+   tts/basic_usage
+   tts/advanced_usage
+   tts/zh_text_frontend
+   tts/models_introduction
+   tts/gan_vocoder
+   tts/demo
+   tts/demo_2

+.. toctree::
+   :maxdepth: 1
+   :caption: Released Models
+
+   released_model

 .. toctree::
   :maxdepth: 1
@ -45,3 +55,8 @@ Contents

   asr/reference

+
+
+
+
+
--- a/docs/source/asr/install.md
+++ b/docs/source/asr/install.md
@ -8,7 +8,7 @@ To avoid the trouble of environment setup, [running in Docker container](#runnin

 ## Setup (Important)

- Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost`, `sox, and `swig`, e.g. installing them via `apt-get`:
+- Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost`, `sox`, and `swig`, e.g. installing them via `apt-get`:

 ```bash
 sudo apt-get install -y sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
@ -33,9 +33,9 @@ make install
 ```bash
 git clone https://github.com/PaddlePaddle/DeepSpeech.git
 cd DeepSpeech
-pushd tools; make; popd
+pushd tools; make virtualenv.done:; popd
 source tools/venv/bin/activate
-bash setup.sh
+pip install -e .
 ```

 - Source venv before do experiment.
@ -44,6 +44,14 @@ bash setup.sh
 source tools/venv/bin/activate
 ```

+## Simple Setup
+
+```python
+git clone https://github.com/PaddlePaddle/DeepSpeech.git
+cd DeepSpeech
+pip install -e .
+```
+
 ## Running in Docker Container (optional)

 Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed.
--- a/docs/source/introduction.md
+++ b/docs/source/introduction.md
@ -0,0 +1,33 @@
+# PaddleSpeech
+
+## What is PaddleSpeech?
+PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech -  Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
+
+## What can PaddleSpeech do?
+
+### Speech-To-Text
+(An introduce of ASR in PaddleSpeech is needed here!)
+
+### Text-To-Speech
+TTS mainly consists of components below:
+- Implementation of models and commonly used neural network layers.
+- Dataset abstraction and common data preprocessing pipelines.
+- Ready-to-run experiments.
+
+PaddleSpeech TTS provides you with a complete TTS pipeline, including:
+- Text FrontEnd
+    - Rule based Chinese frontend.
+- Acoustic Models
+    - FastSpeech2
+    - SpeedySpeech
+    - TransformerTTS
+    - Tacotron2
+- Vocoders
+    - Multi Band MelGAN
+    - Parallel WaveGAN
+    - WaveFlow
+- Voice Cloning
+    - Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
+    - GE2E
+
+Text-To-Speech  helps you to train TTS models with simple commands.
--- a/docs/source/reference.md
+++ b/docs/source/reference.md
@ -0,0 +1,37 @@
+# Reference
+
+We borrowed a lot of code from these repos to build `model` and `engine`, thank for these great work:
+
+* [espnet](https://github.com/espnet/espnet/blob/master/LICENSE)
+- Apache-2.0 License
+- python/shell `utils`
+- kaldi feat preprocessing
+- datapipeline and `transform`
+- a lot of tts model, like `fastspeech2` and GAN-based `vocoder`
+
+* [wenet](https://github.com/wenet-e2e/wenet/blob/main/LICENSE)
+- Apache-2.0 License
+- U2 model
+- Building TLG based Graph
+
+* [kaldi](https://github.com/kaldi-asr/kaldi/blob/master/COPYING)
+- Apache-2.0 License
+- shell/perl/python utils.
+- feature bins.
+- WFST based decoding for LM integration.
+
+* [delta](https://github.com/Delta-ML/delta/blob/master/LICENSE)
+- Apache-2.0 License
+- `engine` arch
+
+* [speechbrain](https://github.com/speechbrain/speechbrain/blob/develop/LICENSE)
+- Apache-2.0 License
+- ECAPA-TDNN SV model
+
+* [chainer](https://github.com/chainer/chainer/blob/master/LICENSE)
+- MIT License
+- Updater, Trainer and more utils.
+
+* [librosa](https://github.com/librosa/librosa/blob/main/LICENSE.md)
+- ISC License
+- Audio feature
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@ -0,0 +1,55 @@
+# Released Models
+
+## Speech-To-Text Models
+### Acoustic Model Released in paddle 2.X
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
+:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
+[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 |-| 151 h
+[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h
+[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h
+[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h
+[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h
+[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h
+
+### Acoustic Model Transformed from paddle 1.8
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
+:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------- | :---------
+[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 |-| 151 h|
+[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers |-| 0.0685| 960 h|
+[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
+
+### Language Model Released
+
+Language Model | Training Data | Token-based | Size | Descriptions
+:-------------:| :------------:| :-----: | -----: | :-----------------
+[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) |  [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie'  binary with '-a 22 -q 8 -b 8'
+[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
+[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
+
+## Text-To-Speech Models
+### Acoustic Models
+Model Type | Dataset| Example Link | Pretrained Models
+:-------------:| :------------:| :-----: | :-----
+Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
+TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
+SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
+FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
+FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
+FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
+FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)
+
+
+### Vocoders
+
+Model Type | Dataset| Example Link | Pretrained Models
+:-------------:| :------------:| :-----: | :-----
+WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)
+Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip.](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)
+Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
+Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)
+
+### Voice Cloning
+Model Type | Dataset| Example Link | Pretrained Models
+:-------------:| :------------:| :-----: | :-----
+GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
+GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
--- a/docs/source/tts/README.md
+++ b/docs/source/tts/README.md
@ -2,10 +2,11 @@
 Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It is built on PaddlePaddle dynamic graph and includes many influential TTS models.  

 <div align="center">
-  <img src="docs/images/logo.png" width=300 /> <br>
+  <img src="../../images/logo.png" width=300 /> <br>
 </div>

-## News  <img src="./docs/images/news_icon.png" width="40"/>
+
+## News  <img src="../../images/news_icon.png" width="40"/>
 - Oct-12-2021, Refector examples code.
 - Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
 - Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).
--- a/docs/source/tts/advanced_usage.md
+++ b/docs/source/tts/advanced_usage.md
@ -1,6 +1,5 @@
-
 # Advanced Usage
-This sections covers how to extend parakeet by implementing your own models and experiments. Guidelines on implementation are also elaborated.
+This sections covers how to extend TTS by implementing your own models and experiments. Guidelines on implementation are also elaborated.

 For the general deep learning experiment, there are several parts to deal with:
 1. Preprocess the data according to the needs of the model, and iterate the dataset by batch.
@ -8,7 +7,7 @@ For the general deep learning experiment, there are several parts to deal with:
 3. Write out the training process (generally including forward / backward calculation, parameter update, log recording, visualization, periodic evaluation, etc.).
 5. Configure and run the experiment.

-## Parakeet's Model Components
+## PaddleSpeech TTS's Model Components
 In order to balance the reusability and function of models, we divide models into several types according to its characteristics.

 For the commonly used modules that can be used as part of other larger models, we try to implement them as simple and universal as possible, because they will be reused. Modules with trainable parameters are generally implemented as subclasses of `paddle.nn.Layer`. Modules without trainable parameters can be directly implemented as a function, and its input and output are `paddle.Tensor`.
@ -68,11 +67,11 @@ There are two common ways to define a model which consists of several modules.
    ```
    When a model is a complicated and made up of several components, each of which has a separate functionality, and can be replaced by other components with the same functionality, we prefer to define it in this way.

-In the directory structure of Parakeet, modules with high reusability are placed in `parakeet.modules`, but models for specific tasks are placed in `parakeet.models`. When developing a new model, developers need to consider the feasibility of splitting the modules, and the degree of generality of the modules, and place them in appropriate directories.
+In the directory structure of PaddleSpeech TTS, modules with high reusability are placed in `paddlespeech.t2s.modules`, but models for specific tasks are placed in `paddlespeech.t2s.models`. When developing a new model, developers need to consider the feasibility of splitting the modules, and the degree of generality of the modules, and place them in appropriate directories.

-## Parakeet's Data Components
+## PaddleSpeech TTS's Data Components
 Another critical componnet for a deep learning project is data.
-Parakeet uses the following methods for training data:
+PaddleSpeech TTS uses the following methods for training data:
 1. Preprocess the data.
 2. Load the preprocessed data for training.

@ -94,7 +93,7 @@ Then we need to select a format for saving metadata to the hard disk. There are

 Meanwhile, `cache` is added here, and a multi-process Manager is used to share memory between multiple processes. When `num_workers` is used, it is guaranteed that each sub process will not cache a copy.

-The implementation of `DataTable` can be found in `parakeet/datasets/data_table.py`.
+The implementation of `DataTable` can be found in `paddlespeech/t2s/datasets/data_table.py`.
 ```python
 class DataTable(Dataset):
    """Dataset to load and convert data for general purpose.
@ -154,7 +153,7 @@ def _convert(self, meta_datum: Dict[str, Any]) -> Dict[str, Any]:
    return example
 ```

-## Parakeet's Training Components
+## PaddleSpeech TTS's Training Components
 A typical training process includes the following processes:
 1. Iterate the dataset.
 2. Process batch data.
@ -164,7 +163,7 @@ A typical training process includes the following processes:
 6. Write logs, visualize, and in some cases save necessary intermediate results.
 7. Save the state of the model and optimizer.

-Here, we mainly introduce the training related components of Parakeet and why we designed it like this.
+Here, we mainly introduce the training related components of TTS in Pa and why we designed it like this.
 ### Global Repoter
 When training and modifying Deep Learning models，logging is often needed, and it has even become the key to model debugging and modifying. We usually use various visualization tools，such as ,  `visualdl` in `paddle`, `tensorboard` in `tensorflow`  and `vidsom`, `wnb` ,etc. Besides, `logging` and `print` are usuaally used for different purpose.

@ -180,9 +179,9 @@ We think this method is a little ugly. We prefer to return the necessary informa

 It takes advantage of the globality of Python's module level variables and the effect of context manager.

-There is a module level variable in  `parakeet/training/reporter.py`  `OBSERVATIONS`，which is  a `Dict` to store key-value.
+There is a module level variable in  `paddlespeech/t2s/training/reporter.py`  `OBSERVATIONS`，which is  a `Dict` to store key-value.
 ```python
-# parakeet/training/reporter.py
+# paddlespeech/t2s/training/reporter.py

@contextlib.contextmanager
 def scope(observations):
@ -245,7 +244,7 @@ def test_reporter_scope():

 In this way, when we write  modular components, we can directly call `report`.  The caller will decide where to report as long as it's ready for `OBSERVATION`, then it opens a `scope` and calls the component within this `scope`.

- The `Trainer` in Parakeet report the information in this way.
+ The `Trainer` in PaddleSpeech TTS report the information in this way.
 ```python
 while True:
    self.observation = {}
@ -269,7 +268,7 @@ We made an abstraction for these intermediate processes, that is, `Updater`, whi
 ### Visualizer
 Because we choose observation as the communication mode, we can simply write the things in observation into `visualizer`.

-## Parakeet's Configuration Components
+## PaddleSpeech TTS's Configuration Components
 Deep learning experiments often have many options to configure. These configurations can be roughly divided into several categories.
 1. Data source and data processing mode configuration.
 2. Save path configuration of experimental results.
@ -293,28 +292,26 @@ The following is the basic  `ArgumentParser`:
 3.  `--output-dir` is the dir to save the training results.（if there are checkpoints in  `checkpoints/` of  `--output-dir` , it's defalut to reload the newest checkpoint to train)
 4. `--device` and  `--nprocs` determine operation modes，`--device` specifies the type of running device, whether to run on `cpu` or `gpu`. `--nprocs` refers to  the number of training processes. If `nprocs` > 1, it means that multi process parallel training is used. (Note: currently only GPU multi card multi process training is supported.)

-Developers can refer to the examples in  `Parakeet/examples` to write the default configuration file when adding new experiments.
+Developers can refer to the examples in `examples` to write the default configuration file when adding new experiments.

-## Parakeet's Experiment template
+## PaddleSpeech TTS's Experiment template

-The experimental codes in Parakeet  are generally organized as follows:
+The experimental codes in PaddleSpeech TTS are generally organized as follows:

 ```text
-├── conf
-│    └── default.yaml   (defalut config)
-├── README.md           (help information)  
-├── batch_fn.py         (organize metadata into batch)
-├── config.py           (code to read default config)
-├── *_updater.py        (Updater of  a specific model)
-├── preprocess.py       (data preprocessing code)
-├── preprocess.sh       (script to call data preprocessing.py)
-├── synthesis.py        (synthesis from metadata)
-├── synthesis.sh        (script to call synthesis.py)
-├── synthesis_e2e.py    (synthesis from raw text)
-├── synthesis_e2e.sh    (script to call synthesis_e2e.py)
-├── train.py            (train code)
-└── run.sh              (script to call train.py)
+.
+├──  README.md               (help information)
+├──  conf
+│     └── default.yaml       (defalut config)
+├──  local
+│    ├──  preprocess.sh      (script to call data preprocessing.py)
+│    ├──  synthesize.sh      (script to call synthesis.py)  
+│    ├──  synthesize_e2e.sh  (script to call synthesis_e2e.py)
+│    └──train.sh             (script to call train.py)
+├── path.sh                  (script include paths to be sourced)
+└── run.sh                   (script to call scripts in local)
 ```
+The `*.py` files called by above `*.sh` are located `${BIN_DIR}/`

 We add a named argument. `--output-dir` to each training script to specify the output directory. The directory structure is as follows, It's best for developers to follow this specification:
 ```text
@ -330,4 +327,4 @@ exp/default/
 └── test/                    (output dir of synthesis results)
 ```

-You can view the examples we provide in `Parakeet/examples`. These experiments are provided to users as examples which can be run directly. Users are welcome to add new models and experiments and contribute code to Parakeet.
+You can view the examples we provide in `examples`. These experiments are provided to users as examples which can be run directly. Users are welcome to add new models and experiments and contribute code to PaddleSpeech.
--- a/docs/source/tts/basic_usage.md
+++ b/docs/source/tts/basic_usage.md
@ -1,115 +0,0 @@
-# Basic Usage
-This section shows how to use pretrained models provided by parakeet and make inference with them.
-
-Pretrained models in v0.4 are provided in a archive. Extract it to get a folder like this:
-```
-checkpoint_name/
-├──default.yaml
-├──snapshot_iter_76000.pdz
-├──speech_stats.npy
-└──phone_id_map.txt
-```
-`default.yaml` stores the config used to train the model.
-`snapshot_iter_N.pdz` is the chechpoint file, where `N` is the steps it has been trained.
-`*_stats.npy` is the stats file of feature if  it has been normalized before training.
-`phone_id_map.txt` is the map of  phonemes to phoneme_ids.
-
-The example code below shows how to use the models for prediction.
-## Acoustic Models (text to spectrogram)
-The code below show how to use a `FastSpeech2` model.  After loading the pretrained model, use it and normalizer object to construct a prediction object，then use fastspeech2_inferencet(phone_ids) to generate spectrograms, which can be further used to synthesize raw audio with a vocoder.
-
-```python
-from pathlib import Path
-import numpy as np
-import paddle
-import yaml
-from yacs.config import CfgNode
-from parakeet.models.fastspeech2 import FastSpeech2
-from parakeet.models.fastspeech2 import FastSpeech2Inference
-from parakeet.modules.normalizer import ZScore
-# Parakeet/examples/fastspeech2/baker/frontend.py
-from frontend import Frontend
-
-# load the pretrained model
-checkpoint_dir = Path("fastspeech2_nosil_baker_ckpt_0.4")
-with open(checkpoint_dir / "phone_id_map.txt", "r") as f:
-    phn_id = [line.strip().split() for line in f.readlines()]
-vocab_size = len(phn_id)
-with open(checkpoint_dir / "default.yaml") as f:
-    fastspeech2_config = CfgNode(yaml.safe_load(f))
-odim = fastspeech2_config.n_mels
-model = FastSpeech2(
-    idim=vocab_size, odim=odim, **fastspeech2_config["model"])
-model.set_state_dict(
-    paddle.load(args.fastspeech2_checkpoint)["main_params"])
-model.eval()
-
-# load stats file
-stat = np.load(checkpoint_dir / "speech_stats.npy")
-mu, std = stat
-mu = paddle.to_tensor(mu)
-std = paddle.to_tensor(std)
-fastspeech2_normalizer = ZScore(mu, std)
-
-# construct a prediction object
-fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)
-
-# load Chinese Frontend
-frontend = Frontend(checkpoint_dir / "phone_id_map.txt")
-
-# text to spectrogram
-sentence = "你好吗？"
-input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
-phone_ids = input_ids["phone_ids"]
-flags = 0
-# The output of Chinese text frontend is segmented
-for part_phone_ids in phone_ids:
-    with paddle.no_grad():
-        temp_mel = fastspeech2_inference(part_phone_ids)
-        if flags == 0:
-            mel = temp_mel
-            flags = 1
-        else:
-            mel = paddle.concat([mel, temp_mel])
-```
-
-## Vocoder (spectrogram to wave)
-The code below show how to use a  ` Parallel WaveGAN` model. Like the example above, after loading the pretrained model, use it and normalizer object to construct a prediction object，then use pwg_inference(mel) to generate  raw audio (in wav format).
-
-```python
-from pathlib import Path
-import numpy as np
-import paddle
-import soundfile as sf
-import yaml
-from yacs.config import CfgNode
-from parakeet.models.parallel_wavegan import PWGGenerator
-from parakeet.models.parallel_wavegan import PWGInference
-from parakeet.modules.normalizer import ZScore
-
-# load the pretrained model
-checkpoint_dir = Path("parallel_wavegan_baker_ckpt_0.4")
-with open(checkpoint_dir / "pwg_default.yaml") as f:
-    pwg_config = CfgNode(yaml.safe_load(f))
-vocoder = PWGGenerator(**pwg_config["generator_params"])
-vocoder.set_state_dict(paddle.load(args.pwg_params))
-vocoder.remove_weight_norm()
-vocoder.eval()
-
-# load stats file
-stat = np.load(checkpoint_dir / "pwg_stats.npy")
-mu, std = stat
-mu = paddle.to_tensor(mu)
-std = paddle.to_tensor(std)
-pwg_normalizer = ZScore(mu, std)
-
-# construct a prediction object
-pwg_inference = PWGInference(pwg_normalizer, vocoder)
-
-# spectrogram to wave
-wav = pwg_inference(mel)
-sf.write(
-        audio_path,
-        wav.numpy(),
-        samplerate=fastspeech2_config.fs)
-```
--- a/docs/source/tts/demo.rst
+++ b/docs/source/tts/demo.rst
--- a/docs/source/tts/demo_2.rst
+++ b/docs/source/tts/demo_2.rst
@ -0,0 +1,287 @@
+Audio Sample (PaddleSpeech TTS VS Espnet TTS)
+==================
+
+This is an audio demo page to contrast PaddleSpeech TTS and Espnet TTS, We use their respective modules (Text Frontend, Acoustic model and Vocoder) here.
+We use Espnet's released models here.
+
+FastSpeech2 + Parallel WaveGAN in CSMSC
+
+.. raw:: html
+     
+    
+    <div class="table">
+    <table border="2" cellspacing="1" cellpadding="1"> 
+        <tr>
+            <th align="center"> Text </th>
+            <th align="center"> Espent TTS </th>
+            <th align="center"> PaddleSpeech TTS </th>
+        </tr>
+        <tr>
+            <td>早上好，今天是2020/10/29，最低温度是-3°C。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>你好，我的编号是37249，很高兴为您服务。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/002.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>我们公司有37249个人。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/003.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>我出生于2005年10月8日。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/004.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>我们习惯在12:30吃中午饭。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/005.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>只要有超过3/4的人投票同意，你就会成为我们的新班长。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/006.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/006.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>我要买一只价值999.9元的手表。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/007.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>我的手机号是18544139121，欢迎来电。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/008.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>明天有62%的概率降雨。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/009.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>手表厂有五种好产品。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/010.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/010.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>跑马场有五百匹很勇敢的千里马。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/011.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/011.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>有一天，我看到了一栋楼，我顿感不妙，因为我看不清里面有没有人。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/012.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/012.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>史小姐拿着小雨伞去找她的老保姆了。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/013.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/013.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>不要相信这个老奶奶说的话，她一点儿也不好。</td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/014.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/014.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        </table>
+        </div>
+
--- a/docs/source/tts/gan_vocoder.md
+++ b/docs/source/tts/gan_vocoder.md
@ -0,0 +1,9 @@
+# GAN Vocoders
+This is a brief introduction of GAN Vocoders, we mainly introduce the losses of different vocoders here.
+
+Model  | Generator Loss |Discriminator Loss
+:-------------:| :------------:| :-----
+Parallel Wave GAN| adversial loss <br> Feature Matching  | Multi-Scale Discriminator |
+Mel GAN |adversial loss <br> Multi-resolution STFT loss  | adversial loss|
+Multi-Band Mel GAN | adversial loss <br> full band Multi-resolution STFT loss <br> sub band Multi-resolution STFT loss |Multi-Scale Discriminator|
+HiFi GAN |adversial loss <br> Feature Matching <br>  Mel-Spectrogram Loss | Multi-Scale Discriminator <br> Multi-Period Discriminato  |
--- a/docs/source/tts/index.rst
+++ b/docs/source/tts/index.rst
@ -1,45 +0,0 @@
-.. parakeet documentation master file, created by
-   sphinx-quickstart on Fri Sep 10 14:22:24 2021.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-Parakeet 
-====================================
-
-``parakeet`` is a deep learning based text-to-speech toolkit built upon ``paddlepaddle`` framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by `Baidu Research <http://research.baidu.com>`_ and other research groups. 
-
-``parakeet`` mainly consists of components below.
-
-#. Implementation of models and commonly used neural network layers.
-#. Dataset abstraction and common data preprocessing pipelines.
-#. Ready-to-run experiments.
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Introduction
-
-   introduction
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Getting started
-
-   install
-   basic_usage
-   advanced_usage
-   cn_text_frontend
-   released_models
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Demos
-   
-   demo
-   
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
--- a/docs/source/tts/install.md
+++ b/docs/source/tts/install.md
@ -1,47 +0,0 @@
-# Installation
-## Install PaddlePaddle
-Parakeet requires PaddlePaddle as its backend. Note that 2.1.2 or newer versions of paddle is required.
-
-Since paddlepaddle has multiple packages depending on the device (cpu or gpu) and the dependency libraries, it is recommended to install a proper package of paddlepaddle with respect to the device and dependency library versons via `pip`.
-
-Installing paddlepaddle with conda or build paddlepaddle from source is also supported. Please refer to [PaddlePaddle installation](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html) for more details.
-
-Example instruction to install paddlepaddle via pip is listed below.
-
-### PaddlePaddle with GPU
-```python
-# PaddlePaddle for CUDA10.1 
-python -m pip install paddlepaddle-gpu==2.1.2.post101 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
-# PaddlePaddle for CUDA10.2  
-python -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple
-# PaddlePaddle for CUDA11.0
-python -m pip install paddlepaddle-gpu==2.1.2.post110 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
-# PaddlePaddle for CUDA11.2 
-python -m pip install paddlepaddle-gpu==2.1.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
-```
-### PaddlePaddle with CPU
-```python
-python -m pip install paddlepaddle==2.1.2 -i https://mirror.baidu.com/pypi/simple
-```
-## Install libsndfile
-Experimemts in parakeet often involve audio and spectrum processing, thus `librosa` and `soundfile` are required. `soundfile` requires a extra C library `libsndfile`, which is not always handled by pip.
-
-For Windows and Mac users, `libsndfile` is also installed when installing `soundfile` via pip, but for Linux users, installing `libsndfile` via system package manager is required. Example commands for popular distributions are listed below.
-```bash
-# ubuntu, debian
-sudo apt-get install libsndfile1
-# centos, fedora
-sudo yum install libsndfile
-# openSUSE
-sudo zypper in libsndfile
-```
-For any problem with installtion of soundfile, please refer to [SoundFile](https://pypi.org/project/SoundFile/).
-## Install Parakeet
-There are two ways to install parakeet according to the purpose of using it.
-
- 1. If you want to run experiments provided by parakeet or add new models and experiments, it is recommended to clone the project from github (Parakeet), and install it in editable mode.
-       ```python
-       git clone https://github.com/PaddlePaddle/Parakeet
-       cd Parakeet
-       pip install -e .
-       ```
--- a/docs/source/tts/introduction.md
+++ b/docs/source/tts/introduction.md
@ -1,27 +0,0 @@
-# Parakeet - PAddle PARAllel text-to-speech toolKIT
-
-## What is Parakeet?
-Parakeet is a deep learning based text-to-speech toolkit built upon paddlepaddle framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by Baidu Research and other research groups.
-
-## What can Parakeet do?
-Parakeet mainly consists of components below:
- Implementation of models and commonly used neural network layers.
- Dataset abstraction and common data preprocessing pipelines.
- Ready-to-run experiments.
-
-Parakeet provides you with a complete TTS pipeline, including:
- Text FrontEnd
-    - Rule based Chinese frontend.
- Acoustic Models
-    - FastSpeech2
-    - SpeedySpeech
-    - TransformerTTS
-    - Tacotron2
- Vocoders
-    - Parallel WaveGAN
-    - WaveFlow
- Voice Cloning
-    - Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
-    - GE2E
-
-Parakeet helps you to train TTS models with simple commands.
--- a/docs/source/tts/models_introduction.md
+++ b/docs/source/tts/models_introduction.md
@ -1,12 +1,12 @@
-# Released Models
-TTS system mainly includes three modules: `text frontend`, `Acoustic model` and `Vocoder`. We introduce a rule based Chinese text frontend in [cn_text_frontend.md](./cn_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable models.
+# Models introduction
+TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We introduce a rule based Chinese text frontend in [cn_text_frontend.md](./cn_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable models.

 The main processes of TTS include:
 1. Convert the original text into characters/phonemes, through `text frontend` module.
 2. Convert characters/phonemes into acoustic features , such as linear spectrogram, mel spectrogram, LPC features, etc. through `Acoustic models`.
 3. Convert acoustic features into waveforms through `Vocoders`.

-A simple text frontend module can be implemented by rules. Acoustic models and vocoders need to be trained. The models provided by Parakeet are acoustic models and vocoders.
+A simple text frontend module can be implemented by rules. Acoustic models and vocoders need to be trained. The models provided by PaddleSpeech TTS are acoustic models and vocoders.

 ## Acoustic Models
 ### Modeling Objectives of Acoustic Models
@ -27,14 +27,14 @@ At present, there are two mainstream acoustic model structures.
   - Acoustic decoder (N Frames - > N Frames).

 <div align="left">
-  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/frame_level_am.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/frame_level_am.png" width=500 /> <br>
 </div>

 - Sequence to sequence acoustic model:
    - M Tokens - > N Frames.

 <div align="left">
-  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/seq2seq_am.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/seq2seq_am.png" width=500 /> <br>
 </div>

 ### Tacotron2
@ -54,7 +54,7 @@ At present, there are two mainstream acoustic model structures.
    - CBHG postprocess.
    - Vocoder: Griffin-Lim.
 <div align="left">
-  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/tacotron.png" width=700 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/tacotron.png" width=700 /> <br>
 </div>

 **Advantage of Tacotron:**
@ -89,10 +89,10 @@ At present, there are two mainstream acoustic model structures.
   - The alignment matrix of previous time is considered at the step `t` of decoder.

 <div align="left">
-  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/tacotron2.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/tacotron2.png" width=500 /> <br>
 </div>

-You can find Parakeet's tacotron2 example at `Parakeet/examples/tacotron2`.
+You can find PaddleSpeech TTS's tacotron2 with LJSpeech dataset example at [examples/ljspeech/tts0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts0).

 ### TransformerTTS
 **Disadvantages of the Tacotrons:**
@ -118,7 +118,7 @@ Transformer TTS is a combination of Tacotron2 and Transformer.
    - Positional Encoding.

 <div align="left">
-  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/transformer.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/transformer.png" width=500 /> <br>
 </div>

 #### Transformer TTS
@ -138,7 +138,7 @@ Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2.
    - Uniform scale position encoding may have a negative impact on input or output sequences.

 <div align="left">
-  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/transformer_tts.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/transformer_tts.png" width=500 /> <br>
 </div>

 **Disadvantages of Transformer TTS:**
@ -146,7 +146,7 @@ Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2.
 - The ability to perceive local information is weak, and local information is more related to pronunciation.
 - Stability is worse than Tacotron2.

-You can find Parakeet's Transformer TTS example at `Parakeet/examples/transformer_tts`.
+You can find PaddleSpeech TTS's Transformer TTS with LJSpeech dataset example at [examples/ljspeech/tts1](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts1).


 ### FastSpeech2
@ -184,14 +184,14 @@ Instead of using the encoder-attention-decoder based architecture as adopted by
 • Can be generated in parallel (decoding time is less affected by sequence length)

 <div align="left">
-  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/fastspeech.png" width=800 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastspeech.png" width=800 /> <br>
 </div>

 #### FastPitch
 [FastPitch](https://arxiv.org/abs/2006.06873) follows FastSpeech. A single pitch value is predicted for every temporal location, which improves the overall quality of synthesized speech.

 <div align="left">
-  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/fastpitch.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastpitch.png" width=500 /> <br>
 </div>

 #### FastSpeech2
@ -209,10 +209,10 @@ Instead of using the encoder-attention-decoder based architecture as adopted by
 FastSpeech2 is similar to FastPitch but introduces more variation information of speech.

 <div align="left">
-  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/fastspeech2.png" width=800 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastspeech2.png" width=800 /> <br>
 </div>

-You can find Parakeet's FastSpeech2/FastPitch example at `Parakeet/examples/fastspeech2`, We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2.
+You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example at [examples/csmsc/tts3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts3), We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2.

 ### SpeedySpeech
 [SpeedySpeech](https://arxiv.org/abs/2008.03802) simplify the teacher-student architecture of FastSpeech and provide a fast and stable training procedure.
@ -223,10 +223,10 @@ You can find Parakeet's FastSpeech2/FastPitch example at `Parakeet/examples/fast
 - Describe a simple data augmentation technique that can be used early in the training to make the teacher network robust to sequential error propagation.

 <div align="left">
-  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/speedyspeech.png" width=500 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/speedyspeech.png" width=500 /> <br>
 </div>

-You can find Parakeet's SpeedySpeech example at `Parakeet/examples/speedyspeech/baker`.
+You can find PaddleSpeech TTS's SpeedySpeech with CSMSC dataset example at [examples/csmsc/tts2](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts2).

 ## Vocoders
 In speech synthesis, the main task of the vocoder is to convert the spectral parameters predicted by the acoustic model into the final speech waveform.
@ -276,7 +276,7 @@ Here, we introduce a Flow-based vocoder WaveFlow and a GAN-based vocoder Paralle
 - It is a small-footprint flow-based model for raw audio. It has only 5.9M parameters, which is 15x smalller than WaveGlow (87.9M).
 - It is directly trained with maximum likelihood without probability density distillation and auxiliary losses as used in [Parallel WaveNet](https://arxiv.org/abs/1711.10433) and [ClariNet](https://openreview.net/pdf?id=HklY120cYm), which simplifies the training pipeline and reduces the cost of development.

-You can find Parakeet's WaveFlow example at `Parakeet/examples/waveflow`.
+You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examples/ljspeech/voc0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0).

 ### Parallel WaveGAN
 [Parallel WaveGAN](https://arxiv.org/abs/1910.11480) trains a non-autoregressive WaveNet variant as a generator in a GAN based training method.
@ -286,10 +286,10 @@ You can find Parakeet's WaveFlow example at `Parakeet/examples/waveflow`.
 - Use non-causal convolution instead of causal convolution.
 - The input is random Gaussian white noise.
 - The model is non-autoregressive both in training and prediction, which is fast
-  Multi-resolution STFT loss.
+- Multi-resolution STFT loss.

 <div align="left">
-  <img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/pwg.png" width=600 /> <br>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/pwg.png" width=600 /> <br>
 </div>

-You can find Parakeet's Parallel WaveGAN example at `Parakeet/examples/parallelwave_gan/baker`.
+You can find PaddleSpeech TTS's Parallel WaveGAN with CSMSC example at [examples/csmsc/voc1](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1).
--- a/docs/source/tts/quick_start.md
+++ b/docs/source/tts/quick_start.md
@ -0,0 +1,193 @@
+# Quick Start of Text-To-Speech
+The examples in PaddleSpeech are mainly classified by datasets, the TTS datasets we mainly used are:
+* CSMCS (Mandarin single speaker)
+* AISHELL3 (Mandarin multiple speaker)
+* LJSpeech (English single speaker)
+* VCTK (English multiple speaker)
+
+The models in PaddleSpeech TTS have the following mapping relationship:
+* tts0 - Tactron2
+* tts1 - TransformerTTS
+* tts2 - SpeedySpeech
+* tts3 - FastSpeech2
+* voc0 - WaveFlow
+* voc1 - Parallel WaveGAN
+* voc2 - MelGAN
+* voc3 - MultiBand MelGAN
+* vc0 - Tactron2 Voice Clone with GE2E
+
+## Quick Start
+
+Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. (./examples/csmsc/)(https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc)
+
+### Train Parallel WaveGAN with CSMSC
+- Go to directory
+    ```bash
+    cd examples/csmsc/voc1
+    ```
+- Source env
+    ```bash
+    source path.sh
+    ```
+    **Must do this before you start to do anything.**
+    Set `MAIN_ROOT` as project dir. Using `parallelwave_gan` model as `MODEL`.
+
+- Main entrypoint
+    ```bash
+    bash run.sh
+    ```
+    This is just a demo, please make sure source data have been prepared well and every `step` works well before next `step`.
+### Train FastSpeech2 with CSMSC
+- Go to directory
+    ```bash
+    cd examples/csmsc/tts3
+    ```
+- Source env
+    ```bash
+    source path.sh
+    ```
+    **Must do this before you start to do anything.**
+    Set `MAIN_ROOT` as project dir. Using `fastspeech2` model as `MODEL`.
+- Main entrypoint
+    ```bash
+    bash run.sh
+    ```
+    This is just a demo, please make sure source data have been prepared well and every `step` works well before next `step`.
+
+The steps in `run.sh` mainly include:
+- source path.
+- preprocess the dataset,
+- train the model.
+- synthesize waveform from metadata.jsonl.
+- synthesize waveform from text file. (in acoustic models)
+- inference using static model. (optional)
+
+For more details , you can see `README.md` in examples.
+
+## Pipeline of TTS
+This section shows how to use pretrained models provided by TTS and make inference with them.
+
+Pretrained models in TTS are provided in a archive. Extract it to get a folder like this:
+**Acoustic Models:**
+```text
+checkpoint_name
+├── default.yaml
+├── snapshot_iter_*.pdz
+├── speech_stats.npy
+├── phone_id_map.txt
+├── spk_id_map.txt (optimal)
+└── tone_id_map.txt (optimal)
+```
+**Vocoders:**
+```text
+checkpoint_name
+├── default.yaml  
+├── snapshot_iter_*.pdz
+└── stats.npy  
+```
+- `default.yaml` stores the config used to train the model.
+- `snapshot_iter_*.pdz` is the chechpoint file, where `*` is the steps it has been trained.
+- `*_stats.npy` is the stats file of feature if  it has been normalized before training.
+- `phone_id_map.txt` is the map of  phonemes to phoneme_ids.
+- `tone_id_map.txt` is the map of  tones to tones_ids, when you split tones and phones before training acoustic models. (for example in our csmsc/speedyspeech example)
+- `spk_id_map.txt` is the map of  spkeaker to spk_ids in multi-spk acoustic models. (for example in our aishell3/fastspeech2 example)
+
+The example code below shows how to use the models for prediction.
+### Acoustic Models (text to spectrogram)
+The code below show how to use a `FastSpeech2` model.  After loading the pretrained model, use it and normalizer object to construct a prediction object，then use `fastspeech2_inferencet(phone_ids)` to generate spectrograms, which can be further used to synthesize raw audio with a vocoder.
+
+```python
+from pathlib import Path
+import numpy as np
+import paddle
+import yaml
+from yacs.config import CfgNode
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
+from paddlespeech.t2s.modules.normalizer import ZScore
+# examples/fastspeech2/baker/frontend.py
+from frontend import Frontend
+
+# load the pretrained model
+checkpoint_dir = Path("fastspeech2_nosil_baker_ckpt_0.4")
+with open(checkpoint_dir / "phone_id_map.txt", "r") as f:
+    phn_id = [line.strip().split() for line in f.readlines()]
+vocab_size = len(phn_id)
+with open(checkpoint_dir / "default.yaml") as f:
+    fastspeech2_config = CfgNode(yaml.safe_load(f))
+odim = fastspeech2_config.n_mels
+model = FastSpeech2(
+    idim=vocab_size, odim=odim, **fastspeech2_config["model"])
+model.set_state_dict(
+    paddle.load(args.fastspeech2_checkpoint)["main_params"])
+model.eval()
+
+# load stats file
+stat = np.load(checkpoint_dir / "speech_stats.npy")
+mu, std = stat
+mu = paddle.to_tensor(mu)
+std = paddle.to_tensor(std)
+fastspeech2_normalizer = ZScore(mu, std)
+
+# construct a prediction object
+fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)
+
+# load Chinese Frontend
+frontend = Frontend(checkpoint_dir / "phone_id_map.txt")
+
+# text to spectrogram
+sentence = "你好吗？"
+input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
+phone_ids = input_ids["phone_ids"]
+flags = 0
+# The output of Chinese text frontend is segmented
+for part_phone_ids in phone_ids:
+    with paddle.no_grad():
+        temp_mel = fastspeech2_inference(part_phone_ids)
+        if flags == 0:
+            mel = temp_mel
+            flags = 1
+        else:
+            mel = paddle.concat([mel, temp_mel])
+```
+
+### Vocoder (spectrogram to wave)
+The code below show how to use a  ` Parallel WaveGAN` model. Like the example above, after loading the pretrained model, use it and normalizer object to construct a prediction object，then use `pwg_inference(mel)` to generate  raw audio (in wav format).
+
+```python
+from pathlib import Path
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
+from paddlespeech.t2s.models.parallel_wavegan import PWGInference
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+# load the pretrained model
+checkpoint_dir = Path("parallel_wavegan_baker_ckpt_0.4")
+with open(checkpoint_dir / "pwg_default.yaml") as f:
+    pwg_config = CfgNode(yaml.safe_load(f))
+vocoder = PWGGenerator(**pwg_config["generator_params"])
+vocoder.set_state_dict(paddle.load(args.pwg_params))
+vocoder.remove_weight_norm()
+vocoder.eval()
+
+# load stats file
+stat = np.load(checkpoint_dir / "pwg_stats.npy")
+mu, std = stat
+mu = paddle.to_tensor(mu)
+std = paddle.to_tensor(std)
+pwg_normalizer = ZScore(mu, std)
+
+# construct a prediction object
+pwg_inference = PWGInference(pwg_normalizer, vocoder)
+
+# spectrogram to wave
+wav = pwg_inference(mel)
+sf.write(
+        audio_path,
+        wav.numpy(),
+        samplerate=fastspeech2_config.fs)
+```
--- a/docs/source/tts/test_sentence.txt
+++ b/docs/source/tts/test_sentence.txt
@ -0,0 +1,14 @@
+001 早上好，今天是2020/10/29，最低温度是-3°C。
+002 你好，我的编号是37249，很高兴为您服务。
+003 我们公司有37249个人。
+004 我出生于2005年10月8日。
+005 我们习惯在12:30吃中午饭。
+006 只要有超过3/4的人投票同意，你就会成为我们的新班长。
+007 我要买一只价值999.9元的手表。
+008 我的手机号是18544139121，欢迎来电。
+009 明天有62%的概率降雨。
+010 手表厂有五种好产品。
+011 跑马场有五百匹很勇敢的千里马。
+012 有一天，我看到了一栋楼，我顿感不妙，因为我看不清里面有没有人。
+013 史小姐拿着小雨伞去找她的老保姆了。
+014 不要相信这个老奶奶说的话，她一点儿也不好。
--- a/docs/source/tts/zh_text_frontend.md
+++ b/docs/source/tts/zh_text_frontend.md
@ -1,5 +1,5 @@
 # Chinese Rule Based Text Frontend
-TTS system mainly includes three modules: `text frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in Parakeet, see exapmle in `Parakeet/examples/text_frontend/`.
+A TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in PaddleSpeech TTS, see exapmle in [examples/other/text_frontend/](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/text_frontend).

 A text frontend module mainly includes:
 - Text Segmentation
--- a/examples/aishell3/tts0/run.sh
+++ b/examples/aishell3/tts0/run.sh
--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
@ -5,7 +5,8 @@

 ## Data

-| Data Subset | Duration in Seconds |
-| data/manifest.train |  1.23 ~ 14.53125 |
-| data/manifest.dev  | 1.645 ~ 12.533 |  
-| data/manifest.test | 1.859125 ~ 14.6999375 |
+| Data Subset         | Duration in Seconds   |
+| ------------------- | --------------------- |
+| data/manifest.train | 1.23 ~ 14.53125       |
+| data/manifest.dev   | 1.645 ~ 12.533        |
+| data/manifest.test  | 1.859125 ~ 14.6999375 |
--- a/examples/aishell/s0/README.md
+++ b/examples/aishell/s0/README.md
@ -4,9 +4,7 @@

 | Model | Params | Release | Config | Test set | Loss | CER |  
 | --- | --- | --- | --- | --- | --- | --- |  
-| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug | test | 6.016139030456543 | 0.066549 |  
-| --- | --- | --- | --- | --- | --- | --- |  
-| DeepSpeech2 | 58.4M | 7181e427 | conf/deepspeech2.yaml + spec aug | test | 5.71956205368042 | 0.064287 |  
+| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug | test | 5.71956205368042 | 0.064287 |  
 | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |  
 | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
 | DeepSpeech2 | 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |  
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@ -41,7 +41,7 @@ model:
  use_gru: True 
  share_rnn_weights: False
  blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: instance 

 training:
  n_epoch: 80
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@ -43,7 +43,7 @@ model:
  fc_layers_size_list: -1,
  use_gru: False 
  blank_id: 0
-  ctc_grad_norm_type: instance
+  ctc_grad_norm_type: null
  
 training:
  n_epoch: 50
--- a/examples/aishell/s0/local/test.sh
+++ b/examples/aishell/s0/local/test.sh
@ -13,7 +13,7 @@ ckpt_prefix=$2
 model_type=$3

 # download language model
-bash local/download_lm_ch.sh
+bash local/download_lm_ch.sh > /dev/null 2>&1
 if [ $? -ne 0 ]; then
   exit 1
 fi
--- a/examples/aishell/s0/local/test_export.sh
+++ b/examples/aishell/s0/local/test_export.sh
@ -13,7 +13,7 @@ jit_model_export_path=$2
 model_type=$3

 # download language model
-bash local/download_lm_ch.sh
+bash local/download_lm_ch.sh > /dev/null 2>&1
 if [ $? -ne 0 ]; then
   exit 1
 fi
--- a/examples/aishell/s0/path.sh
+++ b/examples/aishell/s0/path.sh
@ -11,4 +11,4 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/

 MODEL=deepspeech2
-export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@ -77,7 +77,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false

--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@ -72,7 +72,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: null
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false

--- a/examples/aishell/s1/local/test_hub.sh
+++ b/examples/aishell/s1/local/test_hub.sh
@ -0,0 +1,47 @@
+#!/bin/bash
+
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path ckpt_path_prefix audio_file"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_prefix=$2
+audio_file=$3
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+
+
+for type in  attention_rescoring; do
+    echo "decoding ${type}"
+    batch_size=1
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test_hub.py \
+    --nproc ${ngpu} \
+    --config ${config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size} \
+    --audio_file ${audio_file}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+exit 0
--- a/examples/aishell/s1/path.sh
+++ b/examples/aishell/s1/path.sh
@ -12,7 +12,7 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/

 # model exp
 MODEL=u2
-export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin


 # srilm
--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
@ -1,6 +1,6 @@
 #!/bin/bash
-set -e
 source path.sh
+set -e

 stage=0
 stop_stage=100
@ -13,6 +13,8 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"

+audio_file="data/tmp.wav"
+
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1
@ -46,5 +48,10 @@ fi
 # Optionally, you can add LM and test it with runtime.
 if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
    # train lm and build TLG
-    ./local/tlg.sh --corpus aishell --lmtype srilm 
+    ./local/tlg.sh --corpus aishell --lmtype srilm
 fi
+
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+fi
--- a/examples/aishell3/README.md
+++ b/examples/aishell3/README.md
@ -1,4 +1,11 @@
 # Aishell3

-* tts0 - fastspeech2
-* vc0 - tactron2 voice clone
+* tts0 - Tactron2
+* tts1 - TransformerTTS
+* tts2 - SpeedySpeech
+* tts3 - FastSpeech2
+* voc0 - WaveFlow
+* voc1 - Parallel WaveGAN
+* voc2 - MelGAN
+* voc3 - MultiBand MelGAN
+* vc0 - Tactron2 Voice Clone with GE2E
--- a/examples/vctk/fastspeech2/aishell3/README.md
+++ b/examples/vctk/fastspeech2/aishell3/README.md
@ -17,13 +17,24 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA result of AISHELL-3 and Extract it
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo.
-### Preprocess the dataset
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+
+## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
 Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
-Run the command below to preprocess the dataset.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset,
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
+```bash
+./run.sh
+```
+### Preprocess the dataset
 ```bash
-./preprocess.sh
+./local/preprocess.sh ${conf_path}
 ```
 When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
 ```text
@ -47,17 +58,17 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi

 Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.

-## Train the model
-`./run.sh` calls `../train.py`.
+### Train the model
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
 ```bash
-./run.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
 ```
 Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
-                [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
+                [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT]

 Train a FastSpeech2 model.

@ -70,8 +81,7 @@ optional arguments:
                        dev data.
  --output-dir OUTPUT_DIR
                        output dir.
-  --device DEVICE       device type to use.
-  --nprocs NPROCS       number of processes.
+  --ngpu NGPU           if ngpu=0, use cpu.
  --verbose VERBOSE     verbose.
  --phones-dict PHONES_DICT
                        phone vocabulary file.
@ -81,25 +91,12 @@ optional arguments:
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
 3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
-4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
-5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
-6. `--phones-dict` is the path of the phone vocabulary file.
-7. `--speaker-dict`is the path of the  speaker id map file when training a multi-speaker FastSpeech2.
-## Pretrained Model
-Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+6. `--speaker-dict`is the path of the  speaker id map file when training a multi-speaker FastSpeech2.

-FastSpeech2 checkpoint contains files listed below.
-
-```text
-fastspeech2_nosil_aishell3_ckpt_0.4
-├── default.yaml            # default config used to train fastspeech2
-├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
-├── snapshot_iter_96400.pdz # model parameters and optimizer states
-├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
-└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
-```
-## Synthesize
-We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
+### Synthesize
+We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
 Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
 ```bash
 unzip pwg_baker_ckpt_0.4.zip
@ -111,9 +108,9 @@ pwg_baker_ckpt_0.4
 ├── pwg_snapshot_iter_400000.pdz   # model parameters of parallel wavegan
 └── pwg_stats.npy                  # statistics used to normalize spectrogram when training parallel wavegan
 ```
-`synthesize.sh` calls `synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
 ```bash
-./synthesize.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
 usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
@ -123,7 +120,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
                     [--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT]
                     [--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
                     [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
-                     [--device DEVICE] [--verbose VERBOSE]
+                     [--ngpu NGPU] [--verbose VERBOSE]

 Synthesize with fastspeech2 & parallel wavegan.

@ -150,25 +147,25 @@ optional arguments:
                        test metadata.
  --output-dir OUTPUT_DIR
                        output dir.
-  --device DEVICE       device type to use.
-  --verbose VERBOSE     verbose.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --verbose VERBOSE     verbose
 ```
-
-`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
-
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e.py`, which can synthesize waveform from text file.
 ```bash
-./synthesize_e2e.sh
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
 ```
 ```text
-usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
-                         [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
-                         [--fastspeech2-stat FASTSPEECH2_STAT]
-                         [--pwg-config PWG_CONFIG]
-                         [--pwg-checkpoint PWG_CHECKPOINT]
-                         [--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
-                         [--speaker-dict SPEAKER_DICT] [--text TEXT]
-                         [--output-dir OUTPUT_DIR] [--device DEVICE]
-                         [--verbose VERBOSE]
+usage: multi_spk_synthesize_e2e.py [-h]
+                                   [--fastspeech2-config FASTSPEECH2_CONFIG]
+                                   [--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
+                                   [--fastspeech2-stat FASTSPEECH2_STAT]
+                                   [--pwg-config PWG_CONFIG]
+                                   [--pwg-checkpoint PWG_CHECKPOINT]
+                                   [--pwg-stat PWG_STAT]
+                                   [--phones-dict PHONES_DICT]
+                                   [--speaker-dict SPEAKER_DICT] [--text TEXT]
+                                   [--output-dir OUTPUT_DIR] [--ngpu NGPU]
+                                   [--verbose VERBOSE]

 Synthesize with fastspeech2 & parallel wavegan.

@ -194,7 +191,7 @@ optional arguments:
  --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
  --output-dir OUTPUT_DIR
                        output dir.
-  --device DEVICE       device type to use.
+  --ngpu NGPU           if ngpu == 0, use cpu.
  --verbose VERBOSE     verbose.
 ```
 1. `--fastspeech2-config`, `--fastspeech2-checkpoint`, `--fastspeech2-stat`, `--phones-dict` and `--speaker-dict` are arguments for fastspeech2, which correspond to the 5 files in the fastspeech2 pretrained model.
@ -202,26 +199,39 @@ optional arguments:
 3. `--test-metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 4. `--text` is the text file, which contains sentences to synthesize.
 5. `--output-dir` is the directory to save synthesized audio files.
-6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.

-You can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+## Pretrained Model
+Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
+
+FastSpeech2 checkpoint contains files listed below.
+
+```text
+fastspeech2_nosil_aishell3_ckpt_0.4
+├── default.yaml            # default config used to train fastspeech2
+├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── snapshot_iter_96400.pdz # model parameters and optimizer states
+├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
+└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
+```
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
+source path.sh
+
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 synthesize_e2e.py \
+python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
  --fastspeech2-config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \
  --fastspeech2-checkpoint=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \
  --fastspeech2-stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \
  --pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
  --pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
  --pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
-  --text=../sentences.txt \
+  --text=${BIN_DIR}/../sentences.txt \
  --output-dir=exp/default/test_e2e \
-  --device="gpu" \
  --phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
  --speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt

 ```
-
 ## Future work
 A multi-speaker  vocoder is needed.
--- a/Show More
+++ b/Show More