Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into dev

3 years ago · ef27a0e18a
parent 357a6723e0 32afa23e50
commit ef27a0e18a
267 changed files with 3396 additions and 4352 deletions
--- a/audio/.gitignore
+++ b/audio/.gitignore
@ -1,7 +0,0 @@
-.ipynb_checkpoints/**
-*.ipynb
-nohup.out
-__pycache__/
-*.wav
-*.m4a
-obsolete/**
--- a/audio/.pre-commit-config.yaml
+++ b/audio/.pre-commit-config.yaml
@ -1,45 +0,0 @@
-repos:
-   repo: local
-    hooks:
-    -   id: yapf
-        name: yapf
-        entry: yapf
-        language: system
-        args: [-i, --style .style.yapf]
-        files: \.py$
-
-   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: a11d9314b22d8f8c7556443875b731ef05965464
-    hooks:
-    -   id: check-merge-conflict
-    -   id: check-symlinks
-    -   id: end-of-file-fixer
-    -   id: trailing-whitespace
-    -   id: detect-private-key
-    -   id: check-symlinks
-    -   id: check-added-large-files
-
-   repo: https://github.com/pycqa/isort
-    rev: 5.8.0
-    hooks:
-    -   id: isort
-        name: isort (python)
-    -   id: isort
-        name: isort (cython)
-        types: [cython]
-    -   id: isort
-        name: isort (pyi)
-        types: [pyi]
-
-   repo: local
-    hooks:
-    -   id: flake8
-        name: flake8
-        entry: flake8
-        language: system
-        args:
-        -   --count
-        -   --select=E9,F63,F7,F82
-        -   --show-source
-        -   --statistics
-        files: \.py$
--- a/audio/.style.yapf
+++ b/audio/.style.yapf
@ -1,3 +0,0 @@
-[style]
-based_on_style = pep8
-column_limit = 80
--- a/audio/LICENSE
+++ b/audio/LICENSE
@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--- a/audio/README.md
+++ b/audio/README.md
@ -1,37 +0,0 @@
-# PaddleAudio:  The audio library for PaddlePaddle
-
-## Introduction
-PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap.
-
-
-
-## Features
- Spectrogram and related features are compatible with librosa.
- State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come.
- Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap.
- Data loading supports for common open source audio in multiple languages including English, Mandarin and so on.
-
-
-## Install
-```
-git clone https://github.com/PaddlePaddle/models
-cd models/PaddleAudio
-pip install .
-
-```
-
-## Quick start
-### Audio loading and feature extraction
-```
-import paddleaudio as pa
-s,r = pa.load(f)
-mel_spect = pa.melspectrogram(s,sr=r)
-```
-
-###  Examples
-We provide a set of examples to help you get started in using PaddleAudio quickly.
- [PANNs:  acoustic scene and events analysis using pre-trained models](./examples/panns)
- [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification)
- [Training a audio-tagging network on Audioset](./examples/audioset_training)
-
-Please refer to [example directory](./examples) for more details.
--- a/audio/examples/panns/README.md
+++ b/audio/examples/panns/README.md
@ -1,128 +0,0 @@
-# Audio Tagging
-
-声音分类的任务是单标签的分类任务，但是对于一段音频来说，它可以是多标签的。譬如在一般的室内办公环境进行录音，这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音，还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言，对一段音频预测多个标签是具有很强的实用性的。
-
-在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
-
-`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别，因此可用于声音的实时tagging。
-
-本示例采用`PANNs`预训练模型，基于Audioset的标签类别对输入音频实时tagging，并最终以文本形式输出对应时刻的top k类别和对应的得分。
-
-
-## 模型简介
-
-PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
- CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
- CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
- CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
-
-
-## 快速开始
-
-### 模型预测
-
-```shell
-export CUDA_VISIBLE_DEVICES=0
-python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir
-```
-
-可支持配置的参数：
-
- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
- `wav`: 指定预测的音频文件。
- `sample_duration`: 模型每次预测的音频时间长度，单位为秒，默认为2s。
- `hop_duration`: 每两个预测音频的时间间隔，单位为秒，默认为0.3s。
- `output_dir`: 模型预测结果存放的路径，默认为`./output_dir`。
-
-示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
-```python
-from paddleaudio.models.panns import cnn14, cnn10, cnn6
-
-# CNN14
-model = cnn14(pretrained=True, extract_embedding=False)
-# CNN10
-model = cnn10(pretrained=True, extract_embedding=False)
-# CNN6
-model = cnn6(pretrained=True, extract_embedding=False)
-```
-
-执行结果：
-```
-[2021-04-30 19:15:41,025] [    INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz
-```
-
-执行后得分结果保存在`output_dir`的`.npz`文件中。
-
-
-### 生成tagging标签文本
-```shell
-python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir
-```
-
-可支持配置的参数：
-
- `tagging_file`: 模型预测结果文件。
- `top_k`: 获取预测结果中，得分最高的前top_k个标签，默认为10。
- `smooth`: 预测结果的后验概率平滑，默认为True，表示应用平滑。
- `smooth_size`: 平滑计算过程中的样本数量，默认为5。
- `label_file`: 模型预测结果对应的Audioset类别的文本文件。
- `output_dir`: 标签文本存放的路径，默认为`./output_dir`。
-
-执行结果：
-```
-[2021-04-30 19:26:58,743] [    INFO] - Posterior smoothing...
-[2021-04-30 19:26:58,746] [    INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt
-```
-
-执行后文本结果保存在`output_dir`的`.txt`文件中。
-
-
-## Tagging标签文本
-
-最终输出的文本结果如下所示。  
-样本每个时间范围的top k结果用空行分隔。在每一个结果中，第一行是时间信息，数字表示tagging结果在时间起点信息，比例值代表当前时刻`t`与音频总长度`T`的比值；紧接的k行是对应的标签和得分。
-
-```
-0.0
-Cat: 0.9144676923751831
-Animal: 0.8855036497116089
-Domestic animals, pets: 0.804577112197876
-Meow: 0.7422927021980286
-Music: 0.19959309697151184
-Inside, small room: 0.12550437450408936
-Caterwaul: 0.021584441885352135
-Purr: 0.020247288048267365
-Speech: 0.018197158351540565
-Vehicle: 0.007446660194545984
-
-0.059197544398158296
-Cat: 0.9250872135162354
-Animal: 0.8957151174545288
-Domestic animals, pets: 0.8228275775909424
-Meow: 0.7650775909423828
-Music: 0.20210561156272888
-Inside, small room: 0.12290887534618378
-Caterwaul: 0.029371455311775208
-Purr: 0.018731823191046715
-Speech: 0.017130598425865173
-Vehicle: 0.007748497650027275
-
-0.11839508879631659
-Cat: 0.9336574673652649
-Animal: 0.9111202359199524
-Domestic animals, pets: 0.8349071145057678
-Meow: 0.7761964797973633
-Music: 0.20467285811901093
-Inside, small room: 0.10709915310144424
-Caterwaul: 0.05370649695396423
-Purr: 0.018830426037311554
-Speech: 0.017361722886562347
-Vehicle: 0.006929398979991674
-
-...
-...
-```
-
-以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子，可以实时地对音频进行多标签预测。
-
-![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif)
--- a/audio/examples/panns/assets/audioset_labels.txt
+++ b/audio/examples/panns/assets/audioset_labels.txt
@ -1,527 +0,0 @@
-Speech
-Male speech, man speaking
-Female speech, woman speaking
-Child speech, kid speaking
-Conversation
-Narration, monologue
-Babbling
-Speech synthesizer
-Shout
-Bellow
-Whoop
-Yell
-Battle cry
-Children shouting
-Screaming
-Whispering
-Laughter
-Baby laughter
-Giggle
-Snicker
-Belly laugh
-Chuckle, chortle
-Crying, sobbing
-Baby cry, infant cry
-Whimper
-Wail, moan
-Sigh
-Singing
-Choir
-Yodeling
-Chant
-Mantra
-Male singing
-Female singing
-Child singing
-Synthetic singing
-Rapping
-Humming
-Groan
-Grunt
-Whistling
-Breathing
-Wheeze
-Snoring
-Gasp
-Pant
-Snort
-Cough
-Throat clearing
-Sneeze
-Sniff
-Run
-Shuffle
-Walk, footsteps
-Chewing, mastication
-Biting
-Gargling
-Stomach rumble
-Burping, eructation
-Hiccup
-Fart
-Hands
-Finger snapping
-Clapping
-Heart sounds, heartbeat
-Heart murmur
-Cheering
-Applause
-Chatter
-Crowd
-Hubbub, speech noise, speech babble
-Children playing
-Animal
-Domestic animals, pets
-Dog
-Bark
-Yip
-Howl
-Bow-wow
-Growling
-Whimper (dog)
-Cat
-Purr
-Meow
-Hiss
-Caterwaul
-Livestock, farm animals, working animals
-Horse
-Clip-clop
-Neigh, whinny
-Cattle, bovinae
-Moo
-Cowbell
-Pig
-Oink
-Goat
-Bleat
-Sheep
-Fowl
-Chicken, rooster
-Cluck
-Crowing, cock-a-doodle-doo
-Turkey
-Gobble
-Duck
-Quack
-Goose
-Honk
-Wild animals
-Roaring cats (lions, tigers)
-Roar
-Bird
-Bird vocalization, bird call, bird song
-Chirp, tweet
-Squawk
-Pigeon, dove
-Coo
-Crow
-Caw
-Owl
-Hoot
-Bird flight, flapping wings
-Canidae, dogs, wolves
-Rodents, rats, mice
-Mouse
-Patter
-Insect
-Cricket
-Mosquito
-Fly, housefly
-Buzz
-Bee, wasp, etc.
-Frog
-Croak
-Snake
-Rattle
-Whale vocalization
-Music
-Musical instrument
-Plucked string instrument
-Guitar
-Electric guitar
-Bass guitar
-Acoustic guitar
-Steel guitar, slide guitar
-Tapping (guitar technique)
-Strum
-Banjo
-Sitar
-Mandolin
-Zither
-Ukulele
-Keyboard (musical)
-Piano
-Electric piano
-Organ
-Electronic organ
-Hammond organ
-Synthesizer
-Sampler
-Harpsichord
-Percussion
-Drum kit
-Drum machine
-Drum
-Snare drum
-Rimshot
-Drum roll
-Bass drum
-Timpani
-Tabla
-Cymbal
-Hi-hat
-Wood block
-Tambourine
-Rattle (instrument)
-Maraca
-Gong
-Tubular bells
-Mallet percussion
-Marimba, xylophone
-Glockenspiel
-Vibraphone
-Steelpan
-Orchestra
-Brass instrument
-French horn
-Trumpet
-Trombone
-Bowed string instrument
-String section
-Violin, fiddle
-Pizzicato
-Cello
-Double bass
-Wind instrument, woodwind instrument
-Flute
-Saxophone
-Clarinet
-Harp
-Bell
-Church bell
-Jingle bell
-Bicycle bell
-Tuning fork
-Chime
-Wind chime
-Change ringing (campanology)
-Harmonica
-Accordion
-Bagpipes
-Didgeridoo
-Shofar
-Theremin
-Singing bowl
-Scratching (performance technique)
-Pop music
-Hip hop music
-Beatboxing
-Rock music
-Heavy metal
-Punk rock
-Grunge
-Progressive rock
-Rock and roll
-Psychedelic rock
-Rhythm and blues
-Soul music
-Reggae
-Country
-Swing music
-Bluegrass
-Funk
-Folk music
-Middle Eastern music
-Jazz
-Disco
-Classical music
-Opera
-Electronic music
-House music
-Techno
-Dubstep
-Drum and bass
-Electronica
-Electronic dance music
-Ambient music
-Trance music
-Music of Latin America
-Salsa music
-Flamenco
-Blues
-Music for children
-New-age music
-Vocal music
-A capella
-Music of Africa
-Afrobeat
-Christian music
-Gospel music
-Music of Asia
-Carnatic music
-Music of Bollywood
-Ska
-Traditional music
-Independent music
-Song
-Background music
-Theme music
-Jingle (music)
-Soundtrack music
-Lullaby
-Video game music
-Christmas music
-Dance music
-Wedding music
-Happy music
-Funny music
-Sad music
-Tender music
-Exciting music
-Angry music
-Scary music
-Wind
-Rustling leaves
-Wind noise (microphone)
-Thunderstorm
-Thunder
-Water
-Rain
-Raindrop
-Rain on surface
-Stream
-Waterfall
-Ocean
-Waves, surf
-Steam
-Gurgling
-Fire
-Crackle
-Vehicle
-Boat, Water vehicle
-Sailboat, sailing ship
-Rowboat, canoe, kayak
-Motorboat, speedboat
-Ship
-Motor vehicle (road)
-Car
-Vehicle horn, car horn, honking
-Toot
-Car alarm
-Power windows, electric windows
-Skidding
-Tire squeal
-Car passing by
-Race car, auto racing
-Truck
-Air brake
-Air horn, truck horn
-Reversing beeps
-Ice cream truck, ice cream van
-Bus
-Emergency vehicle
-Police car (siren)
-Ambulance (siren)
-Fire engine, fire truck (siren)
-Motorcycle
-Traffic noise, roadway noise
-Rail transport
-Train
-Train whistle
-Train horn
-Railroad car, train wagon
-Train wheels squealing
-Subway, metro, underground
-Aircraft
-Aircraft engine
-Jet engine
-Propeller, airscrew
-Helicopter
-Fixed-wing aircraft, airplane
-Bicycle
-Skateboard
-Engine
-Light engine (high frequency)
-Dental drill, dentist's drill
-Lawn mower
-Chainsaw
-Medium engine (mid frequency)
-Heavy engine (low frequency)
-Engine knocking
-Engine starting
-Idling
-Accelerating, revving, vroom
-Door
-Doorbell
-Ding-dong
-Sliding door
-Slam
-Knock
-Tap
-Squeak
-Cupboard open or close
-Drawer open or close
-Dishes, pots, and pans
-Cutlery, silverware
-Chopping (food)
-Frying (food)
-Microwave oven
-Blender
-Water tap, faucet
-Sink (filling or washing)
-Bathtub (filling or washing)
-Hair dryer
-Toilet flush
-Toothbrush
-Electric toothbrush
-Vacuum cleaner
-Zipper (clothing)
-Keys jangling
-Coin (dropping)
-Scissors
-Electric shaver, electric razor
-Shuffling cards
-Typing
-Typewriter
-Computer keyboard
-Writing
-Alarm
-Telephone
-Telephone bell ringing
-Ringtone
-Telephone dialing, DTMF
-Dial tone
-Busy signal
-Alarm clock
-Siren
-Civil defense siren
-Buzzer
-Smoke detector, smoke alarm
-Fire alarm
-Foghorn
-Whistle
-Steam whistle
-Mechanisms
-Ratchet, pawl
-Clock
-Tick
-Tick-tock
-Gears
-Pulleys
-Sewing machine
-Mechanical fan
-Air conditioning
-Cash register
-Printer
-Camera
-Single-lens reflex camera
-Tools
-Hammer
-Jackhammer
-Sawing
-Filing (rasp)
-Sanding
-Power tool
-Drill
-Explosion
-Gunshot, gunfire
-Machine gun
-Fusillade
-Artillery fire
-Cap gun
-Fireworks
-Firecracker
-Burst, pop
-Eruption
-Boom
-Wood
-Chop
-Splinter
-Crack
-Glass
-Chink, clink
-Shatter
-Liquid
-Splash, splatter
-Slosh
-Squish
-Drip
-Pour
-Trickle, dribble
-Gush
-Fill (with liquid)
-Spray
-Pump (liquid)
-Stir
-Boiling
-Sonar
-Arrow
-Whoosh, swoosh, swish
-Thump, thud
-Thunk
-Electronic tuner
-Effects unit
-Chorus effect
-Basketball bounce
-Bang
-Slap, smack
-Whack, thwack
-Smash, crash
-Breaking
-Bouncing
-Whip
-Flap
-Scratch
-Scrape
-Rub
-Roll
-Crushing
-Crumpling, crinkling
-Tearing
-Beep, bleep
-Ping
-Ding
-Clang
-Squeal
-Creak
-Rustle
-Whir
-Clatter
-Sizzle
-Clicking
-Clickety-clack
-Rumble
-Plop
-Jingle, tinkle
-Hum
-Zing
-Boing
-Crunch
-Silence
-Sine wave
-Harmonic
-Chirp tone
-Sound effect
-Pulse
-Inside, small room
-Inside, large room or hall
-Inside, public space
-Outside, urban or manmade
-Outside, rural or natural
-Reverberation
-Echo
-Noise
-Environmental noise
-Static
-Mains hum
-Distortion
-Sidetone
-Cacophony
-White noise
-Pink noise
-Throbbing
-Vibration
-Television
-Radio
-Field recording
--- a/audio/examples/panns/audio_tag.py
+++ b/audio/examples/panns/audio_tag.py
@ -1,111 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from typing import List
-
-import numpy as np
-import paddle
-from paddleaudio.backends import load as load_audio
-from paddleaudio.features import melspectrogram
-from paddleaudio.models.panns import cnn14
-from paddleaudio.utils import logger
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
-parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
-parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
-parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
-parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
-args = parser.parse_args()
-# yapf: enable
-
-
-def split(waveform: np.ndarray, win_size: int, hop_size: int):
-    """
-    Split into N waveforms.
-    N is decided by win_size and hop_size.
-    """
-    assert isinstance(waveform, np.ndarray)
-    time = []
-    data = []
-    for i in range(0, len(waveform), hop_size):
-        segment = waveform[i:i + win_size]
-        if len(segment) < win_size:
-            segment = np.pad(segment, (0, win_size - len(segment)))
-        data.append(segment)
-        time.append(i / len(waveform))
-    return time, data
-
-
-def batchify(data: List[List[float]],
-             sample_rate: int,
-             batch_size: int,
-             **kwargs):
-    """
-    Extract features from waveforms and create batches.
-    """
-    examples = []
-    for waveform in data:
-        feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
-        examples.append(feats)
-
-    # Seperates data into some batches.
-    one_batch = []
-    for example in examples:
-        one_batch.append(example)
-        if len(one_batch) == batch_size:
-            yield one_batch
-            one_batch = []
-    if one_batch:
-        yield one_batch
-
-
-def predict(model, data: List[List[float]], sample_rate: int,
-            batch_size: int=1):
-    """
-    Use pretrained model to make predictions.
-    """
-    batches = batchify(data, sample_rate, batch_size)
-    results = None
-    model.eval()
-    for batch in batches:
-        feats = paddle.to_tensor(batch).unsqueeze(1)  \
-            # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
-
-        audioset_scores = model(feats)
-        if results is None:
-            results = audioset_scores.numpy()
-        else:
-            results = np.concatenate((results, audioset_scores.numpy()))
-
-    return results
-
-
-if __name__ == '__main__':
-    paddle.set_device(args.device)
-    model = cnn14(pretrained=True, extract_embedding=False)
-    waveform, sr = load_audio(args.wav, sr=None)
-    time, data = split(waveform,
-                       int(args.sample_duration * sr),
-                       int(args.hop_duration * sr))
-    results = predict(model, data, sr, batch_size=8)
-
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-    time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
-    output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
-    np.savez(output_file, time=time, scores=results)
-    logger.info(f'Saved tagging results to {output_file}')
--- a/audio/examples/panns/parse_result.py
+++ b/audio/examples/panns/parse_result.py
@ -1,83 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import ast
-import os
-from typing import Dict
-
-import numpy as np
-from paddleaudio.utils import logger
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--tagging_file', type=str, required=True, help='')
-parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
-parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
-parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
-parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
-parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
-args = parser.parse_args()
-# yapf: enable
-
-
-def smooth(results: np.ndarray, win_size: int):
-    """
-    Execute posterior smoothing in-place.
-    """
-    for i in range(len(results) - 1, -1, -1):
-        if i < win_size - 1:
-            left = 0
-        else:
-            left = i + 1 - win_size
-        results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
-
-
-def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
-    """
-    Return top k result.
-    """
-    result = np.asarray(result)
-    topk_idx = (-result).argsort()[:k]
-
-    ret = ''
-    for idx in topk_idx:
-        label, score = label_map[idx], result[idx]
-        ret += f'{label}: {score}\n'
-    return ret
-
-
-if __name__ == "__main__":
-    label_map = {}
-    with open(args.label_file, 'r') as f:
-        for i, l in enumerate(f.readlines()):
-            label_map[i] = l.strip()
-
-    results = np.load(args.tagging_file, allow_pickle=True)
-    times, scores = results['time'], results['scores']
-
-    if args.smooth:
-        logger.info('Posterior smoothing...')
-        smooth(scores, win_size=args.smooth_size)
-
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-    output_file = os.path.join(
-        args.output_dir,
-        os.path.basename(args.tagging_file).split('.')[0] + '.txt')
-    with open(output_file, 'w') as f:
-        for time, score in zip(times, scores):
-            f.write(f'{time}\n')
-            f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
-
-    logger.info(f'Saved tagging labels to {output_file}')
--- a/audio/paddleaudio/datasets/aishell.py
+++ b/audio/paddleaudio/datasets/aishell.py
@ -1,154 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import codecs
-import collections
-import json
-import os
-from typing import Dict
-
-from paddle.io import Dataset
-from tqdm import tqdm
-
-from ..backends import load as load_audio
-from ..utils.download import decompress
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from ..utils.log import logger
-from .dataset import feat_funcs
-
-__all__ = ['AISHELL1']
-
-
-class AISHELL1(Dataset):
-    """
-    This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
-    It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
-    smart home, autonomous driving, and industrial production. The whole recording was
-    put in quiet indoor environment, using 3 different devices at the same time: high
-    fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
-    iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
-    to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
-    in China were invited to participate in the recording. The manual transcription
-    accuracy rate is above 95%, through professional speech annotation and strict
-    quality inspection. The corpus is divided into training, development and testing
-    sets.
-
-    Reference:
-        AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
-        https://arxiv.org/abs/1709.05522
-    """
-
-    archieves = [
-        {
-            'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
-            'md5': '2f494334227864a8a8fec932999db9d8',
-        },
-    ]
-    text_meta = os.path.join('data_aishell', 'transcript',
-                             'aishell_transcript_v0.8.txt')
-    utt_info = collections.namedtuple('META_INFO',
-                                      ('file_path', 'utt_id', 'text'))
-    audio_path = os.path.join('data_aishell', 'wav')
-    manifest_path = os.path.join('data_aishell', 'manifest')
-    subset = ['train', 'dev', 'test']
-
-    def __init__(self, subset: str='train', feat_type: str='raw', **kwargs):
-        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
-            self.subset, subset)
-        self.subset = subset
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self._data = self._get_data()
-        super(AISHELL1, self).__init__()
-
-    def _get_text_info(self) -> Dict[str, str]:
-        ret = {}
-        with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                utt_id, text = map(str.strip, line.split(' ',
-                                                         1))  # utt_id, text
-                ret.update({utt_id: ''.join(text.split())})
-        return ret
-
-    def _get_data(self):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-            # Extract *wav from *.tar.gz.
-            for root, _, files in os.walk(
-                    os.path.join(DATA_HOME, self.audio_path)):
-                for file in files:
-                    if file.endswith('.tar.gz'):
-                        decompress(os.path.join(root, file))
-                        os.remove(os.path.join(root, file))
-
-        text_info = self._get_text_info()
-
-        data = []
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.wav'):
-                    utt_id = os.path.splitext(file)[0]
-                    if utt_id not in text_info:  # There are some utt_id that without label
-                        continue
-                    text = text_info[utt_id]
-                    file_path = os.path.join(root, file)
-                    data.append(self.utt_info(file_path, utt_id, text))
-
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = load_audio(
-            sample[0])  # The first element of sample is file path
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sample_rate=sr,
-            **self.feat_config) if feat_func else waveform
-        record.update({'feat': feat, 'duration': len(waveform) / sr})
-        return record
-
-    def create_manifest(self, prefix='manifest'):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
-            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
-
-        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
-                                     f'{prefix}.{self.subset}')
-        with codecs.open(manifest_file, 'w', 'utf-8') as f:
-            for idx in tqdm(range(len(self))):
-                record = self._convert_to_record(idx)
-                record_line = json.dumps(
-                    {
-                        'utt': record['utt_id'],
-                        'feat': record['file_path'],
-                        'feat_shape': (record['duration'], ),
-                        'text': record['text']
-                    },
-                    ensure_ascii=False)
-                f.write(record_line + '\n')
-        logger.info(f'Manifest file {manifest_file} created.')
-
-    def __getitem__(self, idx):
-        record = self._convert_to_record(idx)
-        return tuple(record.values())
-
-    def __len__(self):
-        return len(self._data)
--- a/audio/paddleaudio/datasets/dcase.py
+++ b/audio/paddleaudio/datasets/dcase.py
@ -1,298 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-from typing import List
-from typing import Tuple
-
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from .dataset import AudioClassificationDataset
-
-__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
-
-
-class UrbanAcousticScenes(AudioClassificationDataset):
-    """
-    TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from
-    12 European cities in 10 different acoustic scenes using 4 different devices.
-    Additionally, synthetic data for 11 mobile devices was created based on the original
-    recordings. Of the 12 cities, two are present only in the evaluation set.
-
-    Reference:
-        A multi-device dataset for urban acoustic scene classification
-        https://arxiv.org/abs/1807.09840
-    """
-
-    source_url = 'https://zenodo.org/record/3819968/files/'
-    base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
-    archieves = [
-        {
-            'url': source_url + base_name + '.meta.zip',
-            'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
-        },
-        {
-            'url': source_url + base_name + '.audio.1.zip',
-            'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
-        },
-        {
-            'url': source_url + base_name + '.audio.2.zip',
-            'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
-        },
-        {
-            'url': source_url + base_name + '.audio.3.zip',
-            'md5': 'ed38956c4246abb56190c1e9b602b7b8',
-        },
-        {
-            'url': source_url + base_name + '.audio.4.zip',
-            'md5': '97ab8560056b6816808dedc044dcc023',
-        },
-        {
-            'url': source_url + base_name + '.audio.5.zip',
-            'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
-        },
-        {
-            'url': source_url + base_name + '.audio.6.zip',
-            'md5': 'fbf856a3a86fff7520549c899dc94372',
-        },
-        {
-            'url': source_url + base_name + '.audio.7.zip',
-            'md5': '0dbffe7b6e45564da649378723284062',
-        },
-        {
-            'url': source_url + base_name + '.audio.8.zip',
-            'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
-        },
-        {
-            'url': source_url + base_name + '.audio.9.zip',
-            'md5': 'a65596a5372eab10c78e08a0de797c9e',
-        },
-        {
-            'url': source_url + base_name + '.audio.10.zip',
-            'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
-        },
-        {
-            'url': source_url + base_name + '.audio.11.zip',
-            'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
-        },
-        {
-            'url': source_url + base_name + '.audio.12.zip',
-            'md5': 'e5f4400c6b9697295fab4cf507155a2f',
-        },
-        {
-            'url': source_url + base_name + '.audio.13.zip',
-            'md5': '8855ab9f9896422746ab4c5d89d8da2f',
-        },
-        {
-            'url': source_url + base_name + '.audio.14.zip',
-            'md5': '092ad744452cd3e7de78f988a3d13020',
-        },
-        {
-            'url': source_url + base_name + '.audio.15.zip',
-            'md5': '4b5eb85f6592aebf846088d9df76b420',
-        },
-        {
-            'url': source_url + base_name + '.audio.16.zip',
-            'md5': '2e0a89723e58a3836be019e6996ae460',
-        },
-    ]
-    label_list = [
-        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
-        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
-    ]
-
-    meta = os.path.join(base_name, 'meta.csv')
-    meta_info = collections.namedtuple('META_INFO', (
-        'filename', 'scene_label', 'identifier', 'source_label'))
-    subset_meta = {
-        'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
-        'dev':
-        os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
-        'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
-    }
-    subset_meta_info = collections.namedtuple('SUBSET_META_INFO',
-                                              ('filename', 'scene_label'))
-    audio_path = os.path.join(base_name, 'audio')
-
-    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        files, labels = self._get_data(mode)
-        super(UrbanAcousticScenes, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, subset: str=None,
-                       skip_header: bool=True) -> List[collections.namedtuple]:
-        if subset is None:
-            meta_file = self.meta
-            meta_info = self.meta_info
-        else:
-            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
-            meta_file = self.subset_meta[subset]
-            meta_info = self.subset_meta_info
-
-        ret = []
-        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
-            lines = rf.readlines()[1:] if skip_header else rf.readlines()
-            for line in lines:
-                ret.append(meta_info(*line.strip().split('\t')))
-        return ret
-
-    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        meta_info = self._get_meta_info(subset=mode, skip_header=True)
-
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, label = sample[:2]
-            filename = os.path.basename(filename)
-            target = self.label_list.index(label)
-
-            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-            labels.append(int(target))
-
-        return files, labels
-
-
-class UrbanAudioVisualScenes(AudioClassificationDataset):
-    """
-    TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
-    and video recordings from 12 European cities in 10 different scenes.
-    This dataset consists of 10-seconds audio and video segments from 10
-    acoustic scenes. The total amount of audio in the development set is 34 hours.
-
-    Reference:
-        A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
-        https://arxiv.org/abs/2011.00030
-    """
-
-    source_url = 'https://zenodo.org/record/4477542/files/'
-    base_name = 'TAU-urban-audio-visual-scenes-2021-development'
-
-    archieves = [
-        {
-            'url': source_url + base_name + '.meta.zip',
-            'md5': '76e3d7ed5291b118372e06379cb2b490',
-        },
-        {
-            'url': source_url + base_name + '.audio.1.zip',
-            'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
-        },
-        {
-            'url': source_url + base_name + '.audio.2.zip',
-            'md5': '7fd6bb63127f5785874a55aba4e77aa5',
-        },
-        {
-            'url': source_url + base_name + '.audio.3.zip',
-            'md5': '61396bede29d7c8c89729a01a6f6b2e2',
-        },
-        {
-            'url': source_url + base_name + '.audio.4.zip',
-            'md5': '6ddac89717fcf9c92c451868eed77fe1',
-        },
-        {
-            'url': source_url + base_name + '.audio.5.zip',
-            'md5': 'af4820756cdf1a7d4bd6037dc034d384',
-        },
-        {
-            'url': source_url + base_name + '.audio.6.zip',
-            'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
-        },
-        {
-            'url': source_url + base_name + '.audio.7.zip',
-            'md5': '2be39a76aeed704d5929d020a2909efd',
-        },
-        {
-            'url': source_url + base_name + '.audio.8.zip',
-            'md5': '972d8afe0874720fc2f28086e7cb22a9',
-        },
-    ]
-    label_list = [
-        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
-        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
-    ]
-
-    meta_base_path = os.path.join(base_name, base_name + '.meta')
-    meta = os.path.join(meta_base_path, 'meta.csv')
-    meta_info = collections.namedtuple('META_INFO', (
-        'filename_audio', 'filename_video', 'scene_label', 'identifier'))
-    subset_meta = {
-        'train':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
-        'dev':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
-        'test':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
-    }
-    subset_meta_info = collections.namedtuple('SUBSET_META_INFO', (
-        'filename_audio', 'filename_video', 'scene_label'))
-    audio_path = os.path.join(base_name, 'audio')
-
-    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        files, labels = self._get_data(mode)
-        super(UrbanAudioVisualScenes, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, subset: str=None,
-                       skip_header: bool=True) -> List[collections.namedtuple]:
-        if subset is None:
-            meta_file = self.meta
-            meta_info = self.meta_info
-        else:
-            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
-            meta_file = self.subset_meta[subset]
-            meta_info = self.subset_meta_info
-
-        ret = []
-        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
-            lines = rf.readlines()[1:] if skip_header else rf.readlines()
-            for line in lines:
-                ret.append(meta_info(*line.strip().split('\t')))
-        return ret
-
-    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves,
-                                    os.path.join(DATA_HOME, self.base_name))
-
-        meta_info = self._get_meta_info(subset=mode, skip_header=True)
-
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, _, label = sample[:3]
-            filename = os.path.basename(filename)
-            target = self.label_list.index(label)
-
-            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-            labels.append(int(target))
-
-        return files, labels
--- a/audio/paddleaudio/datasets/librispeech.py
+++ b/audio/paddleaudio/datasets/librispeech.py
@ -1,199 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import codecs
-import collections
-import json
-import os
-from typing import Dict
-
-from paddle.io import Dataset
-from tqdm import tqdm
-
-from ..backends import load as load_audio
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from ..utils.log import logger
-from .dataset import feat_funcs
-
-__all__ = ['LIBRISPEECH']
-
-
-class LIBRISPEECH(Dataset):
-    """
-    LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
-    prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
-    derived from read audiobooks from the LibriVox project, and has been carefully
-    segmented and aligned.
-
-    Reference:
-        LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
-        http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
-        https://arxiv.org/abs/1709.05522
-    """
-
-    source_url = 'http://www.openslr.org/resources/12/'
-    archieves = [
-        {
-            'url': source_url + 'train-clean-100.tar.gz',
-            'md5': '2a93770f6d5c6c964bc36631d331a522',
-        },
-        {
-            'url': source_url + 'train-clean-360.tar.gz',
-            'md5': 'c0e676e450a7ff2f54aeade5171606fa',
-        },
-        {
-            'url': source_url + 'train-other-500.tar.gz',
-            'md5': 'd1a0fd59409feb2c614ce4d30c387708',
-        },
-        {
-            'url': source_url + 'dev-clean.tar.gz',
-            'md5': '42e2234ba48799c1f50f24a7926300a1',
-        },
-        {
-            'url': source_url + 'dev-other.tar.gz',
-            'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
-        },
-        {
-            'url': source_url + 'test-clean.tar.gz',
-            'md5': '32fa31d27d2e1cad72775fee3f4849a9',
-        },
-        {
-            'url': source_url + 'test-other.tar.gz',
-            'md5': 'fb5a50374b501bb3bac4815ee91d3135',
-        },
-    ]
-    speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
-    utt_info = collections.namedtuple('META_INFO', (
-        'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
-    audio_path = 'LibriSpeech'
-    manifest_path = os.path.join('LibriSpeech', 'manifest')
-    subset = [
-        'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean',
-        'dev-other', 'test-clean', 'test-other'
-    ]
-
-    def __init__(self,
-                 subset: str='train-clean-100',
-                 feat_type: str='raw',
-                 **kwargs):
-        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
-            self.subset, subset)
-        self.subset = subset
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self._data = self._get_data()
-        super(LIBRISPEECH, self).__init__()
-
-    def _get_speaker_info(self) -> Dict[str, str]:
-        ret = {}
-        with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
-            for line in rf.readlines():
-                if ';' in line:  # Skip dataset abstract
-                    continue
-                spk_id, gender = map(str.strip,
-                                     line.split('|')[:2])  # spk_id, gender
-                ret.update({spk_id: gender})
-        return ret
-
-    def _get_text_info(self, trans_file) -> Dict[str, str]:
-        ret = {}
-        with open(trans_file, 'r') as rf:
-            for line in rf.readlines():
-                utt_id, text = map(str.strip, line.split(' ',
-                                                         1))  # utt_id, text
-                ret.update({utt_id: text})
-        return ret
-
-    def _get_data(self):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
-            download_and_decompress(self.archieves, DATA_HOME,
-                                    len(self.archieves))
-
-        # Speaker info
-        speaker_info = self._get_speaker_info()
-
-        # Text info
-        text_info = {}
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.trans.txt'):
-                    text_info.update(
-                        self._get_text_info(os.path.join(root, file)))
-
-        data = []
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.flac'):
-                    utt_id = os.path.splitext(file)[0]
-                    spk_id = utt_id.split('-')[0]
-                    if utt_id not in text_info \
-                        or spk_id not in speaker_info :  # Skip samples with incomplete data
-                        continue
-                    file_path = os.path.join(root, file)
-                    text = text_info[utt_id]
-                    spk_gender = speaker_info[spk_id]
-                    data.append(
-                        self.utt_info(file_path, utt_id, text, spk_id,
-                                      spk_gender))
-
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = load_audio(
-            sample[0])  # The first element of sample is file path
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sample_rate=sr,
-            **self.feat_config) if feat_func else waveform
-        record.update({'feat': feat, 'duration': len(waveform) / sr})
-        return record
-
-    def create_manifest(self, prefix='manifest'):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
-            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
-
-        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
-                                     f'{prefix}.{self.subset}')
-        with codecs.open(manifest_file, 'w', 'utf-8') as f:
-            for idx in tqdm(range(len(self))):
-                record = self._convert_to_record(idx)
-                record_line = json.dumps(
-                    {
-                        'utt': record['utt_id'],
-                        'feat': record['file_path'],
-                        'feat_shape': (record['duration'], ),
-                        'text': record['text'],
-                        'spk': record['spk_id'],
-                        'gender': record['spk_gender'],
-                    },
-                    ensure_ascii=False)
-                f.write(record_line + '\n')
-        logger.info(f'Manifest file {manifest_file} created.')
-
-    def __getitem__(self, idx):
-        record = self._convert_to_record(idx)
-        return tuple(record.values())
-
-    def __len__(self):
-        return len(self._data)
--- a/audio/paddleaudio/datasets/ravdess.py
+++ b/audio/paddleaudio/datasets/ravdess.py
@ -1,136 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-import random
-from typing import List
-from typing import Tuple
-
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from .dataset import AudioClassificationDataset
-
-__all__ = ['RAVDESS']
-
-
-class RAVDESS(AudioClassificationDataset):
-    """
-    The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
-    lexically-matched statements in a neutral North American accent. Speech emotions
-    includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
-    Each expression is produced at two levels of emotional intensity (normal, strong),
-    with an additional neutral expression.
-
-    Reference:
-        The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
-        A dynamic, multimodal set of facial and vocal expressions in North American English
-        https://doi.org/10.1371/journal.pone.0196391
-    """
-
-    archieves = [
-        {
-            'url':
-            'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
-            'md5':
-            '5411230427d67a21e18aa4d466e6d1b9',
-        },
-        {
-            'url':
-            'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
-            'md5':
-            'bc696df654c87fed845eb13823edef8a',
-        },
-    ]
-    label_list = [
-        'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
-        'surprised'
-    ]
-    meta_info = collections.namedtuple(
-        'META_INFO', ('modality', 'vocal_channel', 'emotion',
-                      'emotion_intensity', 'statement', 'repitition', 'actor'))
-    speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
-    song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
-
-    def __init__(self,
-                 mode='train',
-                 seed=0,
-                 n_folds=5,
-                 split=1,
-                 feat_type='raw',
-                 **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            seed (:obj:`int`, `optional`, defaults to 0):
-                Set the random seed to shuffle samples.
-            n_folds (:obj:`int`, `optional`, defaults to 5):
-                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
-        files, labels = self._get_data(mode, seed, n_folds, split)
-        super(RAVDESS, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, files) -> List[collections.namedtuple]:
-        ret = []
-        for file in files:
-            basename_without_extend = os.path.basename(file)[:-4]
-            ret.append(self.meta_info(*basename_without_extend.split('-')))
-        return ret
-
-    def _get_data(self, mode, seed, n_folds,
-                  split) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(self.speech_path) and not os.path.isdir(
-                self.song_path):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        wav_files = []
-        for root, _, files in os.walk(self.speech_path):
-            for file in files:
-                if file.endswith('.wav'):
-                    wav_files.append(os.path.join(root, file))
-
-        for root, _, files in os.walk(self.song_path):
-            for file in files:
-                if file.endswith('.wav'):
-                    wav_files.append(os.path.join(root, file))
-
-        random.seed(seed)  # shuffle samples to split data
-        random.shuffle(
-            wav_files
-        )  # make sure using the same seed to create train and dev dataset
-        meta_info = self._get_meta_info(wav_files)
-
-        files = []
-        labels = []
-        n_samples_per_fold = len(meta_info) // n_folds
-        for idx, sample in enumerate(meta_info):
-            _, _, emotion, _, _, _, _ = sample
-            target = int(emotion) - 1
-            fold = idx // n_samples_per_fold + 1
-
-            if mode == 'train' and int(fold) != split:
-                files.append(wav_files[idx])
-                labels.append(target)
-
-            if mode != 'train' and int(fold) == split:
-                files.append(wav_files[idx])
-                labels.append(target)
-
-        return files, labels
--- a/audio/test/README.md
+++ b/audio/test/README.md
@ -1,41 +0,0 @@
-# PaddleAudio Testing Guide
-
-
-
-
-# Testing
-First clone a version of the project by
-```
-git clone https://github.com/PaddlePaddle/models.git
-
-```
-Then install the project in your virtual environment.
-```
-cd models/PaddleAudio
-python setup.py bdist_wheel
-pip install -e .[dev]
-```
-The requirements for testing will be installed along with PaddleAudio.  
-
-Now run
-```
-pytest test
-```
-
-If it goes well, you will see outputs like these:
-```
-platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
-rootdir: ./models/PaddleAudio
-plugins: hydra-core-1.0.6
-collected 16 items  
-
-test/unit_test/test_backend.py ...........                                                                         [ 68%]
-test/unit_test/test_features.py .....                                                                              [100%]
-
-==================================================== warnings summary ====================================================
-.
-.
-.
-- Docs: https://docs.pytest.org/en/stable/warnings.html
-============================================ 16 passed, 11 warnings in 6.76s =============================================
-```
--- a/audio/test/unit_test/test_backend.py
+++ b/audio/test/unit_test/test_backend.py
@ -1,113 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddleaudio
-import pytest
-
-TEST_FILE = './test/data/test_audio.wav'
-
-
-def relative_err(a, b, real=True):
-    """compute relative error of two matrices or vectors"""
-    if real:
-        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
-    else:
-        err = np.sum((a.real - b.real)**2) / \
-            (EPS + np.sum(a.real**2) + np.sum(b.real**2))
-        err += np.sum((a.imag - b.imag)**2) / \
-            (EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
-
-        return err
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def load_audio():
-    x, r = librosa.load(TEST_FILE, sr=16000)
-    print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
-    return x, r
-
-
-# start testing
-x, r = load_audio()
-EPS = 1e-8
-
-
-def test_load():
-    s, r = paddleaudio.load(TEST_FILE, sr=16000)
-    assert r == 16000
-    assert s.dtype == 'float32'
-
-    s, r = paddleaudio.load(
-        TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
-    assert len(s) / r == 2.0
-    assert r == 16000
-    assert s.dtype == 'int16'
-
-
-def test_depth_convert():
-    y = paddleaudio.depth_convert(x, 'int16')
-    assert len(y) == len(x)
-    assert y.dtype == 'int16'
-    assert np.max(y) <= 32767
-    assert np.min(y) >= -32768
-    assert np.std(y) > EPS
-
-    y = paddleaudio.depth_convert(x, 'int8')
-    assert len(y) == len(x)
-    assert y.dtype == 'int8'
-    assert np.max(y) <= 127
-    assert np.min(y) >= -128
-    assert np.std(y) > EPS
-
-
-# test case for resample
-rs_test_data = [
-    (32000, 'kaiser_fast'),
-    (16000, 'kaiser_fast'),
-    (8000, 'kaiser_fast'),
-    (32000, 'kaiser_best'),
-    (16000, 'kaiser_best'),
-    (8000, 'kaiser_best'),
-    (22050, 'kaiser_best'),
-    (44100, 'kaiser_best'),
-]
-
-
-@pytest.mark.parametrize('sr,mode', rs_test_data)
-def test_resample(sr, mode):
-    y = paddleaudio.resample(x, 16000, sr, mode=mode)
-    factor = sr / 16000
-    err = relative_err(len(y), len(x) * factor)
-    print('err:', err)
-    assert err < EPS
-
-
-def test_normalize():
-    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
-    assert np.max(y) < 0.5 + EPS
-
-    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
-    assert np.max(y) <= 2.0 + EPS
-
-    y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
-    print('np.std(y):', np.std(y))
-    assert np.abs(np.std(y) - 1.0) < EPS
-
-
-if __name__ == '__main__':
-    test_load()
-    test_depth_convert()
-    test_resample(22050, 'kaiser_fast')
-    test_normalize()
--- a/audio/test/unit_test/test_features.py
+++ b/audio/test/unit_test/test_features.py
@ -1,143 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddleaudio as pa
-import pytest
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def load_audio():
-    x, r = librosa.load('./test/data/test_audio.wav')
-    #x,r = librosa.load('../data/test_audio.wav',sr=16000)
-    return x, r
-
-
-## start testing
-x, r = load_audio()
-EPS = 1e-8
-
-
-def relative_err(a, b, real=True):
-    """compute relative error of two matrices or vectors"""
-    if real:
-        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
-    else:
-        err = np.sum((a.real - b.real)**2) / (
-            EPS + np.sum(a.real**2) + np.sum(b.real**2))
-        err += np.sum((a.imag - b.imag)**2) / (
-            EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
-
-        return err
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_melspectrogram():
-    a = pa.melspectrogram(
-        x,
-        window_size=512,
-        sr=16000,
-        hop_length=320,
-        n_mels=64,
-        fmin=50,
-        to_db=False, )
-    b = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    assert relative_err(a, b) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_melspectrogram_db():
-
-    a = pa.melspectrogram(
-        x,
-        window_size=512,
-        sr=16000,
-        hop_length=320,
-        n_mels=64,
-        fmin=50,
-        to_db=True,
-        ref=1.0,
-        amin=1e-10,
-        top_db=None)
-    b = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
-    assert relative_err(a, b) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_stft():
-    a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
-    b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
-    assert a.shape == b.shape
-    assert relative_err(a, b, real=False) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_split_frames():
-    a = librosa.util.frame(x, frame_length=512, hop_length=320)
-    b = pa.split_frames(x, frame_length=512, hop_length=320)
-    assert relative_err(a, b) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_mfcc():
-    kwargs = {
-        'window_size': 512,
-        'hop_length': 320,
-        'n_mels': 64,
-        'fmin': 50,
-        'to_db': False
-    }
-    a = pa.mfcc(
-        x,
-        #sample_rate=16000,
-        spect=None,
-        n_mfcc=20,
-        dct_type=2,
-        norm='ortho',
-        lifter=0,
-        **kwargs)
-    S = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    b = librosa.feature.mfcc(
-        x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
-    assert relative_err(a, b) < EPS
-
-
-if __name__ == '__main__':
-    test_melspectrogram()
-    test_melspectrogram_db()
-    test_stft()
-    test_split_frames()
-    test_mfcc()
--- a/examples/dataset/aidatatang_200zh/.gitignore
+++ b/examples/dataset/aidatatang_200zh/.gitignore
--- a/examples/dataset/aidatatang_200zh/README.md
+++ b/examples/dataset/aidatatang_200zh/README.md
--- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
--- a/examples/dataset/aishell/.gitignore
+++ b/examples/dataset/aishell/.gitignore
--- a/examples/dataset/aishell/README.md
+++ b/examples/dataset/aishell/README.md
--- a/examples/dataset/aishell/aishell.py
+++ b/examples/dataset/aishell/aishell.py
--- a/examples/dataset/aishell3/README.md
+++ b/examples/dataset/aishell3/README.md
--- a/examples/dataset/chime3_background/chime3_background.py
+++ b/examples/dataset/chime3_background/chime3_background.py
--- a/examples/dataset/gigaspeech/.gitignore
+++ b/examples/dataset/gigaspeech/.gitignore
--- a/examples/dataset/gigaspeech/README.md
+++ b/examples/dataset/gigaspeech/README.md
--- a/examples/dataset/gigaspeech/gigaspeech.py
+++ b/examples/dataset/gigaspeech/gigaspeech.py
--- a/examples/dataset/gigaspeech/run.sh
+++ b/examples/dataset/gigaspeech/run.sh
--- a/examples/dataset/librispeech/.gitignore
+++ b/examples/dataset/librispeech/.gitignore
--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
--- a/examples/dataset/magicdata/README.md
+++ b/examples/dataset/magicdata/README.md
--- a/examples/dataset/mini_librispeech/.gitignore
+++ b/examples/dataset/mini_librispeech/.gitignore
--- a/examples/dataset/mini_librispeech/mini_librispeech.py
+++ b/examples/dataset/mini_librispeech/mini_librispeech.py
--- a/examples/dataset/multi_cn/README.md
+++ b/examples/dataset/multi_cn/README.md
--- a/examples/dataset/musan/.gitignore
+++ b/examples/dataset/musan/.gitignore
--- a/examples/dataset/musan/musan.py
+++ b/examples/dataset/musan/musan.py
--- a/examples/dataset/primewords/README.md
+++ b/examples/dataset/primewords/README.md
--- a/examples/dataset/rir_noise/.gitignore
+++ b/examples/dataset/rir_noise/.gitignore
--- a/examples/dataset/rir_noise/rir_noise.py
+++ b/examples/dataset/rir_noise/rir_noise.py
--- a/examples/dataset/st-cmds/README.md
+++ b/examples/dataset/st-cmds/README.md
--- a/examples/dataset/ted_en_zh/.gitignore
+++ b/examples/dataset/ted_en_zh/.gitignore
--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
--- a/examples/dataset/thchs30/.gitignore
+++ b/examples/dataset/thchs30/.gitignore
--- a/examples/dataset/thchs30/README.md
+++ b/examples/dataset/thchs30/README.md
--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
--- a/examples/dataset/timit/.gitignore
+++ b/examples/dataset/timit/.gitignore
--- a/examples/dataset/timit/timit.py
+++ b/examples/dataset/timit/timit.py
--- a/examples/dataset/timit/timit_kaldi_standard_split.py
+++ b/examples/dataset/timit/timit_kaldi_standard_split.py
--- a/examples/dataset/voxforge/run_data.sh
+++ b/examples/dataset/voxforge/run_data.sh
@ -1,10 +1,10 @@
 #! /usr/bin/env bash

-TARGET_DIR=${MAIN_ROOT}/examples/dataset/voxforge
+TARGET_DIR=${MAIN_ROOT}/dataset/voxforge
 mkdir -p ${TARGET_DIR}

 # download data, generate manifests
-python ${MAIN_ROOT}/examples/dataset/voxforge/voxforge.py \
+python ${MAIN_ROOT}/dataset/voxforge/voxforge.py \
 --manifest_prefix="${TARGET_DIR}/manifest" \
 --target_dir="${TARGET_DIR}" \
 --is_merge_dialect=True \
--- a/examples/dataset/voxforge/voxforge.py
+++ b/examples/dataset/voxforge/voxforge.py
--- a/docs/source/reference.md
+++ b/docs/source/reference.md
@ -6,7 +6,7 @@ We borrowed a lot of code from these repos to build `model` and `engine`, thanks
 - Apache-2.0 License
 - python/shell `utils`
 - kaldi feat preprocessing
- data pipe line and `transform`
+- data pipe line and `transformer`
 - some tts models, like `fastspeech2` and GAN-based `vocoder`

 * [wenet](https://github.com/wenet-e2e/wenet/blob/main/LICENSE)
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@ -1,16 +1,18 @@
 # Released Models

 ## Speech-to-Text Models
+
 ### Acoustic Model Released in paddle 2.X
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link
 :-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :-----------
-[Ds2 Online Aishell ASR0 Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
-[Ds2 Offline Aishell ASR0 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
+[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
 [Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
 [Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
 [Conformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s1/librispeech.s1.transformer.all.wer5p62.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0456 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech ASR2 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s2/libri_transformer_espnet_wer3p84.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0384 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
+[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0410 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
+[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.024 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
+

 ### Acoustic Model Transformed from paddle 1.8
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
@ -20,14 +22,15 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER |
 [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|

 ### Language Model Released
-
 Language Model | Training Data | Token-based | Size | Descriptions
 :-------------:| :------------:| :-----: | -----: | :-----------------
 [English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) |  [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie'  binary with '-a 22 -q 8 -b 8'
 [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
 [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings

+
 ## Text-to-Speech Models
+
 ### Acoustic Models
 Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
@ -40,7 +43,6 @@ FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/Pa
 FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||

 ### Vocoders
-
 Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
 WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)|||
--- a/docs/source/tts/README.md
+++ b/docs/source/tts/README.md
@ -5,20 +5,6 @@ Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-spee
  <img src="../../images/logo.png" width=300 /> <br>
 </div>

-
-## News  <img src="../../images/news_icon.png" width="40"/>
- Oct-12-2021, Refector examples code.
- Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
- Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).
- Sep-14-2021, Reconstruction of TransformerTTS. Check [examples/transformer_tts/ljspeech](./examples/transformer_tts/ljspeech).
- Aug-31-2021, Chinese Text Frontend. Check [examples/text_frontend](./examples/text_frontend).
- Aug-23-2021, FastSpeech2/FastPitch with AISHELL-3. Check [examples/fastspeech2/aishell3](./examples/fastspeech2/aishell3).
- Aug-03-2021, FastSpeech2/FastPitch with CSMSC. Check [examples/fastspeech2/baker](./examples/fastspeech2/baker).
- Jul-19-2021, SpeedySpeech with CSMSC. Check [examples/speedyspeech/baker](./examples/speedyspeech/baker).
- Jul-01-2021, Parallel WaveGAN with CSMSC. Check [examples/GANVocoder/parallelwave_gan/baker](./examples/GANVocoder/parallelwave_gan/baker).
- Jul-01-2021, Montreal-Forced-Aligner. Check  [examples/use_mfa](./examples/use_mfa).
- May-07-2021, Voice Cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
-
 ## Overview

 In order to facilitate exploiting the existing TTS models directly and developing the new ones, Parakeet selects typical models and provides their reference implementations in PaddlePaddle. Further more, Parakeet abstracts the TTS pipeline and standardizes the procedure of data preprocessing, common modules sharing, model configuration, and the process of training and synthesis. The models supported here include Text FrontEnd, end-to-end Acoustic models and Vocoders:
@ -38,50 +24,11 @@ In order to facilitate exploiting the existing TTS models directly and developin
  - [Transfer Learning from Speaker Verification to Multispeaker Text-to-Speech Synthesis](https://arxiv.org/pdf/1806.04558v4.pdf)
  - [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)

-## Setup
-It's difficult to install some dependent libraries for this repo in Windows system, we recommend that you **DO NOT** use Windows system, please use `Linux`.
-
-Make sure the library `libsndfile1` is installed, e.g., on Ubuntu.
-
-```bash
-sudo apt-get install libsndfile1
-```
-### Install PaddlePaddle
-See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.1.2** or above.
-
-### Install Parakeet

-```bash
-git clone https://github.com/PaddlePaddle/Parakeet
-cd Parakeet
-pip install -e .
-```
-
-If some python dependent packages cannot be installed successfully, you can run the following script first.
-(replace `python3.6` with your own python version)
-```bash
-sudo apt install -y python3.6-dev
-```
-
-See [install](https://paddle-parakeet.readthedocs.io/en/latest/install.html) for more details.
-
-## Examples
-Entries to the introduction, and the launch of training and synthsis for different example models:
-
- [>>> Chinese Text Frontend](./examples/text_frontend)
- [>>> FastSpeech2/FastPitch](./examples/fastspeech2)
- [>>> Montreal-Forced-Aligner](./examples/use_mfa)
- [>>> Parallel WaveGAN](./examples/GANVocoder/parallelwave_gan)
- [>>> SpeedySpeech](./examples/speedyspeech)
- [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3)
- [>>> GE2E](./examples/ge2e)
- [>>> WaveFlow](./examples/waveflow)
- [>>> TransformerTTS](./examples/transformer_tts)
- [>>> Tacotron2](./examples/tacotron2)

 ## Audio samples
-### TTS models (Acoustic Model + Neural Vocoder)
-Check our [website](https://paddleparakeet.readthedocs.io/en/latest/demo.html) for audio sampels.
+
+Check our [website](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) for audio sampels.

 ## Released Model

--- a/docs/source/tts/advanced_usage.md
+++ b/docs/source/tts/advanced_usage.md
@ -290,7 +290,7 @@ The following is the basic  `ArgumentParser`:
 1. `--config`  is used to support configuration file parsing, and the configuration file itself handles the unique options of each experiment.
 2. `--train-metadata` is the path to the training data.
 3.  `--output-dir` is the dir to save the training results.（if there are checkpoints in  `checkpoints/` of  `--output-dir` , it's defalut to reload the newest checkpoint to train)
-4. `--device` and  `--nprocs` determine operation modes，`--device` specifies the type of running device, whether to run on `cpu` or `gpu`. `--nprocs` refers to  the number of training processes. If `nprocs` > 1, it means that multi process parallel training is used. (Note: currently only GPU multi card multi process training is supported.)
+4. `--ngpu` determine operation modes，`--ngpu` refers to the number of training processes. If `ngpu` > 0, it means using GPU, else CPU is used.

 Developers can refer to the examples in `examples` to write the default configuration file when adding new experiments.

--- a/docs/topic/ctc/ctc_loss.ipynb
+++ b/docs/topic/ctc/ctc_loss.ipynb
@ -343,6 +343,16 @@
    "    $$"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "41637c03",
+   "metadata": {},
+   "source": [
+    "## Source Code\n",
+    "本人在 [warp-ctc](https://github.com/zh794390558/warp-ctc) 上加了注释，并调整 index 的索引方式，便于理解代码。\n",
+    "对比上面的公式推导和lattice图可以快速理解 ctc 实现。"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "coordinated-music",
@ -372,7 +382,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -386,7 +396,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.7.0"
  },
  "toc": {
   "base_numbering": 1,
--- a/examples/aishell/asr0/local/data.sh
+++ b/examples/aishell/asr0/local/data.sh
@ -9,7 +9,7 @@ dict_dir=data/lang_char

 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
--- a/examples/aishell/asr0/local/export.sh
+++ b/examples/aishell/asr0/local/export.sh
@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4

 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \
--- a/examples/aishell/asr0/local/test.sh
+++ b/examples/aishell/asr0/local/test.sh
@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi

 python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/aishell/asr0/local/test_export.sh
+++ b/examples/aishell/asr0/local/test_export.sh
@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi

 python3 -u ${BIN_DIR}/test_export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${jit_model_export_path}.rsl \
 --export_path ${jit_model_export_path} \
--- a/examples/aishell/asr0/local/test_hub.sh
+++ b/examples/aishell/asr0/local/test_hub.sh
@ -20,7 +20,7 @@ if [ $? -ne 0 ]; then
 fi

 python3 -u ${BIN_DIR}/test_hub.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/aishell/asr0/local/train.sh
+++ b/examples/aishell/asr0/local/train.sh
@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi

 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
--- a/examples/aishell/asr1/local/align.sh
+++ b/examples/aishell/asr1/local/align.sh
@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/aishell/asr1/local/data.sh
+++ b/examples/aishell/asr1/local/data.sh
@ -8,7 +8,7 @@ dict_dir=data/lang_char

 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
--- a/examples/aishell/asr1/local/export.sh
+++ b/examples/aishell/asr1/local/export.sh
@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3

 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@ -34,7 +34,7 @@ for type in attention ctc_greedy_search; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
@ -53,7 +53,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
--- a/examples/aishell/asr1/local/test_hub.sh
+++ b/examples/aishell/asr1/local/test_hub.sh
@ -29,7 +29,7 @@ for type in  attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test_hub.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
--- a/examples/aishell/asr1/local/tlg.sh
+++ b/examples/aishell/asr1/local/tlg.sh
@ -9,7 +9,7 @@ lmtype=srilm

 source utils/parse_options.sh

-data=${MAIN_ROOT}/examples/dataset/${corpus}
+data=${MAIN_ROOT}/dataset/${corpus}
 lexicon=$data/resource_aishell/lexicon.txt
 text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt

--- a/examples/aishell/asr1/local/train.sh
+++ b/examples/aishell/asr1/local/train.sh
@ -29,7 +29,7 @@ mkdir -p exp

 python3 -u ${BIN_DIR}/train.py \
 --seed ${seed} \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --profiler-options "${profiler_options}" \
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@ -17,7 +17,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
--- a/examples/aishell3/tts3/conf/default.yaml
+++ b/examples/aishell3/tts3/conf/default.yaml
@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@ -45,7 +45,8 @@ We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so th

 We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.

-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
+

 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@ -18,7 +18,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.

 ## Pretrained GE2E Model
 We use pretrained GE2E model to generate spwaker embedding for each sentence.
--- a/examples/aishell3/vc1/conf/default.yaml
+++ b/examples/aishell3/vc1/conf/default.yaml
@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@ -15,7 +15,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
--- a/examples/callcenter/asr1/local/align.sh
+++ b/examples/callcenter/asr1/local/align.sh
@ -23,7 +23,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/callcenter/asr1/local/export.sh
+++ b/examples/callcenter/asr1/local/export.sh
@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3

 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
--- a/examples/callcenter/asr1/local/test.sh
+++ b/examples/callcenter/asr1/local/test.sh
@ -28,7 +28,7 @@ for type in attention ctc_greedy_search; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
@ -47,7 +47,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
--- a/examples/callcenter/asr1/local/train.sh
+++ b/examples/callcenter/asr1/local/train.sh
@ -22,7 +22,7 @@ if [ ${seed} != 0 ]; then
 fi

 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind

 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind

 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
@ -209,8 +209,8 @@ Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](htt

 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
-default| 2(gpu) x 76000|1.0991|0.59132|0.035815| 0.31915| 0.15287|
-conformer| 2(gpu) x 76000||||||
+default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287|
+conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509|

 FastSpeech2 checkpoint contains files listed below.
 ```text
--- a/examples/csmsc/tts3/conf/conformer.yaml
+++ b/examples/csmsc/tts3/conf/conformer.yaml
@ -0,0 +1,109 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size.
+n_shift: 300       # Hop size.
+win_length: 1200   # Window length.
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80          # Maximum f0 for pitch extraction.
+f0max: 400         # Minimum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 4
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: conformer           # encoder type
+    decoder_type: conformer           # decoder type
+    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+    conformer_activation_type: swish             # conformer activation type
+    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+    use_cnn_in_conformer: true                   # whether to use CNN in conformer
+    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+    init_type: xavier_uniform         # initialization type
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+  optim: adam              # optimizer type
+  learning_rate: 0.001     # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 1000
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
--- a/examples/csmsc/tts3/conf/default.yaml
+++ b/examples/csmsc/tts3/conf/default.yaml
@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
-    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@ -5,8 +5,8 @@ This example contains code used to train a [parallel wavegan](http://arxiv.org/a
 Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.

 ### Get MFA Result and Extract
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@ -5,8 +5,8 @@ This example contains code used to train a [Multi Band MelGAN](https://arxiv.org
 Download CSMSC from the [official website](https://www.data-baker.com/data/index/source) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/BZNSYP`.

 ### Get MFA Result and Extract
-We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo.

 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
--- a/audio/examples/sound_classification/README.md
+++ b/audio/examples/sound_classification/README.md
@ -21,22 +21,17 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用

 ### 模型训练

-以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。关于如何使用`paddle.distributed.launch`启动多卡训练，请查看[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html)。
+以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。

-单卡训练:
+启动训练:
 ```shell
-$ python train.py --epochs 50 --batch_size 16 --checkpoint_dir ./checkpoint --save_freq 10
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
 ```

-多卡训练:
-```shell
-$ unset CUDA_VISIBLE_DEVICES
-$ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_size 16 --num_worker 4 --checkpoint_dir ./checkpoint --save_freq 10
-```
+`paddlespeech/cls/exps/panns/train.py` 脚本中可支持配置的参数：

-可支持配置的参数：
-
- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `device`: 指定模型预测时使用的设备。
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `epochs`: 训练轮次，默认为50。
 - `learning_rate`: Fine-tune的学习率；默认为5e-5。
 - `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为16。
@ -47,9 +42,9 @@ $ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_

 示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
 ```python
-from model import SoundClassifier
 from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14, cnn10, cnn6
+from paddlespeech.cls.models import SoundClassifier
+from paddlespeech.cls.models import cnn14, cnn10, cnn6

 # CNN14
 backbone = cnn14(pretrained=True, extract_embedding=True)
@ -67,12 +62,14 @@ model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
 ### 模型预测

 ```shell
-python -u predict.py --wav ./dog.wav --top_k 3 --checkpoint ./checkpoint/epoch_50/model.pdparams
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2
 ```

-可支持配置的参数：
- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+`paddlespeech/cls/exps/panns/predict.py` 脚本中可支持配置的参数：
+
+- `device`: 指定模型预测时使用的设备。
 - `wav`: 指定预测的音频文件。
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `top_k`: 预测显示的top k标签的得分，默认为1。
 - `checkpoint`: 模型参数checkpoint文件。

@ -91,10 +88,10 @@ Cat: 6.579841738130199e-06
 模型训练结束后，可以将已保存的动态图参数导出成静态图的模型和参数，然后实施静态图的部署。

 ```shell
-python -u export_model.py --checkpoint ./checkpoint/epoch_50/model.pdparams --output_dir ./export
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3
 ```

-可支持配置的参数：
+`paddlespeech/cls/exps/panns/export_model.py` 脚本中可支持配置的参数：
 - `checkpoint`: 模型参数checkpoint文件。
 - `output_dir`: 导出静态图模型和参数文件的保存目录。

@ -109,8 +106,13 @@ export

 #### 2. 模型部署和预测

-`deploy/python/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
+`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：

-```sh
-python deploy/python/predict.py --model_dir ./export --device gpu
+```shell
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4
 ```
+
+`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本中可支持配置的主要参数：
+- `device`: 指定模型预测时使用的设备。
+- `model_dir`: 导出静态图模型和参数文件的保存目录。
+- `wav`: 指定预测的音频文件。
--- a/examples/esc50/cls0/local/export.sh
+++ b/examples/esc50/cls0/local/export.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+
+ckpt_dir=$1
+output_dir=$2
+
+python3 ${BIN_DIR}/export_model.py \
+--checkpoint ${ckpt_dir}/model.pdparams \
+--output_dir ${output_dir}
--- a/examples/esc50/cls0/local/infer.sh
+++ b/examples/esc50/cls0/local/infer.sh
@ -0,0 +1,11 @@
+#!/bin/bash
+
+audio_file=$1
+ckpt_dir=$2
+feat_backend=$3
+
+python3 ${BIN_DIR}/predict.py \
+--wav ${audio_file} \
+--feat_backend ${feat_backend} \
+--top_k 10 \
+--checkpoint ${ckpt_dir}/model.pdparams
--- a/examples/esc50/cls0/local/static_model_infer.sh
+++ b/examples/esc50/cls0/local/static_model_infer.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+device=$1
+model_dir=$2
+audio_file=$3
+
+python3 ${BIN_DIR}/deploy/predict.py \
+--device ${device} \
+--model_dir ${model_dir} \
+--wav ${audio_file} 
--- a/examples/esc50/cls0/local/train.sh
+++ b/examples/esc50/cls0/local/train.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+
+ngpu=$1
+feat_backend=$2
+
+num_epochs=50
+batch_size=16
+ckpt_dir=./checkpoint
+save_freq=10
+
+if [ ${ngpu} -gt 0 ]; then
+    python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \
+    --epochs ${num_epochs} \
+    --feat_backend ${feat_backend} \
+    --batch_size ${batch_size} \
+    --checkpoint_dir ${ckpt_dir} \
+    --save_freq ${save_freq}
+else
+    python3 ${BIN_DIR}/train.py \
+    --epochs ${num_epochs} \
+    --feat_backend ${feat_backend} \
+    --batch_size ${batch_size} \
+    --checkpoint_dir ${ckpt_dir} \
+    --save_freq ${save_freq}
+fi
--- a/examples/esc50/cls0/path.sh
+++ b/examples/esc50/cls0/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=panns
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL}
--- a/examples/esc50/cls0/run.sh
+++ b/examples/esc50/cls0/run.sh
@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+source path.sh
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+stage=$1
+stop_stage=100
+feat_backend=numpy
+audio_file=~/cat.wav
+ckpt_dir=./checkpoint/epoch_50
+output_dir=./export
+infer_device=cpu
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ./local/train.sh ${ngpu} ${feat_backend} || exit -1
+    exit 0
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ./local/infer.sh ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1
+    exit 0
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    ./local/export.sh ${ckpt_dir} ${output_dir} || exit -1
+    exit 0
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    ./local/static_model_infer.sh ${infer_device} ${output_dir} ${audio_file} || exit -1
+    exit 0
+fi
--- a/examples/librispeech/asr0/local/data.sh
+++ b/examples/librispeech/asr0/local/data.sh
@ -10,7 +10,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh

 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
--- a/examples/librispeech/asr0/local/export.sh
+++ b/examples/librispeech/asr0/local/export.sh
@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4

 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \
--- a/examples/librispeech/asr0/local/test.sh
+++ b/examples/librispeech/asr0/local/test.sh
@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi

 python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/librispeech/asr0/local/test_hub.sh
+++ b/examples/librispeech/asr0/local/test_hub.sh
@ -20,7 +20,7 @@ if [ $? -ne 0 ]; then
 fi

 python3 -u ${BIN_DIR}/test_hub.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/librispeech/asr0/local/train.sh
+++ b/examples/librispeech/asr0/local/train.sh
@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi

 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
--- a/examples/librispeech/asr1/RESULTS.md
+++ b/examples/librispeech/asr1/RESULTS.md
@ -21,7 +21,7 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.733129533131917 | 0.047874 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.733129533131917 | 0.053922 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.733129533131917 | 0.053427 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.733129533131917 | 0.041369 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.725063021977743 | 0.047417 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.725063021977743 | 0.053922 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.725063021977743 | 0.053180 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.725063021977743 | 0.041026 |  
--- a/examples/librispeech/asr1/local/align.sh
+++ b/examples/librispeech/asr1/local/align.sh
@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/librispeech/asr1/local/data.sh
+++ b/examples/librispeech/asr1/local/data.sh
@ -19,7 +19,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh

 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
--- a/examples/librispeech/asr1/local/export.sh
+++ b/examples/librispeech/asr1/local/export.sh
@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3

 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
--- a/Show More
+++ b/Show More