merge the develop branch and do the revising

4 years ago · dcc2390323
parent 5047e8786c 85f7f674d2
commit dcc2390323
299 changed files with 3950 additions and 4405 deletions
--- a/audio/.gitignore
+++ b/audio/.gitignore
@ -1,7 +0,0 @@
 .ipynb_checkpoints/**
 *.ipynb
 nohup.out
 __pycache__/
 *.wav
 *.m4a
 obsolete/**
--- a/audio/.pre-commit-config.yaml
+++ b/audio/.pre-commit-config.yaml
@ -1,45 +0,0 @@
 repos:
 -   repo: local
    hooks:
    -   id: yapf
        name: yapf
        entry: yapf
        language: system
        args: [-i, --style .style.yapf]
        files: \.py$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: a11d9314b22d8f8c7556443875b731ef05965464
    hooks:
    -   id: check-merge-conflict
    -   id: check-symlinks
    -   id: end-of-file-fixer
    -   id: trailing-whitespace
    -   id: detect-private-key
    -   id: check-symlinks
    -   id: check-added-large-files
 -   repo: https://github.com/pycqa/isort
    rev: 5.8.0
    hooks:
    -   id: isort
        name: isort (python)
    -   id: isort
        name: isort (cython)
        types: [cython]
    -   id: isort
        name: isort (pyi)
        types: [pyi]
 -   repo: local
    hooks:
    -   id: flake8
        name: flake8
        entry: flake8
        language: system
        args:
        -   --count
        -   --select=E9,F63,F7,F82
        -   --show-source
        -   --statistics
        files: \.py$
--- a/audio/.style.yapf
+++ b/audio/.style.yapf
@ -1,3 +0,0 @@
 [style]
 based_on_style = pep8
 column_limit = 80
--- a/audio/LICENSE
+++ b/audio/LICENSE
@ -1,201 +0,0 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright [yyyy] [name of copyright owner]
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/audio/README.md
+++ b/audio/README.md
@ -1,37 +0,0 @@
 # PaddleAudio:  The audio library for PaddlePaddle
 ## Introduction
 PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap.
 ## Features
 - Spectrogram and related features are compatible with librosa.
 - State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come.
 - Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap.
 - Data loading supports for common open source audio in multiple languages including English, Mandarin and so on.
 ## Install
 ```
 git clone https://github.com/PaddlePaddle/models
 cd models/PaddleAudio
 pip install .
 ```
 ## Quick start
 ### Audio loading and feature extraction
 ```
 import paddleaudio as pa
 s,r = pa.load(f)
 mel_spect = pa.melspectrogram(s,sr=r)
 ```
 ###  Examples
 We provide a set of examples to help you get started in using PaddleAudio quickly.
 - [PANNs:  acoustic scene and events analysis using pre-trained models](./examples/panns)
 - [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification)
 - [Training a audio-tagging network on Audioset](./examples/audioset_training)
 Please refer to [example directory](./examples) for more details.
--- a/audio/examples/panns/README.md
+++ b/audio/examples/panns/README.md
@ -1,128 +0,0 @@
 # Audio Tagging
 声音分类的任务是单标签的分类任务，但是对于一段音频来说，它可以是多标签的。譬如在一般的室内办公环境进行录音，这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音，还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言，对一段音频预测多个标签是具有很强的实用性的。
 在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
 `PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别，因此可用于声音的实时tagging。
 本示例采用`PANNs`预训练模型，基于Audioset的标签类别对输入音频实时tagging，并最终以文本形式输出对应时刻的top k类别和对应的得分。
 ## 模型简介
 PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
 - CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
 - CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
 - CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
 ## 快速开始
 ### 模型预测
 ```shell
 export CUDA_VISIBLE_DEVICES=0
 python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir
 ```
 可支持配置的参数：
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
 - `wav`: 指定预测的音频文件。
 - `sample_duration`: 模型每次预测的音频时间长度，单位为秒，默认为2s。
 - `hop_duration`: 每两个预测音频的时间间隔，单位为秒，默认为0.3s。
 - `output_dir`: 模型预测结果存放的路径，默认为`./output_dir`。
 示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
 ```python
 from paddleaudio.models.panns import cnn14, cnn10, cnn6
 # CNN14
 model = cnn14(pretrained=True, extract_embedding=False)
 # CNN10
 model = cnn10(pretrained=True, extract_embedding=False)
 # CNN6
 model = cnn6(pretrained=True, extract_embedding=False)
 ```
 执行结果：
 ```
 [2021-04-30 19:15:41,025] [    INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz
 ```
 执行后得分结果保存在`output_dir`的`.npz`文件中。
 ### 生成tagging标签文本
 ```shell
 python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir
 ```
 可支持配置的参数：
 - `tagging_file`: 模型预测结果文件。
 - `top_k`: 获取预测结果中，得分最高的前top_k个标签，默认为10。
 - `smooth`: 预测结果的后验概率平滑，默认为True，表示应用平滑。
 - `smooth_size`: 平滑计算过程中的样本数量，默认为5。
 - `label_file`: 模型预测结果对应的Audioset类别的文本文件。
 - `output_dir`: 标签文本存放的路径，默认为`./output_dir`。
 执行结果：
 ```
 [2021-04-30 19:26:58,743] [    INFO] - Posterior smoothing...
 [2021-04-30 19:26:58,746] [    INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt
 ```
 执行后文本结果保存在`output_dir`的`.txt`文件中。
 ## Tagging标签文本
 最终输出的文本结果如下所示。  
 样本每个时间范围的top k结果用空行分隔。在每一个结果中，第一行是时间信息，数字表示tagging结果在时间起点信息，比例值代表当前时刻`t`与音频总长度`T`的比值；紧接的k行是对应的标签和得分。
 ```
 0.0
 Cat: 0.9144676923751831
 Animal: 0.8855036497116089
 Domestic animals, pets: 0.804577112197876
 Meow: 0.7422927021980286
 Music: 0.19959309697151184
 Inside, small room: 0.12550437450408936
 Caterwaul: 0.021584441885352135
 Purr: 0.020247288048267365
 Speech: 0.018197158351540565
 Vehicle: 0.007446660194545984
 0.059197544398158296
 Cat: 0.9250872135162354
 Animal: 0.8957151174545288
 Domestic animals, pets: 0.8228275775909424
 Meow: 0.7650775909423828
 Music: 0.20210561156272888
 Inside, small room: 0.12290887534618378
 Caterwaul: 0.029371455311775208
 Purr: 0.018731823191046715
 Speech: 0.017130598425865173
 Vehicle: 0.007748497650027275
 0.11839508879631659
 Cat: 0.9336574673652649
 Animal: 0.9111202359199524
 Domestic animals, pets: 0.8349071145057678
 Meow: 0.7761964797973633
 Music: 0.20467285811901093
 Inside, small room: 0.10709915310144424
 Caterwaul: 0.05370649695396423
 Purr: 0.018830426037311554
 Speech: 0.017361722886562347
 Vehicle: 0.006929398979991674
 ...
 ...
 ```
 以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子，可以实时地对音频进行多标签预测。
 ![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif)
--- a/audio/examples/panns/assets/audioset_labels.txt
+++ b/audio/examples/panns/assets/audioset_labels.txt
@ -1,527 +0,0 @@
 Speech
 Male speech, man speaking
 Female speech, woman speaking
 Child speech, kid speaking
 Conversation
 Narration, monologue
 Babbling
 Speech synthesizer
 Shout
 Bellow
 Whoop
 Yell
 Battle cry
 Children shouting
 Screaming
 Whispering
 Laughter
 Baby laughter
 Giggle
 Snicker
 Belly laugh
 Chuckle, chortle
 Crying, sobbing
 Baby cry, infant cry
 Whimper
 Wail, moan
 Sigh
 Singing
 Choir
 Yodeling
 Chant
 Mantra
 Male singing
 Female singing
 Child singing
 Synthetic singing
 Rapping
 Humming
 Groan
 Grunt
 Whistling
 Breathing
 Wheeze
 Snoring
 Gasp
 Pant
 Snort
 Cough
 Throat clearing
 Sneeze
 Sniff
 Run
 Shuffle
 Walk, footsteps
 Chewing, mastication
 Biting
 Gargling
 Stomach rumble
 Burping, eructation
 Hiccup
 Fart
 Hands
 Finger snapping
 Clapping
 Heart sounds, heartbeat
 Heart murmur
 Cheering
 Applause
 Chatter
 Crowd
 Hubbub, speech noise, speech babble
 Children playing
 Animal
 Domestic animals, pets
 Dog
 Bark
 Yip
 Howl
 Bow-wow
 Growling
 Whimper (dog)
 Cat
 Purr
 Meow
 Hiss
 Caterwaul
 Livestock, farm animals, working animals
 Horse
 Clip-clop
 Neigh, whinny
 Cattle, bovinae
 Moo
 Cowbell
 Pig
 Oink
 Goat
 Bleat
 Sheep
 Fowl
 Chicken, rooster
 Cluck
 Crowing, cock-a-doodle-doo
 Turkey
 Gobble
 Duck
 Quack
 Goose
 Honk
 Wild animals
 Roaring cats (lions, tigers)
 Roar
 Bird
 Bird vocalization, bird call, bird song
 Chirp, tweet
 Squawk
 Pigeon, dove
 Coo
 Crow
 Caw
 Owl
 Hoot
 Bird flight, flapping wings
 Canidae, dogs, wolves
 Rodents, rats, mice
 Mouse
 Patter
 Insect
 Cricket
 Mosquito
 Fly, housefly
 Buzz
 Bee, wasp, etc.
 Frog
 Croak
 Snake
 Rattle
 Whale vocalization
 Music
 Musical instrument
 Plucked string instrument
 Guitar
 Electric guitar
 Bass guitar
 Acoustic guitar
 Steel guitar, slide guitar
 Tapping (guitar technique)
 Strum
 Banjo
 Sitar
 Mandolin
 Zither
 Ukulele
 Keyboard (musical)
 Piano
 Electric piano
 Organ
 Electronic organ
 Hammond organ
 Synthesizer
 Sampler
 Harpsichord
 Percussion
 Drum kit
 Drum machine
 Drum
 Snare drum
 Rimshot
 Drum roll
 Bass drum
 Timpani
 Tabla
 Cymbal
 Hi-hat
 Wood block
 Tambourine
 Rattle (instrument)
 Maraca
 Gong
 Tubular bells
 Mallet percussion
 Marimba, xylophone
 Glockenspiel
 Vibraphone
 Steelpan
 Orchestra
 Brass instrument
 French horn
 Trumpet
 Trombone
 Bowed string instrument
 String section
 Violin, fiddle
 Pizzicato
 Cello
 Double bass
 Wind instrument, woodwind instrument
 Flute
 Saxophone
 Clarinet
 Harp
 Bell
 Church bell
 Jingle bell
 Bicycle bell
 Tuning fork
 Chime
 Wind chime
 Change ringing (campanology)
 Harmonica
 Accordion
 Bagpipes
 Didgeridoo
 Shofar
 Theremin
 Singing bowl
 Scratching (performance technique)
 Pop music
 Hip hop music
 Beatboxing
 Rock music
 Heavy metal
 Punk rock
 Grunge
 Progressive rock
 Rock and roll
 Psychedelic rock
 Rhythm and blues
 Soul music
 Reggae
 Country
 Swing music
 Bluegrass
 Funk
 Folk music
 Middle Eastern music
 Jazz
 Disco
 Classical music
 Opera
 Electronic music
 House music
 Techno
 Dubstep
 Drum and bass
 Electronica
 Electronic dance music
 Ambient music
 Trance music
 Music of Latin America
 Salsa music
 Flamenco
 Blues
 Music for children
 New-age music
 Vocal music
 A capella
 Music of Africa
 Afrobeat
 Christian music
 Gospel music
 Music of Asia
 Carnatic music
 Music of Bollywood
 Ska
 Traditional music
 Independent music
 Song
 Background music
 Theme music
 Jingle (music)
 Soundtrack music
 Lullaby
 Video game music
 Christmas music
 Dance music
 Wedding music
 Happy music
 Funny music
 Sad music
 Tender music
 Exciting music
 Angry music
 Scary music
 Wind
 Rustling leaves
 Wind noise (microphone)
 Thunderstorm
 Thunder
 Water
 Rain
 Raindrop
 Rain on surface
 Stream
 Waterfall
 Ocean
 Waves, surf
 Steam
 Gurgling
 Fire
 Crackle
 Vehicle
 Boat, Water vehicle
 Sailboat, sailing ship
 Rowboat, canoe, kayak
 Motorboat, speedboat
 Ship
 Motor vehicle (road)
 Car
 Vehicle horn, car horn, honking
 Toot
 Car alarm
 Power windows, electric windows
 Skidding
 Tire squeal
 Car passing by
 Race car, auto racing
 Truck
 Air brake
 Air horn, truck horn
 Reversing beeps
 Ice cream truck, ice cream van
 Bus
 Emergency vehicle
 Police car (siren)
 Ambulance (siren)
 Fire engine, fire truck (siren)
 Motorcycle
 Traffic noise, roadway noise
 Rail transport
 Train
 Train whistle
 Train horn
 Railroad car, train wagon
 Train wheels squealing
 Subway, metro, underground
 Aircraft
 Aircraft engine
 Jet engine
 Propeller, airscrew
 Helicopter
 Fixed-wing aircraft, airplane
 Bicycle
 Skateboard
 Engine
 Light engine (high frequency)
 Dental drill, dentist's drill
 Lawn mower
 Chainsaw
 Medium engine (mid frequency)
 Heavy engine (low frequency)
 Engine knocking
 Engine starting
 Idling
 Accelerating, revving, vroom
 Door
 Doorbell
 Ding-dong
 Sliding door
 Slam
 Knock
 Tap
 Squeak
 Cupboard open or close
 Drawer open or close
 Dishes, pots, and pans
 Cutlery, silverware
 Chopping (food)
 Frying (food)
 Microwave oven
 Blender
 Water tap, faucet
 Sink (filling or washing)
 Bathtub (filling or washing)
 Hair dryer
 Toilet flush
 Toothbrush
 Electric toothbrush
 Vacuum cleaner
 Zipper (clothing)
 Keys jangling
 Coin (dropping)
 Scissors
 Electric shaver, electric razor
 Shuffling cards
 Typing
 Typewriter
 Computer keyboard
 Writing
 Alarm
 Telephone
 Telephone bell ringing
 Ringtone
 Telephone dialing, DTMF
 Dial tone
 Busy signal
 Alarm clock
 Siren
 Civil defense siren
 Buzzer
 Smoke detector, smoke alarm
 Fire alarm
 Foghorn
 Whistle
 Steam whistle
 Mechanisms
 Ratchet, pawl
 Clock
 Tick
 Tick-tock
 Gears
 Pulleys
 Sewing machine
 Mechanical fan
 Air conditioning
 Cash register
 Printer
 Camera
 Single-lens reflex camera
 Tools
 Hammer
 Jackhammer
 Sawing
 Filing (rasp)
 Sanding
 Power tool
 Drill
 Explosion
 Gunshot, gunfire
 Machine gun
 Fusillade
 Artillery fire
 Cap gun
 Fireworks
 Firecracker
 Burst, pop
 Eruption
 Boom
 Wood
 Chop
 Splinter
 Crack
 Glass
 Chink, clink
 Shatter
 Liquid
 Splash, splatter
 Slosh
 Squish
 Drip
 Pour
 Trickle, dribble
 Gush
 Fill (with liquid)
 Spray
 Pump (liquid)
 Stir
 Boiling
 Sonar
 Arrow
 Whoosh, swoosh, swish
 Thump, thud
 Thunk
 Electronic tuner
 Effects unit
 Chorus effect
 Basketball bounce
 Bang
 Slap, smack
 Whack, thwack
 Smash, crash
 Breaking
 Bouncing
 Whip
 Flap
 Scratch
 Scrape
 Rub
 Roll
 Crushing
 Crumpling, crinkling
 Tearing
 Beep, bleep
 Ping
 Ding
 Clang
 Squeal
 Creak
 Rustle
 Whir
 Clatter
 Sizzle
 Clicking
 Clickety-clack
 Rumble
 Plop
 Jingle, tinkle
 Hum
 Zing
 Boing
 Crunch
 Silence
 Sine wave
 Harmonic
 Chirp tone
 Sound effect
 Pulse
 Inside, small room
 Inside, large room or hall
 Inside, public space
 Outside, urban or manmade
 Outside, rural or natural
 Reverberation
 Echo
 Noise
 Environmental noise
 Static
 Mains hum
 Distortion
 Sidetone
 Cacophony
 White noise
 Pink noise
 Throbbing
 Vibration
 Television
 Radio
 Field recording
--- a/audio/examples/panns/audio_tag.py
+++ b/audio/examples/panns/audio_tag.py
@ -1,111 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 from typing import List
 import numpy as np
 import paddle
 from paddleaudio.backends import load as load_audio
 from paddleaudio.features import melspectrogram
 from paddleaudio.models.panns import cnn14
 from paddleaudio.utils import logger
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
 parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
 parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
 parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
 parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
 args = parser.parse_args()
 # yapf: enable
 def split(waveform: np.ndarray, win_size: int, hop_size: int):
    """
    Split into N waveforms.
    N is decided by win_size and hop_size.
    """
    assert isinstance(waveform, np.ndarray)
    time = []
    data = []
    for i in range(0, len(waveform), hop_size):
        segment = waveform[i:i + win_size]
        if len(segment) < win_size:
            segment = np.pad(segment, (0, win_size - len(segment)))
        data.append(segment)
        time.append(i / len(waveform))
    return time, data
 def batchify(data: List[List[float]],
             sample_rate: int,
             batch_size: int,
             **kwargs):
    """
    Extract features from waveforms and create batches.
    """
    examples = []
    for waveform in data:
        feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
        examples.append(feats)
    # Seperates data into some batches.
    one_batch = []
    for example in examples:
        one_batch.append(example)
        if len(one_batch) == batch_size:
            yield one_batch
            one_batch = []
    if one_batch:
        yield one_batch
 def predict(model, data: List[List[float]], sample_rate: int,
            batch_size: int=1):
    """
    Use pretrained model to make predictions.
    """
    batches = batchify(data, sample_rate, batch_size)
    results = None
    model.eval()
    for batch in batches:
        feats = paddle.to_tensor(batch).unsqueeze(1)  \
            # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
        audioset_scores = model(feats)
        if results is None:
            results = audioset_scores.numpy()
        else:
            results = np.concatenate((results, audioset_scores.numpy()))
    return results
 if __name__ == '__main__':
    paddle.set_device(args.device)
    model = cnn14(pretrained=True, extract_embedding=False)
    waveform, sr = load_audio(args.wav, sr=None)
    time, data = split(waveform,
                       int(args.sample_duration * sr),
                       int(args.hop_duration * sr))
    results = predict(model, data, sr, batch_size=8)
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
    output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
    np.savez(output_file, time=time, scores=results)
    logger.info(f'Saved tagging results to {output_file}')
--- a/audio/examples/panns/parse_result.py
+++ b/audio/examples/panns/parse_result.py
@ -1,83 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import ast
 import os
 from typing import Dict
 import numpy as np
 from paddleaudio.utils import logger
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument('--tagging_file', type=str, required=True, help='')
 parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
 parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
 parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
 parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
 parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
 args = parser.parse_args()
 # yapf: enable
 def smooth(results: np.ndarray, win_size: int):
    """
    Execute posterior smoothing in-place.
    """
    for i in range(len(results) - 1, -1, -1):
        if i < win_size - 1:
            left = 0
        else:
            left = i + 1 - win_size
        results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
 def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
    """
    Return top k result.
    """
    result = np.asarray(result)
    topk_idx = (-result).argsort()[:k]
    ret = ''
    for idx in topk_idx:
        label, score = label_map[idx], result[idx]
        ret += f'{label}: {score}\n'
    return ret
 if __name__ == "__main__":
    label_map = {}
    with open(args.label_file, 'r') as f:
        for i, l in enumerate(f.readlines()):
            label_map[i] = l.strip()
    results = np.load(args.tagging_file, allow_pickle=True)
    times, scores = results['time'], results['scores']
    if args.smooth:
        logger.info('Posterior smoothing...')
        smooth(scores, win_size=args.smooth_size)
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    output_file = os.path.join(
        args.output_dir,
        os.path.basename(args.tagging_file).split('.')[0] + '.txt')
    with open(output_file, 'w') as f:
        for time, score in zip(times, scores):
            f.write(f'{time}\n')
            f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
    logger.info(f'Saved tagging labels to {output_file}')
--- a/audio/paddleaudio/datasets/aishell.py
+++ b/audio/paddleaudio/datasets/aishell.py
@ -1,154 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import codecs
 import collections
 import json
 import os
 from typing import Dict
 from paddle.io import Dataset
 from tqdm import tqdm
 from ..backends import load as load_audio
 from ..utils.download import decompress
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from ..utils.log import logger
 from .dataset import feat_funcs
 __all__ = ['AISHELL1']
 class AISHELL1(Dataset):
    """
    This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
    It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
    smart home, autonomous driving, and industrial production. The whole recording was
    put in quiet indoor environment, using 3 different devices at the same time: high
    fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
    iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
    to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
    in China were invited to participate in the recording. The manual transcription
    accuracy rate is above 95%, through professional speech annotation and strict
    quality inspection. The corpus is divided into training, development and testing
    sets.
    Reference:
        AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
        https://arxiv.org/abs/1709.05522
    """
    archieves = [
        {
            'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
            'md5': '2f494334227864a8a8fec932999db9d8',
        },
    ]
    text_meta = os.path.join('data_aishell', 'transcript',
                             'aishell_transcript_v0.8.txt')
    utt_info = collections.namedtuple('META_INFO',
                                      ('file_path', 'utt_id', 'text'))
    audio_path = os.path.join('data_aishell', 'wav')
    manifest_path = os.path.join('data_aishell', 'manifest')
    subset = ['train', 'dev', 'test']
    def __init__(self, subset: str='train', feat_type: str='raw', **kwargs):
        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
            self.subset, subset)
        self.subset = subset
        self.feat_type = feat_type
        self.feat_config = kwargs
        self._data = self._get_data()
        super(AISHELL1, self).__init__()
    def _get_text_info(self) -> Dict[str, str]:
        ret = {}
        with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
            for line in rf.readlines()[1:]:
                utt_id, text = map(str.strip, line.split(' ',
                                                         1))  # utt_id, text
                ret.update({utt_id: ''.join(text.split())})
        return ret
    def _get_data(self):
        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
            not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
            download_and_decompress(self.archieves, DATA_HOME)
            # Extract *wav from *.tar.gz.
            for root, _, files in os.walk(
                    os.path.join(DATA_HOME, self.audio_path)):
                for file in files:
                    if file.endswith('.tar.gz'):
                        decompress(os.path.join(root, file))
                        os.remove(os.path.join(root, file))
        text_info = self._get_text_info()
        data = []
        for root, _, files in os.walk(
                os.path.join(DATA_HOME, self.audio_path, self.subset)):
            for file in files:
                if file.endswith('.wav'):
                    utt_id = os.path.splitext(file)[0]
                    if utt_id not in text_info:  # There are some utt_id that without label
                        continue
                    text = text_info[utt_id]
                    file_path = os.path.join(root, file)
                    data.append(self.utt_info(file_path, utt_id, text))
        return data
    def _convert_to_record(self, idx: int):
        sample = self._data[idx]
        record = {}
        # To show all fields in a namedtuple: `type(sample)._fields`
        for field in type(sample)._fields:
            record[field] = getattr(sample, field)
        waveform, sr = load_audio(
            sample[0])  # The first element of sample is file path
        feat_func = feat_funcs[self.feat_type]
        feat = feat_func(
            waveform, sample_rate=sr,
            **self.feat_config) if feat_func else waveform
        record.update({'feat': feat, 'duration': len(waveform) / sr})
        return record
    def create_manifest(self, prefix='manifest'):
        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
                                     f'{prefix}.{self.subset}')
        with codecs.open(manifest_file, 'w', 'utf-8') as f:
            for idx in tqdm(range(len(self))):
                record = self._convert_to_record(idx)
                record_line = json.dumps(
                    {
                        'utt': record['utt_id'],
                        'feat': record['file_path'],
                        'feat_shape': (record['duration'], ),
                        'text': record['text']
                    },
                    ensure_ascii=False)
                f.write(record_line + '\n')
        logger.info(f'Manifest file {manifest_file} created.')
    def __getitem__(self, idx):
        record = self._convert_to_record(idx)
        return tuple(record.values())
    def __len__(self):
        return len(self._data)
--- a/audio/paddleaudio/datasets/dcase.py
+++ b/audio/paddleaudio/datasets/dcase.py
@ -1,298 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
 import os
 from typing import List
 from typing import Tuple
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
 class UrbanAcousticScenes(AudioClassificationDataset):
    """
    TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from
    12 European cities in 10 different acoustic scenes using 4 different devices.
    Additionally, synthetic data for 11 mobile devices was created based on the original
    recordings. Of the 12 cities, two are present only in the evaluation set.
    Reference:
        A multi-device dataset for urban acoustic scene classification
        https://arxiv.org/abs/1807.09840
    """
    source_url = 'https://zenodo.org/record/3819968/files/'
    base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
    archieves = [
        {
            'url': source_url + base_name + '.meta.zip',
            'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
        },
        {
            'url': source_url + base_name + '.audio.1.zip',
            'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
        },
        {
            'url': source_url + base_name + '.audio.2.zip',
            'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
        },
        {
            'url': source_url + base_name + '.audio.3.zip',
            'md5': 'ed38956c4246abb56190c1e9b602b7b8',
        },
        {
            'url': source_url + base_name + '.audio.4.zip',
            'md5': '97ab8560056b6816808dedc044dcc023',
        },
        {
            'url': source_url + base_name + '.audio.5.zip',
            'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
        },
        {
            'url': source_url + base_name + '.audio.6.zip',
            'md5': 'fbf856a3a86fff7520549c899dc94372',
        },
        {
            'url': source_url + base_name + '.audio.7.zip',
            'md5': '0dbffe7b6e45564da649378723284062',
        },
        {
            'url': source_url + base_name + '.audio.8.zip',
            'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
        },
        {
            'url': source_url + base_name + '.audio.9.zip',
            'md5': 'a65596a5372eab10c78e08a0de797c9e',
        },
        {
            'url': source_url + base_name + '.audio.10.zip',
            'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
        },
        {
            'url': source_url + base_name + '.audio.11.zip',
            'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
        },
        {
            'url': source_url + base_name + '.audio.12.zip',
            'md5': 'e5f4400c6b9697295fab4cf507155a2f',
        },
        {
            'url': source_url + base_name + '.audio.13.zip',
            'md5': '8855ab9f9896422746ab4c5d89d8da2f',
        },
        {
            'url': source_url + base_name + '.audio.14.zip',
            'md5': '092ad744452cd3e7de78f988a3d13020',
        },
        {
            'url': source_url + base_name + '.audio.15.zip',
            'md5': '4b5eb85f6592aebf846088d9df76b420',
        },
        {
            'url': source_url + base_name + '.audio.16.zip',
            'md5': '2e0a89723e58a3836be019e6996ae460',
        },
    ]
    label_list = [
        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
    ]
    meta = os.path.join(base_name, 'meta.csv')
    meta_info = collections.namedtuple('META_INFO', (
        'filename', 'scene_label', 'identifier', 'source_label'))
    subset_meta = {
        'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
        'dev':
        os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
        'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
    }
    subset_meta_info = collections.namedtuple('SUBSET_META_INFO',
                                              ('filename', 'scene_label'))
    audio_path = os.path.join(base_name, 'audio')
    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train or dev).
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        files, labels = self._get_data(mode)
        super(UrbanAcousticScenes, self).__init__(
            files=files, labels=labels, feat_type=feat_type, **kwargs)
    def _get_meta_info(self, subset: str=None,
                       skip_header: bool=True) -> List[collections.namedtuple]:
        if subset is None:
            meta_file = self.meta
            meta_info = self.meta_info
        else:
            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
            meta_file = self.subset_meta[subset]
            meta_info = self.subset_meta_info
        ret = []
        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
            lines = rf.readlines()[1:] if skip_header else rf.readlines()
            for line in lines:
                ret.append(meta_info(*line.strip().split('\t')))
        return ret
    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
            download_and_decompress(self.archieves, DATA_HOME)
        meta_info = self._get_meta_info(subset=mode, skip_header=True)
        files = []
        labels = []
        for sample in meta_info:
            filename, label = sample[:2]
            filename = os.path.basename(filename)
            target = self.label_list.index(label)
            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
            labels.append(int(target))
        return files, labels
 class UrbanAudioVisualScenes(AudioClassificationDataset):
    """
    TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
    and video recordings from 12 European cities in 10 different scenes.
    This dataset consists of 10-seconds audio and video segments from 10
    acoustic scenes. The total amount of audio in the development set is 34 hours.
    Reference:
        A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
        https://arxiv.org/abs/2011.00030
    """
    source_url = 'https://zenodo.org/record/4477542/files/'
    base_name = 'TAU-urban-audio-visual-scenes-2021-development'
    archieves = [
        {
            'url': source_url + base_name + '.meta.zip',
            'md5': '76e3d7ed5291b118372e06379cb2b490',
        },
        {
            'url': source_url + base_name + '.audio.1.zip',
            'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
        },
        {
            'url': source_url + base_name + '.audio.2.zip',
            'md5': '7fd6bb63127f5785874a55aba4e77aa5',
        },
        {
            'url': source_url + base_name + '.audio.3.zip',
            'md5': '61396bede29d7c8c89729a01a6f6b2e2',
        },
        {
            'url': source_url + base_name + '.audio.4.zip',
            'md5': '6ddac89717fcf9c92c451868eed77fe1',
        },
        {
            'url': source_url + base_name + '.audio.5.zip',
            'md5': 'af4820756cdf1a7d4bd6037dc034d384',
        },
        {
            'url': source_url + base_name + '.audio.6.zip',
            'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
        },
        {
            'url': source_url + base_name + '.audio.7.zip',
            'md5': '2be39a76aeed704d5929d020a2909efd',
        },
        {
            'url': source_url + base_name + '.audio.8.zip',
            'md5': '972d8afe0874720fc2f28086e7cb22a9',
        },
    ]
    label_list = [
        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
    ]
    meta_base_path = os.path.join(base_name, base_name + '.meta')
    meta = os.path.join(meta_base_path, 'meta.csv')
    meta_info = collections.namedtuple('META_INFO', (
        'filename_audio', 'filename_video', 'scene_label', 'identifier'))
    subset_meta = {
        'train':
        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
        'dev':
        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
        'test':
        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
    }
    subset_meta_info = collections.namedtuple('SUBSET_META_INFO', (
        'filename_audio', 'filename_video', 'scene_label'))
    audio_path = os.path.join(base_name, 'audio')
    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train or dev).
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        files, labels = self._get_data(mode)
        super(UrbanAudioVisualScenes, self).__init__(
            files=files, labels=labels, feat_type=feat_type, **kwargs)
    def _get_meta_info(self, subset: str=None,
                       skip_header: bool=True) -> List[collections.namedtuple]:
        if subset is None:
            meta_file = self.meta
            meta_info = self.meta_info
        else:
            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
            meta_file = self.subset_meta[subset]
            meta_info = self.subset_meta_info
        ret = []
        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
            lines = rf.readlines()[1:] if skip_header else rf.readlines()
            for line in lines:
                ret.append(meta_info(*line.strip().split('\t')))
        return ret
    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
            download_and_decompress(self.archieves,
                                    os.path.join(DATA_HOME, self.base_name))
        meta_info = self._get_meta_info(subset=mode, skip_header=True)
        files = []
        labels = []
        for sample in meta_info:
            filename, _, label = sample[:3]
            filename = os.path.basename(filename)
            target = self.label_list.index(label)
            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
            labels.append(int(target))
        return files, labels
--- a/audio/paddleaudio/datasets/librispeech.py
+++ b/audio/paddleaudio/datasets/librispeech.py
@ -1,199 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import codecs
 import collections
 import json
 import os
 from typing import Dict
 from paddle.io import Dataset
 from tqdm import tqdm
 from ..backends import load as load_audio
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from ..utils.log import logger
 from .dataset import feat_funcs
 __all__ = ['LIBRISPEECH']
 class LIBRISPEECH(Dataset):
    """
    LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
    prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
    derived from read audiobooks from the LibriVox project, and has been carefully
    segmented and aligned.
    Reference:
        LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
        http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
        https://arxiv.org/abs/1709.05522
    """
    source_url = 'http://www.openslr.org/resources/12/'
    archieves = [
        {
            'url': source_url + 'train-clean-100.tar.gz',
            'md5': '2a93770f6d5c6c964bc36631d331a522',
        },
        {
            'url': source_url + 'train-clean-360.tar.gz',
            'md5': 'c0e676e450a7ff2f54aeade5171606fa',
        },
        {
            'url': source_url + 'train-other-500.tar.gz',
            'md5': 'd1a0fd59409feb2c614ce4d30c387708',
        },
        {
            'url': source_url + 'dev-clean.tar.gz',
            'md5': '42e2234ba48799c1f50f24a7926300a1',
        },
        {
            'url': source_url + 'dev-other.tar.gz',
            'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
        },
        {
            'url': source_url + 'test-clean.tar.gz',
            'md5': '32fa31d27d2e1cad72775fee3f4849a9',
        },
        {
            'url': source_url + 'test-other.tar.gz',
            'md5': 'fb5a50374b501bb3bac4815ee91d3135',
        },
    ]
    speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
    utt_info = collections.namedtuple('META_INFO', (
        'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
    audio_path = 'LibriSpeech'
    manifest_path = os.path.join('LibriSpeech', 'manifest')
    subset = [
        'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean',
        'dev-other', 'test-clean', 'test-other'
    ]
    def __init__(self,
                 subset: str='train-clean-100',
                 feat_type: str='raw',
                 **kwargs):
        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
            self.subset, subset)
        self.subset = subset
        self.feat_type = feat_type
        self.feat_config = kwargs
        self._data = self._get_data()
        super(LIBRISPEECH, self).__init__()
    def _get_speaker_info(self) -> Dict[str, str]:
        ret = {}
        with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
            for line in rf.readlines():
                if ';' in line:  # Skip dataset abstract
                    continue
                spk_id, gender = map(str.strip,
                                     line.split('|')[:2])  # spk_id, gender
                ret.update({spk_id: gender})
        return ret
    def _get_text_info(self, trans_file) -> Dict[str, str]:
        ret = {}
        with open(trans_file, 'r') as rf:
            for line in rf.readlines():
                utt_id, text = map(str.strip, line.split(' ',
                                                         1))  # utt_id, text
                ret.update({utt_id: text})
        return ret
    def _get_data(self):
        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
            not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
            download_and_decompress(self.archieves, DATA_HOME,
                                    len(self.archieves))
        # Speaker info
        speaker_info = self._get_speaker_info()
        # Text info
        text_info = {}
        for root, _, files in os.walk(
                os.path.join(DATA_HOME, self.audio_path, self.subset)):
            for file in files:
                if file.endswith('.trans.txt'):
                    text_info.update(
                        self._get_text_info(os.path.join(root, file)))
        data = []
        for root, _, files in os.walk(
                os.path.join(DATA_HOME, self.audio_path, self.subset)):
            for file in files:
                if file.endswith('.flac'):
                    utt_id = os.path.splitext(file)[0]
                    spk_id = utt_id.split('-')[0]
                    if utt_id not in text_info \
                        or spk_id not in speaker_info :  # Skip samples with incomplete data
                        continue
                    file_path = os.path.join(root, file)
                    text = text_info[utt_id]
                    spk_gender = speaker_info[spk_id]
                    data.append(
                        self.utt_info(file_path, utt_id, text, spk_id,
                                      spk_gender))
        return data
    def _convert_to_record(self, idx: int):
        sample = self._data[idx]
        record = {}
        # To show all fields in a namedtuple: `type(sample)._fields`
        for field in type(sample)._fields:
            record[field] = getattr(sample, field)
        waveform, sr = load_audio(
            sample[0])  # The first element of sample is file path
        feat_func = feat_funcs[self.feat_type]
        feat = feat_func(
            waveform, sample_rate=sr,
            **self.feat_config) if feat_func else waveform
        record.update({'feat': feat, 'duration': len(waveform) / sr})
        return record
    def create_manifest(self, prefix='manifest'):
        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
                                     f'{prefix}.{self.subset}')
        with codecs.open(manifest_file, 'w', 'utf-8') as f:
            for idx in tqdm(range(len(self))):
                record = self._convert_to_record(idx)
                record_line = json.dumps(
                    {
                        'utt': record['utt_id'],
                        'feat': record['file_path'],
                        'feat_shape': (record['duration'], ),
                        'text': record['text'],
                        'spk': record['spk_id'],
                        'gender': record['spk_gender'],
                    },
                    ensure_ascii=False)
                f.write(record_line + '\n')
        logger.info(f'Manifest file {manifest_file} created.')
    def __getitem__(self, idx):
        record = self._convert_to_record(idx)
        return tuple(record.values())
    def __len__(self):
        return len(self._data)
--- a/audio/paddleaudio/datasets/ravdess.py
+++ b/audio/paddleaudio/datasets/ravdess.py
@ -1,136 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
 import os
 import random
 from typing import List
 from typing import Tuple
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['RAVDESS']
 class RAVDESS(AudioClassificationDataset):
    """
    The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
    lexically-matched statements in a neutral North American accent. Speech emotions
    includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
    Each expression is produced at two levels of emotional intensity (normal, strong),
    with an additional neutral expression.
    Reference:
        The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
        A dynamic, multimodal set of facial and vocal expressions in North American English
        https://doi.org/10.1371/journal.pone.0196391
    """
    archieves = [
        {
            'url':
            'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
            'md5':
            '5411230427d67a21e18aa4d466e6d1b9',
        },
        {
            'url':
            'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
            'md5':
            'bc696df654c87fed845eb13823edef8a',
        },
    ]
    label_list = [
        'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
        'surprised'
    ]
    meta_info = collections.namedtuple(
        'META_INFO', ('modality', 'vocal_channel', 'emotion',
                      'emotion_intensity', 'statement', 'repitition', 'actor'))
    speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
    song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
    def __init__(self,
                 mode='train',
                 seed=0,
                 n_folds=5,
                 split=1,
                 feat_type='raw',
                 **kwargs):
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train or dev).
            seed (:obj:`int`, `optional`, defaults to 0):
                Set the random seed to shuffle samples.
            n_folds (:obj:`int`, `optional`, defaults to 5):
                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
            split (:obj:`int`, `optional`, defaults to 1):
                It specify the fold of dev dataset.
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
        files, labels = self._get_data(mode, seed, n_folds, split)
        super(RAVDESS, self).__init__(
            files=files, labels=labels, feat_type=feat_type, **kwargs)
    def _get_meta_info(self, files) -> List[collections.namedtuple]:
        ret = []
        for file in files:
            basename_without_extend = os.path.basename(file)[:-4]
            ret.append(self.meta_info(*basename_without_extend.split('-')))
        return ret
    def _get_data(self, mode, seed, n_folds,
                  split) -> Tuple[List[str], List[int]]:
        if not os.path.isdir(self.speech_path) and not os.path.isdir(
                self.song_path):
            download_and_decompress(self.archieves, DATA_HOME)
        wav_files = []
        for root, _, files in os.walk(self.speech_path):
            for file in files:
                if file.endswith('.wav'):
                    wav_files.append(os.path.join(root, file))
        for root, _, files in os.walk(self.song_path):
            for file in files:
                if file.endswith('.wav'):
                    wav_files.append(os.path.join(root, file))
        random.seed(seed)  # shuffle samples to split data
        random.shuffle(
            wav_files
        )  # make sure using the same seed to create train and dev dataset
        meta_info = self._get_meta_info(wav_files)
        files = []
        labels = []
        n_samples_per_fold = len(meta_info) // n_folds
        for idx, sample in enumerate(meta_info):
            _, _, emotion, _, _, _, _ = sample
            target = int(emotion) - 1
            fold = idx // n_samples_per_fold + 1
            if mode == 'train' and int(fold) != split:
                files.append(wav_files[idx])
                labels.append(target)
            if mode != 'train' and int(fold) == split:
                files.append(wav_files[idx])
                labels.append(target)
        return files, labels
--- a/audio/test/README.md
+++ b/audio/test/README.md
@ -1,41 +0,0 @@
 # PaddleAudio Testing Guide
 # Testing
 First clone a version of the project by
 ```
 git clone https://github.com/PaddlePaddle/models.git
 ```
 Then install the project in your virtual environment.
 ```
 cd models/PaddleAudio
 python setup.py bdist_wheel
 pip install -e .[dev]
 ```
 The requirements for testing will be installed along with PaddleAudio.  
 Now run
 ```
 pytest test
 ```
 If it goes well, you will see outputs like these:
 ```
 platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
 rootdir: ./models/PaddleAudio
 plugins: hydra-core-1.0.6
 collected 16 items  
 test/unit_test/test_backend.py ...........                                                                         [ 68%]
 test/unit_test/test_features.py .....                                                                              [100%]
 ==================================================== warnings summary ====================================================
 .
 .
 .
 -- Docs: https://docs.pytest.org/en/stable/warnings.html
 ============================================ 16 passed, 11 warnings in 6.76s =============================================
 ```
--- a/audio/test/unit_test/test_backend.py
+++ b/audio/test/unit_test/test_backend.py
@ -1,113 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import librosa
 import numpy as np
 import paddleaudio
 import pytest
 TEST_FILE = './test/data/test_audio.wav'
 def relative_err(a, b, real=True):
    """compute relative error of two matrices or vectors"""
    if real:
        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
    else:
        err = np.sum((a.real - b.real)**2) / \
            (EPS + np.sum(a.real**2) + np.sum(b.real**2))
        err += np.sum((a.imag - b.imag)**2) / \
            (EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
        return err
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def load_audio():
    x, r = librosa.load(TEST_FILE, sr=16000)
    print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
    return x, r
 # start testing
 x, r = load_audio()
 EPS = 1e-8
 def test_load():
    s, r = paddleaudio.load(TEST_FILE, sr=16000)
    assert r == 16000
    assert s.dtype == 'float32'
    s, r = paddleaudio.load(
        TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
    assert len(s) / r == 2.0
    assert r == 16000
    assert s.dtype == 'int16'
 def test_depth_convert():
    y = paddleaudio.depth_convert(x, 'int16')
    assert len(y) == len(x)
    assert y.dtype == 'int16'
    assert np.max(y) <= 32767
    assert np.min(y) >= -32768
    assert np.std(y) > EPS
    y = paddleaudio.depth_convert(x, 'int8')
    assert len(y) == len(x)
    assert y.dtype == 'int8'
    assert np.max(y) <= 127
    assert np.min(y) >= -128
    assert np.std(y) > EPS
 # test case for resample
 rs_test_data = [
    (32000, 'kaiser_fast'),
    (16000, 'kaiser_fast'),
    (8000, 'kaiser_fast'),
    (32000, 'kaiser_best'),
    (16000, 'kaiser_best'),
    (8000, 'kaiser_best'),
    (22050, 'kaiser_best'),
    (44100, 'kaiser_best'),
 ]
@pytest.mark.parametrize('sr,mode', rs_test_data)
 def test_resample(sr, mode):
    y = paddleaudio.resample(x, 16000, sr, mode=mode)
    factor = sr / 16000
    err = relative_err(len(y), len(x) * factor)
    print('err:', err)
    assert err < EPS
 def test_normalize():
    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
    assert np.max(y) < 0.5 + EPS
    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
    assert np.max(y) <= 2.0 + EPS
    y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
    print('np.std(y):', np.std(y))
    assert np.abs(np.std(y) - 1.0) < EPS
 if __name__ == '__main__':
    test_load()
    test_depth_convert()
    test_resample(22050, 'kaiser_fast')
    test_normalize()
--- a/audio/test/unit_test/test_features.py
+++ b/audio/test/unit_test/test_features.py
@ -1,143 +0,0 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import librosa
 import numpy as np
 import paddleaudio as pa
 import pytest
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def load_audio():
    x, r = librosa.load('./test/data/test_audio.wav')
    #x,r = librosa.load('../data/test_audio.wav',sr=16000)
    return x, r
 ## start testing
 x, r = load_audio()
 EPS = 1e-8
 def relative_err(a, b, real=True):
    """compute relative error of two matrices or vectors"""
    if real:
        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
    else:
        err = np.sum((a.real - b.real)**2) / (
            EPS + np.sum(a.real**2) + np.sum(b.real**2))
        err += np.sum((a.imag - b.imag)**2) / (
            EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
        return err
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_melspectrogram():
    a = pa.melspectrogram(
        x,
        window_size=512,
        sr=16000,
        hop_length=320,
        n_mels=64,
        fmin=50,
        to_db=False, )
    b = librosa.feature.melspectrogram(
        x,
        sr=16000,
        n_fft=512,
        win_length=512,
        hop_length=320,
        n_mels=64,
        fmin=50)
    assert relative_err(a, b) < EPS
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_melspectrogram_db():
    a = pa.melspectrogram(
        x,
        window_size=512,
        sr=16000,
        hop_length=320,
        n_mels=64,
        fmin=50,
        to_db=True,
        ref=1.0,
        amin=1e-10,
        top_db=None)
    b = librosa.feature.melspectrogram(
        x,
        sr=16000,
        n_fft=512,
        win_length=512,
        hop_length=320,
        n_mels=64,
        fmin=50)
    b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
    assert relative_err(a, b) < EPS
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_stft():
    a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
    b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
    assert a.shape == b.shape
    assert relative_err(a, b, real=False) < EPS
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_split_frames():
    a = librosa.util.frame(x, frame_length=512, hop_length=320)
    b = pa.split_frames(x, frame_length=512, hop_length=320)
    assert relative_err(a, b) < EPS
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_mfcc():
    kwargs = {
        'window_size': 512,
        'hop_length': 320,
        'n_mels': 64,
        'fmin': 50,
        'to_db': False
    }
    a = pa.mfcc(
        x,
        #sample_rate=16000,
        spect=None,
        n_mfcc=20,
        dct_type=2,
        norm='ortho',
        lifter=0,
        **kwargs)
    S = librosa.feature.melspectrogram(
        x,
        sr=16000,
        n_fft=512,
        win_length=512,
        hop_length=320,
        n_mels=64,
        fmin=50)
    b = librosa.feature.mfcc(
        x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
    assert relative_err(a, b) < EPS
 if __name__ == '__main__':
    test_melspectrogram()
    test_melspectrogram_db()
    test_stft()
    test_split_frames()
    test_mfcc()
--- a/examples/dataset/aidatatang_200zh/.gitignore
+++ b/examples/dataset/aidatatang_200zh/.gitignore
--- a/examples/dataset/aidatatang_200zh/README.md
+++ b/examples/dataset/aidatatang_200zh/README.md
--- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
--- a/examples/dataset/aishell/.gitignore
+++ b/examples/dataset/aishell/.gitignore
--- a/examples/dataset/aishell/README.md
+++ b/examples/dataset/aishell/README.md
--- a/examples/dataset/aishell/aishell.py
+++ b/examples/dataset/aishell/aishell.py
--- a/examples/dataset/aishell3/README.md
+++ b/examples/dataset/aishell3/README.md
--- a/examples/dataset/chime3_background/chime3_background.py
+++ b/examples/dataset/chime3_background/chime3_background.py
--- a/examples/dataset/gigaspeech/.gitignore
+++ b/examples/dataset/gigaspeech/.gitignore
--- a/examples/dataset/gigaspeech/README.md
+++ b/examples/dataset/gigaspeech/README.md
--- a/examples/dataset/gigaspeech/gigaspeech.py
+++ b/examples/dataset/gigaspeech/gigaspeech.py
--- a/examples/dataset/gigaspeech/run.sh
+++ b/examples/dataset/gigaspeech/run.sh
--- a/examples/dataset/librispeech/.gitignore
+++ b/examples/dataset/librispeech/.gitignore
--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
--- a/examples/dataset/magicdata/README.md
+++ b/examples/dataset/magicdata/README.md
--- a/examples/dataset/mini_librispeech/.gitignore
+++ b/examples/dataset/mini_librispeech/.gitignore
--- a/examples/dataset/mini_librispeech/mini_librispeech.py
+++ b/examples/dataset/mini_librispeech/mini_librispeech.py
--- a/examples/dataset/multi_cn/README.md
+++ b/examples/dataset/multi_cn/README.md
--- a/examples/dataset/musan/.gitignore
+++ b/examples/dataset/musan/.gitignore
--- a/examples/dataset/musan/musan.py
+++ b/examples/dataset/musan/musan.py
--- a/examples/dataset/primewords/README.md
+++ b/examples/dataset/primewords/README.md
--- a/examples/dataset/rir_noise/.gitignore
+++ b/examples/dataset/rir_noise/.gitignore
--- a/examples/dataset/rir_noise/rir_noise.py
+++ b/examples/dataset/rir_noise/rir_noise.py
--- a/examples/dataset/st-cmds/README.md
+++ b/examples/dataset/st-cmds/README.md
--- a/examples/dataset/ted_en_zh/.gitignore
+++ b/examples/dataset/ted_en_zh/.gitignore
--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
--- a/examples/dataset/thchs30/.gitignore
+++ b/examples/dataset/thchs30/.gitignore
--- a/examples/dataset/thchs30/README.md
+++ b/examples/dataset/thchs30/README.md
--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
--- a/examples/dataset/timit/.gitignore
+++ b/examples/dataset/timit/.gitignore
--- a/examples/dataset/timit/timit.py
+++ b/examples/dataset/timit/timit.py
--- a/examples/dataset/timit/timit_kaldi_standard_split.py
+++ b/examples/dataset/timit/timit_kaldi_standard_split.py
--- a/examples/dataset/voxforge/run_data.sh
+++ b/examples/dataset/voxforge/run_data.sh
@ -1,10 +1,10 @@
 #! /usr/bin/env bash
-TARGET_DIR=${MAIN_ROOT}/examples/dataset/voxforge
+TARGET_DIR=${MAIN_ROOT}/dataset/voxforge
 mkdir -p ${TARGET_DIR}
 # download data, generate manifests
-python ${MAIN_ROOT}/examples/dataset/voxforge/voxforge.py \
+python ${MAIN_ROOT}/dataset/voxforge/voxforge.py \
 --manifest_prefix="${TARGET_DIR}/manifest" \
 --target_dir="${TARGET_DIR}" \
 --is_merge_dialect=True \
--- a/examples/dataset/voxforge/voxforge.py
+++ b/examples/dataset/voxforge/voxforge.py
--- a/demos/style_fs2/style_syn.py
+++ b/demos/style_fs2/style_syn.py
@ -34,7 +34,9 @@ def evaluate(args, fastspeech2_config, pwg_config):
    sentences = []
    with open(args.text, 'rt') as f:
        for line in f:
-            utt_id, sentence = line.strip().split()
+            items = line.strip().split()
            utt_id = items[0]
            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))
    with open(args.phones_dict, "r") as f:
--- a/docs/source/reference.md
+++ b/docs/source/reference.md
@ -6,7 +6,7 @@ We borrowed a lot of code from these repos to build `model` and `engine`, thanks
 - Apache-2.0 License
 - python/shell `utils`
 - kaldi feat preprocessing
- data pipe line and `transform`
+- data pipe line and `transformer`
 - some tts models, like `fastspeech2` and GAN-based `vocoder`
 * [wenet](https://github.com/wenet-e2e/wenet/blob/main/LICENSE)
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@ -1,16 +1,18 @@
 # Released Models
 ## Speech-to-Text Models
 ### Acoustic Model Released in paddle 2.X
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link
 :-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :-----------
-[Ds2 Online Aishell ASR0 Model](https://deepspeech.bj.bcebos.com/release2.2/aishell/s0/ds2_online_aishll_CER8.02_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080218 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
-[Ds2 Offline Aishell ASR0 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
+[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
 [Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
 [Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
 [Conformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s1/librispeech.s1.transformer.all.wer5p62.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0456 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
+[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0410 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
-[Transformer Librispeech ASR2 Model](https://deepspeech.bj.bcebos.com/release2.2/librispeech/s2/libri_transformer_espnet_wer3p84.release.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0384 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
+[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.024 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
 ### Acoustic Model Transformed from paddle 1.8
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
@ -20,14 +22,15 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER |
 [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
 ### Language Model Released
 Language Model | Training Data | Token-based | Size | Descriptions
 :-------------:| :------------:| :-----: | -----: | :-----------------
 [English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) |  [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie'  binary with '-a 22 -q 8 -b 8'
 [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
 [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
 ## Text-to-Speech Models
 ### Acoustic Models
 Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
@ -40,7 +43,6 @@ FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/Pa
 FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
 ### Vocoders
 Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static)
 :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
 WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)|||
--- a/docs/source/tts/README.md
+++ b/docs/source/tts/README.md
@ -5,20 +5,6 @@ Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-spee
  <img src="../../images/logo.png" width=300 /> <br>
 </div>
 ## News  <img src="../../images/news_icon.png" width="40"/>
 - Oct-12-2021, Refector examples code.
 - Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
 - Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).
 - Sep-14-2021, Reconstruction of TransformerTTS. Check [examples/transformer_tts/ljspeech](./examples/transformer_tts/ljspeech).
 - Aug-31-2021, Chinese Text Frontend. Check [examples/text_frontend](./examples/text_frontend).
 - Aug-23-2021, FastSpeech2/FastPitch with AISHELL-3. Check [examples/fastspeech2/aishell3](./examples/fastspeech2/aishell3).
 - Aug-03-2021, FastSpeech2/FastPitch with CSMSC. Check [examples/fastspeech2/baker](./examples/fastspeech2/baker).
 - Jul-19-2021, SpeedySpeech with CSMSC. Check [examples/speedyspeech/baker](./examples/speedyspeech/baker).
 - Jul-01-2021, Parallel WaveGAN with CSMSC. Check [examples/GANVocoder/parallelwave_gan/baker](./examples/GANVocoder/parallelwave_gan/baker).
 - Jul-01-2021, Montreal-Forced-Aligner. Check  [examples/use_mfa](./examples/use_mfa).
 - May-07-2021, Voice Cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
 ## Overview
 In order to facilitate exploiting the existing TTS models directly and developing the new ones, Parakeet selects typical models and provides their reference implementations in PaddlePaddle. Further more, Parakeet abstracts the TTS pipeline and standardizes the procedure of data preprocessing, common modules sharing, model configuration, and the process of training and synthesis. The models supported here include Text FrontEnd, end-to-end Acoustic models and Vocoders:
@ -38,50 +24,11 @@ In order to facilitate exploiting the existing TTS models directly and developin
  - [Transfer Learning from Speaker Verification to Multispeaker Text-to-Speech Synthesis](https://arxiv.org/pdf/1806.04558v4.pdf)
  - [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)
 ## Setup
 It's difficult to install some dependent libraries for this repo in Windows system, we recommend that you **DO NOT** use Windows system, please use `Linux`.
 Make sure the library `libsndfile1` is installed, e.g., on Ubuntu.
 ```bash
 sudo apt-get install libsndfile1
 ```
 ### Install PaddlePaddle
 See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.1.2** or above.
 ### Install Parakeet
 ```bash
 git clone https://github.com/PaddlePaddle/Parakeet
 cd Parakeet
 pip install -e .
 ```
 If some python dependent packages cannot be installed successfully, you can run the following script first.
 (replace `python3.6` with your own python version)
 ```bash
 sudo apt install -y python3.6-dev
 ```
 See [install](https://paddle-parakeet.readthedocs.io/en/latest/install.html) for more details.
 ## Examples
 Entries to the introduction, and the launch of training and synthsis for different example models:
 - [>>> Chinese Text Frontend](./examples/text_frontend)
 - [>>> FastSpeech2/FastPitch](./examples/fastspeech2)
 - [>>> Montreal-Forced-Aligner](./examples/use_mfa)
 - [>>> Parallel WaveGAN](./examples/GANVocoder/parallelwave_gan)
 - [>>> SpeedySpeech](./examples/speedyspeech)
 - [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3)
 - [>>> GE2E](./examples/ge2e)
 - [>>> WaveFlow](./examples/waveflow)
 - [>>> TransformerTTS](./examples/transformer_tts)
 - [>>> Tacotron2](./examples/tacotron2)
 ## Audio samples
-### TTS models (Acoustic Model + Neural Vocoder)
+
-Check our [website](https://paddleparakeet.readthedocs.io/en/latest/demo.html) for audio sampels.
+Check our [website](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) for audio sampels.
 ## Released Model
--- a/docs/source/tts/advanced_usage.md
+++ b/docs/source/tts/advanced_usage.md
@ -290,7 +290,7 @@ The following is the basic  `ArgumentParser`:
 1. `--config`  is used to support configuration file parsing, and the configuration file itself handles the unique options of each experiment.
 2. `--train-metadata` is the path to the training data.
 3.  `--output-dir` is the dir to save the training results.（if there are checkpoints in  `checkpoints/` of  `--output-dir` , it's defalut to reload the newest checkpoint to train)
-4. `--device` and  `--nprocs` determine operation modes，`--device` specifies the type of running device, whether to run on `cpu` or `gpu`. `--nprocs` refers to  the number of training processes. If `nprocs` > 1, it means that multi process parallel training is used. (Note: currently only GPU multi card multi process training is supported.)
+4. `--ngpu` determine operation modes，`--ngpu` refers to the number of training processes. If `ngpu` > 0, it means using GPU, else CPU is used.
 Developers can refer to the examples in `examples` to write the default configuration file when adding new experiments.
--- a/docs/topic/ctc/ctc_loss.ipynb
+++ b/docs/topic/ctc/ctc_loss.ipynb
@ -343,6 +343,16 @@
    "    $$"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "41637c03",
   "metadata": {},
   "source": [
    "## Source Code\n",
    "本人在 [warp-ctc](https://github.com/zh794390558/warp-ctc) 上加了注释，并调整 index 的索引方式，便于理解代码。\n",
    "对比上面的公式推导和lattice图可以快速理解 ctc 实现。"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "coordinated-music",
@ -372,7 +382,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -386,7 +396,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.7.0"
  },
  "toc": {
   "base_numbering": 1,
--- a/examples/aishell/asr0/local/data.sh
+++ b/examples/aishell/asr0/local/data.sh
@ -9,7 +9,7 @@ dict_dir=data/lang_char
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
--- a/examples/aishell/asr0/local/export.sh
+++ b/examples/aishell/asr0/local/export.sh
@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4
 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \
--- a/examples/aishell/asr0/local/test.sh
+++ b/examples/aishell/asr0/local/test.sh
@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/aishell/asr0/local/test_export.sh
+++ b/examples/aishell/asr0/local/test_export.sh
@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test_export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${jit_model_export_path}.rsl \
 --export_path ${jit_model_export_path} \
--- a/examples/aishell/asr0/local/test_hub.sh
+++ b/examples/aishell/asr0/local/test_hub.sh
@ -13,6 +13,17 @@ ckpt_prefix=$2
 model_type=$3
 audio_file=$4
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
 if [ $? -ne 0 ]; then
   exit 1
 fi
 if [ ! -f ${audio_file} ]; then
    echo "Plase input the right audio_file path"
    exit 1
 fi
 # download language model
 bash local/download_lm_ch.sh
 if [ $? -ne 0 ]; then
@ -20,7 +31,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test_hub.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/aishell/asr0/local/train.sh
+++ b/examples/aishell/asr0/local/train.sh
@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
--- a/examples/aishell/asr0/run.sh
+++ b/examples/aishell/asr0/run.sh
@ -8,6 +8,7 @@ stop_stage=100
 conf_path=conf/deepspeech2.yaml    #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
 avg_num=1
 model_type=offline    # offline or online
 audio_file=data/demo_01_03.wav
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -15,7 +16,6 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 audio_file="data/tmp.wav"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
--- a/examples/aishell/asr1/READEME.md
+++ b/examples/aishell/asr1/READEME.md
@ -4,7 +4,7 @@ This example contains code used to train a Transformer or [Conformer](http://arx
 ## Overview
-All the scirpts you need are in the ```run.sh```. There are several stages in the ```run.sh```, and each stage has its function.
+All the scirpts you need are in ```run.sh```. There are several stages in ```run.sh```, and each stage has its function.
 | Stage | Function                                                     |
 | :---- | :----------------------------------------------------------- |
@ -16,7 +16,7 @@ All the scirpts you need are in the ```run.sh```. There are several stages in th
 | 5     | Infer the single audio file                                  |
-You can choose to run a range of  stages by setting the ```stage``` and ```stop_stage ``` . 
+You can choose to run a range of stages by setting ```stage``` and ```stop_stage ```. 
 For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
@ -33,19 +33,17 @@ bash run.sh --stage 0 --stop_stage 0
-The document below will describe the scripts in the ```run.sh``` in detail.
+The document below will describe the scripts in ```run.sh``` in detail.
 ## The Environment Variables
-The path.sh contains the environment variable. 
+The path.sh contains the environment variables. 
 ```bash
 source path.sh
 ```
-This script needs to be run firstly.  
+This script needs to be run firstly. And another script is also needed:
 And another script is also needed:
 ```bash
 source ${MAIN_ROOT}/utils/parse_options.sh
@ -57,7 +55,7 @@ It will support the way of using```--varibale value``` in the shell scripts.
 ## The Local Variables
-Some local variables are set in the ```run.sh```. 
+Some local variables are set in ```run.sh```. 
 ```gpus``` denotes the GPU number you want to use. If you set ```gpus=```, it means you only use CPU. 
 ```stage``` denotes the number of stage you want to start from in the expriments.
@ -71,7 +69,7 @@ Some local variables are set in the ```run.sh```.
 ```ckpt``` denotes the checkpoint prefix of the model, e.g. "conformer"
-You can set the local variables (except ```ckpt```)  when you use the ```run.sh```
+You can set the local variables (except ```ckpt```) when you use ```run.sh```
 For example, you can set the ```gpus``` and ``avg_num`` when you use the command line.:
@ -83,7 +81,7 @@ bash run.sh --gpus 0,1 --avg_num 20
 ## Stage 0: Data Processing
-To use this example, you need to process data firstly and  you can use stage 0 in the ```run.sh``` to do this. The code is shown below:
+To use this example, you need to process data firstly and you can use stage 0 in ```run.sh``` to do this. The code is shown below:
 ```bash
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
@ -129,7 +127,7 @@ data/
 ## Stage 1: Model Training
-If you want to train the model. you can use stage 1 in the ```run.sh```. The code is shown below. 
+If you want to train the model. you can use stage 1 in ```run.sh```. The code is shown below. 
 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@ -185,7 +183,7 @@ avg.sh best exp/conformer/checkpoints 20
 ## Stage 3: Model Testing
-The test stage is to evaluate the model performance.. The code of test stage is shown below:
+The test stage is to evaluate the model performance. The code of test stage is shown below:
 ```bash
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
@ -325,11 +323,11 @@ In some situations, you want to use the trained model to do the inference for th
 ```bash
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # test a single .wav file
-     CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+     CUDA_VISIBLE_DEVICES=0 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
 fi
 ```
-you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model by the script below:
+you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below:
 ```bash
 wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz
@ -339,7 +337,7 @@ tar xzvf transformer.model.tar.gz
 You need to prepare an audio file, please confirm the sample rate of the audio is 16K. Assume the path of the audio file is ```data/test_audio.wav```, you can get the result by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 data/test_audio.wav
+CUDA_VISIBLE_DEVICES= ./local/test_hub.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 data/test_audio.wav
 ```
--- a/examples/aishell/asr1/local/align.sh
+++ b/examples/aishell/asr1/local/align.sh
@ -18,7 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/aishell/asr1/local/data.sh
+++ b/examples/aishell/asr1/local/data.sh
@ -8,7 +8,7 @@ dict_dir=data/lang_char
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
--- a/examples/aishell/asr1/local/export.sh
+++ b/examples/aishell/asr1/local/export.sh
@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@ -34,7 +34,7 @@ for type in attention ctc_greedy_search; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
@ -53,7 +53,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
--- a/examples/aishell/asr1/local/test_hub.sh
+++ b/examples/aishell/asr1/local/test_hub.sh
@ -12,6 +12,17 @@ config_path=$1
 ckpt_prefix=$2
 audio_file=$3
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
 if [ $? -ne 0 ]; then
   exit 1
 fi
 if [ ! -f ${audio_file} ]; then
    echo "Plase input the right audio_file path"
    exit 1
 fi
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
    chunk_mode=true
@ -29,7 +40,7 @@ for type in  attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test_hub.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
--- a/examples/aishell/asr1/local/tlg.sh
+++ b/examples/aishell/asr1/local/tlg.sh
@ -9,7 +9,7 @@ lmtype=srilm
 source utils/parse_options.sh
-data=${MAIN_ROOT}/examples/dataset/${corpus}
+data=${MAIN_ROOT}/dataset/${corpus}
 lexicon=$data/resource_aishell/lexicon.txt
 text=$data/data_aishell/transcript/aishell_transcript_v0.8.txt
--- a/examples/aishell/asr1/local/train.sh
+++ b/examples/aishell/asr1/local/train.sh
@ -29,7 +29,7 @@ mkdir -p exp
 python3 -u ${BIN_DIR}/train.py \
 --seed ${seed} \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --profiler-options "${profiler_options}" \
--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
@ -7,6 +7,7 @@ stage=0
 stop_stage=50
 conf_path=conf/conformer.yaml
 avg_num=20
 audio_file=data/demo_01_03.wav
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -14,8 +15,6 @@ avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
 echo "checkpoint name ${ckpt}"
 audio_file="data/test_single_audio.wav"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
    bash ./local/data.sh || exit -1
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@ -17,7 +17,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
--- a/examples/aishell3/tts3/conf/default.yaml
+++ b/examples/aishell3/tts3/conf/default.yaml
@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
@ -45,7 +45,8 @@ We use Montreal Force Aligner 1.0. The label in  aishell3 include pinyin，so th
 We use [lexicon.txt](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/lexicon.txt) as the lexicon.
-You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download the alignment results from here [alignment_aishell3.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/alignment_aishell3.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 ```bash
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@ -18,7 +18,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 ## Pretrained GE2E Model
 We use pretrained GE2E model to generate spwaker embedding for each sentence.
--- a/examples/aishell3/vc1/conf/default.yaml
+++ b/examples/aishell3/vc1/conf/default.yaml
@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@ -15,7 +15,7 @@ tar zxvf data_aishell3.tgz -C data_aishell3
 ```
 ### Get MFA Result and Extract
 We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
-You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/data_aishell3`.
--- a/examples/callcenter/asr1/local/align.sh
+++ b/examples/callcenter/asr1/local/align.sh
@ -23,7 +23,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/callcenter/asr1/local/export.sh
+++ b/examples/callcenter/asr1/local/export.sh
@ -13,7 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path}
--- a/examples/callcenter/asr1/local/test.sh
+++ b/examples/callcenter/asr1/local/test.sh
@ -28,7 +28,7 @@ for type in attention ctc_greedy_search; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
@ -47,7 +47,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --nproc ${ngpu} \
+    --ngpu ${ngpu} \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
--- a/examples/callcenter/asr1/local/train.sh
+++ b/examples/callcenter/asr1/local/train.sh
@ -22,7 +22,7 @@ if [ ${seed} != 0 ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --seed ${seed}
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
@ -210,7 +210,7 @@ Static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](htt
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
 default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287|
-conformer| 2(gpu) x 76000||||||
+conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509|
 FastSpeech2 checkpoint contains files listed below.
 ```text
--- a/examples/csmsc/tts3/conf/conformer.yaml
+++ b/examples/csmsc/tts3/conf/conformer.yaml
@ -0,0 +1,109 @@
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
 ###########################################################
 fs: 24000          # sr
 n_fft: 2048        # FFT size.
 n_shift: 300       # Hop size.
 win_length: 1200   # Window length.
                   # If set to null, it will be the same as fft_size.
 window: "hann"     # Window function.
 # Only used for feats_type != raw
 fmin: 80           # Minimum frequency of Mel basis.
 fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.
 # Only used for the model using pitch features (e.g. FastSpeech2)
 f0min: 80          # Maximum f0 for pitch extraction.
 f0max: 400         # Minimum f0 for pitch extraction.
 ###########################################################
 #                       DATA SETTING                      #
 ###########################################################
 batch_size: 64
 num_workers: 4
 ###########################################################
 #                       MODEL SETTING                     #
 ###########################################################
 model:
    adim: 384         # attention dimension
    aheads: 2         # number of attention heads
    elayers: 4        # number of encoder layers
    eunits: 1536      # number of encoder ff units
    dlayers: 4        # number of decoder layers
    dunits: 1536      # number of decoder ff units
    positionwise_layer_type: conv1d   # type of position-wise layer
    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
    duration_predictor_layers: 2      # number of layers of duration predictor
    duration_predictor_chans: 256     # number of channels of duration predictor
    duration_predictor_kernel_size: 3 # filter size of duration predictor
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
    reduction_factor: 1               # reduction factor
    encoder_type: conformer           # encoder type
    decoder_type: conformer           # decoder type
    conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
    conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
    conformer_activation_type: swish             # conformer activation type
    use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
    use_cnn_in_conformer: true                   # whether to use CNN in conformer
    conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
    conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
    init_type: xavier_uniform         # initialization type
    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
    energy_predictor_layers: 2                 # number of conv layers in energy predictor
    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
 ###########################################################
 #                       UPDATER SETTING                   #
 ###########################################################
 updater:
    use_masking: True                 # whether to apply masking for padded part in loss calculation
 ###########################################################
 #                     OPTIMIZER SETTING                   #
 ###########################################################
 optimizer:
  optim: adam              # optimizer type
  learning_rate: 0.001     # learning rate
 ###########################################################
 #                     TRAINING SETTING                    #
 ###########################################################
 max_epoch: 1000
 num_snapshots: 5
 ###########################################################
 #                       OTHER SETTING                     #
 ###########################################################
 seed: 10086
--- a/examples/csmsc/tts3/conf/default.yaml
+++ b/examples/csmsc/tts3/conf/default.yaml
@ -45,7 +45,6 @@ model:
    postnet_layers: 5                 # number of layers of postnset
    postnet_filts: 5                  # filter size of conv layers in postnet
    postnet_chans: 256                # number of channels of conv layers in postnet
    use_masking: True                 # whether to apply masking for padded part in loss calculation
    use_scaled_pos_enc: True          # whether to use scaled positional encoding
    encoder_normalize_before: True    # whether to perform layer normalization before the input
    decoder_normalize_before: True    # whether to perform layer normalization before the input
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [use_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to  [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to  cut silence in the edge of audio.
-You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo.
 ## Get Started
 Assume the path to the dataset is `~/datasets/BZNSYP`.
--- a/audio/examples/sound_classification/README.md
+++ b/audio/examples/sound_classification/README.md
@ -21,22 +21,17 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用
 ### 模型训练
-以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。关于如何使用`paddle.distributed.launch`启动多卡训练，请查看[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html)。
+以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。
-单卡训练:
+启动训练:
 ```shell
-$ python train.py --epochs 50 --batch_size 16 --checkpoint_dir ./checkpoint --save_freq 10
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
 ```
-多卡训练:
+`paddlespeech/cls/exps/panns/train.py` 脚本中可支持配置的参数：
 ```shell
 $ unset CUDA_VISIBLE_DEVICES
 $ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_size 16 --num_worker 4 --checkpoint_dir ./checkpoint --save_freq 10
 ```
-可支持配置的参数：
+- `device`: 指定模型预测时使用的设备。
-
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
 - `epochs`: 训练轮次，默认为50。
 - `learning_rate`: Fine-tune的学习率；默认为5e-5。
 - `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为16。
@ -47,9 +42,9 @@ $ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_
 示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
 ```python
 from model import SoundClassifier
 from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14, cnn10, cnn6
+from paddlespeech.cls.models import SoundClassifier
 from paddlespeech.cls.models import cnn14, cnn10, cnn6
 # CNN14
 backbone = cnn14(pretrained=True, extract_embedding=True)
@ -67,12 +62,14 @@ model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
 ### 模型预测
 ```shell
-python -u predict.py --wav ./dog.wav --top_k 3 --checkpoint ./checkpoint/epoch_50/model.pdparams
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2
 ```
-可支持配置的参数：
+`paddlespeech/cls/exps/panns/predict.py` 脚本中可支持配置的参数：
- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+
 - `device`: 指定模型预测时使用的设备。
 - `wav`: 指定预测的音频文件。
 - `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `top_k`: 预测显示的top k标签的得分，默认为1。
 - `checkpoint`: 模型参数checkpoint文件。
@ -91,10 +88,10 @@ Cat: 6.579841738130199e-06
 模型训练结束后，可以将已保存的动态图参数导出成静态图的模型和参数，然后实施静态图的部署。
 ```shell
-python -u export_model.py --checkpoint ./checkpoint/epoch_50/model.pdparams --output_dir ./export
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3
 ```
-可支持配置的参数：
+`paddlespeech/cls/exps/panns/export_model.py` 脚本中可支持配置的参数：
 - `checkpoint`: 模型参数checkpoint文件。
 - `output_dir`: 导出静态图模型和参数文件的保存目录。
@ -109,8 +106,13 @@ export
 #### 2. 模型部署和预测
-`deploy/python/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
+`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
-```sh
+```shell
-python deploy/python/predict.py --model_dir ./export --device gpu
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4
 ```
 `paddlespeech/cls/exps/panns/deploy/predict.py` 脚本中可支持配置的主要参数：
 - `device`: 指定模型预测时使用的设备。
 - `model_dir`: 导出静态图模型和参数文件的保存目录。
 - `wav`: 指定预测的音频文件。
--- a/examples/esc50/cls0/local/export.sh
+++ b/examples/esc50/cls0/local/export.sh
@ -0,0 +1,8 @@
 #!/bin/bash
 ckpt_dir=$1
 output_dir=$2
 python3 ${BIN_DIR}/export_model.py \
 --checkpoint ${ckpt_dir}/model.pdparams \
 --output_dir ${output_dir}
--- a/examples/esc50/cls0/local/infer.sh
+++ b/examples/esc50/cls0/local/infer.sh
@ -0,0 +1,11 @@
 #!/bin/bash
 audio_file=$1
 ckpt_dir=$2
 feat_backend=$3
 python3 ${BIN_DIR}/predict.py \
 --wav ${audio_file} \
 --feat_backend ${feat_backend} \
 --top_k 10 \
 --checkpoint ${ckpt_dir}/model.pdparams
--- a/examples/esc50/cls0/local/static_model_infer.sh
+++ b/examples/esc50/cls0/local/static_model_infer.sh
@ -0,0 +1,10 @@
 #!/bin/bash
 device=$1
 model_dir=$2
 audio_file=$3
 python3 ${BIN_DIR}/deploy/predict.py \
 --device ${device} \
 --model_dir ${model_dir} \
 --wav ${audio_file} 
--- a/examples/esc50/cls0/local/train.sh
+++ b/examples/esc50/cls0/local/train.sh
@ -0,0 +1,25 @@
 #!/bin/bash
 ngpu=$1
 feat_backend=$2
 num_epochs=50
 batch_size=16
 ckpt_dir=./checkpoint
 save_freq=10
 if [ ${ngpu} -gt 0 ]; then
    python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \
    --epochs ${num_epochs} \
    --feat_backend ${feat_backend} \
    --batch_size ${batch_size} \
    --checkpoint_dir ${ckpt_dir} \
    --save_freq ${save_freq}
 else
    python3 ${BIN_DIR}/train.py \
    --epochs ${num_epochs} \
    --feat_backend ${feat_backend} \
    --batch_size ${batch_size} \
    --checkpoint_dir ${ckpt_dir} \
    --save_freq ${save_freq}
 fi
--- a/examples/esc50/cls0/path.sh
+++ b/examples/esc50/cls0/path.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C
 export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 MODEL=panns
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL}
--- a/examples/esc50/cls0/run.sh
+++ b/examples/esc50/cls0/run.sh
@ -0,0 +1,33 @@
 #!/bin/bash
 set -e
 source path.sh
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 stage=$1
 stop_stage=100
 feat_backend=numpy
 audio_file=~/cat.wav
 ckpt_dir=./checkpoint/epoch_50
 output_dir=./export
 infer_device=cpu
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    ./local/train.sh ${ngpu} ${feat_backend} || exit -1
    exit 0
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    ./local/infer.sh ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1
    exit 0
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    ./local/export.sh ${ckpt_dir} ${output_dir} || exit -1
    exit 0
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
    ./local/static_model_infer.sh ${infer_device} ${output_dir} ${audio_file} || exit -1
    exit 0
 fi
--- a/examples/librispeech/asr0/local/data.sh
+++ b/examples/librispeech/asr0/local/data.sh
@ -10,7 +10,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh
 mkdir -p data
 mkdir -p ${dict_dir}
-TARGET_DIR=${MAIN_ROOT}/examples/dataset
+TARGET_DIR=${MAIN_ROOT}/dataset
 mkdir -p ${TARGET_DIR}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
--- a/examples/librispeech/asr0/local/export.sh
+++ b/examples/librispeech/asr0/local/export.sh
@ -14,7 +14,7 @@ jit_model_export_path=$3
 model_type=$4
 python3 -u ${BIN_DIR}/export.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \
 --export_path ${jit_model_export_path} \
--- a/examples/librispeech/asr0/local/test.sh
+++ b/examples/librispeech/asr0/local/test.sh
@ -19,7 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/librispeech/asr0/local/test_hub.sh
+++ b/examples/librispeech/asr0/local/test_hub.sh
@ -13,6 +13,17 @@ ckpt_prefix=$2
 model_type=$3
 audio_file=$4
 mkdir -p data
 wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
 if [ $? -ne 0 ]; then
   exit 1
 fi
 if [ ! -f ${audio_file} ]; then
    echo "Plase input the right audio_file path"
    exit 1
 fi
 # download language model
 bash local/download_lm_en.sh
 if [ $? -ne 0 ]; then
@ -20,7 +31,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test_hub.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \
--- a/examples/librispeech/asr0/local/train.sh
+++ b/examples/librispeech/asr0/local/train.sh
@ -21,7 +21,7 @@ if [ ${seed} != 0 ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--nproc ${ngpu} \
+--ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --model_type ${model_type} \
--- a/Show More
+++ b/Show More