Merge pull request #1027 from KPatr1ck/audio

[CLS]Add paddlespeech.cls and esc50 example.
4 years ago · 097b2e4c9a
parent b6ade97b32 6e1ac1cc15
commit 097b2e4c9a
55 changed files with 1126 additions and 2312 deletions
--- a/audio/.gitignore
+++ b/audio/.gitignore
@ -1,7 +0,0 @@
-.ipynb_checkpoints/**
-*.ipynb
-nohup.out
-__pycache__/
-*.wav
-*.m4a
-obsolete/**
--- a/audio/.pre-commit-config.yaml
+++ b/audio/.pre-commit-config.yaml
@ -1,45 +0,0 @@
-repos:
-   repo: local
-    hooks:
-    -   id: yapf
-        name: yapf
-        entry: yapf
-        language: system
-        args: [-i, --style .style.yapf]
-        files: \.py$
-
-   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: a11d9314b22d8f8c7556443875b731ef05965464
-    hooks:
-    -   id: check-merge-conflict
-    -   id: check-symlinks
-    -   id: end-of-file-fixer
-    -   id: trailing-whitespace
-    -   id: detect-private-key
-    -   id: check-symlinks
-    -   id: check-added-large-files
-
-   repo: https://github.com/pycqa/isort
-    rev: 5.8.0
-    hooks:
-    -   id: isort
-        name: isort (python)
-    -   id: isort
-        name: isort (cython)
-        types: [cython]
-    -   id: isort
-        name: isort (pyi)
-        types: [pyi]
-
-   repo: local
-    hooks:
-    -   id: flake8
-        name: flake8
-        entry: flake8
-        language: system
-        args:
-        -   --count
-        -   --select=E9,F63,F7,F82
-        -   --show-source
-        -   --statistics
-        files: \.py$
--- a/audio/.style.yapf
+++ b/audio/.style.yapf
@ -1,3 +0,0 @@
-[style]
-based_on_style = pep8
-column_limit = 80
--- a/audio/LICENSE
+++ b/audio/LICENSE
@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--- a/audio/README.md
+++ b/audio/README.md
@ -1,37 +0,0 @@
-# PaddleAudio:  The audio library for PaddlePaddle
-
-## Introduction
-PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap.
-
-
-
-## Features
- Spectrogram and related features are compatible with librosa.
- State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come.
- Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap.
- Data loading supports for common open source audio in multiple languages including English, Mandarin and so on.
-
-
-## Install
-```
-git clone https://github.com/PaddlePaddle/models
-cd models/PaddleAudio
-pip install .
-
-```
-
-## Quick start
-### Audio loading and feature extraction
-```
-import paddleaudio as pa
-s,r = pa.load(f)
-mel_spect = pa.melspectrogram(s,sr=r)
-```
-
-###  Examples
-We provide a set of examples to help you get started in using PaddleAudio quickly.
- [PANNs:  acoustic scene and events analysis using pre-trained models](./examples/panns)
- [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification)
- [Training a audio-tagging network on Audioset](./examples/audioset_training)
-
-Please refer to [example directory](./examples) for more details.
--- a/audio/examples/panns/README.md
+++ b/audio/examples/panns/README.md
@ -1,128 +0,0 @@
-# Audio Tagging
-
-声音分类的任务是单标签的分类任务，但是对于一段音频来说，它可以是多标签的。譬如在一般的室内办公环境进行录音，这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音，还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言，对一段音频预测多个标签是具有很强的实用性的。
-
-在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
-
-`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别，因此可用于声音的实时tagging。
-
-本示例采用`PANNs`预训练模型，基于Audioset的标签类别对输入音频实时tagging，并最终以文本形式输出对应时刻的top k类别和对应的得分。
-
-
-## 模型简介
-
-PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
- CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
- CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
- CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
-
-
-## 快速开始
-
-### 模型预测
-
-```shell
-export CUDA_VISIBLE_DEVICES=0
-python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir
-```
-
-可支持配置的参数：
-
- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
- `wav`: 指定预测的音频文件。
- `sample_duration`: 模型每次预测的音频时间长度，单位为秒，默认为2s。
- `hop_duration`: 每两个预测音频的时间间隔，单位为秒，默认为0.3s。
- `output_dir`: 模型预测结果存放的路径，默认为`./output_dir`。
-
-示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
-```python
-from paddleaudio.models.panns import cnn14, cnn10, cnn6
-
-# CNN14
-model = cnn14(pretrained=True, extract_embedding=False)
-# CNN10
-model = cnn10(pretrained=True, extract_embedding=False)
-# CNN6
-model = cnn6(pretrained=True, extract_embedding=False)
-```
-
-执行结果：
-```
-[2021-04-30 19:15:41,025] [    INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz
-```
-
-执行后得分结果保存在`output_dir`的`.npz`文件中。
-
-
-### 生成tagging标签文本
-```shell
-python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir
-```
-
-可支持配置的参数：
-
- `tagging_file`: 模型预测结果文件。
- `top_k`: 获取预测结果中，得分最高的前top_k个标签，默认为10。
- `smooth`: 预测结果的后验概率平滑，默认为True，表示应用平滑。
- `smooth_size`: 平滑计算过程中的样本数量，默认为5。
- `label_file`: 模型预测结果对应的Audioset类别的文本文件。
- `output_dir`: 标签文本存放的路径，默认为`./output_dir`。
-
-执行结果：
-```
-[2021-04-30 19:26:58,743] [    INFO] - Posterior smoothing...
-[2021-04-30 19:26:58,746] [    INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt
-```
-
-执行后文本结果保存在`output_dir`的`.txt`文件中。
-
-
-## Tagging标签文本
-
-最终输出的文本结果如下所示。  
-样本每个时间范围的top k结果用空行分隔。在每一个结果中，第一行是时间信息，数字表示tagging结果在时间起点信息，比例值代表当前时刻`t`与音频总长度`T`的比值；紧接的k行是对应的标签和得分。
-
-```
-0.0
-Cat: 0.9144676923751831
-Animal: 0.8855036497116089
-Domestic animals, pets: 0.804577112197876
-Meow: 0.7422927021980286
-Music: 0.19959309697151184
-Inside, small room: 0.12550437450408936
-Caterwaul: 0.021584441885352135
-Purr: 0.020247288048267365
-Speech: 0.018197158351540565
-Vehicle: 0.007446660194545984
-
-0.059197544398158296
-Cat: 0.9250872135162354
-Animal: 0.8957151174545288
-Domestic animals, pets: 0.8228275775909424
-Meow: 0.7650775909423828
-Music: 0.20210561156272888
-Inside, small room: 0.12290887534618378
-Caterwaul: 0.029371455311775208
-Purr: 0.018731823191046715
-Speech: 0.017130598425865173
-Vehicle: 0.007748497650027275
-
-0.11839508879631659
-Cat: 0.9336574673652649
-Animal: 0.9111202359199524
-Domestic animals, pets: 0.8349071145057678
-Meow: 0.7761964797973633
-Music: 0.20467285811901093
-Inside, small room: 0.10709915310144424
-Caterwaul: 0.05370649695396423
-Purr: 0.018830426037311554
-Speech: 0.017361722886562347
-Vehicle: 0.006929398979991674
-
-...
-...
-```
-
-以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子，可以实时地对音频进行多标签预测。
-
-![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif)
--- a/audio/examples/panns/assets/audioset_labels.txt
+++ b/audio/examples/panns/assets/audioset_labels.txt
@ -1,527 +0,0 @@
-Speech
-Male speech, man speaking
-Female speech, woman speaking
-Child speech, kid speaking
-Conversation
-Narration, monologue
-Babbling
-Speech synthesizer
-Shout
-Bellow
-Whoop
-Yell
-Battle cry
-Children shouting
-Screaming
-Whispering
-Laughter
-Baby laughter
-Giggle
-Snicker
-Belly laugh
-Chuckle, chortle
-Crying, sobbing
-Baby cry, infant cry
-Whimper
-Wail, moan
-Sigh
-Singing
-Choir
-Yodeling
-Chant
-Mantra
-Male singing
-Female singing
-Child singing
-Synthetic singing
-Rapping
-Humming
-Groan
-Grunt
-Whistling
-Breathing
-Wheeze
-Snoring
-Gasp
-Pant
-Snort
-Cough
-Throat clearing
-Sneeze
-Sniff
-Run
-Shuffle
-Walk, footsteps
-Chewing, mastication
-Biting
-Gargling
-Stomach rumble
-Burping, eructation
-Hiccup
-Fart
-Hands
-Finger snapping
-Clapping
-Heart sounds, heartbeat
-Heart murmur
-Cheering
-Applause
-Chatter
-Crowd
-Hubbub, speech noise, speech babble
-Children playing
-Animal
-Domestic animals, pets
-Dog
-Bark
-Yip
-Howl
-Bow-wow
-Growling
-Whimper (dog)
-Cat
-Purr
-Meow
-Hiss
-Caterwaul
-Livestock, farm animals, working animals
-Horse
-Clip-clop
-Neigh, whinny
-Cattle, bovinae
-Moo
-Cowbell
-Pig
-Oink
-Goat
-Bleat
-Sheep
-Fowl
-Chicken, rooster
-Cluck
-Crowing, cock-a-doodle-doo
-Turkey
-Gobble
-Duck
-Quack
-Goose
-Honk
-Wild animals
-Roaring cats (lions, tigers)
-Roar
-Bird
-Bird vocalization, bird call, bird song
-Chirp, tweet
-Squawk
-Pigeon, dove
-Coo
-Crow
-Caw
-Owl
-Hoot
-Bird flight, flapping wings
-Canidae, dogs, wolves
-Rodents, rats, mice
-Mouse
-Patter
-Insect
-Cricket
-Mosquito
-Fly, housefly
-Buzz
-Bee, wasp, etc.
-Frog
-Croak
-Snake
-Rattle
-Whale vocalization
-Music
-Musical instrument
-Plucked string instrument
-Guitar
-Electric guitar
-Bass guitar
-Acoustic guitar
-Steel guitar, slide guitar
-Tapping (guitar technique)
-Strum
-Banjo
-Sitar
-Mandolin
-Zither
-Ukulele
-Keyboard (musical)
-Piano
-Electric piano
-Organ
-Electronic organ
-Hammond organ
-Synthesizer
-Sampler
-Harpsichord
-Percussion
-Drum kit
-Drum machine
-Drum
-Snare drum
-Rimshot
-Drum roll
-Bass drum
-Timpani
-Tabla
-Cymbal
-Hi-hat
-Wood block
-Tambourine
-Rattle (instrument)
-Maraca
-Gong
-Tubular bells
-Mallet percussion
-Marimba, xylophone
-Glockenspiel
-Vibraphone
-Steelpan
-Orchestra
-Brass instrument
-French horn
-Trumpet
-Trombone
-Bowed string instrument
-String section
-Violin, fiddle
-Pizzicato
-Cello
-Double bass
-Wind instrument, woodwind instrument
-Flute
-Saxophone
-Clarinet
-Harp
-Bell
-Church bell
-Jingle bell
-Bicycle bell
-Tuning fork
-Chime
-Wind chime
-Change ringing (campanology)
-Harmonica
-Accordion
-Bagpipes
-Didgeridoo
-Shofar
-Theremin
-Singing bowl
-Scratching (performance technique)
-Pop music
-Hip hop music
-Beatboxing
-Rock music
-Heavy metal
-Punk rock
-Grunge
-Progressive rock
-Rock and roll
-Psychedelic rock
-Rhythm and blues
-Soul music
-Reggae
-Country
-Swing music
-Bluegrass
-Funk
-Folk music
-Middle Eastern music
-Jazz
-Disco
-Classical music
-Opera
-Electronic music
-House music
-Techno
-Dubstep
-Drum and bass
-Electronica
-Electronic dance music
-Ambient music
-Trance music
-Music of Latin America
-Salsa music
-Flamenco
-Blues
-Music for children
-New-age music
-Vocal music
-A capella
-Music of Africa
-Afrobeat
-Christian music
-Gospel music
-Music of Asia
-Carnatic music
-Music of Bollywood
-Ska
-Traditional music
-Independent music
-Song
-Background music
-Theme music
-Jingle (music)
-Soundtrack music
-Lullaby
-Video game music
-Christmas music
-Dance music
-Wedding music
-Happy music
-Funny music
-Sad music
-Tender music
-Exciting music
-Angry music
-Scary music
-Wind
-Rustling leaves
-Wind noise (microphone)
-Thunderstorm
-Thunder
-Water
-Rain
-Raindrop
-Rain on surface
-Stream
-Waterfall
-Ocean
-Waves, surf
-Steam
-Gurgling
-Fire
-Crackle
-Vehicle
-Boat, Water vehicle
-Sailboat, sailing ship
-Rowboat, canoe, kayak
-Motorboat, speedboat
-Ship
-Motor vehicle (road)
-Car
-Vehicle horn, car horn, honking
-Toot
-Car alarm
-Power windows, electric windows
-Skidding
-Tire squeal
-Car passing by
-Race car, auto racing
-Truck
-Air brake
-Air horn, truck horn
-Reversing beeps
-Ice cream truck, ice cream van
-Bus
-Emergency vehicle
-Police car (siren)
-Ambulance (siren)
-Fire engine, fire truck (siren)
-Motorcycle
-Traffic noise, roadway noise
-Rail transport
-Train
-Train whistle
-Train horn
-Railroad car, train wagon
-Train wheels squealing
-Subway, metro, underground
-Aircraft
-Aircraft engine
-Jet engine
-Propeller, airscrew
-Helicopter
-Fixed-wing aircraft, airplane
-Bicycle
-Skateboard
-Engine
-Light engine (high frequency)
-Dental drill, dentist's drill
-Lawn mower
-Chainsaw
-Medium engine (mid frequency)
-Heavy engine (low frequency)
-Engine knocking
-Engine starting
-Idling
-Accelerating, revving, vroom
-Door
-Doorbell
-Ding-dong
-Sliding door
-Slam
-Knock
-Tap
-Squeak
-Cupboard open or close
-Drawer open or close
-Dishes, pots, and pans
-Cutlery, silverware
-Chopping (food)
-Frying (food)
-Microwave oven
-Blender
-Water tap, faucet
-Sink (filling or washing)
-Bathtub (filling or washing)
-Hair dryer
-Toilet flush
-Toothbrush
-Electric toothbrush
-Vacuum cleaner
-Zipper (clothing)
-Keys jangling
-Coin (dropping)
-Scissors
-Electric shaver, electric razor
-Shuffling cards
-Typing
-Typewriter
-Computer keyboard
-Writing
-Alarm
-Telephone
-Telephone bell ringing
-Ringtone
-Telephone dialing, DTMF
-Dial tone
-Busy signal
-Alarm clock
-Siren
-Civil defense siren
-Buzzer
-Smoke detector, smoke alarm
-Fire alarm
-Foghorn
-Whistle
-Steam whistle
-Mechanisms
-Ratchet, pawl
-Clock
-Tick
-Tick-tock
-Gears
-Pulleys
-Sewing machine
-Mechanical fan
-Air conditioning
-Cash register
-Printer
-Camera
-Single-lens reflex camera
-Tools
-Hammer
-Jackhammer
-Sawing
-Filing (rasp)
-Sanding
-Power tool
-Drill
-Explosion
-Gunshot, gunfire
-Machine gun
-Fusillade
-Artillery fire
-Cap gun
-Fireworks
-Firecracker
-Burst, pop
-Eruption
-Boom
-Wood
-Chop
-Splinter
-Crack
-Glass
-Chink, clink
-Shatter
-Liquid
-Splash, splatter
-Slosh
-Squish
-Drip
-Pour
-Trickle, dribble
-Gush
-Fill (with liquid)
-Spray
-Pump (liquid)
-Stir
-Boiling
-Sonar
-Arrow
-Whoosh, swoosh, swish
-Thump, thud
-Thunk
-Electronic tuner
-Effects unit
-Chorus effect
-Basketball bounce
-Bang
-Slap, smack
-Whack, thwack
-Smash, crash
-Breaking
-Bouncing
-Whip
-Flap
-Scratch
-Scrape
-Rub
-Roll
-Crushing
-Crumpling, crinkling
-Tearing
-Beep, bleep
-Ping
-Ding
-Clang
-Squeal
-Creak
-Rustle
-Whir
-Clatter
-Sizzle
-Clicking
-Clickety-clack
-Rumble
-Plop
-Jingle, tinkle
-Hum
-Zing
-Boing
-Crunch
-Silence
-Sine wave
-Harmonic
-Chirp tone
-Sound effect
-Pulse
-Inside, small room
-Inside, large room or hall
-Inside, public space
-Outside, urban or manmade
-Outside, rural or natural
-Reverberation
-Echo
-Noise
-Environmental noise
-Static
-Mains hum
-Distortion
-Sidetone
-Cacophony
-White noise
-Pink noise
-Throbbing
-Vibration
-Television
-Radio
-Field recording
--- a/audio/examples/panns/audio_tag.py
+++ b/audio/examples/panns/audio_tag.py
@ -1,111 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from typing import List
-
-import numpy as np
-import paddle
-from paddleaudio.backends import load as load_audio
-from paddleaudio.features import melspectrogram
-from paddleaudio.models.panns import cnn14
-from paddleaudio.utils import logger
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
-parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
-parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
-parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
-parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
-args = parser.parse_args()
-# yapf: enable
-
-
-def split(waveform: np.ndarray, win_size: int, hop_size: int):
-    """
-    Split into N waveforms.
-    N is decided by win_size and hop_size.
-    """
-    assert isinstance(waveform, np.ndarray)
-    time = []
-    data = []
-    for i in range(0, len(waveform), hop_size):
-        segment = waveform[i:i + win_size]
-        if len(segment) < win_size:
-            segment = np.pad(segment, (0, win_size - len(segment)))
-        data.append(segment)
-        time.append(i / len(waveform))
-    return time, data
-
-
-def batchify(data: List[List[float]],
-             sample_rate: int,
-             batch_size: int,
-             **kwargs):
-    """
-    Extract features from waveforms and create batches.
-    """
-    examples = []
-    for waveform in data:
-        feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
-        examples.append(feats)
-
-    # Seperates data into some batches.
-    one_batch = []
-    for example in examples:
-        one_batch.append(example)
-        if len(one_batch) == batch_size:
-            yield one_batch
-            one_batch = []
-    if one_batch:
-        yield one_batch
-
-
-def predict(model, data: List[List[float]], sample_rate: int,
-            batch_size: int=1):
-    """
-    Use pretrained model to make predictions.
-    """
-    batches = batchify(data, sample_rate, batch_size)
-    results = None
-    model.eval()
-    for batch in batches:
-        feats = paddle.to_tensor(batch).unsqueeze(1)  \
-            # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
-
-        audioset_scores = model(feats)
-        if results is None:
-            results = audioset_scores.numpy()
-        else:
-            results = np.concatenate((results, audioset_scores.numpy()))
-
-    return results
-
-
-if __name__ == '__main__':
-    paddle.set_device(args.device)
-    model = cnn14(pretrained=True, extract_embedding=False)
-    waveform, sr = load_audio(args.wav, sr=None)
-    time, data = split(waveform,
-                       int(args.sample_duration * sr),
-                       int(args.hop_duration * sr))
-    results = predict(model, data, sr, batch_size=8)
-
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-    time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
-    output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
-    np.savez(output_file, time=time, scores=results)
-    logger.info(f'Saved tagging results to {output_file}')
--- a/audio/examples/panns/parse_result.py
+++ b/audio/examples/panns/parse_result.py
@ -1,83 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import ast
-import os
-from typing import Dict
-
-import numpy as np
-from paddleaudio.utils import logger
-
-# yapf: disable
-parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--tagging_file', type=str, required=True, help='')
-parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
-parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
-parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
-parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
-parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
-args = parser.parse_args()
-# yapf: enable
-
-
-def smooth(results: np.ndarray, win_size: int):
-    """
-    Execute posterior smoothing in-place.
-    """
-    for i in range(len(results) - 1, -1, -1):
-        if i < win_size - 1:
-            left = 0
-        else:
-            left = i + 1 - win_size
-        results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
-
-
-def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
-    """
-    Return top k result.
-    """
-    result = np.asarray(result)
-    topk_idx = (-result).argsort()[:k]
-
-    ret = ''
-    for idx in topk_idx:
-        label, score = label_map[idx], result[idx]
-        ret += f'{label}: {score}\n'
-    return ret
-
-
-if __name__ == "__main__":
-    label_map = {}
-    with open(args.label_file, 'r') as f:
-        for i, l in enumerate(f.readlines()):
-            label_map[i] = l.strip()
-
-    results = np.load(args.tagging_file, allow_pickle=True)
-    times, scores = results['time'], results['scores']
-
-    if args.smooth:
-        logger.info('Posterior smoothing...')
-        smooth(scores, win_size=args.smooth_size)
-
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-    output_file = os.path.join(
-        args.output_dir,
-        os.path.basename(args.tagging_file).split('.')[0] + '.txt')
-    with open(output_file, 'w') as f:
-        for time, score in zip(times, scores):
-            f.write(f'{time}\n')
-            f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
-
-    logger.info(f'Saved tagging labels to {output_file}')
--- a/audio/paddleaudio/datasets/aishell.py
+++ b/audio/paddleaudio/datasets/aishell.py
@ -1,154 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import codecs
-import collections
-import json
-import os
-from typing import Dict
-
-from paddle.io import Dataset
-from tqdm import tqdm
-
-from ..backends import load as load_audio
-from ..utils.download import decompress
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from ..utils.log import logger
-from .dataset import feat_funcs
-
-__all__ = ['AISHELL1']
-
-
-class AISHELL1(Dataset):
-    """
-    This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
-    It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
-    smart home, autonomous driving, and industrial production. The whole recording was
-    put in quiet indoor environment, using 3 different devices at the same time: high
-    fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
-    iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
-    to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
-    in China were invited to participate in the recording. The manual transcription
-    accuracy rate is above 95%, through professional speech annotation and strict
-    quality inspection. The corpus is divided into training, development and testing
-    sets.
-
-    Reference:
-        AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
-        https://arxiv.org/abs/1709.05522
-    """
-
-    archieves = [
-        {
-            'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
-            'md5': '2f494334227864a8a8fec932999db9d8',
-        },
-    ]
-    text_meta = os.path.join('data_aishell', 'transcript',
-                             'aishell_transcript_v0.8.txt')
-    utt_info = collections.namedtuple('META_INFO',
-                                      ('file_path', 'utt_id', 'text'))
-    audio_path = os.path.join('data_aishell', 'wav')
-    manifest_path = os.path.join('data_aishell', 'manifest')
-    subset = ['train', 'dev', 'test']
-
-    def __init__(self, subset: str='train', feat_type: str='raw', **kwargs):
-        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
-            self.subset, subset)
-        self.subset = subset
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self._data = self._get_data()
-        super(AISHELL1, self).__init__()
-
-    def _get_text_info(self) -> Dict[str, str]:
-        ret = {}
-        with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
-            for line in rf.readlines()[1:]:
-                utt_id, text = map(str.strip, line.split(' ',
-                                                         1))  # utt_id, text
-                ret.update({utt_id: ''.join(text.split())})
-        return ret
-
-    def _get_data(self):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-            # Extract *wav from *.tar.gz.
-            for root, _, files in os.walk(
-                    os.path.join(DATA_HOME, self.audio_path)):
-                for file in files:
-                    if file.endswith('.tar.gz'):
-                        decompress(os.path.join(root, file))
-                        os.remove(os.path.join(root, file))
-
-        text_info = self._get_text_info()
-
-        data = []
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.wav'):
-                    utt_id = os.path.splitext(file)[0]
-                    if utt_id not in text_info:  # There are some utt_id that without label
-                        continue
-                    text = text_info[utt_id]
-                    file_path = os.path.join(root, file)
-                    data.append(self.utt_info(file_path, utt_id, text))
-
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = load_audio(
-            sample[0])  # The first element of sample is file path
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sample_rate=sr,
-            **self.feat_config) if feat_func else waveform
-        record.update({'feat': feat, 'duration': len(waveform) / sr})
-        return record
-
-    def create_manifest(self, prefix='manifest'):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
-            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
-
-        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
-                                     f'{prefix}.{self.subset}')
-        with codecs.open(manifest_file, 'w', 'utf-8') as f:
-            for idx in tqdm(range(len(self))):
-                record = self._convert_to_record(idx)
-                record_line = json.dumps(
-                    {
-                        'utt': record['utt_id'],
-                        'feat': record['file_path'],
-                        'feat_shape': (record['duration'], ),
-                        'text': record['text']
-                    },
-                    ensure_ascii=False)
-                f.write(record_line + '\n')
-        logger.info(f'Manifest file {manifest_file} created.')
-
-    def __getitem__(self, idx):
-        record = self._convert_to_record(idx)
-        return tuple(record.values())
-
-    def __len__(self):
-        return len(self._data)
--- a/audio/paddleaudio/datasets/dcase.py
+++ b/audio/paddleaudio/datasets/dcase.py
@ -1,298 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-from typing import List
-from typing import Tuple
-
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from .dataset import AudioClassificationDataset
-
-__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
-
-
-class UrbanAcousticScenes(AudioClassificationDataset):
-    """
-    TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from
-    12 European cities in 10 different acoustic scenes using 4 different devices.
-    Additionally, synthetic data for 11 mobile devices was created based on the original
-    recordings. Of the 12 cities, two are present only in the evaluation set.
-
-    Reference:
-        A multi-device dataset for urban acoustic scene classification
-        https://arxiv.org/abs/1807.09840
-    """
-
-    source_url = 'https://zenodo.org/record/3819968/files/'
-    base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
-    archieves = [
-        {
-            'url': source_url + base_name + '.meta.zip',
-            'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
-        },
-        {
-            'url': source_url + base_name + '.audio.1.zip',
-            'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
-        },
-        {
-            'url': source_url + base_name + '.audio.2.zip',
-            'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
-        },
-        {
-            'url': source_url + base_name + '.audio.3.zip',
-            'md5': 'ed38956c4246abb56190c1e9b602b7b8',
-        },
-        {
-            'url': source_url + base_name + '.audio.4.zip',
-            'md5': '97ab8560056b6816808dedc044dcc023',
-        },
-        {
-            'url': source_url + base_name + '.audio.5.zip',
-            'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
-        },
-        {
-            'url': source_url + base_name + '.audio.6.zip',
-            'md5': 'fbf856a3a86fff7520549c899dc94372',
-        },
-        {
-            'url': source_url + base_name + '.audio.7.zip',
-            'md5': '0dbffe7b6e45564da649378723284062',
-        },
-        {
-            'url': source_url + base_name + '.audio.8.zip',
-            'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
-        },
-        {
-            'url': source_url + base_name + '.audio.9.zip',
-            'md5': 'a65596a5372eab10c78e08a0de797c9e',
-        },
-        {
-            'url': source_url + base_name + '.audio.10.zip',
-            'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
-        },
-        {
-            'url': source_url + base_name + '.audio.11.zip',
-            'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
-        },
-        {
-            'url': source_url + base_name + '.audio.12.zip',
-            'md5': 'e5f4400c6b9697295fab4cf507155a2f',
-        },
-        {
-            'url': source_url + base_name + '.audio.13.zip',
-            'md5': '8855ab9f9896422746ab4c5d89d8da2f',
-        },
-        {
-            'url': source_url + base_name + '.audio.14.zip',
-            'md5': '092ad744452cd3e7de78f988a3d13020',
-        },
-        {
-            'url': source_url + base_name + '.audio.15.zip',
-            'md5': '4b5eb85f6592aebf846088d9df76b420',
-        },
-        {
-            'url': source_url + base_name + '.audio.16.zip',
-            'md5': '2e0a89723e58a3836be019e6996ae460',
-        },
-    ]
-    label_list = [
-        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
-        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
-    ]
-
-    meta = os.path.join(base_name, 'meta.csv')
-    meta_info = collections.namedtuple('META_INFO', (
-        'filename', 'scene_label', 'identifier', 'source_label'))
-    subset_meta = {
-        'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
-        'dev':
-        os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
-        'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
-    }
-    subset_meta_info = collections.namedtuple('SUBSET_META_INFO',
-                                              ('filename', 'scene_label'))
-    audio_path = os.path.join(base_name, 'audio')
-
-    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        files, labels = self._get_data(mode)
-        super(UrbanAcousticScenes, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, subset: str=None,
-                       skip_header: bool=True) -> List[collections.namedtuple]:
-        if subset is None:
-            meta_file = self.meta
-            meta_info = self.meta_info
-        else:
-            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
-            meta_file = self.subset_meta[subset]
-            meta_info = self.subset_meta_info
-
-        ret = []
-        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
-            lines = rf.readlines()[1:] if skip_header else rf.readlines()
-            for line in lines:
-                ret.append(meta_info(*line.strip().split('\t')))
-        return ret
-
-    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        meta_info = self._get_meta_info(subset=mode, skip_header=True)
-
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, label = sample[:2]
-            filename = os.path.basename(filename)
-            target = self.label_list.index(label)
-
-            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-            labels.append(int(target))
-
-        return files, labels
-
-
-class UrbanAudioVisualScenes(AudioClassificationDataset):
-    """
-    TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
-    and video recordings from 12 European cities in 10 different scenes.
-    This dataset consists of 10-seconds audio and video segments from 10
-    acoustic scenes. The total amount of audio in the development set is 34 hours.
-
-    Reference:
-        A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
-        https://arxiv.org/abs/2011.00030
-    """
-
-    source_url = 'https://zenodo.org/record/4477542/files/'
-    base_name = 'TAU-urban-audio-visual-scenes-2021-development'
-
-    archieves = [
-        {
-            'url': source_url + base_name + '.meta.zip',
-            'md5': '76e3d7ed5291b118372e06379cb2b490',
-        },
-        {
-            'url': source_url + base_name + '.audio.1.zip',
-            'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
-        },
-        {
-            'url': source_url + base_name + '.audio.2.zip',
-            'md5': '7fd6bb63127f5785874a55aba4e77aa5',
-        },
-        {
-            'url': source_url + base_name + '.audio.3.zip',
-            'md5': '61396bede29d7c8c89729a01a6f6b2e2',
-        },
-        {
-            'url': source_url + base_name + '.audio.4.zip',
-            'md5': '6ddac89717fcf9c92c451868eed77fe1',
-        },
-        {
-            'url': source_url + base_name + '.audio.5.zip',
-            'md5': 'af4820756cdf1a7d4bd6037dc034d384',
-        },
-        {
-            'url': source_url + base_name + '.audio.6.zip',
-            'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
-        },
-        {
-            'url': source_url + base_name + '.audio.7.zip',
-            'md5': '2be39a76aeed704d5929d020a2909efd',
-        },
-        {
-            'url': source_url + base_name + '.audio.8.zip',
-            'md5': '972d8afe0874720fc2f28086e7cb22a9',
-        },
-    ]
-    label_list = [
-        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
-        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
-    ]
-
-    meta_base_path = os.path.join(base_name, base_name + '.meta')
-    meta = os.path.join(meta_base_path, 'meta.csv')
-    meta_info = collections.namedtuple('META_INFO', (
-        'filename_audio', 'filename_video', 'scene_label', 'identifier'))
-    subset_meta = {
-        'train':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
-        'dev':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
-        'test':
-        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
-    }
-    subset_meta_info = collections.namedtuple('SUBSET_META_INFO', (
-        'filename_audio', 'filename_video', 'scene_label'))
-    audio_path = os.path.join(base_name, 'audio')
-
-    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        files, labels = self._get_data(mode)
-        super(UrbanAudioVisualScenes, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, subset: str=None,
-                       skip_header: bool=True) -> List[collections.namedtuple]:
-        if subset is None:
-            meta_file = self.meta
-            meta_info = self.meta_info
-        else:
-            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
-            meta_file = self.subset_meta[subset]
-            meta_info = self.subset_meta_info
-
-        ret = []
-        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
-            lines = rf.readlines()[1:] if skip_header else rf.readlines()
-            for line in lines:
-                ret.append(meta_info(*line.strip().split('\t')))
-        return ret
-
-    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
-            download_and_decompress(self.archieves,
-                                    os.path.join(DATA_HOME, self.base_name))
-
-        meta_info = self._get_meta_info(subset=mode, skip_header=True)
-
-        files = []
-        labels = []
-        for sample in meta_info:
-            filename, _, label = sample[:3]
-            filename = os.path.basename(filename)
-            target = self.label_list.index(label)
-
-            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
-            labels.append(int(target))
-
-        return files, labels
--- a/audio/paddleaudio/datasets/librispeech.py
+++ b/audio/paddleaudio/datasets/librispeech.py
@ -1,199 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import codecs
-import collections
-import json
-import os
-from typing import Dict
-
-from paddle.io import Dataset
-from tqdm import tqdm
-
-from ..backends import load as load_audio
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from ..utils.log import logger
-from .dataset import feat_funcs
-
-__all__ = ['LIBRISPEECH']
-
-
-class LIBRISPEECH(Dataset):
-    """
-    LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
-    prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
-    derived from read audiobooks from the LibriVox project, and has been carefully
-    segmented and aligned.
-
-    Reference:
-        LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
-        http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
-        https://arxiv.org/abs/1709.05522
-    """
-
-    source_url = 'http://www.openslr.org/resources/12/'
-    archieves = [
-        {
-            'url': source_url + 'train-clean-100.tar.gz',
-            'md5': '2a93770f6d5c6c964bc36631d331a522',
-        },
-        {
-            'url': source_url + 'train-clean-360.tar.gz',
-            'md5': 'c0e676e450a7ff2f54aeade5171606fa',
-        },
-        {
-            'url': source_url + 'train-other-500.tar.gz',
-            'md5': 'd1a0fd59409feb2c614ce4d30c387708',
-        },
-        {
-            'url': source_url + 'dev-clean.tar.gz',
-            'md5': '42e2234ba48799c1f50f24a7926300a1',
-        },
-        {
-            'url': source_url + 'dev-other.tar.gz',
-            'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
-        },
-        {
-            'url': source_url + 'test-clean.tar.gz',
-            'md5': '32fa31d27d2e1cad72775fee3f4849a9',
-        },
-        {
-            'url': source_url + 'test-other.tar.gz',
-            'md5': 'fb5a50374b501bb3bac4815ee91d3135',
-        },
-    ]
-    speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
-    utt_info = collections.namedtuple('META_INFO', (
-        'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
-    audio_path = 'LibriSpeech'
-    manifest_path = os.path.join('LibriSpeech', 'manifest')
-    subset = [
-        'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean',
-        'dev-other', 'test-clean', 'test-other'
-    ]
-
-    def __init__(self,
-                 subset: str='train-clean-100',
-                 feat_type: str='raw',
-                 **kwargs):
-        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
-            self.subset, subset)
-        self.subset = subset
-        self.feat_type = feat_type
-        self.feat_config = kwargs
-        self._data = self._get_data()
-        super(LIBRISPEECH, self).__init__()
-
-    def _get_speaker_info(self) -> Dict[str, str]:
-        ret = {}
-        with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
-            for line in rf.readlines():
-                if ';' in line:  # Skip dataset abstract
-                    continue
-                spk_id, gender = map(str.strip,
-                                     line.split('|')[:2])  # spk_id, gender
-                ret.update({spk_id: gender})
-        return ret
-
-    def _get_text_info(self, trans_file) -> Dict[str, str]:
-        ret = {}
-        with open(trans_file, 'r') as rf:
-            for line in rf.readlines():
-                utt_id, text = map(str.strip, line.split(' ',
-                                                         1))  # utt_id, text
-                ret.update({utt_id: text})
-        return ret
-
-    def _get_data(self):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
-            not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
-            download_and_decompress(self.archieves, DATA_HOME,
-                                    len(self.archieves))
-
-        # Speaker info
-        speaker_info = self._get_speaker_info()
-
-        # Text info
-        text_info = {}
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.trans.txt'):
-                    text_info.update(
-                        self._get_text_info(os.path.join(root, file)))
-
-        data = []
-        for root, _, files in os.walk(
-                os.path.join(DATA_HOME, self.audio_path, self.subset)):
-            for file in files:
-                if file.endswith('.flac'):
-                    utt_id = os.path.splitext(file)[0]
-                    spk_id = utt_id.split('-')[0]
-                    if utt_id not in text_info \
-                        or spk_id not in speaker_info :  # Skip samples with incomplete data
-                        continue
-                    file_path = os.path.join(root, file)
-                    text = text_info[utt_id]
-                    spk_gender = speaker_info[spk_id]
-                    data.append(
-                        self.utt_info(file_path, utt_id, text, spk_id,
-                                      spk_gender))
-
-        return data
-
-    def _convert_to_record(self, idx: int):
-        sample = self._data[idx]
-
-        record = {}
-        # To show all fields in a namedtuple: `type(sample)._fields`
-        for field in type(sample)._fields:
-            record[field] = getattr(sample, field)
-
-        waveform, sr = load_audio(
-            sample[0])  # The first element of sample is file path
-        feat_func = feat_funcs[self.feat_type]
-        feat = feat_func(
-            waveform, sample_rate=sr,
-            **self.feat_config) if feat_func else waveform
-        record.update({'feat': feat, 'duration': len(waveform) / sr})
-        return record
-
-    def create_manifest(self, prefix='manifest'):
-        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
-            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
-
-        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
-                                     f'{prefix}.{self.subset}')
-        with codecs.open(manifest_file, 'w', 'utf-8') as f:
-            for idx in tqdm(range(len(self))):
-                record = self._convert_to_record(idx)
-                record_line = json.dumps(
-                    {
-                        'utt': record['utt_id'],
-                        'feat': record['file_path'],
-                        'feat_shape': (record['duration'], ),
-                        'text': record['text'],
-                        'spk': record['spk_id'],
-                        'gender': record['spk_gender'],
-                    },
-                    ensure_ascii=False)
-                f.write(record_line + '\n')
-        logger.info(f'Manifest file {manifest_file} created.')
-
-    def __getitem__(self, idx):
-        record = self._convert_to_record(idx)
-        return tuple(record.values())
-
-    def __len__(self):
-        return len(self._data)
--- a/audio/paddleaudio/datasets/ravdess.py
+++ b/audio/paddleaudio/datasets/ravdess.py
@ -1,136 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import collections
-import os
-import random
-from typing import List
-from typing import Tuple
-
-from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
-from .dataset import AudioClassificationDataset
-
-__all__ = ['RAVDESS']
-
-
-class RAVDESS(AudioClassificationDataset):
-    """
-    The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
-    lexically-matched statements in a neutral North American accent. Speech emotions
-    includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
-    Each expression is produced at two levels of emotional intensity (normal, strong),
-    with an additional neutral expression.
-
-    Reference:
-        The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
-        A dynamic, multimodal set of facial and vocal expressions in North American English
-        https://doi.org/10.1371/journal.pone.0196391
-    """
-
-    archieves = [
-        {
-            'url':
-            'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
-            'md5':
-            '5411230427d67a21e18aa4d466e6d1b9',
-        },
-        {
-            'url':
-            'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
-            'md5':
-            'bc696df654c87fed845eb13823edef8a',
-        },
-    ]
-    label_list = [
-        'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
-        'surprised'
-    ]
-    meta_info = collections.namedtuple(
-        'META_INFO', ('modality', 'vocal_channel', 'emotion',
-                      'emotion_intensity', 'statement', 'repitition', 'actor'))
-    speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
-    song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
-
-    def __init__(self,
-                 mode='train',
-                 seed=0,
-                 n_folds=5,
-                 split=1,
-                 feat_type='raw',
-                 **kwargs):
-        """
-        Ags:
-            mode (:obj:`str`, `optional`, defaults to `train`):
-                It identifies the dataset mode (train or dev).
-            seed (:obj:`int`, `optional`, defaults to 0):
-                Set the random seed to shuffle samples.
-            n_folds (:obj:`int`, `optional`, defaults to 5):
-                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
-            split (:obj:`int`, `optional`, defaults to 1):
-                It specify the fold of dev dataset.
-            feat_type (:obj:`str`, `optional`, defaults to `raw`):
-                It identifies the feature type that user wants to extrace of an audio file.
-        """
-        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
-        files, labels = self._get_data(mode, seed, n_folds, split)
-        super(RAVDESS, self).__init__(
-            files=files, labels=labels, feat_type=feat_type, **kwargs)
-
-    def _get_meta_info(self, files) -> List[collections.namedtuple]:
-        ret = []
-        for file in files:
-            basename_without_extend = os.path.basename(file)[:-4]
-            ret.append(self.meta_info(*basename_without_extend.split('-')))
-        return ret
-
-    def _get_data(self, mode, seed, n_folds,
-                  split) -> Tuple[List[str], List[int]]:
-        if not os.path.isdir(self.speech_path) and not os.path.isdir(
-                self.song_path):
-            download_and_decompress(self.archieves, DATA_HOME)
-
-        wav_files = []
-        for root, _, files in os.walk(self.speech_path):
-            for file in files:
-                if file.endswith('.wav'):
-                    wav_files.append(os.path.join(root, file))
-
-        for root, _, files in os.walk(self.song_path):
-            for file in files:
-                if file.endswith('.wav'):
-                    wav_files.append(os.path.join(root, file))
-
-        random.seed(seed)  # shuffle samples to split data
-        random.shuffle(
-            wav_files
-        )  # make sure using the same seed to create train and dev dataset
-        meta_info = self._get_meta_info(wav_files)
-
-        files = []
-        labels = []
-        n_samples_per_fold = len(meta_info) // n_folds
-        for idx, sample in enumerate(meta_info):
-            _, _, emotion, _, _, _, _ = sample
-            target = int(emotion) - 1
-            fold = idx // n_samples_per_fold + 1
-
-            if mode == 'train' and int(fold) != split:
-                files.append(wav_files[idx])
-                labels.append(target)
-
-            if mode != 'train' and int(fold) == split:
-                files.append(wav_files[idx])
-                labels.append(target)
-
-        return files, labels
--- a/audio/test/README.md
+++ b/audio/test/README.md
@ -1,41 +0,0 @@
-# PaddleAudio Testing Guide
-
-
-
-
-# Testing
-First clone a version of the project by
-```
-git clone https://github.com/PaddlePaddle/models.git
-
-```
-Then install the project in your virtual environment.
-```
-cd models/PaddleAudio
-python setup.py bdist_wheel
-pip install -e .[dev]
-```
-The requirements for testing will be installed along with PaddleAudio.  
-
-Now run
-```
-pytest test
-```
-
-If it goes well, you will see outputs like these:
-```
-platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
-rootdir: ./models/PaddleAudio
-plugins: hydra-core-1.0.6
-collected 16 items  
-
-test/unit_test/test_backend.py ...........                                                                         [ 68%]
-test/unit_test/test_features.py .....                                                                              [100%]
-
-==================================================== warnings summary ====================================================
-.
-.
-.
-- Docs: https://docs.pytest.org/en/stable/warnings.html
-============================================ 16 passed, 11 warnings in 6.76s =============================================
-```
--- a/audio/test/unit_test/test_backend.py
+++ b/audio/test/unit_test/test_backend.py
@ -1,113 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddleaudio
-import pytest
-
-TEST_FILE = './test/data/test_audio.wav'
-
-
-def relative_err(a, b, real=True):
-    """compute relative error of two matrices or vectors"""
-    if real:
-        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
-    else:
-        err = np.sum((a.real - b.real)**2) / \
-            (EPS + np.sum(a.real**2) + np.sum(b.real**2))
-        err += np.sum((a.imag - b.imag)**2) / \
-            (EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
-
-        return err
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def load_audio():
-    x, r = librosa.load(TEST_FILE, sr=16000)
-    print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
-    return x, r
-
-
-# start testing
-x, r = load_audio()
-EPS = 1e-8
-
-
-def test_load():
-    s, r = paddleaudio.load(TEST_FILE, sr=16000)
-    assert r == 16000
-    assert s.dtype == 'float32'
-
-    s, r = paddleaudio.load(
-        TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
-    assert len(s) / r == 2.0
-    assert r == 16000
-    assert s.dtype == 'int16'
-
-
-def test_depth_convert():
-    y = paddleaudio.depth_convert(x, 'int16')
-    assert len(y) == len(x)
-    assert y.dtype == 'int16'
-    assert np.max(y) <= 32767
-    assert np.min(y) >= -32768
-    assert np.std(y) > EPS
-
-    y = paddleaudio.depth_convert(x, 'int8')
-    assert len(y) == len(x)
-    assert y.dtype == 'int8'
-    assert np.max(y) <= 127
-    assert np.min(y) >= -128
-    assert np.std(y) > EPS
-
-
-# test case for resample
-rs_test_data = [
-    (32000, 'kaiser_fast'),
-    (16000, 'kaiser_fast'),
-    (8000, 'kaiser_fast'),
-    (32000, 'kaiser_best'),
-    (16000, 'kaiser_best'),
-    (8000, 'kaiser_best'),
-    (22050, 'kaiser_best'),
-    (44100, 'kaiser_best'),
-]
-
-
-@pytest.mark.parametrize('sr,mode', rs_test_data)
-def test_resample(sr, mode):
-    y = paddleaudio.resample(x, 16000, sr, mode=mode)
-    factor = sr / 16000
-    err = relative_err(len(y), len(x) * factor)
-    print('err:', err)
-    assert err < EPS
-
-
-def test_normalize():
-    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
-    assert np.max(y) < 0.5 + EPS
-
-    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
-    assert np.max(y) <= 2.0 + EPS
-
-    y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
-    print('np.std(y):', np.std(y))
-    assert np.abs(np.std(y) - 1.0) < EPS
-
-
-if __name__ == '__main__':
-    test_load()
-    test_depth_convert()
-    test_resample(22050, 'kaiser_fast')
-    test_normalize()
--- a/audio/test/unit_test/test_features.py
+++ b/audio/test/unit_test/test_features.py
@ -1,143 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import librosa
-import numpy as np
-import paddleaudio as pa
-import pytest
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def load_audio():
-    x, r = librosa.load('./test/data/test_audio.wav')
-    #x,r = librosa.load('../data/test_audio.wav',sr=16000)
-    return x, r
-
-
-## start testing
-x, r = load_audio()
-EPS = 1e-8
-
-
-def relative_err(a, b, real=True):
-    """compute relative error of two matrices or vectors"""
-    if real:
-        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
-    else:
-        err = np.sum((a.real - b.real)**2) / (
-            EPS + np.sum(a.real**2) + np.sum(b.real**2))
-        err += np.sum((a.imag - b.imag)**2) / (
-            EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
-
-        return err
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_melspectrogram():
-    a = pa.melspectrogram(
-        x,
-        window_size=512,
-        sr=16000,
-        hop_length=320,
-        n_mels=64,
-        fmin=50,
-        to_db=False, )
-    b = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    assert relative_err(a, b) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_melspectrogram_db():
-
-    a = pa.melspectrogram(
-        x,
-        window_size=512,
-        sr=16000,
-        hop_length=320,
-        n_mels=64,
-        fmin=50,
-        to_db=True,
-        ref=1.0,
-        amin=1e-10,
-        top_db=None)
-    b = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
-    assert relative_err(a, b) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_stft():
-    a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
-    b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
-    assert a.shape == b.shape
-    assert relative_err(a, b, real=False) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_split_frames():
-    a = librosa.util.frame(x, frame_length=512, hop_length=320)
-    b = pa.split_frames(x, frame_length=512, hop_length=320)
-    assert relative_err(a, b) < EPS
-
-
-@pytest.mark.filterwarnings("ignore::DeprecationWarning")
-def test_mfcc():
-    kwargs = {
-        'window_size': 512,
-        'hop_length': 320,
-        'n_mels': 64,
-        'fmin': 50,
-        'to_db': False
-    }
-    a = pa.mfcc(
-        x,
-        #sample_rate=16000,
-        spect=None,
-        n_mfcc=20,
-        dct_type=2,
-        norm='ortho',
-        lifter=0,
-        **kwargs)
-    S = librosa.feature.melspectrogram(
-        x,
-        sr=16000,
-        n_fft=512,
-        win_length=512,
-        hop_length=320,
-        n_mels=64,
-        fmin=50)
-    b = librosa.feature.mfcc(
-        x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
-    assert relative_err(a, b) < EPS
-
-
-if __name__ == '__main__':
-    test_melspectrogram()
-    test_melspectrogram_db()
-    test_stft()
-    test_split_frames()
-    test_mfcc()
--- a/audio/examples/sound_classification/README.md
+++ b/audio/examples/sound_classification/README.md
@ -21,22 +21,17 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用

 ### 模型训练

-以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。关于如何使用`paddle.distributed.launch`启动多卡训练，请查看[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html)。
+以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。

-单卡训练:
+启动训练:
 ```shell
-$ python train.py --epochs 50 --batch_size 16 --checkpoint_dir ./checkpoint --save_freq 10
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
 ```

-多卡训练:
-```shell
-$ unset CUDA_VISIBLE_DEVICES
-$ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_size 16 --num_worker 4 --checkpoint_dir ./checkpoint --save_freq 10
-```
+`paddlespeech/cls/exps/panns/train.py` 脚本中可支持配置的参数：

-可支持配置的参数：
-
- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `device`: 指定模型预测时使用的设备。
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `epochs`: 训练轮次，默认为50。
 - `learning_rate`: Fine-tune的学习率；默认为5e-5。
 - `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为16。
@ -47,9 +42,9 @@ $ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_

 示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
 ```python
-from model import SoundClassifier
 from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14, cnn10, cnn6
+from paddlespeech.cls.models import SoundClassifier
+from paddlespeech.cls.models import cnn14, cnn10, cnn6

 # CNN14
 backbone = cnn14(pretrained=True, extract_embedding=True)
@ -67,12 +62,14 @@ model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
 ### 模型预测

 ```shell
-python -u predict.py --wav ./dog.wav --top_k 3 --checkpoint ./checkpoint/epoch_50/model.pdparams
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2
 ```

-可支持配置的参数：
- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+`paddlespeech/cls/exps/panns/predict.py` 脚本中可支持配置的参数：
+
+- `device`: 指定模型预测时使用的设备。
 - `wav`: 指定预测的音频文件。
+- `feat_backend`: 选择提取特征的后端，可选`'numpy'`或`'paddle'`，默认为`'numpy'`。
 - `top_k`: 预测显示的top k标签的得分，默认为1。
 - `checkpoint`: 模型参数checkpoint文件。

@ -91,10 +88,10 @@ Cat: 6.579841738130199e-06
 模型训练结束后，可以将已保存的动态图参数导出成静态图的模型和参数，然后实施静态图的部署。

 ```shell
-python -u export_model.py --checkpoint ./checkpoint/epoch_50/model.pdparams --output_dir ./export
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3
 ```

-可支持配置的参数：
+`paddlespeech/cls/exps/panns/export_model.py` 脚本中可支持配置的参数：
 - `checkpoint`: 模型参数checkpoint文件。
 - `output_dir`: 导出静态图模型和参数文件的保存目录。

@ -109,8 +106,13 @@ export

 #### 2. 模型部署和预测

-`deploy/python/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
+`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：

-```sh
-python deploy/python/predict.py --model_dir ./export --device gpu
+```shell
+$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4
 ```
+
+`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本中可支持配置的主要参数：
+- `device`: 指定模型预测时使用的设备。
+- `model_dir`: 导出静态图模型和参数文件的保存目录。
+- `wav`: 指定预测的音频文件。
--- a/examples/esc50/cls0/local/export.sh
+++ b/examples/esc50/cls0/local/export.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+
+ckpt_dir=$1
+output_dir=$2
+
+python3 ${BIN_DIR}/export_model.py \
+--checkpoint ${ckpt_dir}/model.pdparams \
+--output_dir ${output_dir}
--- a/examples/esc50/cls0/local/infer.sh
+++ b/examples/esc50/cls0/local/infer.sh
@ -0,0 +1,11 @@
+#!/bin/bash
+
+audio_file=$1
+ckpt_dir=$2
+feat_backend=$3
+
+python3 ${BIN_DIR}/predict.py \
+--wav ${audio_file} \
+--feat_backend ${feat_backend} \
+--top_k 10 \
+--checkpoint ${ckpt_dir}/model.pdparams
--- a/examples/esc50/cls0/local/static_model_infer.sh
+++ b/examples/esc50/cls0/local/static_model_infer.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+device=$1
+model_dir=$2
+audio_file=$3
+
+python3 ${BIN_DIR}/deploy/predict.py \
+--device ${device} \
+--model_dir ${model_dir} \
+--wav ${audio_file} 
--- a/examples/esc50/cls0/local/train.sh
+++ b/examples/esc50/cls0/local/train.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+
+ngpu=$1
+feat_backend=$2
+
+num_epochs=50
+batch_size=16
+ckpt_dir=./checkpoint
+save_freq=10
+
+if [ ${ngpu} -gt 0 ]; then
+    python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \
+    --epochs ${num_epochs} \
+    --feat_backend ${feat_backend} \
+    --batch_size ${batch_size} \
+    --checkpoint_dir ${ckpt_dir} \
+    --save_freq ${save_freq}
+else
+    python3 ${BIN_DIR}/train.py \
+    --epochs ${num_epochs} \
+    --feat_backend ${feat_backend} \
+    --batch_size ${batch_size} \
+    --checkpoint_dir ${ckpt_dir} \
+    --save_freq ${save_freq}
+fi
--- a/examples/esc50/cls0/path.sh
+++ b/examples/esc50/cls0/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=panns
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL}
--- a/examples/esc50/cls0/run.sh
+++ b/examples/esc50/cls0/run.sh
@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+source path.sh
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+stage=$1
+stop_stage=100
+feat_backend=numpy
+audio_file=~/cat.wav
+ckpt_dir=./checkpoint/epoch_50
+output_dir=./export
+infer_device=cpu
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    ./local/train.sh ${ngpu} ${feat_backend} || exit -1
+    exit 0
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ./local/infer.sh ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1
+    exit 0
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    ./local/export.sh ${ckpt_dir} ${output_dir} || exit -1
+    exit 0
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    ./local/static_model_infer.sh ${infer_device} ${output_dir} ${audio_file} || exit -1
+    exit 0
+fi
--- a/audio/paddleaudio/init.py
+++ b/audio/paddleaudio/init.py
--- a/audio/paddleaudio/backends/init.py
+++ b/audio/paddleaudio/backends/init.py
--- a/audio/paddleaudio/backends/audio.py
+++ b/audio/paddleaudio/backends/audio.py
--- a/audio/paddleaudio/datasets/init.py
+++ b/audio/paddleaudio/datasets/init.py
@ -11,24 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .aishell import AISHELL1
-from .dcase import UrbanAcousticScenes
-from .dcase import UrbanAudioVisualScenes
 from .esc50 import ESC50
 from .gtzan import GTZAN
-from .librispeech import LIBRISPEECH
-from .ravdess import RAVDESS
 from .tess import TESS
 from .urban_sound import UrbanSound8K

 __all__ = [
-    'AISHELL1',
-    'LIBRISPEECH',
    'ESC50',
    'UrbanSound8K',
    'GTZAN',
-    'UrbanAcousticScenes',
-    'UrbanAudioVisualScenes',
-    'RAVDESS',
    'TESS',
 ]
--- a/audio/paddleaudio/datasets/dataset.py
+++ b/audio/paddleaudio/datasets/dataset.py
--- a/audio/paddleaudio/datasets/esc50.py
+++ b/audio/paddleaudio/datasets/esc50.py
--- a/audio/paddleaudio/datasets/gtzan.py
+++ b/audio/paddleaudio/datasets/gtzan.py
--- a/audio/paddleaudio/datasets/tess.py
+++ b/audio/paddleaudio/datasets/tess.py
--- a/audio/paddleaudio/datasets/urban_sound.py
+++ b/audio/paddleaudio/datasets/urban_sound.py
--- a/audio/paddleaudio/features/init.py
+++ b/audio/paddleaudio/features/init.py
@ -13,3 +13,4 @@
 # limitations under the License.
 from .augment import *
 from .core import *
+from .spectrum import *
--- a/audio/paddleaudio/features/augment.py
+++ b/audio/paddleaudio/features/augment.py
@ -15,8 +15,9 @@ from typing import List

 import numpy as np
 from numpy import ndarray as array
-from paddleaudio.backends import depth_convert
-from paddleaudio.utils import ParameterError
+
+from ..backends import depth_convert
+from ..utils import ParameterError

 __all__ = [
    'depth_augment',
--- a/audio/paddleaudio/features/core.py
+++ b/audio/paddleaudio/features/core.py
@ -21,9 +21,10 @@ import numpy as np
 import scipy
 from numpy import ndarray as array
 from numpy.lib.stride_tricks import as_strided
-from paddleaudio.utils import ParameterError
 from scipy.signal import get_window

+from ..utils import ParameterError
+
 __all__ = [
    'stft',
    'mfcc',
@ -293,6 +294,7 @@ def stft(x: array,
    This function is aligned with librosa.
    """
    _check_audio(x)
+
    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft
@ -397,7 +399,7 @@ def mfcc(x,
    This function is NOT strictly aligned with librosa. The following example shows how to get the
    same result with librosa:

-    # paddleaudioe mfcc:
+    # mfcc:
     kwargs = {
        'window_size':512,
        'hop_length':320,
--- a/paddleaudio/features/spectrum.py
+++ b/paddleaudio/features/spectrum.py
@ -0,0 +1,461 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from functools import partial
+from typing import Optional
+from typing import Union
+
+import paddle
+import paddle.nn as nn
+
+from .window import get_window
+
+__all__ = [
+    'Spectrogram',
+    'MelSpectrogram',
+    'LogMelSpectrogram',
+]
+
+
+def hz_to_mel(freq: Union[paddle.Tensor, float],
+              htk: bool=False) -> Union[paddle.Tensor, float]:
+    """Convert Hz to Mels.
+    Parameters:
+        freq: the input tensor of arbitrary shape, or a single floating point number.
+        htk: use HTK formula to do the conversion.
+            The default value is False.
+    Returns:
+        The frequencies represented in Mel-scale.
+    """
+
+    if htk:
+        if isinstance(freq, paddle.Tensor):
+            return 2595.0 * paddle.log10(1.0 + freq / 700.0)
+        else:
+            return 2595.0 * math.log10(1.0 + freq / 700.0)
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = math.log(6.4) / 27.0  # step size for log region
+
+    if isinstance(freq, paddle.Tensor):
+        target = min_log_mel + paddle.log(
+            freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
+        mask = (freq > min_log_hz).astype(freq.dtype)
+        mels = target * mask + mels * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if freq >= min_log_hz:
+            mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
+
+    return mels
+
+
+def mel_to_hz(mel: Union[float, paddle.Tensor],
+              htk: bool=False) -> Union[float, paddle.Tensor]:
+    """Convert mel bin numbers to frequencies.
+    Parameters:
+        mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
+        htk: use HTK formula to do the conversion.
+    Returns:
+        The frequencies represented in hz.
+    """
+    if htk:
+        return 700.0 * (10.0**(mel / 2595.0) - 1.0)
+
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mel
+    # And now the nonlinear scale
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = math.log(6.4) / 27.0  # step size for log region
+    if isinstance(mel, paddle.Tensor):
+        target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
+        mask = (mel > min_log_mel).astype(mel.dtype)
+        freqs = target * mask + freqs * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if mel >= min_log_mel:
+            freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
+
+    return freqs
+
+
+def mel_frequencies(n_mels: int=64,
+                    f_min: float=0.0,
+                    f_max: float=11025.0,
+                    htk: bool=False,
+                    dtype: str=paddle.float32):
+    """Compute mel frequencies.
+    Parameters:
+        n_mels(int): number of Mel bins.
+        f_min(float): the lower cut-off frequency, below which the filter response is zero.
+        f_max(float): the upper cut-off frequency, above which the filter response is zero.
+        htk(bool): whether to use htk formula.
+        dtype(str): the datatype of the return frequencies.
+    Returns:
+        The frequencies represented in Mel-scale
+    """
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    min_mel = hz_to_mel(f_min, htk=htk)
+    max_mel = hz_to_mel(f_max, htk=htk)
+    mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
+    freqs = mel_to_hz(mels, htk=htk)
+    return freqs
+
+
+def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
+    """Compute fourier frequencies.
+    Parameters:
+        sr(int): the audio sample rate.
+        n_fft(float): the number of fft bins.
+        dtype(str): the datatype of the return frequencies.
+    Returns:
+        The frequencies represented in hz.
+    """
+    return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
+
+
+def compute_fbank_matrix(sr: int,
+                         n_fft: int,
+                         n_mels: int=64,
+                         f_min: float=0.0,
+                         f_max: Optional[float]=None,
+                         htk: bool=False,
+                         norm: Union[str, float]='slaney',
+                         dtype: str=paddle.float32):
+    """Compute fbank matrix.
+    Parameters:
+        sr(int): the audio sample rate.
+        n_fft(int): the number of fft bins.
+        n_mels(int): the number of Mel bins.
+        f_min(float): the lower cut-off frequency, below which the filter response is zero.
+        f_max(float): the upper cut-off frequency, above which the filter response is zero.
+        htk: whether to use htk formula.
+        return_complex(bool): whether to return complex matrix. If True, the matrix will
+            be complex type. Otherwise, the real and image part will be stored in the last
+            axis of returned tensor.
+        dtype(str): the datatype of the returned fbank matrix.
+    Returns:
+        The fbank matrix of shape (n_mels, int(1+n_fft//2)).
+    Shape:
+        output: (n_mels, int(1+n_fft//2))
+    """
+
+    if f_max is None:
+        f_max = float(sr) / 2
+
+    # Initialize the weights
+    weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(
+        n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype)
+
+    fdiff = mel_f[1:] - mel_f[:-1]  #np.diff(mel_f)
+    ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
+    #ramps = np.subtract.outer(mel_f, fftfreqs)
+
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+
+        # .. then intersect them with each other and zero
+        weights[i] = paddle.maximum(
+            paddle.zeros_like(lower), paddle.minimum(lower, upper))
+
+    # Slaney-style mel is scaled to be approx constant energy per channel
+    if norm == 'slaney':
+        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm.unsqueeze(1)
+    elif isinstance(norm, int) or isinstance(norm, float):
+        weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
+
+    return weights
+
+
+def power_to_db(magnitude: paddle.Tensor,
+                ref_value: float=1.0,
+                amin: float=1e-10,
+                top_db: Optional[float]=None) -> paddle.Tensor:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units.
+    The function computes the scaling ``10 * log10(x / ref)`` in a numerically
+    stable way.
+    Parameters:
+        magnitude(Tensor): the input magnitude tensor of any shape.
+        ref_value(float): the reference value. If smaller than 1.0, the db level
+            of the signal will be pulled up accordingly. Otherwise, the db level
+            is pushed down.
+        amin(float): the minimum value of input magnitude, below which the input
+            magnitude is clipped(to amin).
+        top_db(float): the maximum db value of resulting spectrum, above which the
+            spectrum is clipped(to top_db).
+    Returns:
+        The spectrogram in log-scale.
+    shape:
+        input: any shape
+        output: same as input
+    """
+    if amin <= 0:
+        raise Exception("amin must be strictly positive")
+
+    if ref_value <= 0:
+        raise Exception("ref_value must be strictly positive")
+
+    ones = paddle.ones_like(magnitude)
+    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
+    log_spec -= 10.0 * math.log10(max(ref_value, amin))
+
+    if top_db is not None:
+        if top_db < 0:
+            raise Exception("top_db must be non-negative")
+        log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
+
+    return log_spec
+
+
+class Spectrogram(nn.Layer):
+    def __init__(self,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 dtype: str=paddle.float32):
+        """Compute spectrogram of a given signal, typically an audio waveform.
+        The spectorgram is defined as the complex norm of the short-time
+        Fourier transformation.
+        Parameters:
+            n_fft(int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window(str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'. The default value is 'reflect'.
+            dtype(str): the data type of input and window.
+        Notes:
+            The Spectrogram transform relies on STFT transform to compute the spectrogram.
+            By default, the weights are not learnable. To fine-tune the Fourier coefficients,
+            set stop_gradient=False before training.
+            For more information, see STFT().
+        """
+        super(Spectrogram, self).__init__()
+
+        if win_length is None:
+            win_length = n_fft
+
+        fft_window = get_window(window, win_length, fftbins=True, dtype=dtype)
+        self._stft = partial(
+            paddle.signal.stft,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=fft_window,
+            center=center,
+            pad_mode=pad_mode)
+
+    def forward(self, x):
+        stft = self._stft(x)
+        spectrogram = paddle.square(paddle.abs(stft))
+        return spectrogram
+
+
+class MelSpectrogram(nn.Layer):
+    def __init__(self,
+                 sr: int=22050,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 n_mels: int=64,
+                 f_min: float=50.0,
+                 f_max: Optional[float]=None,
+                 htk: bool=False,
+                 norm: Union[str, float]='slaney',
+                 dtype: str=paddle.float32):
+        """Compute the melspectrogram of a given signal, typically an audio waveform.
+        The melspectrogram is also known as filterbank or fbank feature in audio community.
+        It is computed by multiplying spectrogram with Mel filter bank matrix.
+        Parameters:
+            sr(int): the audio sample rate.
+                The default value is 22050.
+            n_fft(int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window(str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'.
+                The default value is 'reflect'.
+            n_mels(int): the mel bins.
+            f_min(float): the lower cut-off frequency, below which the filter response is zero.
+            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
+            htk(bool): whether to use HTK formula in computing fbank matrix.
+            norm(str|float): the normalization type in computing fbank matrix.  Slaney-style is used by default.
+                You can specify norm=1.0/2.0 to use customized p-norm normalization.
+            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
+                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
+        """
+        super(MelSpectrogram, self).__init__()
+
+        self._spectrogram = Spectrogram(
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=center,
+            pad_mode=pad_mode,
+            dtype=dtype)
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max
+        self.htk = htk
+        self.norm = norm
+        if f_max is None:
+            f_max = sr // 2
+        self.fbank_matrix = compute_fbank_matrix(
+            sr=sr,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            f_min=f_min,
+            f_max=f_max,
+            htk=htk,
+            norm=norm,
+            dtype=dtype)  # float64 for better numerical results
+        self.register_buffer('fbank_matrix', self.fbank_matrix)
+
+    def forward(self, x):
+        spect_feature = self._spectrogram(x)
+        mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
+        return mel_feature
+
+
+class LogMelSpectrogram(nn.Layer):
+    def __init__(self,
+                 sr: int=22050,
+                 n_fft: int=512,
+                 hop_length: Optional[int]=None,
+                 win_length: Optional[int]=None,
+                 window: str='hann',
+                 center: bool=True,
+                 pad_mode: str='reflect',
+                 n_mels: int=64,
+                 f_min: float=50.0,
+                 f_max: Optional[float]=None,
+                 htk: bool=False,
+                 norm: Union[str, float]='slaney',
+                 ref_value: float=1.0,
+                 amin: float=1e-10,
+                 top_db: Optional[float]=None,
+                 dtype: str=paddle.float32):
+        """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
+        typically an audio waveform.
+        Parameters:
+            sr(int): the audio sample rate.
+                The default value is 22050.
+            n_fft(int): the number of frequency components of the discrete Fourier transform.
+                The default value is 2048,
+            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
+                The default value is None.
+            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
+                The default value is None.
+            window(str): the name of the window function applied to the single before the Fourier transform.
+                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
+                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
+                The default value is 'hann'
+            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
+                If False, frame t begins at x[t * hop_length]
+                The default value is True
+            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
+                and 'constant'.
+                The default value is 'reflect'.
+            n_mels(int): the mel bins.
+            f_min(float): the lower cut-off frequency, below which the filter response is zero.
+            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
+            ref_value(float): the reference value. If smaller than 1.0, the db level
+            htk(bool): whether to use HTK formula in computing fbank matrix.
+            norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
+                You can specify norm=1.0/2.0 to use customized p-norm normalization.
+            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
+                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
+            amin(float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
+                Otherwise, the db level is pushed down.
+                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
+                e.g., 1e-3.
+            top_db(float): the maximum db value of resulting spectrum, above which the
+                spectrum is clipped(to top_db).
+        """
+        super(LogMelSpectrogram, self).__init__()
+
+        self._melspectrogram = MelSpectrogram(
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=center,
+            pad_mode=pad_mode,
+            n_mels=n_mels,
+            f_min=f_min,
+            f_max=f_max,
+            htk=htk,
+            norm=norm,
+            dtype=dtype)
+
+        self.ref_value = ref_value
+        self.amin = amin
+        self.top_db = top_db
+
+    def forward(self, x):
+        # import ipdb; ipdb.set_trace()
+        mel_feature = self._melspectrogram(x)
+        log_mel_feature = power_to_db(
+            mel_feature,
+            ref_value=self.ref_value,
+            amin=self.amin,
+            top_db=self.top_db)
+        return log_mel_feature
--- a/paddleaudio/features/window.py
+++ b/paddleaudio/features/window.py
@ -0,0 +1,415 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import math
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import paddle
+from paddle import Tensor
+
+__all__ = [
+    'get_window',
+]
+
+
+def _cat(a: List[Tensor], data_type: str) -> Tensor:
+    l = [paddle.to_tensor(_a, data_type) for _a in a]
+    return paddle.concat(l)
+
+
+def _acosh(x: Union[Tensor, float]) -> Tensor:
+    if isinstance(x, float):
+        return math.log(x + math.sqrt(x**2 - 1))
+    return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
+
+
+def _extend(M: int, sym: bool) -> bool:
+    """Extend window by 1 sample if needed for DFT-even symmetry"""
+    if not sym:
+        return M + 1, True
+    else:
+        return M, False
+
+
+def _len_guards(M: int) -> bool:
+    """Handle small or incorrect window lengths"""
+    if int(M) != M or M < 0:
+        raise ValueError('Window length M must be a non-negative integer')
+
+    return M <= 1
+
+
+def _truncate(w: Tensor, needed: bool) -> Tensor:
+    """Truncate window by 1 sample if needed for DFT-even symmetry"""
+    if needed:
+        return w[:-1]
+    else:
+        return w
+
+
+def general_gaussian(M: int, p, sig, sym: bool=True,
+                     dtype: str='float64') -> Tensor:
+    """Compute a window with a generalized Gaussian shape.
+    This function is consistent with scipy.signal.windows.general_gaussian().
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
+    w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p))
+
+    return _truncate(w, needs_trunc)
+
+
+def general_hamming(M: int, alpha: float, sym: bool=True,
+                    dtype: str='float64') -> Tensor:
+    """Compute a generalized Hamming window.
+    This function is consistent with scipy.signal.windows.general_hamming()
+    """
+    return general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
+
+
+def taylor(M: int,
+           nbar=4,
+           sll=30,
+           norm=True,
+           sym: bool=True,
+           dtype: str='float64') -> Tensor:
+    """Compute a Taylor window.
+    The Taylor window taper function approximates the Dolph-Chebyshev window's
+    constant sidelobe level for a parameterized number of near-in sidelobes.
+    Parameters:
+        M(int): window size
+        nbar, sil, norm: the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    # Original text uses a negative sidelobe level parameter and then negates
+    # it in the calculation of B. To keep consistent with other methods we
+    # assume the sidelobe level parameter to be positive.
+    B = 10**(sll / 20)
+    A = _acosh(B) / math.pi
+    s2 = nbar**2 / (A**2 + (nbar - 0.5)**2)
+    ma = paddle.arange(1, nbar, dtype=dtype)
+
+    Fm = paddle.empty((nbar - 1, ), dtype=dtype)
+    signs = paddle.empty_like(ma)
+    signs[::2] = 1
+    signs[1::2] = -1
+    m2 = ma * ma
+    for mi in range(len(ma)):
+        numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2
+                                                           ))
+        if mi == 0:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:])
+        elif mi == len(ma) - 1:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
+        else:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[
+                mi] / m2[mi + 1:])
+
+        Fm[mi] = numer / denom
+
+    def W(n):
+        return 1 + 2 * paddle.matmul(
+            Fm.unsqueeze(0),
+            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M))
+
+    w = W(paddle.arange(0, M, dtype=dtype))
+
+    # normalize (Note that this is not described in the original text [1])
+    if norm:
+        scale = 1.0 / W((M - 1) / 2)
+        w *= scale
+    w = w.squeeze()
+    return _truncate(w, needs_trunc)
+
+
+def general_cosine(M: int, a: float, sym: bool=True,
+                   dtype: str='float64') -> Tensor:
+    """Compute a generic weighted sum of cosine terms window.
+    This function is consistent with scipy.signal.windows.general_cosine().
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
+    w = paddle.zeros((M, ), dtype=dtype)
+    for k in range(len(a)):
+        w += a[k] * paddle.cos(k * fac)
+    return _truncate(w, needs_trunc)
+
+
+def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Hamming window.
+    The Hamming window is a taper formed by using a raised cosine with
+    non-zero endpoints, optimized to minimize the nearest side lobe.
+    Parameters:
+        M(int): window size
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    return general_hamming(M, 0.54, sym, dtype=dtype)
+
+
+def hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Hann window.
+    The Hann window is a taper formed by using a raised cosine or sine-squared
+    with ends that touch zero.
+    Parameters:
+        M(int): window size
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    return general_hamming(M, 0.5, sym, dtype=dtype)
+
+
+def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Tukey window.
+    The Tukey window is also known as a tapered cosine window.
+    Parameters:
+        M(int): window size
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+
+    if alpha <= 0:
+        return paddle.ones((M, ), dtype=dtype)
+    elif alpha >= 1.0:
+        return hann(M, sym=sym)
+
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype)
+    width = int(alpha * (M - 1) / 2.0)
+    n1 = n[0:width + 1]
+    n2 = n[width + 1:M - width - 1]
+    n3 = n[M - width - 1:]
+
+    w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
+    w2 = paddle.ones(n2.shape, dtype=dtype)
+    w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha /
+                                          (M - 1))))
+    w = paddle.concat([w1, w2, w3])
+
+    return _truncate(w, needs_trunc)
+
+
+def kaiser(M: int, beta: float, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Kaiser window.
+    The Kaiser window is a taper formed by using a Bessel function.
+    Parameters:
+        M(int): window size.
+        beta(float): the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+    Returns:
+        Tensor: the window tensor
+    """
+    raise NotImplementedError()
+
+
+def gaussian(M: int, std: float, sym: bool=True,
+             dtype: str='float64') -> Tensor:
+    """Compute a Gaussian window.
+    The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
+    Parameters:
+        M(int): window size.
+        std(float): the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
+    sig2 = 2 * std * std
+    w = paddle.exp(-n**2 / sig2)
+
+    return _truncate(w, needs_trunc)
+
+
+def exponential(M: int,
+                center=None,
+                tau=1.,
+                sym: bool=True,
+                dtype: str='float64') -> Tensor:
+    """Compute an exponential (or Poisson) window.
+    Parameters:
+        M(int): window size.
+        tau(float): the window-specific parameter.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if sym and center is not None:
+        raise ValueError("If sym==True, center must be None.")
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    if center is None:
+        center = (M - 1) / 2
+
+    n = paddle.arange(0, M, dtype=dtype)
+    w = paddle.exp(-paddle.abs(n - center) / tau)
+
+    return _truncate(w, needs_trunc)
+
+
+def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a triangular window.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
+    if M % 2 == 0:
+        w = (2 * n - 1.0) / M
+        w = paddle.concat([w, w[::-1]])
+    else:
+        w = 2 * n / (M + 1.0)
+        w = paddle.concat([w, w[-2::-1]])
+
+    return _truncate(w, needs_trunc)
+
+
+def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Bohman window.
+    The Bohman window is the autocorrelation of a cosine window.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
+    w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
+        math.pi * fac)
+    w = _cat([0, w, 0], dtype)
+
+    return _truncate(w, needs_trunc)
+
+
+def blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a Blackman window.
+    The Blackman window is a taper formed by using the first three terms of
+    a summation of cosines. It was designed to have close to the minimal
+    leakage possible.  It is close to optimal, only slightly worse than a
+    Kaiser window.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    return general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
+
+
+def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+    """Compute a window with a simple cosine shape.
+    Parameters:
+        M(int): window size.
+        sym(bool)：whether to return symmetric window.
+            The default value is True
+        dtype(str): the datatype of returned tensor.
+    Returns:
+        Tensor: the window tensor
+    """
+    if _len_guards(M):
+        return paddle.ones((M, ), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5))
+
+    return _truncate(w, needs_trunc)
+
+
+def get_window(window: Union[str, Tuple[str, float]],
+               win_length: int,
+               fftbins: bool=True,
+               dtype: str='float64') -> Tensor:
+    """Return a window of a given length and type.
+    Parameters:
+        window(str|(str,float)): the type of window to create.
+        win_length(int): the number of samples in the window.
+        fftbins(bool): If True, create a "periodic" window. Otherwise,
+            create a "symmetric" window, for use in filter design.
+    Returns:
+       The window represented as a tensor.
+    """
+    sym = not fftbins
+
+    args = ()
+    if isinstance(window, tuple):
+        winstr = window[0]
+        if len(window) > 1:
+            args = window[1:]
+    elif isinstance(window, str):
+        if window in ['gaussian', 'exponential']:
+            raise ValueError("The '" + window + "' window needs one or "
+                             "more parameters -- pass a tuple.")
+        else:
+            winstr = window
+    else:
+        raise ValueError("%s as window type is not supported." %
+                         str(type(window)))
+
+    try:
+        winfunc = eval(winstr)
+    except KeyError as e:
+        raise ValueError("Unknown window type.") from e
+
+    params = (win_length, ) + args
+    kwargs = {'sym': sym}
+    return winfunc(*params, dtype=dtype, **kwargs)
--- a/audio/paddleaudio/utils/init.py
+++ b/audio/paddleaudio/utils/init.py
--- a/audio/paddleaudio/utils/download.py
+++ b/audio/paddleaudio/utils/download.py
@ -17,7 +17,6 @@ from typing import List

 from paddle.framework import load as load_state_dict
 from paddle.utils import download
-from pathos.multiprocessing import ProcessPool

 from .log import logger

@ -32,27 +31,18 @@ def decompress(file: str):
    download._decompress(file)


-def download_and_decompress(archives: List[Dict[str, str]],
-                            path: str,
-                            n_workers: int=0):
+def download_and_decompress(archives: List[Dict[str, str]], path: str):
    """
    Download archieves and decompress to specific path.
    """
    if not os.path.isdir(path):
        os.makedirs(path)

-    if n_workers <= 0:
-        for archive in archives:
-            assert 'url' in archive and 'md5' in archive, \
-                'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+    for archive in archives:
+        assert 'url' in archive and 'md5' in archive, \
+            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'

-            download.get_path_from_url(archive['url'], path, archive['md5'])
-    else:
-        pool = ProcessPool(nodes=n_workers)
-        pool.imap(download.get_path_from_url, [_['url'] for _ in archives],
-                  [path] * len(archives), [_['md5'] for _ in archives])
-        pool.close()
-        pool.join()
+        download.get_path_from_url(archive['url'], path, archive['md5'])


 def load_state_dict_from_url(url: str, path: str, md5: str=None):
--- a/audio/paddleaudio/utils/env.py
+++ b/audio/paddleaudio/utils/env.py
--- a/audio/paddleaudio/utils/error.py
+++ b/audio/paddleaudio/utils/error.py
--- a/audio/paddleaudio/utils/log.py
+++ b/audio/paddleaudio/utils/log.py
--- a/audio/paddleaudio/utils/time.py
+++ b/audio/paddleaudio/utils/time.py
--- a/audio/paddleaudio/models/init.py
+++ b/audio/paddleaudio/models/init.py
--- a/paddlespeech/cls/exps/panns/init.py
+++ b/paddlespeech/cls/exps/panns/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/cls/exps/panns/deploy/init.py
+++ b/paddlespeech/cls/exps/panns/deploy/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/audio/examples/sound_classification/deploy/python/predict.py
+++ b/audio/examples/sound_classification/deploy/python/predict.py
@ -16,16 +16,18 @@ import os

 import numpy as np
 from paddle import inference
+from scipy.special import softmax
+
 from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets import ESC50
 from paddleaudio.features import melspectrogram
-from scipy.special import softmax

 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
-parser.add_argument("--batch_size", type=int, default=2, help="Batch size per GPU/CPU for training.")
 parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.")
 parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
 parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.')
 parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.')
@ -131,10 +133,7 @@ if __name__ == "__main__":
                          args.use_tensorrt, args.precision, args.cpu_threads,
                          args.enable_mkldnn)

-    wavs = [
-        '~/audio_demo_resource/cat.wav',
-        '~/audio_demo_resource/dog.wav',
-    ]
+    wavs = [args.wav]

    for i in range(len(wavs)):
        wavs[i] = os.path.abspath(os.path.expanduser(wavs[i]))
--- a/audio/examples/sound_classification/export_model.py
+++ b/audio/examples/sound_classification/export_model.py
@ -15,9 +15,10 @@ import argparse
 import os

 import paddle
-from model import SoundClassifier
+
 from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier

 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
--- a/audio/examples/sound_classification/predict.py
+++ b/audio/examples/sound_classification/predict.py
@ -16,30 +16,40 @@ import argparse
 import numpy as np
 import paddle
 import paddle.nn.functional as F
-from model import SoundClassifier
+
 from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets import ESC50
+from paddleaudio.features import LogMelSpectrogram
 from paddleaudio.features import melspectrogram
-from paddleaudio.models.panns import cnn14
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier

 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
 parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
 parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
 parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
 args = parser.parse_args()
 # yapf: enable


-def extract_features(file: str, **kwargs):
+def extract_features(file: str, feat_backend: str='numpy',
+                     **kwargs) -> paddle.Tensor:
    waveform, sr = load_audio(file, sr=None)
-    feat = melspectrogram(waveform, sr, **kwargs).transpose()
+
+    if args.feat_backend == 'numpy':
+        feat = melspectrogram(waveform, sr, **kwargs).transpose()
+        feat = np.expand_dims(feat, 0)
+        feat = paddle.to_tensor(feat)
+    else:
+        feature_extractor = LogMelSpectrogram(sr=sr, **kwargs)
+        feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
+        feat = paddle.transpose(feat, [0, 2, 1])
    return feat


 if __name__ == '__main__':
-    paddle.set_device(args.device)

    model = SoundClassifier(
        backbone=cnn14(pretrained=False, extract_embedding=True),
@ -47,8 +57,7 @@ if __name__ == '__main__':
    model.set_state_dict(paddle.load(args.checkpoint))
    model.eval()

-    feat = np.expand_dims(extract_features(args.wav), 0)
-    feat = paddle.to_tensor(feat)
+    feat = extract_features(args.wav, args.feat_backend)
    logits = model(feat)
    probs = F.softmax(logits, axis=1).numpy()

--- a/audio/examples/sound_classification/train.py
+++ b/audio/examples/sound_classification/train.py
@ -15,16 +15,18 @@ import argparse
 import os

 import paddle
-from model import SoundClassifier
+
 from paddleaudio.datasets import ESC50
-from paddleaudio.models.panns import cnn14
+from paddleaudio.features import LogMelSpectrogram
 from paddleaudio.utils import logger
 from paddleaudio.utils import Timer
+from paddlespeech.cls.models import cnn14
+from paddlespeech.cls.models import SoundClassifier

 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
-parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
+parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.")
 parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
 parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
 parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
@ -35,7 +37,6 @@ args = parser.parse_args()
 # yapf: enable

 if __name__ == "__main__":
-    paddle.set_device(args.device)
    nranks = paddle.distributed.get_world_size()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()
@ -48,8 +49,13 @@ if __name__ == "__main__":
        learning_rate=args.learning_rate, parameters=model.parameters())
    criterion = paddle.nn.loss.CrossEntropyLoss()

-    train_ds = ESC50(mode='train', feat_type='melspectrogram')
-    dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
+    if args.feat_backend == 'numpy':
+        train_ds = ESC50(mode='train', feat_type='melspectrogram')
+        dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
+    else:
+        train_ds = ESC50(mode='train')
+        dev_ds = ESC50(mode='dev')
+        feature_extractor = LogMelSpectrogram(sr=16000)

    train_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
@ -71,7 +77,16 @@ if __name__ == "__main__":
        num_corrects = 0
        num_samples = 0
        for batch_idx, batch in enumerate(train_loader):
-            feats, labels = batch
+            if args.feat_backend == 'numpy':
+                feats, labels = batch
+            else:
+                waveforms, labels = batch
+                feats = feature_extractor(
+                    waveforms
+                )  # Need a padding when lengths of waveforms differ in a batch.
+                feats = paddle.transpose(feats,
+                                         [0, 2, 1])  # To [N, length, n_mels]
+
            logits = model(feats)

            loss = criterion(logits, labels)
@ -126,7 +141,13 @@ if __name__ == "__main__":
            num_samples = 0
            with logger.processing('Evaluation on validation dataset'):
                for batch_idx, batch in enumerate(dev_loader):
-                    feats, labels = batch
+                    if args.feat_backend == 'numpy':
+                        feats, labels = batch
+                    else:
+                        waveforms, labels = batch
+                        feats = feature_extractor(waveforms)
+                        feats = paddle.transpose(feats, [0, 2, 1])
+
                    logits = model(feats)

                    preds = paddle.argmax(logits, axis=1)
--- a/paddlespeech/cls/models/init.py
+++ b/paddlespeech/cls/models/init.py
@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .panns import *
--- a/paddlespeech/cls/models/panns/init.py
+++ b/paddlespeech/cls/models/panns/init.py
@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .classifier import *
+from .panns import *
--- a/audio/examples/sound_classification/model.py
+++ b/audio/examples/sound_classification/model.py
--- a/paddlespeech/cls/models/panns/panns.py
+++ b/paddlespeech/cls/models/panns/panns.py
@ -16,8 +16,8 @@ import os
 import paddle.nn as nn
 import paddle.nn.functional as F

-from ..utils.download import load_state_dict_from_url
-from ..utils.env import MODEL_HOME
+from paddleaudio.utils.download import load_state_dict_from_url
+from paddleaudio.utils.env import MODEL_HOME

 __all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6']

--- a/setup_audio.py
+++ b/setup_audio.py
@ -16,19 +16,16 @@ import setuptools
 # set the version here
 version = '0.1.0a'

-with open("README.md", "r") as fh:
-    long_description = fh.read()
-
 setuptools.setup(
    name="paddleaudio",
    version=version,
    author="",
    author_email="",
    description="PaddleAudio, in development",
-    long_description=long_description,
+    long_description="",
    long_description_content_type="text/markdown",
    url="",
-    packages=setuptools.find_packages(exclude=["build*", "test*", "examples*"]),
+    packages=setuptools.find_packages(include=['paddleaudio*']),
    classifiers=[
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: MIT License",
@ -41,8 +38,4 @@ setuptools.setup(
        'resampy >= 0.2.2',
        'soundfile >= 0.9.0',
        'colorlog',
-        'pathos',
-    ],
-    extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2']
-                    }  # for dev only, install: pip install -e .[dev]
-)
+    ], )