Merge pull request #938 from KPatr1ck/develop

[audio] Merge PaddleAudio into PaddleSpeech.
3 years ago · b3ebf5d4c4
parent 304d71747a ee9972a0d3
commit b3ebf5d4c4
44 changed files with 5219 additions and 0 deletions
--- a/paddleaudio/.gitignore
+++ b/paddleaudio/.gitignore
@ -0,0 +1,7 @@
+.ipynb_checkpoints/**
+*.ipynb
+nohup.out
+__pycache__/
+*.wav
+*.m4a
+obsolete/**
--- a/paddleaudio/.pre-commit-config.yaml
+++ b/paddleaudio/.pre-commit-config.yaml
@ -0,0 +1,45 @@
+repos:
+-   repo: local
+    hooks:
+    -   id: yapf
+        name: yapf
+        entry: yapf
+        language: system
+        args: [-i, --style .style.yapf]
+        files: \.py$
+
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: a11d9314b22d8f8c7556443875b731ef05965464
+    hooks:
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+    -   id: detect-private-key
+    -   id: check-symlinks
+    -   id: check-added-large-files
+
+-   repo: https://github.com/pycqa/isort
+    rev: 5.8.0
+    hooks:
+    -   id: isort
+        name: isort (python)
+    -   id: isort
+        name: isort (cython)
+        types: [cython]
+    -   id: isort
+        name: isort (pyi)
+        types: [pyi]
+
+-   repo: local
+    hooks:
+    -   id: flake8
+        name: flake8
+        entry: flake8
+        language: system
+        args:
+        -   --count
+        -   --select=E9,F63,F7,F82
+        -   --show-source
+        -   --statistics
+        files: \.py$
--- a/paddleaudio/.style.yapf
+++ b/paddleaudio/.style.yapf
@ -0,0 +1,3 @@
+[style]
+based_on_style = pep8
+column_limit = 80
--- a/paddleaudio/LICENSE
+++ b/paddleaudio/LICENSE
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/paddleaudio/README.md
+++ b/paddleaudio/README.md
@ -0,0 +1,37 @@
+# PaddleAudio:  The audio library for PaddlePaddle
+
+## Introduction
+PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap.
+
+
+
+## Features
+- Spectrogram and related features are compatible with librosa.
+- State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come.
+- Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap.
+- Data loading supports for common open source audio in multiple languages including English, Mandarin and so on.
+
+
+## Install
+```
+git clone https://github.com/PaddlePaddle/models
+cd models/PaddleAudio
+pip install .
+
+```
+
+## Quick start
+### Audio loading and feature extraction
+```
+import paddleaudio as pa
+s,r = pa.load(f)
+mel_spect = pa.melspectrogram(s,sr=r)
+```
+
+###  Examples
+We provide a set of examples to help you get started in using PaddleAudio quickly.
+- [PANNs:  acoustic scene and events analysis using pre-trained models](./examples/panns)
+- [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification)
+- [Training a audio-tagging network on Audioset](./examples/audioset_training)
+
+Please refer to [example directory](./examples) for more details.
--- a/paddleaudio/examples/panns/README.md
+++ b/paddleaudio/examples/panns/README.md
@ -0,0 +1,128 @@
+# Audio Tagging
+
+声音分类的任务是单标签的分类任务，但是对于一段音频来说，它可以是多标签的。譬如在一般的室内办公环境进行录音，这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音，还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言，对一段音频预测多个标签是具有很强的实用性的。
+
+在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
+
+`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别，因此可用于声音的实时tagging。
+
+本示例采用`PANNs`预训练模型，基于Audioset的标签类别对输入音频实时tagging，并最终以文本形式输出对应时刻的top k类别和对应的得分。
+
+
+## 模型简介
+
+PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
+- CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
+- CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
+- CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
+
+
+## 快速开始
+
+### 模型预测
+
+```shell
+export CUDA_VISIBLE_DEVICES=0
+python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir
+```
+
+可支持配置的参数：
+
+- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `wav`: 指定预测的音频文件。
+- `sample_duration`: 模型每次预测的音频时间长度，单位为秒，默认为2s。
+- `hop_duration`: 每两个预测音频的时间间隔，单位为秒，默认为0.3s。
+- `output_dir`: 模型预测结果存放的路径，默认为`./output_dir`。
+
+示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
+```python
+from paddleaudio.models.panns import cnn14, cnn10, cnn6
+
+# CNN14
+model = cnn14(pretrained=True, extract_embedding=False)
+# CNN10
+model = cnn10(pretrained=True, extract_embedding=False)
+# CNN6
+model = cnn6(pretrained=True, extract_embedding=False)
+```
+
+执行结果：
+```
+[2021-04-30 19:15:41,025] [    INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz
+```
+
+执行后得分结果保存在`output_dir`的`.npz`文件中。
+
+
+### 生成tagging标签文本
+```shell
+python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir
+```
+
+可支持配置的参数：
+
+- `tagging_file`: 模型预测结果文件。
+- `top_k`: 获取预测结果中，得分最高的前top_k个标签，默认为10。
+- `smooth`: 预测结果的后验概率平滑，默认为True，表示应用平滑。
+- `smooth_size`: 平滑计算过程中的样本数量，默认为5。
+- `label_file`: 模型预测结果对应的Audioset类别的文本文件。
+- `output_dir`: 标签文本存放的路径，默认为`./output_dir`。
+
+执行结果：
+```
+[2021-04-30 19:26:58,743] [    INFO] - Posterior smoothing...
+[2021-04-30 19:26:58,746] [    INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt
+```
+
+执行后文本结果保存在`output_dir`的`.txt`文件中。
+
+
+## Tagging标签文本
+
+最终输出的文本结果如下所示。  
+样本每个时间范围的top k结果用空行分隔。在每一个结果中，第一行是时间信息，数字表示tagging结果在时间起点信息，比例值代表当前时刻`t`与音频总长度`T`的比值；紧接的k行是对应的标签和得分。
+
+```
+0.0
+Cat: 0.9144676923751831
+Animal: 0.8855036497116089
+Domestic animals, pets: 0.804577112197876
+Meow: 0.7422927021980286
+Music: 0.19959309697151184
+Inside, small room: 0.12550437450408936
+Caterwaul: 0.021584441885352135
+Purr: 0.020247288048267365
+Speech: 0.018197158351540565
+Vehicle: 0.007446660194545984
+
+0.059197544398158296
+Cat: 0.9250872135162354
+Animal: 0.8957151174545288
+Domestic animals, pets: 0.8228275775909424
+Meow: 0.7650775909423828
+Music: 0.20210561156272888
+Inside, small room: 0.12290887534618378
+Caterwaul: 0.029371455311775208
+Purr: 0.018731823191046715
+Speech: 0.017130598425865173
+Vehicle: 0.007748497650027275
+
+0.11839508879631659
+Cat: 0.9336574673652649
+Animal: 0.9111202359199524
+Domestic animals, pets: 0.8349071145057678
+Meow: 0.7761964797973633
+Music: 0.20467285811901093
+Inside, small room: 0.10709915310144424
+Caterwaul: 0.05370649695396423
+Purr: 0.018830426037311554
+Speech: 0.017361722886562347
+Vehicle: 0.006929398979991674
+
+...
+...
+```
+
+以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子，可以实时地对音频进行多标签预测。
+
+![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif)
--- a/paddleaudio/examples/panns/assets/audioset_labels.txt
+++ b/paddleaudio/examples/panns/assets/audioset_labels.txt
@ -0,0 +1,527 @@
+Speech
+Male speech, man speaking
+Female speech, woman speaking
+Child speech, kid speaking
+Conversation
+Narration, monologue
+Babbling
+Speech synthesizer
+Shout
+Bellow
+Whoop
+Yell
+Battle cry
+Children shouting
+Screaming
+Whispering
+Laughter
+Baby laughter
+Giggle
+Snicker
+Belly laugh
+Chuckle, chortle
+Crying, sobbing
+Baby cry, infant cry
+Whimper
+Wail, moan
+Sigh
+Singing
+Choir
+Yodeling
+Chant
+Mantra
+Male singing
+Female singing
+Child singing
+Synthetic singing
+Rapping
+Humming
+Groan
+Grunt
+Whistling
+Breathing
+Wheeze
+Snoring
+Gasp
+Pant
+Snort
+Cough
+Throat clearing
+Sneeze
+Sniff
+Run
+Shuffle
+Walk, footsteps
+Chewing, mastication
+Biting
+Gargling
+Stomach rumble
+Burping, eructation
+Hiccup
+Fart
+Hands
+Finger snapping
+Clapping
+Heart sounds, heartbeat
+Heart murmur
+Cheering
+Applause
+Chatter
+Crowd
+Hubbub, speech noise, speech babble
+Children playing
+Animal
+Domestic animals, pets
+Dog
+Bark
+Yip
+Howl
+Bow-wow
+Growling
+Whimper (dog)
+Cat
+Purr
+Meow
+Hiss
+Caterwaul
+Livestock, farm animals, working animals
+Horse
+Clip-clop
+Neigh, whinny
+Cattle, bovinae
+Moo
+Cowbell
+Pig
+Oink
+Goat
+Bleat
+Sheep
+Fowl
+Chicken, rooster
+Cluck
+Crowing, cock-a-doodle-doo
+Turkey
+Gobble
+Duck
+Quack
+Goose
+Honk
+Wild animals
+Roaring cats (lions, tigers)
+Roar
+Bird
+Bird vocalization, bird call, bird song
+Chirp, tweet
+Squawk
+Pigeon, dove
+Coo
+Crow
+Caw
+Owl
+Hoot
+Bird flight, flapping wings
+Canidae, dogs, wolves
+Rodents, rats, mice
+Mouse
+Patter
+Insect
+Cricket
+Mosquito
+Fly, housefly
+Buzz
+Bee, wasp, etc.
+Frog
+Croak
+Snake
+Rattle
+Whale vocalization
+Music
+Musical instrument
+Plucked string instrument
+Guitar
+Electric guitar
+Bass guitar
+Acoustic guitar
+Steel guitar, slide guitar
+Tapping (guitar technique)
+Strum
+Banjo
+Sitar
+Mandolin
+Zither
+Ukulele
+Keyboard (musical)
+Piano
+Electric piano
+Organ
+Electronic organ
+Hammond organ
+Synthesizer
+Sampler
+Harpsichord
+Percussion
+Drum kit
+Drum machine
+Drum
+Snare drum
+Rimshot
+Drum roll
+Bass drum
+Timpani
+Tabla
+Cymbal
+Hi-hat
+Wood block
+Tambourine
+Rattle (instrument)
+Maraca
+Gong
+Tubular bells
+Mallet percussion
+Marimba, xylophone
+Glockenspiel
+Vibraphone
+Steelpan
+Orchestra
+Brass instrument
+French horn
+Trumpet
+Trombone
+Bowed string instrument
+String section
+Violin, fiddle
+Pizzicato
+Cello
+Double bass
+Wind instrument, woodwind instrument
+Flute
+Saxophone
+Clarinet
+Harp
+Bell
+Church bell
+Jingle bell
+Bicycle bell
+Tuning fork
+Chime
+Wind chime
+Change ringing (campanology)
+Harmonica
+Accordion
+Bagpipes
+Didgeridoo
+Shofar
+Theremin
+Singing bowl
+Scratching (performance technique)
+Pop music
+Hip hop music
+Beatboxing
+Rock music
+Heavy metal
+Punk rock
+Grunge
+Progressive rock
+Rock and roll
+Psychedelic rock
+Rhythm and blues
+Soul music
+Reggae
+Country
+Swing music
+Bluegrass
+Funk
+Folk music
+Middle Eastern music
+Jazz
+Disco
+Classical music
+Opera
+Electronic music
+House music
+Techno
+Dubstep
+Drum and bass
+Electronica
+Electronic dance music
+Ambient music
+Trance music
+Music of Latin America
+Salsa music
+Flamenco
+Blues
+Music for children
+New-age music
+Vocal music
+A capella
+Music of Africa
+Afrobeat
+Christian music
+Gospel music
+Music of Asia
+Carnatic music
+Music of Bollywood
+Ska
+Traditional music
+Independent music
+Song
+Background music
+Theme music
+Jingle (music)
+Soundtrack music
+Lullaby
+Video game music
+Christmas music
+Dance music
+Wedding music
+Happy music
+Funny music
+Sad music
+Tender music
+Exciting music
+Angry music
+Scary music
+Wind
+Rustling leaves
+Wind noise (microphone)
+Thunderstorm
+Thunder
+Water
+Rain
+Raindrop
+Rain on surface
+Stream
+Waterfall
+Ocean
+Waves, surf
+Steam
+Gurgling
+Fire
+Crackle
+Vehicle
+Boat, Water vehicle
+Sailboat, sailing ship
+Rowboat, canoe, kayak
+Motorboat, speedboat
+Ship
+Motor vehicle (road)
+Car
+Vehicle horn, car horn, honking
+Toot
+Car alarm
+Power windows, electric windows
+Skidding
+Tire squeal
+Car passing by
+Race car, auto racing
+Truck
+Air brake
+Air horn, truck horn
+Reversing beeps
+Ice cream truck, ice cream van
+Bus
+Emergency vehicle
+Police car (siren)
+Ambulance (siren)
+Fire engine, fire truck (siren)
+Motorcycle
+Traffic noise, roadway noise
+Rail transport
+Train
+Train whistle
+Train horn
+Railroad car, train wagon
+Train wheels squealing
+Subway, metro, underground
+Aircraft
+Aircraft engine
+Jet engine
+Propeller, airscrew
+Helicopter
+Fixed-wing aircraft, airplane
+Bicycle
+Skateboard
+Engine
+Light engine (high frequency)
+Dental drill, dentist's drill
+Lawn mower
+Chainsaw
+Medium engine (mid frequency)
+Heavy engine (low frequency)
+Engine knocking
+Engine starting
+Idling
+Accelerating, revving, vroom
+Door
+Doorbell
+Ding-dong
+Sliding door
+Slam
+Knock
+Tap
+Squeak
+Cupboard open or close
+Drawer open or close
+Dishes, pots, and pans
+Cutlery, silverware
+Chopping (food)
+Frying (food)
+Microwave oven
+Blender
+Water tap, faucet
+Sink (filling or washing)
+Bathtub (filling or washing)
+Hair dryer
+Toilet flush
+Toothbrush
+Electric toothbrush
+Vacuum cleaner
+Zipper (clothing)
+Keys jangling
+Coin (dropping)
+Scissors
+Electric shaver, electric razor
+Shuffling cards
+Typing
+Typewriter
+Computer keyboard
+Writing
+Alarm
+Telephone
+Telephone bell ringing
+Ringtone
+Telephone dialing, DTMF
+Dial tone
+Busy signal
+Alarm clock
+Siren
+Civil defense siren
+Buzzer
+Smoke detector, smoke alarm
+Fire alarm
+Foghorn
+Whistle
+Steam whistle
+Mechanisms
+Ratchet, pawl
+Clock
+Tick
+Tick-tock
+Gears
+Pulleys
+Sewing machine
+Mechanical fan
+Air conditioning
+Cash register
+Printer
+Camera
+Single-lens reflex camera
+Tools
+Hammer
+Jackhammer
+Sawing
+Filing (rasp)
+Sanding
+Power tool
+Drill
+Explosion
+Gunshot, gunfire
+Machine gun
+Fusillade
+Artillery fire
+Cap gun
+Fireworks
+Firecracker
+Burst, pop
+Eruption
+Boom
+Wood
+Chop
+Splinter
+Crack
+Glass
+Chink, clink
+Shatter
+Liquid
+Splash, splatter
+Slosh
+Squish
+Drip
+Pour
+Trickle, dribble
+Gush
+Fill (with liquid)
+Spray
+Pump (liquid)
+Stir
+Boiling
+Sonar
+Arrow
+Whoosh, swoosh, swish
+Thump, thud
+Thunk
+Electronic tuner
+Effects unit
+Chorus effect
+Basketball bounce
+Bang
+Slap, smack
+Whack, thwack
+Smash, crash
+Breaking
+Bouncing
+Whip
+Flap
+Scratch
+Scrape
+Rub
+Roll
+Crushing
+Crumpling, crinkling
+Tearing
+Beep, bleep
+Ping
+Ding
+Clang
+Squeal
+Creak
+Rustle
+Whir
+Clatter
+Sizzle
+Clicking
+Clickety-clack
+Rumble
+Plop
+Jingle, tinkle
+Hum
+Zing
+Boing
+Crunch
+Silence
+Sine wave
+Harmonic
+Chirp tone
+Sound effect
+Pulse
+Inside, small room
+Inside, large room or hall
+Inside, public space
+Outside, urban or manmade
+Outside, rural or natural
+Reverberation
+Echo
+Noise
+Environmental noise
+Static
+Mains hum
+Distortion
+Sidetone
+Cacophony
+White noise
+Pink noise
+Throbbing
+Vibration
+Television
+Radio
+Field recording
--- a/paddleaudio/examples/panns/audio_tag.py
+++ b/paddleaudio/examples/panns/audio_tag.py
@ -0,0 +1,112 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from typing import List
+
+import numpy as np
+import paddle
+
+from paddleaudio.backends import load as load_audio
+from paddleaudio.features import melspectrogram
+from paddleaudio.models.panns import cnn14
+from paddleaudio.utils import logger
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
+parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
+parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
+parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
+parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
+args = parser.parse_args()
+# yapf: enable
+
+
+def split(waveform: np.ndarray, win_size: int, hop_size: int):
+    """
+    Split into N waveforms.
+    N is decided by win_size and hop_size.
+    """
+    assert isinstance(waveform, np.ndarray)
+    time = []
+    data = []
+    for i in range(0, len(waveform), hop_size):
+        segment = waveform[i:i + win_size]
+        if len(segment) < win_size:
+            segment = np.pad(segment, (0, win_size - len(segment)))
+        data.append(segment)
+        time.append(i / len(waveform))
+    return time, data
+
+
+def batchify(data: List[List[float]],
+             sample_rate: int,
+             batch_size: int,
+             **kwargs):
+    """
+    Extract features from waveforms and create batches.
+    """
+    examples = []
+    for waveform in data:
+        feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
+        examples.append(feats)
+
+    # Seperates data into some batches.
+    one_batch = []
+    for example in examples:
+        one_batch.append(example)
+        if len(one_batch) == batch_size:
+            yield one_batch
+            one_batch = []
+    if one_batch:
+        yield one_batch
+
+
+def predict(model, data: List[List[float]], sample_rate: int,
+            batch_size: int=1):
+    """
+    Use pretrained model to make predictions.
+    """
+    batches = batchify(data, sample_rate, batch_size)
+    results = None
+    model.eval()
+    for batch in batches:
+        feats = paddle.to_tensor(batch).unsqueeze(1)  \
+            # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
+
+        audioset_scores = model(feats)
+        if results is None:
+            results = audioset_scores.numpy()
+        else:
+            results = np.concatenate((results, audioset_scores.numpy()))
+
+    return results
+
+
+if __name__ == '__main__':
+    paddle.set_device(args.device)
+    model = cnn14(pretrained=True, extract_embedding=False)
+    waveform, sr = load_audio(args.wav, sr=None)
+    time, data = split(waveform,
+                       int(args.sample_duration * sr),
+                       int(args.hop_duration * sr))
+    results = predict(model, data, sr, batch_size=8)
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
+    output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
+    np.savez(output_file, time=time, scores=results)
+    logger.info(f'Saved tagging results to {output_file}')
--- a/paddleaudio/examples/panns/parse_result.py
+++ b/paddleaudio/examples/panns/parse_result.py
@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import ast
+import os
+from typing import Dict
+
+import numpy as np
+
+from paddleaudio.utils import logger
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--tagging_file', type=str, required=True, help='')
+parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
+parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
+parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
+parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
+parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
+args = parser.parse_args()
+# yapf: enable
+
+
+def smooth(results: np.ndarray, win_size: int):
+    """
+    Execute posterior smoothing in-place.
+    """
+    for i in range(len(results) - 1, -1, -1):
+        if i < win_size - 1:
+            left = 0
+        else:
+            left = i + 1 - win_size
+        results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
+
+
+def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
+    """
+    Return top k result.
+    """
+    result = np.asarray(result)
+    topk_idx = (-result).argsort()[:k]
+
+    ret = ''
+    for idx in topk_idx:
+        label, score = label_map[idx], result[idx]
+        ret += f'{label}: {score}\n'
+    return ret
+
+
+if __name__ == "__main__":
+    label_map = {}
+    with open(args.label_file, 'r') as f:
+        for i, l in enumerate(f.readlines()):
+            label_map[i] = l.strip()
+
+    results = np.load(args.tagging_file, allow_pickle=True)
+    times, scores = results['time'], results['scores']
+
+    if args.smooth:
+        logger.info('Posterior smoothing...')
+        smooth(scores, win_size=args.smooth_size)
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    output_file = os.path.join(
+        args.output_dir,
+        os.path.basename(args.tagging_file).split('.')[0] + '.txt')
+    with open(output_file, 'w') as f:
+        for time, score in zip(times, scores):
+            f.write(f'{time}\n')
+            f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
+
+    logger.info(f'Saved tagging labels to {output_file}')
--- a/paddleaudio/examples/sound_classification/README.md
+++ b/paddleaudio/examples/sound_classification/README.md
@ -0,0 +1,116 @@
+# 声音分类
+
+声音分类和检测是声音算法的一个热门研究方向。  
+
+对于声音分类任务，传统机器学习的一个常用做法是首先人工提取音频的时域和频域的多种特征并做特征选择、组合、变换等，然后基于SVM或决策树进行分类。而端到端的深度学习则通常利用深度网络如RNN，CNN等直接对声间波形(waveform)或时频特征(time-frequency)进行特征学习(representation learning)和分类预测。
+
+在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
+
+`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。经过预训练后，模型可以用于提取音频的embbedding。本示例将使用`PANNs`的预训练模型Finetune完成声音分类的任务。
+
+
+## 模型简介
+
+PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
+- CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
+- CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
+- CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
+
+
+## 快速开始
+
+### 模型训练
+
+以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。关于如何使用`paddle.distributed.launch`启动多卡训练，请查看[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html)。
+
+单卡训练:
+```shell
+$ python train.py --epochs 50 --batch_size 16 --checkpoint_dir ./checkpoint --save_freq 10
+```
+
+多卡训练:
+```shell
+$ unset CUDA_VISIBLE_DEVICES
+$ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_size 16 --num_worker 4 --checkpoint_dir ./checkpoint --save_freq 10
+```
+
+可支持配置的参数：
+
+- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `epochs`: 训练轮次，默认为50。
+- `learning_rate`: Fine-tune的学习率；默认为5e-5。
+- `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为16。
+- `num_workers`: Dataloader获取数据的子进程数。默认为0，加载数据的流程在主进程执行。
+- `checkpoint_dir`: 模型参数文件和optimizer参数文件的保存目录，默认为`./checkpoint`。
+- `save_freq`: 训练过程中的模型保存频率，默认为10。
+- `log_freq`: 训练过程中的信息打印频率，默认为10。
+
+示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
+```python
+from model import SoundClassifier
+from paddleaudio.datasets import ESC50
+from paddleaudio.models.panns import cnn14, cnn10, cnn6
+
+# CNN14
+backbone = cnn14(pretrained=True, extract_embedding=True)
+model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
+
+# CNN10
+backbone = cnn10(pretrained=True, extract_embedding=True)
+model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
+
+# CNN6
+backbone = cnn6(pretrained=True, extract_embedding=True)
+model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
+```
+
+### 模型预测
+
+```shell
+python -u predict.py --wav ./dog.wav --top_k 3 --checkpoint ./checkpoint/epoch_50/model.pdparams
+```
+
+可支持配置的参数：
+- `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
+- `wav`: 指定预测的音频文件。
+- `top_k`: 预测显示的top k标签的得分，默认为1。
+- `checkpoint`: 模型参数checkpoint文件。
+
+输出的预测结果如下：
+```
+[/audio/dog.wav]
+Dog: 0.9999538660049438
+Clock tick: 1.3341237718123011e-05
+Cat: 6.579841738130199e-06
+```
+
+### 模型部署
+
+#### 1. 动转静
+
+模型训练结束后，可以将已保存的动态图参数导出成静态图的模型和参数，然后实施静态图的部署。
+
+```shell
+python -u export_model.py --checkpoint ./checkpoint/epoch_50/model.pdparams --output_dir ./export
+```
+
+可支持配置的参数：
+- `checkpoint`: 模型参数checkpoint文件。
+- `output_dir`: 导出静态图模型和参数文件的保存目录。
+
+导出的静态图模型和参数文件如下：
+```sh
+$ tree export
+export
+├── inference.pdiparams
+├── inference.pdiparams.info
+└── inference.pdmodel
+```
+
+#### 2. 模型部署和预测
+
+`deploy/python/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
+
+```sh
+python deploy/python/predict.py --model_dir ./export --device gpu
+```
--- a/paddleaudio/examples/sound_classification/deploy/python/predict.py
+++ b/paddleaudio/examples/sound_classification/deploy/python/predict.py
@ -0,0 +1,147 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import numpy as np
+from paddle import inference
+from scipy.special import softmax
+
+from paddleaudio.backends import load as load_audio
+from paddleaudio.datasets import ESC50
+from paddleaudio.features import melspectrogram
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
+parser.add_argument("--batch_size", type=int, default=2, help="Batch size per GPU/CPU for training.")
+parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
+parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.')
+parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.')
+parser.add_argument('--enable_mkldnn', type=eval, default=False, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
+parser.add_argument("--log_dir", type=str, default="./log", help="The path to save log.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def extract_features(files: str, **kwargs):
+    waveforms = []
+    srs = []
+    max_length = float('-inf')
+    for file in files:
+        waveform, sr = load_audio(file, sr=None)
+        max_length = max(max_length, len(waveform))
+        waveforms.append(waveform)
+        srs.append(sr)
+
+    feats = []
+    for i in range(len(waveforms)):
+        # padding
+        if len(waveforms[i]) < max_length:
+            pad_width = max_length - len(waveforms[i])
+            waveforms[i] = np.pad(waveforms[i], pad_width=(0, pad_width))
+
+        feat = melspectrogram(waveforms[i], sr, **kwargs).transpose()
+        feats.append(feat)
+
+    return np.stack(feats, axis=0)
+
+
+class Predictor(object):
+    def __init__(self,
+                 model_dir,
+                 device="gpu",
+                 batch_size=1,
+                 use_tensorrt=False,
+                 precision="fp32",
+                 cpu_threads=10,
+                 enable_mkldnn=False):
+        self.batch_size = batch_size
+
+        model_file = os.path.join(model_dir, "inference.pdmodel")
+        params_file = os.path.join(model_dir, "inference.pdiparams")
+
+        assert os.path.isfile(model_file) and os.path.isfile(
+            params_file), 'Please check model and parameter files.'
+
+        config = inference.Config(model_file, params_file)
+        if device == "gpu":
+            # set GPU configs accordingly
+            # such as intialize the gpu memory, enable tensorrt
+            config.enable_use_gpu(100, 0)
+            precision_map = {
+                "fp16": inference.PrecisionType.Half,
+                "fp32": inference.PrecisionType.Float32,
+            }
+            precision_mode = precision_map[precision]
+
+            if use_tensorrt:
+                config.enable_tensorrt_engine(
+                    max_batch_size=batch_size,
+                    min_subgraph_size=30,
+                    precision_mode=precision_mode)
+        elif device == "cpu":
+            # set CPU configs accordingly,
+            # such as enable_mkldnn, set_cpu_math_library_num_threads
+            config.disable_gpu()
+            if enable_mkldnn:
+                # cache 10 different shapes for mkldnn to avoid memory leak
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+            config.set_cpu_math_library_num_threads(cpu_threads)
+        elif device == "xpu":
+            # set XPU configs accordingly
+            config.enable_xpu(100)
+
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = inference.create_predictor(config)
+        self.input_handles = [
+            self.predictor.get_input_handle(name)
+            for name in self.predictor.get_input_names()
+        ]
+        self.output_handle = self.predictor.get_output_handle(
+            self.predictor.get_output_names()[0])
+
+    def predict(self, wavs):
+        feats = extract_features(wavs)
+
+        self.input_handles[0].copy_from_cpu(feats)
+        self.predictor.run()
+        logits = self.output_handle.copy_to_cpu()
+        probs = softmax(logits, axis=1)
+        indices = np.argmax(probs, axis=1)
+
+        return indices
+
+
+if __name__ == "__main__":
+    # Define predictor to do prediction.
+    predictor = Predictor(args.model_dir, args.device, args.batch_size,
+                          args.use_tensorrt, args.precision, args.cpu_threads,
+                          args.enable_mkldnn)
+
+    wavs = [
+        '~/audio_demo_resource/cat.wav',
+        '~/audio_demo_resource/dog.wav',
+    ]
+
+    for i in range(len(wavs)):
+        wavs[i] = os.path.abspath(os.path.expanduser(wavs[i]))
+        assert os.path.isfile(
+            wavs[i]), f'Please check input wave file: {wavs[i]}'
+
+    results = predictor.predict(wavs)
+    for idx, wav in enumerate(wavs):
+        print(f'Wav: {wav} \t Label: {ESC50.label_list[results[idx]]}')
--- a/paddleaudio/examples/sound_classification/export_model.py
+++ b/paddleaudio/examples/sound_classification/export_model.py
@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+from model import SoundClassifier
+
+from paddleaudio.datasets import ESC50
+from paddleaudio.models.panns import cnn14
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
+parser.add_argument("--output_dir", type=str, default='./export', help="Path to save static model and its parameters.")
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == '__main__':
+    model = SoundClassifier(
+        backbone=cnn14(pretrained=False, extract_embedding=True),
+        num_class=len(ESC50.label_list))
+    model.set_state_dict(paddle.load(args.checkpoint))
+    model.eval()
+
+    model = paddle.jit.to_static(
+        model,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, None, 64], dtype=paddle.float32)
+        ])
+
+    # Save in static graph model.
+    paddle.jit.save(model, os.path.join(args.output_dir, "inference"))
--- a/paddleaudio/examples/sound_classification/model.py
+++ b/paddleaudio/examples/sound_classification/model.py
@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.nn as nn
+
+
+class SoundClassifier(nn.Layer):
+    """
+    Model for sound classification which uses panns pretrained models to extract
+    embeddings from audio files.
+    """
+
+    def __init__(self, backbone, num_class, dropout=0.1):
+        super(SoundClassifier, self).__init__()
+        self.backbone = backbone
+        self.dropout = nn.Dropout(dropout)
+        self.fc = nn.Linear(self.backbone.emb_size, num_class)
+
+    def forward(self, x):
+        # x: (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
+        x = x.unsqueeze(1)
+        x = self.backbone(x)
+        x = self.dropout(x)
+        logits = self.fc(x)
+
+        return logits
--- a/paddleaudio/examples/sound_classification/predict.py
+++ b/paddleaudio/examples/sound_classification/predict.py
@ -0,0 +1,61 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from model import SoundClassifier
+
+from paddleaudio.backends import load as load_audio
+from paddleaudio.datasets import ESC50
+from paddleaudio.features import melspectrogram
+from paddleaudio.models.panns import cnn14
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
+parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
+parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def extract_features(file: str, **kwargs):
+    waveform, sr = load_audio(file, sr=None)
+    feat = melspectrogram(waveform, sr, **kwargs).transpose()
+    return feat
+
+
+if __name__ == '__main__':
+    paddle.set_device(args.device)
+
+    model = SoundClassifier(
+        backbone=cnn14(pretrained=False, extract_embedding=True),
+        num_class=len(ESC50.label_list))
+    model.set_state_dict(paddle.load(args.checkpoint))
+    model.eval()
+
+    feat = np.expand_dims(extract_features(args.wav), 0)
+    feat = paddle.to_tensor(feat)
+    logits = model(feat)
+    probs = F.softmax(logits, axis=1).numpy()
+
+    sorted_indices = (-probs[0]).argsort()
+
+    msg = f'[{args.wav}]\n'
+    for idx in sorted_indices[:args.top_k]:
+        msg += f'{ESC50.label_list[idx]}: {probs[0][idx]}\n'
+    print(msg)
--- a/paddleaudio/examples/sound_classification/train.py
+++ b/paddleaudio/examples/sound_classification/train.py
@ -0,0 +1,149 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+from model import SoundClassifier
+
+from paddleaudio.datasets import ESC50
+from paddleaudio.models.panns import cnn14
+from paddleaudio.utils import logger
+from paddleaudio.utils import Timer
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
+parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
+parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
+parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
+parser.add_argument("--checkpoint_dir", type=str, default='./checkpoint', help="Directory to save model checkpoints.")
+parser.add_argument("--save_freq", type=int, default=10, help="Save checkpoint every n epoch.")
+parser.add_argument("--log_freq", type=int, default=10, help="Log the training infomation every n steps.")
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == "__main__":
+    paddle.set_device(args.device)
+    nranks = paddle.distributed.get_world_size()
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+    local_rank = paddle.distributed.get_rank()
+
+    backbone = cnn14(pretrained=True, extract_embedding=True)
+    model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
+    model = paddle.DataParallel(model)
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=args.learning_rate, parameters=model.parameters())
+    criterion = paddle.nn.loss.CrossEntropyLoss()
+
+    train_ds = ESC50(mode='train', feat_type='melspectrogram')
+    dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
+
+    train_sampler = paddle.io.DistributedBatchSampler(
+        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
+    train_loader = paddle.io.DataLoader(
+        train_ds,
+        batch_sampler=train_sampler,
+        num_workers=args.num_workers,
+        return_list=True,
+        use_buffer_reader=True, )
+
+    steps_per_epoch = len(train_sampler)
+    timer = Timer(steps_per_epoch * args.epochs)
+    timer.start()
+
+    for epoch in range(1, args.epochs + 1):
+        model.train()
+
+        avg_loss = 0
+        num_corrects = 0
+        num_samples = 0
+        for batch_idx, batch in enumerate(train_loader):
+            feats, labels = batch
+            logits = model(feats)
+
+            loss = criterion(logits, labels)
+            loss.backward()
+            optimizer.step()
+            if isinstance(optimizer._learning_rate,
+                          paddle.optimizer.lr.LRScheduler):
+                optimizer._learning_rate.step()
+            optimizer.clear_grad()
+
+            # Calculate loss
+            avg_loss += loss.numpy()[0]
+
+            # Calculate metrics
+            preds = paddle.argmax(logits, axis=1)
+            num_corrects += (preds == labels).numpy().sum()
+            num_samples += feats.shape[0]
+
+            timer.count()
+
+            if (batch_idx + 1) % args.log_freq == 0 and local_rank == 0:
+                lr = optimizer.get_lr()
+                avg_loss /= args.log_freq
+                avg_acc = num_corrects / num_samples
+
+                print_msg = 'Epoch={}/{}, Step={}/{}'.format(
+                    epoch, args.epochs, batch_idx + 1, steps_per_epoch)
+                print_msg += ' loss={:.4f}'.format(avg_loss)
+                print_msg += ' acc={:.4f}'.format(avg_acc)
+                print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
+                    lr, timer.timing, timer.eta)
+                logger.train(print_msg)
+
+                avg_loss = 0
+                num_corrects = 0
+                num_samples = 0
+
+        if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
+            dev_sampler = paddle.io.BatchSampler(
+                dev_ds,
+                batch_size=args.batch_size,
+                shuffle=False,
+                drop_last=False)
+            dev_loader = paddle.io.DataLoader(
+                dev_ds,
+                batch_sampler=dev_sampler,
+                num_workers=args.num_workers,
+                return_list=True, )
+
+            model.eval()
+            num_corrects = 0
+            num_samples = 0
+            with logger.processing('Evaluation on validation dataset'):
+                for batch_idx, batch in enumerate(dev_loader):
+                    feats, labels = batch
+                    logits = model(feats)
+
+                    preds = paddle.argmax(logits, axis=1)
+                    num_corrects += (preds == labels).numpy().sum()
+                    num_samples += feats.shape[0]
+
+            print_msg = '[Evaluation result]'
+            print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
+
+            logger.eval(print_msg)
+
+            # Save model
+            save_dir = os.path.join(args.checkpoint_dir,
+                                    'epoch_{}'.format(epoch))
+            logger.info('Saving model checkpoint to {}'.format(save_dir))
+            paddle.save(model.state_dict(),
+                        os.path.join(save_dir, 'model.pdparams'))
+            paddle.save(optimizer.state_dict(),
+                        os.path.join(save_dir, 'model.pdopt'))
--- a/paddleaudio/paddleaudio/init.py
+++ b/paddleaudio/paddleaudio/init.py
@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .backends import *
+from .features import *
--- a/paddleaudio/paddleaudio/backends/init.py
+++ b/paddleaudio/paddleaudio/backends/init.py
@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .audio import *
--- a/paddleaudio/paddleaudio/backends/audio.py
+++ b/paddleaudio/paddleaudio/backends/audio.py
@ -0,0 +1,303 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import Optional
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import resampy
+import soundfile as sf
+from numpy import ndarray as array
+from scipy.io import wavfile
+
+from ..utils import ParameterError
+
+__all__ = [
+    'resample',
+    'to_mono',
+    'depth_convert',
+    'normalize',
+    'save_wav',
+    'load',
+]
+NORMALMIZE_TYPES = ['linear', 'gaussian']
+MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
+RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
+EPS = 1e-8
+
+
+def resample(y: array, src_sr: int, target_sr: int,
+             mode: str='kaiser_fast') -> array:
+    """ Audio resampling
+
+     This function is the same as using resampy.resample().
+
+     Notes:
+        The default mode is kaiser_fast.  For better audio quality, use mode = 'kaiser_fast'
+
+     """
+
+    if mode == 'kaiser_best':
+        warnings.warn(
+            f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
+        we recommend the mode kaiser_fast in large scale audio trainning')
+
+    if not isinstance(y, np.ndarray):
+        raise ParameterError(
+            'Only support numpy array, but received y in {type(y)}')
+
+    if mode not in RESAMPLE_MODES:
+        raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
+
+    return resampy.resample(y, src_sr, target_sr, filter=mode)
+
+
+def to_mono(y: array, merge_type: str='average') -> array:
+    """ convert sterior audio to mono
+    """
+    if merge_type not in MERGE_TYPES:
+        raise ParameterError(
+            f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
+        )
+    if y.ndim > 2:
+        raise ParameterError(
+            f'Unsupported audio array,  y.ndim > 2, the shape is {y.shape}')
+    if y.ndim == 1:  # nothing to merge
+        return y
+
+    if merge_type == 'ch0':
+        return y[0]
+    if merge_type == 'ch1':
+        return y[1]
+    if merge_type == 'random':
+        return y[np.random.randint(0, 2)]
+
+    # need to do averaging according to dtype
+
+    if y.dtype == 'float32':
+        y_out = (y[0] + y[1]) * 0.5
+    elif y.dtype == 'int16':
+        y_out = y.astype('int32')
+        y_out = (y_out[0] + y_out[1]) // 2
+        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
+                        np.iinfo(y.dtype).max).astype(y.dtype)
+
+    elif y.dtype == 'int8':
+        y_out = y.astype('int16')
+        y_out = (y_out[0] + y_out[1]) // 2
+        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
+                        np.iinfo(y.dtype).max).astype(y.dtype)
+    else:
+        raise ParameterError(f'Unsupported dtype: {y.dtype}')
+    return y_out
+
+
+def _safe_cast(y: array, dtype: Union[type, str]) -> array:
+    """ data type casting in a safe way, i.e., prevent overflow or underflow
+
+    This function is used internally.
+    """
+    return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
+
+
+def depth_convert(y: array, dtype: Union[type, str],
+                  dithering: bool=True) -> array:
+    """Convert audio array to target dtype safely
+
+    This function convert audio waveform to a target dtype, with addition steps of
+    preventing overflow/underflow and preserving audio range.
+
+    """
+
+    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
+    if y.dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype not in SUPPORT_DTYPE:
+        raise ParameterError(
+            'Unsupported audio dtype, '
+            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
+
+    if dtype == y.dtype:
+        return y
+
+    if dtype == 'float64' and y.dtype == 'float32':
+        return _safe_cast(y, dtype)
+    if dtype == 'float32' and y.dtype == 'float64':
+        return _safe_cast(y, dtype)
+
+    if dtype == 'int16' or dtype == 'int8':
+        if y.dtype in ['float64', 'float32']:
+            factor = np.iinfo(dtype).max
+            y = np.clip(y * factor, np.iinfo(dtype).min,
+                        np.iinfo(dtype).max).astype(dtype)
+            y = y.astype(dtype)
+        else:
+            if dtype == 'int16' and y.dtype == 'int8':
+                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
+                y = y.astype('float32') * factor
+                y = y.astype('int16')
+
+            else:  # dtype == 'int8' and y.dtype=='int16':
+                y = y.astype('int32') * np.iinfo('int8').max / \
+                    np.iinfo('int16').max
+                y = y.astype('int8')
+
+    if dtype in ['float32', 'float64']:
+        org_dtype = y.dtype
+        y = y.astype(dtype) / np.iinfo(org_dtype).max
+    return y
+
+
+def sound_file_load(file: str,
+                    offset: Optional[float]=None,
+                    dtype: str='int16',
+                    duration: Optional[int]=None) -> Tuple[array, int]:
+    """Load audio using soundfile library
+
+    This function load audio file using libsndfile.
+
+    Reference:
+        http://www.mega-nerd.com/libsndfile/#Features
+
+    """
+    with sf.SoundFile(file) as sf_desc:
+        sr_native = sf_desc.samplerate
+        if offset:
+            sf_desc.seek(int(offset * sr_native))
+        if duration is not None:
+            frame_duration = int(duration * sr_native)
+        else:
+            frame_duration = -1
+        y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
+
+    return y, sf_desc.samplerate
+
+
+def audio_file_load():
+    """Load audio using audiofile library
+
+    This function load audio file using audiofile.
+
+    Reference:
+        https://audiofile.68k.org/
+
+    """
+    raise NotImplementedError()
+
+
+def sox_file_load():
+    """Load audio using sox library
+
+    This function load audio file using sox.
+
+    Reference:
+        http://sox.sourceforge.net/
+    """
+    raise NotImplementedError()
+
+
+def normalize(y: array, norm_type: str='linear',
+              mul_factor: float=1.0) -> array:
+    """ normalize an input audio with additional multiplier.
+
+    """
+
+    if norm_type == 'linear':
+        amax = np.max(np.abs(y))
+        factor = 1.0 / (amax + EPS)
+        y = y * factor * mul_factor
+    elif norm_type == 'gaussian':
+        amean = np.mean(y)
+        astd = np.std(y)
+        astd = max(astd, EPS)
+        y = mul_factor * (y - amean) / astd
+    else:
+        raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
+
+    return y
+
+
+def save_wav(y: array, sr: int, file: str) -> None:
+    """Save audio file to disk.
+    This function saves audio to disk using scipy.io.wavfile, with additional step
+    to convert input waveform to int16 unless it already is int16
+
+    Notes:
+        It only support raw wav format.
+
+    """
+    if not file.endswith('.wav'):
+        raise ParameterError(
+            f'only .wav file supported, but dst file name is: {file}')
+
+    if sr <= 0:
+        raise ParameterError(
+            f'Sample rate should be larger than 0, recieved sr = {sr}')
+
+    if y.dtype not in ['int16', 'int8']:
+        warnings.warn(
+            f'input data type is {y.dtype}, will convert data to int16 format before saving'
+        )
+        y_out = depth_convert(y, 'int16')
+    else:
+        y_out = y
+
+    wavfile.write(file, sr, y_out)
+
+
+def load(
+        file: str,
+        sr: Optional[int]=None,
+        mono: bool=True,
+        merge_type: str='average',  # ch0,ch1,random,average
+        normal: bool=True,
+        norm_type: str='linear',
+        norm_mul_factor: float=1.0,
+        offset: float=0.0,
+        duration: Optional[int]=None,
+        dtype: str='float32',
+        resample_mode: str='kaiser_fast') -> Tuple[array, int]:
+    """Load audio file from disk.
+    This function loads audio from disk using using audio beackend.
+
+    Parameters:
+
+    Notes:
+
+    """
+
+    y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration)
+
+    if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
+        raise ParameterError(f'audio file {file} looks empty')
+
+    if mono:
+        y = to_mono(y, merge_type)
+
+    if sr is not None and sr != r:
+        y = resample(y, r, sr, mode=resample_mode)
+        r = sr
+
+    if normal:
+        y = normalize(y, norm_type, norm_mul_factor)
+    elif dtype in ['int8', 'int16']:
+        # still need to do normalization, before depth convertion
+        y = normalize(y, 'linear', 1.0)
+
+    y = depth_convert(y, dtype)
+    return y, r
--- a/paddleaudio/paddleaudio/datasets/init.py
+++ b/paddleaudio/paddleaudio/datasets/init.py
@ -0,0 +1,34 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aishell import AISHELL1
+from .dcase import UrbanAcousticScenes
+from .dcase import UrbanAudioVisualScenes
+from .esc50 import ESC50
+from .gtzan import GTZAN
+from .librispeech import LIBRISPEECH
+from .ravdess import RAVDESS
+from .tess import TESS
+from .urban_sound import UrbanSound8K
+
+__all__ = [
+    'AISHELL1',
+    'LIBRISPEECH',
+    'ESC50',
+    'UrbanSound8K',
+    'GTZAN',
+    'UrbanAcousticScenes',
+    'UrbanAudioVisualScenes',
+    'RAVDESS',
+    'TESS',
+]
--- a/paddleaudio/paddleaudio/datasets/aishell.py
+++ b/paddleaudio/paddleaudio/datasets/aishell.py
@ -0,0 +1,154 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import codecs
+import collections
+import json
+import os
+from typing import Dict
+
+from paddle.io import Dataset
+from tqdm import tqdm
+
+from ..backends import load as load_audio
+from ..utils.download import decompress
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from ..utils.log import logger
+from .dataset import feat_funcs
+
+__all__ = ['AISHELL1']
+
+
+class AISHELL1(Dataset):
+    """
+    This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
+    It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
+    smart home, autonomous driving, and industrial production. The whole recording was
+    put in quiet indoor environment, using 3 different devices at the same time: high
+    fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
+    iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
+    to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
+    in China were invited to participate in the recording. The manual transcription
+    accuracy rate is above 95%, through professional speech annotation and strict
+    quality inspection. The corpus is divided into training, development and testing
+    sets.
+
+    Reference:
+        AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
+        https://arxiv.org/abs/1709.05522
+    """
+
+    archieves = [
+        {
+            'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
+            'md5': '2f494334227864a8a8fec932999db9d8',
+        },
+    ]
+    text_meta = os.path.join('data_aishell', 'transcript',
+                             'aishell_transcript_v0.8.txt')
+    utt_info = collections.namedtuple('META_INFO',
+                                      ('file_path', 'utt_id', 'text'))
+    audio_path = os.path.join('data_aishell', 'wav')
+    manifest_path = os.path.join('data_aishell', 'manifest')
+    subset = ['train', 'dev', 'test']
+
+    def __init__(self, subset: str='train', feat_type: str='raw', **kwargs):
+        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
+            self.subset, subset)
+        self.subset = subset
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self._data = self._get_data()
+        super(AISHELL1, self).__init__()
+
+    def _get_text_info(self) -> Dict[str, str]:
+        ret = {}
+        with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                utt_id, text = map(str.strip, line.split(' ',
+                                                         1))  # utt_id, text
+                ret.update({utt_id: ''.join(text.split())})
+        return ret
+
+    def _get_data(self):
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+            # Extract *wav from *.tar.gz.
+            for root, _, files in os.walk(
+                    os.path.join(DATA_HOME, self.audio_path)):
+                for file in files:
+                    if file.endswith('.tar.gz'):
+                        decompress(os.path.join(root, file))
+                        os.remove(os.path.join(root, file))
+
+        text_info = self._get_text_info()
+
+        data = []
+        for root, _, files in os.walk(
+                os.path.join(DATA_HOME, self.audio_path, self.subset)):
+            for file in files:
+                if file.endswith('.wav'):
+                    utt_id = os.path.splitext(file)[0]
+                    if utt_id not in text_info:  # There are some utt_id that without label
+                        continue
+                    text = text_info[utt_id]
+                    file_path = os.path.join(root, file)
+                    data.append(self.utt_info(file_path, utt_id, text))
+
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(
+            sample[0])  # The first element of sample is file path
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(
+            waveform, sample_rate=sr,
+            **self.feat_config) if feat_func else waveform
+        record.update({'feat': feat, 'duration': len(waveform) / sr})
+        return record
+
+    def create_manifest(self, prefix='manifest'):
+        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
+            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
+
+        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
+                                     f'{prefix}.{self.subset}')
+        with codecs.open(manifest_file, 'w', 'utf-8') as f:
+            for idx in tqdm(range(len(self))):
+                record = self._convert_to_record(idx)
+                record_line = json.dumps(
+                    {
+                        'utt': record['utt_id'],
+                        'feat': record['file_path'],
+                        'feat_shape': (record['duration'], ),
+                        'text': record['text']
+                    },
+                    ensure_ascii=False)
+                f.write(record_line + '\n')
+        logger.info(f'Manifest file {manifest_file} created.')
+
+    def __getitem__(self, idx):
+        record = self._convert_to_record(idx)
+        return tuple(record.values())
+
+    def __len__(self):
+        return len(self._data)
--- a/paddleaudio/paddleaudio/datasets/dataset.py
+++ b/paddleaudio/paddleaudio/datasets/dataset.py
@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import numpy as np
+import paddle
+
+from ..backends import load as load_audio
+from ..features import melspectrogram
+from ..features import mfcc
+
+feat_funcs = {
+    'raw': None,
+    'melspectrogram': melspectrogram,
+    'mfcc': mfcc,
+}
+
+
+class AudioClassificationDataset(paddle.io.Dataset):
+    """
+    Base class of audio classification dataset.
+    """
+
+    def __init__(self,
+                 files: List[str],
+                 labels: List[int],
+                 feat_type: str='raw',
+                 **kwargs):
+        """
+        Ags:
+            files (:obj:`List[str]`): A list of absolute path of audio files.
+            labels (:obj:`List[int]`): Labels of audio files.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        super(AudioClassificationDataset, self).__init__()
+
+        if feat_type not in feat_funcs.keys():
+            raise RuntimeError(
+                f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
+            )
+
+        self.files = files
+        self.labels = labels
+
+        self.feat_type = feat_type
+        self.feat_config = kwargs  # Pass keyword arguments to customize feature config
+
+    def _get_data(self, input_file: str):
+        raise NotImplementedError
+
+    def _convert_to_record(self, idx):
+        file, label = self.files[idx], self.labels[idx]
+
+        waveform, sample_rate = load_audio(file)
+        feat_func = feat_funcs[self.feat_type]
+
+        record = {}
+        record['feat'] = feat_func(
+            waveform, sample_rate,
+            **self.feat_config) if feat_func else waveform
+        record['label'] = label
+        return record
+
+    def __getitem__(self, idx):
+        record = self._convert_to_record(idx)
+        return np.array(record['feat']).transpose(), np.array(
+            record['label'], dtype=np.int64)
+
+    def __len__(self):
+        return len(self.files)
--- a/paddleaudio/paddleaudio/datasets/dcase.py
+++ b/paddleaudio/paddleaudio/datasets/dcase.py
@ -0,0 +1,298 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
+
+
+class UrbanAcousticScenes(AudioClassificationDataset):
+    """
+    TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from
+    12 European cities in 10 different acoustic scenes using 4 different devices.
+    Additionally, synthetic data for 11 mobile devices was created based on the original
+    recordings. Of the 12 cities, two are present only in the evaluation set.
+
+    Reference:
+        A multi-device dataset for urban acoustic scene classification
+        https://arxiv.org/abs/1807.09840
+    """
+
+    source_url = 'https://zenodo.org/record/3819968/files/'
+    base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
+    archieves = [
+        {
+            'url': source_url + base_name + '.meta.zip',
+            'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
+        },
+        {
+            'url': source_url + base_name + '.audio.1.zip',
+            'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
+        },
+        {
+            'url': source_url + base_name + '.audio.2.zip',
+            'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
+        },
+        {
+            'url': source_url + base_name + '.audio.3.zip',
+            'md5': 'ed38956c4246abb56190c1e9b602b7b8',
+        },
+        {
+            'url': source_url + base_name + '.audio.4.zip',
+            'md5': '97ab8560056b6816808dedc044dcc023',
+        },
+        {
+            'url': source_url + base_name + '.audio.5.zip',
+            'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
+        },
+        {
+            'url': source_url + base_name + '.audio.6.zip',
+            'md5': 'fbf856a3a86fff7520549c899dc94372',
+        },
+        {
+            'url': source_url + base_name + '.audio.7.zip',
+            'md5': '0dbffe7b6e45564da649378723284062',
+        },
+        {
+            'url': source_url + base_name + '.audio.8.zip',
+            'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
+        },
+        {
+            'url': source_url + base_name + '.audio.9.zip',
+            'md5': 'a65596a5372eab10c78e08a0de797c9e',
+        },
+        {
+            'url': source_url + base_name + '.audio.10.zip',
+            'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
+        },
+        {
+            'url': source_url + base_name + '.audio.11.zip',
+            'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
+        },
+        {
+            'url': source_url + base_name + '.audio.12.zip',
+            'md5': 'e5f4400c6b9697295fab4cf507155a2f',
+        },
+        {
+            'url': source_url + base_name + '.audio.13.zip',
+            'md5': '8855ab9f9896422746ab4c5d89d8da2f',
+        },
+        {
+            'url': source_url + base_name + '.audio.14.zip',
+            'md5': '092ad744452cd3e7de78f988a3d13020',
+        },
+        {
+            'url': source_url + base_name + '.audio.15.zip',
+            'md5': '4b5eb85f6592aebf846088d9df76b420',
+        },
+        {
+            'url': source_url + base_name + '.audio.16.zip',
+            'md5': '2e0a89723e58a3836be019e6996ae460',
+        },
+    ]
+    label_list = [
+        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
+        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
+    ]
+
+    meta = os.path.join(base_name, 'meta.csv')
+    meta_info = collections.namedtuple('META_INFO', (
+        'filename', 'scene_label', 'identifier', 'source_label'))
+    subset_meta = {
+        'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
+        'dev':
+        os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
+        'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
+    }
+    subset_meta_info = collections.namedtuple('SUBSET_META_INFO',
+                                              ('filename', 'scene_label'))
+    audio_path = os.path.join(base_name, 'audio')
+
+    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        files, labels = self._get_data(mode)
+        super(UrbanAcousticScenes, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self, subset: str=None,
+                       skip_header: bool=True) -> List[collections.namedtuple]:
+        if subset is None:
+            meta_file = self.meta
+            meta_info = self.meta_info
+        else:
+            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
+            meta_file = self.subset_meta[subset]
+            meta_info = self.subset_meta_info
+
+        ret = []
+        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
+            lines = rf.readlines()[1:] if skip_header else rf.readlines()
+            for line in lines:
+                ret.append(meta_info(*line.strip().split('\t')))
+        return ret
+
+    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        meta_info = self._get_meta_info(subset=mode, skip_header=True)
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, label = sample[:2]
+            filename = os.path.basename(filename)
+            target = self.label_list.index(label)
+
+            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+            labels.append(int(target))
+
+        return files, labels
+
+
+class UrbanAudioVisualScenes(AudioClassificationDataset):
+    """
+    TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
+    and video recordings from 12 European cities in 10 different scenes.
+    This dataset consists of 10-seconds audio and video segments from 10
+    acoustic scenes. The total amount of audio in the development set is 34 hours.
+
+    Reference:
+        A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
+        https://arxiv.org/abs/2011.00030
+    """
+
+    source_url = 'https://zenodo.org/record/4477542/files/'
+    base_name = 'TAU-urban-audio-visual-scenes-2021-development'
+
+    archieves = [
+        {
+            'url': source_url + base_name + '.meta.zip',
+            'md5': '76e3d7ed5291b118372e06379cb2b490',
+        },
+        {
+            'url': source_url + base_name + '.audio.1.zip',
+            'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
+        },
+        {
+            'url': source_url + base_name + '.audio.2.zip',
+            'md5': '7fd6bb63127f5785874a55aba4e77aa5',
+        },
+        {
+            'url': source_url + base_name + '.audio.3.zip',
+            'md5': '61396bede29d7c8c89729a01a6f6b2e2',
+        },
+        {
+            'url': source_url + base_name + '.audio.4.zip',
+            'md5': '6ddac89717fcf9c92c451868eed77fe1',
+        },
+        {
+            'url': source_url + base_name + '.audio.5.zip',
+            'md5': 'af4820756cdf1a7d4bd6037dc034d384',
+        },
+        {
+            'url': source_url + base_name + '.audio.6.zip',
+            'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
+        },
+        {
+            'url': source_url + base_name + '.audio.7.zip',
+            'md5': '2be39a76aeed704d5929d020a2909efd',
+        },
+        {
+            'url': source_url + base_name + '.audio.8.zip',
+            'md5': '972d8afe0874720fc2f28086e7cb22a9',
+        },
+    ]
+    label_list = [
+        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
+        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
+    ]
+
+    meta_base_path = os.path.join(base_name, base_name + '.meta')
+    meta = os.path.join(meta_base_path, 'meta.csv')
+    meta_info = collections.namedtuple('META_INFO', (
+        'filename_audio', 'filename_video', 'scene_label', 'identifier'))
+    subset_meta = {
+        'train':
+        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
+        'dev':
+        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
+        'test':
+        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
+    }
+    subset_meta_info = collections.namedtuple('SUBSET_META_INFO', (
+        'filename_audio', 'filename_video', 'scene_label'))
+    audio_path = os.path.join(base_name, 'audio')
+
+    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        files, labels = self._get_data(mode)
+        super(UrbanAudioVisualScenes, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self, subset: str=None,
+                       skip_header: bool=True) -> List[collections.namedtuple]:
+        if subset is None:
+            meta_file = self.meta
+            meta_info = self.meta_info
+        else:
+            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
+            meta_file = self.subset_meta[subset]
+            meta_info = self.subset_meta_info
+
+        ret = []
+        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
+            lines = rf.readlines()[1:] if skip_header else rf.readlines()
+            for line in lines:
+                ret.append(meta_info(*line.strip().split('\t')))
+        return ret
+
+    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves,
+                                    os.path.join(DATA_HOME, self.base_name))
+
+        meta_info = self._get_meta_info(subset=mode, skip_header=True)
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, _, label = sample[:3]
+            filename = os.path.basename(filename)
+            target = self.label_list.index(label)
+
+            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+            labels.append(int(target))
+
+        return files, labels
--- a/paddleaudio/paddleaudio/datasets/esc50.py
+++ b/paddleaudio/paddleaudio/datasets/esc50.py
@ -0,0 +1,152 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['ESC50']
+
+
+class ESC50(AudioClassificationDataset):
+    """
+    The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
+    suitable for benchmarking methods of environmental sound classification. The dataset
+    consists of 5-second-long recordings organized into 50 semantical classes (with
+    40 examples per class)
+
+    Reference:
+        ESC: Dataset for Environmental Sound Classification
+        http://dx.doi.org/10.1145/2733373.2806390
+    """
+
+    archieves = [
+        {
+            'url':
+            'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
+            'md5': '7771e4b9d86d0945acce719c7a59305a',
+        },
+    ]
+    label_list = [
+        # Animals
+        'Dog',
+        'Rooster',
+        'Pig',
+        'Cow',
+        'Frog',
+        'Cat',
+        'Hen',
+        'Insects (flying)',
+        'Sheep',
+        'Crow',
+        # Natural soundscapes & water sounds
+        'Rain',
+        'Sea waves',
+        'Crackling fire',
+        'Crickets',
+        'Chirping birds',
+        'Water drops',
+        'Wind',
+        'Pouring water',
+        'Toilet flush',
+        'Thunderstorm',
+        # Human, non-speech sounds
+        'Crying baby',
+        'Sneezing',
+        'Clapping',
+        'Breathing',
+        'Coughing',
+        'Footsteps',
+        'Laughing',
+        'Brushing teeth',
+        'Snoring',
+        'Drinking, sipping',
+        # Interior/domestic sounds
+        'Door knock',
+        'Mouse click',
+        'Keyboard typing',
+        'Door, wood creaks',
+        'Can opening',
+        'Washing machine',
+        'Vacuum cleaner',
+        'Clock alarm',
+        'Clock tick',
+        'Glass breaking',
+        # Exterior/urban noises
+        'Helicopter',
+        'Chainsaw',
+        'Siren',
+        'Car horn',
+        'Engine',
+        'Train',
+        'Church bells',
+        'Airplane',
+        'Fireworks',
+        'Hand saw',
+    ]
+    meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
+    meta_info = collections.namedtuple(
+        'META_INFO',
+        ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
+    audio_path = os.path.join('ESC-50-master', 'audio')
+
+    def __init__(self,
+                 mode: str='train',
+                 split: int=1,
+                 feat_type: str='raw',
+                 **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        files, labels = self._get_data(mode, split)
+        super(ESC50, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self) -> List[collections.namedtuple]:
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                ret.append(self.meta_info(*line.strip().split(',')))
+        return ret
+
+    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        meta_info = self._get_meta_info()
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, fold, target, _, _, _, _ = sample
+            if mode == 'train' and int(fold) != split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+                labels.append(int(target))
+
+            if mode != 'train' and int(fold) == split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+                labels.append(int(target))
+
+        return files, labels
--- a/paddleaudio/paddleaudio/datasets/gtzan.py
+++ b/paddleaudio/paddleaudio/datasets/gtzan.py
@ -0,0 +1,115 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+import random
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['GTZAN']
+
+
+class GTZAN(AudioClassificationDataset):
+    """
+    The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
+    each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
+    in machine listening research for music genre recognition (MGR).
+
+    Reference:
+        Musical genre classification of audio signals
+        https://ieeexplore.ieee.org/document/1021072/
+    """
+
+    archieves = [
+        {
+            'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
+            'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
+        },
+    ]
+    label_list = [
+        'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
+        'pop', 'reggae', 'rock'
+    ]
+    meta = os.path.join('genres', 'input.mf')
+    meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
+    audio_path = 'genres'
+
+    def __init__(self,
+                 mode='train',
+                 seed=0,
+                 n_folds=5,
+                 split=1,
+                 feat_type='raw',
+                 **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            seed (:obj:`int`, `optional`, defaults to 0):
+                Set the random seed to shuffle samples.
+            n_folds (:obj:`int`, `optional`, defaults to 5):
+                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
+        files, labels = self._get_data(mode, seed, n_folds, split)
+        super(GTZAN, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self) -> List[collections.namedtuple]:
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines():
+                ret.append(self.meta_info(*line.strip().split('\t')))
+        return ret
+
+    def _get_data(self, mode, seed, n_folds,
+                  split) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        meta_info = self._get_meta_info()
+        random.seed(seed)  # shuffle samples to split data
+        random.shuffle(
+            meta_info
+        )  # make sure using the same seed to create train and dev dataset
+
+        files = []
+        labels = []
+        n_samples_per_fold = len(meta_info) // n_folds
+        for idx, sample in enumerate(meta_info):
+            file_path, label = sample
+            filename = os.path.basename(file_path)
+            target = self.label_list.index(label)
+            fold = idx // n_samples_per_fold + 1
+
+            if mode == 'train' and int(fold) != split:
+                files.append(
+                    os.path.join(DATA_HOME, self.audio_path, label, filename))
+                labels.append(target)
+
+            if mode != 'train' and int(fold) == split:
+                files.append(
+                    os.path.join(DATA_HOME, self.audio_path, label, filename))
+                labels.append(target)
+
+        return files, labels
--- a/paddleaudio/paddleaudio/datasets/librispeech.py
+++ b/paddleaudio/paddleaudio/datasets/librispeech.py
@ -0,0 +1,199 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import codecs
+import collections
+import json
+import os
+from typing import Dict
+
+from paddle.io import Dataset
+from tqdm import tqdm
+
+from ..backends import load as load_audio
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from ..utils.log import logger
+from .dataset import feat_funcs
+
+__all__ = ['LIBRISPEECH']
+
+
+class LIBRISPEECH(Dataset):
+    """
+    LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
+    prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
+    derived from read audiobooks from the LibriVox project, and has been carefully
+    segmented and aligned.
+
+    Reference:
+        LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
+        http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
+        https://arxiv.org/abs/1709.05522
+    """
+
+    source_url = 'http://www.openslr.org/resources/12/'
+    archieves = [
+        {
+            'url': source_url + 'train-clean-100.tar.gz',
+            'md5': '2a93770f6d5c6c964bc36631d331a522',
+        },
+        {
+            'url': source_url + 'train-clean-360.tar.gz',
+            'md5': 'c0e676e450a7ff2f54aeade5171606fa',
+        },
+        {
+            'url': source_url + 'train-other-500.tar.gz',
+            'md5': 'd1a0fd59409feb2c614ce4d30c387708',
+        },
+        {
+            'url': source_url + 'dev-clean.tar.gz',
+            'md5': '42e2234ba48799c1f50f24a7926300a1',
+        },
+        {
+            'url': source_url + 'dev-other.tar.gz',
+            'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
+        },
+        {
+            'url': source_url + 'test-clean.tar.gz',
+            'md5': '32fa31d27d2e1cad72775fee3f4849a9',
+        },
+        {
+            'url': source_url + 'test-other.tar.gz',
+            'md5': 'fb5a50374b501bb3bac4815ee91d3135',
+        },
+    ]
+    speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
+    utt_info = collections.namedtuple('META_INFO', (
+        'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
+    audio_path = 'LibriSpeech'
+    manifest_path = os.path.join('LibriSpeech', 'manifest')
+    subset = [
+        'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean',
+        'dev-other', 'test-clean', 'test-other'
+    ]
+
+    def __init__(self,
+                 subset: str='train-clean-100',
+                 feat_type: str='raw',
+                 **kwargs):
+        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
+            self.subset, subset)
+        self.subset = subset
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self._data = self._get_data()
+        super(LIBRISPEECH, self).__init__()
+
+    def _get_speaker_info(self) -> Dict[str, str]:
+        ret = {}
+        with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
+            for line in rf.readlines():
+                if ';' in line:  # Skip dataset abstract
+                    continue
+                spk_id, gender = map(str.strip,
+                                     line.split('|')[:2])  # spk_id, gender
+                ret.update({spk_id: gender})
+        return ret
+
+    def _get_text_info(self, trans_file) -> Dict[str, str]:
+        ret = {}
+        with open(trans_file, 'r') as rf:
+            for line in rf.readlines():
+                utt_id, text = map(str.strip, line.split(' ',
+                                                         1))  # utt_id, text
+                ret.update({utt_id: text})
+        return ret
+
+    def _get_data(self):
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
+            download_and_decompress(self.archieves, DATA_HOME,
+                                    len(self.archieves))
+
+        # Speaker info
+        speaker_info = self._get_speaker_info()
+
+        # Text info
+        text_info = {}
+        for root, _, files in os.walk(
+                os.path.join(DATA_HOME, self.audio_path, self.subset)):
+            for file in files:
+                if file.endswith('.trans.txt'):
+                    text_info.update(
+                        self._get_text_info(os.path.join(root, file)))
+
+        data = []
+        for root, _, files in os.walk(
+                os.path.join(DATA_HOME, self.audio_path, self.subset)):
+            for file in files:
+                if file.endswith('.flac'):
+                    utt_id = os.path.splitext(file)[0]
+                    spk_id = utt_id.split('-')[0]
+                    if utt_id not in text_info \
+                        or spk_id not in speaker_info :  # Skip samples with incomplete data
+                        continue
+                    file_path = os.path.join(root, file)
+                    text = text_info[utt_id]
+                    spk_gender = speaker_info[spk_id]
+                    data.append(
+                        self.utt_info(file_path, utt_id, text, spk_id,
+                                      spk_gender))
+
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(
+            sample[0])  # The first element of sample is file path
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(
+            waveform, sample_rate=sr,
+            **self.feat_config) if feat_func else waveform
+        record.update({'feat': feat, 'duration': len(waveform) / sr})
+        return record
+
+    def create_manifest(self, prefix='manifest'):
+        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
+            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
+
+        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
+                                     f'{prefix}.{self.subset}')
+        with codecs.open(manifest_file, 'w', 'utf-8') as f:
+            for idx in tqdm(range(len(self))):
+                record = self._convert_to_record(idx)
+                record_line = json.dumps(
+                    {
+                        'utt': record['utt_id'],
+                        'feat': record['file_path'],
+                        'feat_shape': (record['duration'], ),
+                        'text': record['text'],
+                        'spk': record['spk_id'],
+                        'gender': record['spk_gender'],
+                    },
+                    ensure_ascii=False)
+                f.write(record_line + '\n')
+        logger.info(f'Manifest file {manifest_file} created.')
+
+    def __getitem__(self, idx):
+        record = self._convert_to_record(idx)
+        return tuple(record.values())
+
+    def __len__(self):
+        return len(self._data)
--- a/paddleaudio/paddleaudio/datasets/ravdess.py
+++ b/paddleaudio/paddleaudio/datasets/ravdess.py
@ -0,0 +1,136 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+import random
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['RAVDESS']
+
+
+class RAVDESS(AudioClassificationDataset):
+    """
+    The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
+    lexically-matched statements in a neutral North American accent. Speech emotions
+    includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
+    Each expression is produced at two levels of emotional intensity (normal, strong),
+    with an additional neutral expression.
+
+    Reference:
+        The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
+        A dynamic, multimodal set of facial and vocal expressions in North American English
+        https://doi.org/10.1371/journal.pone.0196391
+    """
+
+    archieves = [
+        {
+            'url':
+            'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
+            'md5':
+            '5411230427d67a21e18aa4d466e6d1b9',
+        },
+        {
+            'url':
+            'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
+            'md5':
+            'bc696df654c87fed845eb13823edef8a',
+        },
+    ]
+    label_list = [
+        'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
+        'surprised'
+    ]
+    meta_info = collections.namedtuple(
+        'META_INFO', ('modality', 'vocal_channel', 'emotion',
+                      'emotion_intensity', 'statement', 'repitition', 'actor'))
+    speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
+    song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
+
+    def __init__(self,
+                 mode='train',
+                 seed=0,
+                 n_folds=5,
+                 split=1,
+                 feat_type='raw',
+                 **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            seed (:obj:`int`, `optional`, defaults to 0):
+                Set the random seed to shuffle samples.
+            n_folds (:obj:`int`, `optional`, defaults to 5):
+                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
+        files, labels = self._get_data(mode, seed, n_folds, split)
+        super(RAVDESS, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self, files) -> List[collections.namedtuple]:
+        ret = []
+        for file in files:
+            basename_without_extend = os.path.basename(file)[:-4]
+            ret.append(self.meta_info(*basename_without_extend.split('-')))
+        return ret
+
+    def _get_data(self, mode, seed, n_folds,
+                  split) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(self.speech_path) and not os.path.isdir(
+                self.song_path):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        wav_files = []
+        for root, _, files in os.walk(self.speech_path):
+            for file in files:
+                if file.endswith('.wav'):
+                    wav_files.append(os.path.join(root, file))
+
+        for root, _, files in os.walk(self.song_path):
+            for file in files:
+                if file.endswith('.wav'):
+                    wav_files.append(os.path.join(root, file))
+
+        random.seed(seed)  # shuffle samples to split data
+        random.shuffle(
+            wav_files
+        )  # make sure using the same seed to create train and dev dataset
+        meta_info = self._get_meta_info(wav_files)
+
+        files = []
+        labels = []
+        n_samples_per_fold = len(meta_info) // n_folds
+        for idx, sample in enumerate(meta_info):
+            _, _, emotion, _, _, _, _ = sample
+            target = int(emotion) - 1
+            fold = idx // n_samples_per_fold + 1
+
+            if mode == 'train' and int(fold) != split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+            if mode != 'train' and int(fold) == split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+        return files, labels
--- a/paddleaudio/paddleaudio/datasets/tess.py
+++ b/paddleaudio/paddleaudio/datasets/tess.py
@ -0,0 +1,126 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+import random
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['TESS']
+
+
+class TESS(AudioClassificationDataset):
+    """
+    TESS is a set of 200 target words were spoken in the carrier phrase
+    "Say the word _____' by two actresses (aged 26 and 64 years) and
+    recordings were made of the set portraying each of seven emotions(anger,
+    disgust, fear, happiness, pleasant surprise, sadness, and neutral).
+    There are 2800 stimuli in total.
+
+    Reference:
+        Toronto emotional speech set (TESS)
+        https://doi.org/10.5683/SP2/E8H2MF
+    """
+
+    archieves = [
+        {
+            'url':
+            'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
+            'md5':
+            '1465311b24d1de704c4c63e4ccc470c7',
+        },
+    ]
+    label_list = [
+        'angry',
+        'disgust',
+        'fear',
+        'happy',
+        'neutral',
+        'ps',  # pleasant surprise
+        'sad',
+    ]
+    meta_info = collections.namedtuple('META_INFO',
+                                       ('speaker', 'word', 'emotion'))
+    audio_path = 'TESS_Toronto_emotional_speech_set'
+
+    def __init__(self,
+                 mode='train',
+                 seed=0,
+                 n_folds=5,
+                 split=1,
+                 feat_type='raw',
+                 **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            seed (:obj:`int`, `optional`, defaults to 0):
+                Set the random seed to shuffle samples.
+            n_folds (:obj:`int`, `optional`, defaults to 5):
+                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
+        files, labels = self._get_data(mode, seed, n_folds, split)
+        super(TESS, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+
+    def _get_meta_info(self, files) -> List[collections.namedtuple]:
+        ret = []
+        for file in files:
+            basename_without_extend = os.path.basename(file)[:-4]
+            ret.append(self.meta_info(*basename_without_extend.split('_')))
+        return ret
+
+    def _get_data(self, mode, seed, n_folds,
+                  split) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        wav_files = []
+        for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
+            for file in files:
+                if file.endswith('.wav'):
+                    wav_files.append(os.path.join(root, file))
+
+        random.seed(seed)  # shuffle samples to split data
+        random.shuffle(
+            wav_files
+        )  # make sure using the same seed to create train and dev dataset
+        meta_info = self._get_meta_info(wav_files)
+
+        files = []
+        labels = []
+        n_samples_per_fold = len(meta_info) // n_folds
+        for idx, sample in enumerate(meta_info):
+            _, _, emotion = sample
+            target = self.label_list.index(emotion)
+            fold = idx // n_samples_per_fold + 1
+
+            if mode == 'train' and int(fold) != split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+            if mode != 'train' and int(fold) == split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+        return files, labels
--- a/paddleaudio/paddleaudio/datasets/urban_sound.py
+++ b/paddleaudio/paddleaudio/datasets/urban_sound.py
@ -0,0 +1,104 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List
+from typing import Tuple
+
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = ['UrbanSound8K']
+
+
+class UrbanSound8K(AudioClassificationDataset):
+    """
+    UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
+    sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
+    drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
+    classes are drawn from the urban sound taxonomy.
+
+    Reference:
+        A Dataset and Taxonomy for Urban Sound Research
+        https://dl.acm.org/doi/10.1145/2647868.2655045
+    """
+
+    archieves = [
+        {
+            'url':
+            'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
+            'md5': '9aa69802bbf37fb986f71ec1483a196e',
+        },
+    ]
+    label_list = [
+        "air_conditioner", "car_horn", "children_playing", "dog_bark",
+        "drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
+        "street_music"
+    ]
+    meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
+    meta_info = collections.namedtuple(
+        'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
+                      'class_id', 'label'))
+    audio_path = os.path.join('UrbanSound8K', 'audio')
+
+    def __init__(self,
+                 mode: str='train',
+                 split: int=1,
+                 feat_type: str='raw',
+                 **kwargs):
+        files, labels = self._get_data(mode, split)
+        super(UrbanSound8K, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs)
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+
+    def _get_meta_info(self):
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                ret.append(self.meta_info(*line.strip().split(',')))
+        return ret
+
+    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+
+        meta_info = self._get_meta_info()
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, _, _, _, _, fold, target, _ = sample
+            if mode == 'train' and int(fold) != split:
+                files.append(
+                    os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
+                                 filename))
+                labels.append(int(target))
+
+            if mode != 'train' and int(fold) == split:
+                files.append(
+                    os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
+                                 filename))
+                labels.append(int(target))
+
+        return files, labels
--- a/paddleaudio/paddleaudio/features/init.py
+++ b/paddleaudio/paddleaudio/features/init.py
@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .augment import *
+from .core import *
--- a/paddleaudio/paddleaudio/features/augment.py
+++ b/paddleaudio/paddleaudio/features/augment.py
@ -0,0 +1,170 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import numpy as np
+from numpy import ndarray as array
+
+from paddleaudio.backends import depth_convert
+from paddleaudio.utils import ParameterError
+
+__all__ = [
+    'depth_augment',
+    'spect_augment',
+    'random_crop1d',
+    'random_crop2d',
+    'adaptive_spect_augment',
+]
+
+
+def randint(high: int) -> int:
+    """Generate one random integer in range [0 high)
+
+     This is a helper function for random data augmentaiton
+    """
+    return int(np.random.randint(0, high=high))
+
+
+def rand() -> float:
+    """Generate one floating-point number in range [0 1)
+
+    This is a helper function for random data augmentaiton
+    """
+    return float(np.random.rand(1))
+
+
+def depth_augment(y: array,
+                  choices: List=['int8', 'int16'],
+                  probs: List[float]=[0.5, 0.5]) -> array:
+    """ Audio depth augmentation
+
+    Do audio depth augmentation to simulate the distortion brought by quantization.
+    """
+    assert len(probs) == len(
+        choices
+    ), 'number of choices {} must be equal to size of probs {}'.format(
+        len(choices), len(probs))
+    depth = np.random.choice(choices, p=probs)
+    src_depth = y.dtype
+    y1 = depth_convert(y, depth)
+    y2 = depth_convert(y1, src_depth)
+
+    return y2
+
+
+def adaptive_spect_augment(spect: array, tempo_axis: int=0,
+                           level: float=0.1) -> array:
+    """Do adpative spectrogram augmentation
+
+    The level of the augmentation is gowern by the paramter level,
+    ranging from 0 to 1, with 0 represents no augmentation。
+
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+
+    time_mask_width = int(nt * level * 0.5)
+    freq_mask_width = int(nf * level * 0.5)
+
+    num_time_mask = int(10 * level)
+    num_freq_mask = int(10 * level)
+
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+
+    return spect
+
+
+def spect_augment(spect: array,
+                  tempo_axis: int=0,
+                  max_time_mask: int=3,
+                  max_freq_mask: int=3,
+                  max_time_mask_width: int=30,
+                  max_freq_mask_width: int=20) -> array:
+    """Do spectrogram augmentation in both time and freq axis
+
+    Reference:
+
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+
+    num_time_mask = randint(max_time_mask)
+    num_freq_mask = randint(max_freq_mask)
+
+    time_mask_width = randint(max_time_mask_width)
+    freq_mask_width = randint(max_freq_mask_width)
+
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+
+    return spect
+
+
+def random_crop1d(y: array, crop_len: int) -> array:
+    """ Do random cropping on 1d input signal
+
+    The input is a 1d signal, typically a sound waveform
+    """
+    if y.ndim != 1:
+        'only accept 1d tensor or numpy array'
+    n = len(y)
+    idx = randint(n - crop_len)
+    return y[idx:idx + crop_len]
+
+
+def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
+    """ Do random cropping for 2D array, typically a spectrogram.
+
+    The cropping is done in temporal direction on the time-freq input signal.
+    """
+    if tempo_axis >= s.ndim:
+        raise ParameterError('axis out of range')
+
+    n = s.shape[tempo_axis]
+    idx = randint(high=n - crop_len)
+    sli = [slice(None) for i in range(s.ndim)]
+    sli[tempo_axis] = slice(idx, idx + crop_len)
+    out = s[tuple(sli)]
+    return out
--- a/paddleaudio/paddleaudio/features/core.py
+++ b/paddleaudio/paddleaudio/features/core.py
@ -0,0 +1,576 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import List
+from typing import Optional
+from typing import Union
+
+import numpy as np
+import scipy
+from numpy import ndarray as array
+from numpy.lib.stride_tricks import as_strided
+from scipy.signal import get_window
+
+from paddleaudio.utils import ParameterError
+
+__all__ = [
+    'stft',
+    'mfcc',
+    'hz_to_mel',
+    'mel_to_hz',
+    'split_frames',
+    'mel_frequencies',
+    'power_to_db',
+    'compute_fbank_matrix',
+    'melspectrogram',
+    'spectrogram',
+    'mu_encode',
+    'mu_decode',
+]
+
+
+def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array:
+    """Pad an array to a target length along a target axis.
+
+    This differs from `np.pad` by centering the data prior to padding,
+    analogous to `str.center`
+    """
+
+    kwargs.setdefault("mode", "constant")
+    n = data.shape[axis]
+    lpad = int((size - n) // 2)
+    lengths = [(0, 0)] * data.ndim
+    lengths[axis] = (lpad, int(size - n - lpad))
+
+    if lpad < 0:
+        raise ParameterError(("Target size ({size:d}) must be "
+                              "at least input size ({n:d})"))
+
+    return np.pad(data, lengths, **kwargs)
+
+
+def split_frames(x: array, frame_length: int, hop_length: int,
+                 axis: int=-1) -> array:
+    """Slice a data array into (overlapping) frames.
+
+    This function is aligned with librosa.frame
+    """
+
+    if not isinstance(x, np.ndarray):
+        raise ParameterError(
+            f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
+
+    if x.shape[axis] < frame_length:
+        raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
+                             f" for frame_length={frame_length:d}")
+
+    if hop_length < 1:
+        raise ParameterError(f"Invalid hop_length: {hop_length:d}")
+
+    if axis == -1 and not x.flags["F_CONTIGUOUS"]:
+        warnings.warn(f"librosa.util.frame called with axis={axis} "
+                      "on a non-contiguous input. This will result in a copy.")
+        x = np.asfortranarray(x)
+    elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
+        warnings.warn(f"librosa.util.frame called with axis={axis} "
+                      "on a non-contiguous input. This will result in a copy.")
+        x = np.ascontiguousarray(x)
+
+    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
+    strides = np.asarray(x.strides)
+
+    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
+
+    if axis == -1:
+        shape = list(x.shape)[:-1] + [frame_length, n_frames]
+        strides = list(strides) + [hop_length * new_stride]
+
+    elif axis == 0:
+        shape = [n_frames, frame_length] + list(x.shape)[1:]
+        strides = [hop_length * new_stride] + list(strides)
+
+    else:
+        raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
+
+    return as_strided(x, shape=shape, strides=strides)
+
+
+def _check_audio(y, mono=True) -> bool:
+    """Determine whether a variable contains valid audio data.
+
+    The audio y must be a np.ndarray, ether 1-channel or two channel
+    """
+    if not isinstance(y, np.ndarray):
+        raise ParameterError("Audio data must be of type numpy.ndarray")
+    if y.ndim > 2:
+        raise ParameterError(
+            f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
+
+    if mono and y.ndim == 2:
+        raise ParameterError(
+            f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
+
+    if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
+        raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
+
+    if not np.issubdtype(y.dtype, np.floating):
+        raise ParameterError("Audio data must be floating-point")
+
+    if not np.isfinite(y).all():
+        raise ParameterError("Audio buffer is not finite everywhere")
+
+    return True
+
+
+def hz_to_mel(frequencies: Union[float, List[float], array],
+              htk: bool=False) -> array:
+    """Convert Hz to Mels
+
+    This function is aligned with librosa.
+    """
+    freq = np.asanyarray(frequencies)
+
+    if htk:
+        return 2595.0 * np.log10(1.0 + freq / 700.0)
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = np.log(6.4) / 27.0  # step size for log region
+
+    if freq.ndim:
+        # If we have array data, vectorize
+        log_t = freq >= min_log_hz
+        mels[log_t] = min_log_mel + \
+            np.log(freq[log_t] / min_log_hz) / logstep
+    elif freq >= min_log_hz:
+        # If we have scalar data, heck directly
+        mels = min_log_mel + np.log(freq / min_log_hz) / logstep
+
+    return mels
+
+
+def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array:
+    """Convert mel bin numbers to frequencies.
+
+    This function is aligned with librosa.
+    """
+    mel_array = np.asanyarray(mels)
+
+    if htk:
+        return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
+
+    # Fill in the linear scale
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mel_array
+
+    # And now the nonlinear scale
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = np.log(6.4) / 27.0  # step size for log region
+
+    if mel_array.ndim:
+        # If we have vector data, vectorize
+        log_t = mel_array >= min_log_mel
+        freqs[log_t] = min_log_hz * \
+            np.exp(logstep * (mel_array[log_t] - min_log_mel))
+    elif mel_array >= min_log_mel:
+        # If we have scalar data, check directly
+        freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
+
+    return freqs
+
+
+def mel_frequencies(n_mels: int=128,
+                    fmin: float=0.0,
+                    fmax: float=11025.0,
+                    htk: bool=False) -> array:
+    """Compute mel frequencies
+
+    This function is aligned with librosa.
+    """
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    min_mel = hz_to_mel(fmin, htk=htk)
+    max_mel = hz_to_mel(fmax, htk=htk)
+
+    mels = np.linspace(min_mel, max_mel, n_mels)
+
+    return mel_to_hz(mels, htk=htk)
+
+
+def fft_frequencies(sr: int, n_fft: int) -> array:
+    """Compute fourier frequencies.
+
+    This function is aligned with librosa.
+    """
+    return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
+
+
+def compute_fbank_matrix(sr: int,
+                         n_fft: int,
+                         n_mels: int=128,
+                         fmin: float=0.0,
+                         fmax: Optional[float]=None,
+                         htk: bool=False,
+                         norm: str="slaney",
+                         dtype: type=np.float32):
+    """Compute fbank matrix.
+
+    This funciton is aligned with librosa.
+    """
+    if norm != "slaney":
+        raise ParameterError('norm must set to slaney')
+
+    if fmax is None:
+        fmax = float(sr) / 2
+
+    # Initialize the weights
+    n_mels = int(n_mels)
+    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
+
+    fdiff = np.diff(mel_f)
+    ramps = np.subtract.outer(mel_f, fftfreqs)
+
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+
+        # .. then intersect them with each other and zero
+        weights[i] = np.maximum(0, np.minimum(lower, upper))
+
+    if norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm[:, np.newaxis]
+
+    # Only check weights if f_mel[0] is positive
+    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
+        # This means we have an empty channel somewhere
+        warnings.warn("Empty filters detected in mel frequency basis. "
+                      "Some channels will produce empty responses. "
+                      "Try increasing your sampling rate (and fmax) or "
+                      "reducing n_mels.")
+
+    return weights
+
+
+def stft(x: array,
+         n_fft: int=2048,
+         hop_length: Optional[int]=None,
+         win_length: Optional[int]=None,
+         window: str="hann",
+         center: bool=True,
+         dtype: type=np.complex64,
+         pad_mode: str="reflect") -> array:
+    """Short-time Fourier transform (STFT).
+
+    This function is aligned with librosa.
+    """
+    _check_audio(x)
+    # By default, use the entire frame
+    if win_length is None:
+        win_length = n_fft
+
+    # Set the default hop, if it's not already specified
+    if hop_length is None:
+        hop_length = int(win_length // 4)
+
+    fft_window = get_window(window, win_length, fftbins=True)
+
+    # Pad the window out to n_fft size
+    fft_window = pad_center(fft_window, n_fft)
+
+    # Reshape so that the window can be broadcast
+    fft_window = fft_window.reshape((-1, 1))
+
+    # Pad the time series so that frames are centered
+    if center:
+        if n_fft > x.shape[-1]:
+            warnings.warn(
+                f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
+            )
+        x = np.pad(x, int(n_fft // 2), mode=pad_mode)
+
+    elif n_fft > x.shape[-1]:
+        raise ParameterError(
+            f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
+        )
+
+    # Window the time series.
+    x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length)
+    # Pre-allocate the STFT matrix
+    stft_matrix = np.empty(
+        (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
+    fft = np.fft  # use numpy fft as default
+    # Constrain STFT block sizes to 256 KB
+    MAX_MEM_BLOCK = 2**8 * 2**10
+    # how many columns can we fit within MAX_MEM_BLOCK?
+    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
+    n_columns = max(n_columns, 1)
+
+    for bl_s in range(0, stft_matrix.shape[1], n_columns):
+        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
+        stft_matrix[:, bl_s:bl_t] = fft.rfft(
+            fft_window * x_frames[:, bl_s:bl_t], axis=0)
+
+    return stft_matrix
+
+
+def power_to_db(spect: array,
+                ref: float=1.0,
+                amin: float=1e-10,
+                top_db: Optional[float]=80.0) -> array:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units
+
+    This computes the scaling ``10 * log10(spect / ref)`` in a numerically
+    stable way.
+
+    This function is aligned with librosa.
+    """
+    spect = np.asarray(spect)
+
+    if amin <= 0:
+        raise ParameterError("amin must be strictly positive")
+
+    if np.issubdtype(spect.dtype, np.complexfloating):
+        warnings.warn(
+            "power_to_db was called on complex input so phase "
+            "information will be discarded. To suppress this warning, "
+            "call power_to_db(np.abs(D)**2) instead.")
+        magnitude = np.abs(spect)
+    else:
+        magnitude = spect
+
+    if callable(ref):
+        # User supplied a function to calculate reference power
+        ref_value = ref(magnitude)
+    else:
+        ref_value = np.abs(ref)
+
+    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
+    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
+
+    if top_db is not None:
+        if top_db < 0:
+            raise ParameterError("top_db must be non-negative")
+        log_spec = np.maximum(log_spec, log_spec.max() - top_db)
+
+    return log_spec
+
+
+def mfcc(x,
+         sr: int=16000,
+         spect: Optional[array]=None,
+         n_mfcc: int=20,
+         dct_type: int=2,
+         norm: str="ortho",
+         lifter: int=0,
+         **kwargs) -> array:
+    """Mel-frequency cepstral coefficients (MFCCs)
+
+    This function is NOT strictly aligned with librosa. The following example shows how to get the
+    same result with librosa:
+
+    # paddleaudioe mfcc:
+     kwargs = {
+        'window_size':512,
+        'hop_length':320,
+        'mel_bins':64,
+        'fmin':50,
+         'to_db':False}
+    a = mfcc(x,
+        spect=None,
+        n_mfcc=20,
+        dct_type=2,
+        norm='ortho',
+        lifter=0,
+        **kwargs)
+
+    # librosa mfcc:
+    spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512,
+                                              win_length=512,
+                                              hop_length=320,
+                                              n_mels=64, fmin=50)
+    b = librosa.feature.mfcc(x,
+        sr=16000,
+        S=spect,
+        n_mfcc=20,
+        dct_type=2,
+        norm='ortho',
+        lifter=0)
+
+    assert np.mean( (a-b)**2) < 1e-8
+
+    """
+    if spect is None:
+        spect = melspectrogram(x, sr=sr, **kwargs)
+
+    M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
+
+    if lifter > 0:
+        factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
+                        lifter)
+        return M * factor[:, np.newaxis]
+    elif lifter == 0:
+        return M
+    else:
+        raise ParameterError(
+            f"MFCC lifter={lifter} must be a non-negative number")
+
+
+def melspectrogram(x: array,
+                   sr: int=16000,
+                   window_size: int=512,
+                   hop_length: int=320,
+                   n_mels: int=64,
+                   fmin: int=50,
+                   fmax: Optional[float]=None,
+                   window: str='hann',
+                   center: bool=True,
+                   pad_mode: str='reflect',
+                   power: float=2.0,
+                   to_db: bool=True,
+                   ref: float=1.0,
+                   amin: float=1e-10,
+                   top_db: Optional[float]=None) -> array:
+    """Compute mel-spectrogram.
+
+    Parameters:
+        x: numpy.ndarray
+        The input wavform is a numpy array [shape=(n,)]
+
+        window_size: int, typically 512, 1024, 2048, etc.
+        The window size for framing, also used as n_fft for stft
+
+
+    Returns:
+        The mel-spectrogram in power scale or db scale(default)
+
+
+    Notes:
+    1. sr is default to 16000, which is commonly used in speech/speaker processing.
+    2. when fmax is None, it is set to sr//2.
+    3. this function will convert mel spectgrum to db scale by default. This is different
+    that of librosa.
+
+    """
+    _check_audio(x, mono=True)
+    if len(x) <= 0:
+        raise ParameterError('The input waveform is empty')
+
+    if fmax is None:
+        fmax = sr // 2
+    if fmin < 0 or fmin >= fmax:
+        raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
+
+    s = stft(
+        x,
+        n_fft=window_size,
+        hop_length=hop_length,
+        win_length=window_size,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    spect_power = np.abs(s)**power
+    fb_matrix = compute_fbank_matrix(
+        sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
+    mel_spect = np.matmul(fb_matrix, spect_power)
+    if to_db:
+        return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
+    else:
+        return mel_spect
+
+
+def spectrogram(x: array,
+                sr: int=16000,
+                window_size: int=512,
+                hop_length: int=320,
+                window: str='hann',
+                center: bool=True,
+                pad_mode: str='reflect',
+                power: float=2.0) -> array:
+    """Compute spectrogram from an input waveform.
+
+    This function is a wrapper for librosa.feature.stft, with addition step to
+    compute the magnitude of the complex spectrogram.
+    """
+
+    s = stft(
+        x,
+        n_fft=window_size,
+        hop_length=hop_length,
+        win_length=window_size,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    return np.abs(s)**power
+
+
+def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array:
+    """Mu-law encoding.
+
+    Compute the mu-law decoding given an input code.
+    When quantized is True, the result will be converted to
+    integer in range [0,mu-1]. Otherwise, the resulting signal
+    is in range [-1,1]
+
+
+    Reference:
+        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+
+    """
+    mu = 255
+    y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
+    if quantized:
+        y = np.floor((y + 1) / 2 * mu + 0.5)  # convert to [0 , mu-1]
+    return y
+
+
+def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
+    """Mu-law decoding.
+
+    Compute the mu-law decoding given an input code.
+
+    it assumes that the input y is in
+    range [0,mu-1] when quantize is True and [-1,1] otherwise
+
+    Reference:
+        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+
+    """
+    if mu < 1:
+        raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
+
+    mu = mu - 1
+    if quantized:  # undo the quantization
+        y = y * 2 / mu - 1
+    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
+    return x
--- a/paddleaudio/paddleaudio/models/init.py
+++ b/paddleaudio/paddleaudio/models/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddleaudio/paddleaudio/models/panns.py
+++ b/paddleaudio/paddleaudio/models/panns.py
@ -0,0 +1,309 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ..utils.download import load_state_dict_from_url
+from ..utils.env import MODEL_HOME
+
+__all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6']
+
+pretrained_model_urls = {
+    'cnn14': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams',
+    'cnn10': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams',
+    'cnn6': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams',
+}
+
+
+class ConvBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+
+        self.conv1 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias_attr=False)
+        self.conv2 = nn.Conv2D(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=(3, 3),
+            stride=(1, 1),
+            padding=(1, 1),
+            bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(out_channels)
+        self.bn2 = nn.BatchNorm2D(out_channels)
+
+    def forward(self, x, pool_size=(2, 2), pool_type='avg'):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = F.relu(x)
+
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x = F.avg_pool2d(
+                x, kernel_size=pool_size) + F.max_pool2d(
+                    x, kernel_size=pool_size)
+        else:
+            raise Exception(
+                f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".'
+            )
+        return x
+
+
+class ConvBlock5x5(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock5x5, self).__init__()
+
+        self.conv1 = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(5, 5),
+            stride=(1, 1),
+            padding=(2, 2),
+            bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(out_channels)
+
+    def forward(self, x, pool_size=(2, 2), pool_type='avg'):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x = F.avg_pool2d(
+                x, kernel_size=pool_size) + F.max_pool2d(
+                    x, kernel_size=pool_size)
+        else:
+            raise Exception(
+                f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".'
+            )
+        return x
+
+
+class CNN14(nn.Layer):
+    """
+    The CNN14(14-layer CNNs) mainly consist of 6 convolutional blocks while each convolutional
+    block consists of 2 convolutional layers with a kernel size of 3 × 3.
+
+    Reference:
+        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+        https://arxiv.org/pdf/1912.10211.pdf
+    """
+    emb_size = 2048
+
+    def __init__(self, extract_embedding: bool=True):
+
+        super(CNN14, self).__init__()
+        self.bn0 = nn.BatchNorm2D(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+
+        self.fc1 = nn.Linear(2048, self.emb_size)
+        self.fc_audioset = nn.Linear(self.emb_size, 527)
+        self.extract_embedding = extract_embedding
+
+    def forward(self, x):
+        x.stop_gradient = False
+        x = x.transpose([0, 3, 2, 1])
+        x = self.bn0(x)
+        x = x.transpose([0, 3, 2, 1])
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = x.mean(axis=3)
+        x = x.max(axis=2) + x.mean(axis=2)
+
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu(self.fc1(x))
+
+        if self.extract_embedding:
+            output = F.dropout(x, p=0.5, training=self.training)
+        else:
+            output = F.sigmoid(self.fc_audioset(x))
+        return output
+
+
+class CNN10(nn.Layer):
+    """
+    The CNN10(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
+    block consists of 2 convolutional layers with a kernel size of 3 × 3.
+
+    Reference:
+        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+        https://arxiv.org/pdf/1912.10211.pdf
+    """
+    emb_size = 512
+
+    def __init__(self, extract_embedding: bool=True):
+
+        super(CNN10, self).__init__()
+        self.bn0 = nn.BatchNorm2D(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+
+        self.fc1 = nn.Linear(512, self.emb_size)
+        self.fc_audioset = nn.Linear(self.emb_size, 527)
+        self.extract_embedding = extract_embedding
+
+    def forward(self, x):
+        x.stop_gradient = False
+        x = x.transpose([0, 3, 2, 1])
+        x = self.bn0(x)
+        x = x.transpose([0, 3, 2, 1])
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = x.mean(axis=3)
+        x = x.max(axis=2) + x.mean(axis=2)
+
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu(self.fc1(x))
+
+        if self.extract_embedding:
+            output = F.dropout(x, p=0.5, training=self.training)
+        else:
+            output = F.sigmoid(self.fc_audioset(x))
+        return output
+
+
+class CNN6(nn.Layer):
+    """
+    The CNN14(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
+    block consists of 1 convolutional layers with a kernel size of 5 × 5.
+
+    Reference:
+        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+        https://arxiv.org/pdf/1912.10211.pdf
+    """
+    emb_size = 512
+
+    def __init__(self, extract_embedding: bool=True):
+
+        super(CNN6, self).__init__()
+        self.bn0 = nn.BatchNorm2D(64)
+        self.conv_block1 = ConvBlock5x5(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock5x5(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock5x5(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock5x5(in_channels=256, out_channels=512)
+
+        self.fc1 = nn.Linear(512, self.emb_size)
+        self.fc_audioset = nn.Linear(self.emb_size, 527)
+        self.extract_embedding = extract_embedding
+
+    def forward(self, x):
+        x.stop_gradient = False
+        x = x.transpose([0, 3, 2, 1])
+        x = self.bn0(x)
+        x = x.transpose([0, 3, 2, 1])
+
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+
+        x = x.mean(axis=3)
+        x = x.max(axis=2) + x.mean(axis=2)
+
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu(self.fc1(x))
+
+        if self.extract_embedding:
+            output = F.dropout(x, p=0.5, training=self.training)
+        else:
+            output = F.sigmoid(self.fc_audioset(x))
+        return output
+
+
+def cnn14(pretrained: bool=False, extract_embedding: bool=True) -> CNN14:
+    model = CNN14(extract_embedding=extract_embedding)
+    if pretrained:
+        state_dict = load_state_dict_from_url(
+            url=pretrained_model_urls['cnn14'],
+            path=os.path.join(MODEL_HOME, 'panns'))
+        model.set_state_dict(state_dict)
+    return model
+
+
+def cnn10(pretrained: bool=False, extract_embedding: bool=True) -> CNN10:
+    model = CNN10(extract_embedding=extract_embedding)
+    if pretrained:
+        state_dict = load_state_dict_from_url(
+            url=pretrained_model_urls['cnn10'],
+            path=os.path.join(MODEL_HOME, 'panns'))
+        model.set_state_dict(state_dict)
+    return model
+
+
+def cnn6(pretrained: bool=False, extract_embedding: bool=True) -> CNN6:
+    model = CNN6(extract_embedding=extract_embedding)
+    if pretrained:
+        state_dict = load_state_dict_from_url(
+            url=pretrained_model_urls['cnn6'],
+            path=os.path.join(MODEL_HOME, 'panns'))
+        model.set_state_dict(state_dict)
+    return model
--- a/paddleaudio/paddleaudio/utils/init.py
+++ b/paddleaudio/paddleaudio/utils/init.py
@ -0,0 +1,18 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .download import *
+from .env import *
+from .error import *
+from .log import *
+from .time import *
--- a/paddleaudio/paddleaudio/utils/download.py
+++ b/paddleaudio/paddleaudio/utils/download.py
@ -0,0 +1,66 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Dict
+from typing import List
+
+from paddle.framework import load as load_state_dict
+from paddle.utils import download
+from pathos.multiprocessing import ProcessPool
+
+from .log import logger
+
+download.logger = logger
+
+
+def decompress(file: str):
+    """
+    Extracts all files from a compressed file.
+    """
+    assert os.path.isfile(file), "File: {} not exists.".format(file)
+    download._decompress(file)
+
+
+def download_and_decompress(archives: List[Dict[str, str]],
+                            path: str,
+                            n_workers: int=0):
+    """
+    Download archieves and decompress to specific path.
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    if n_workers <= 0:
+        for archive in archives:
+            assert 'url' in archive and 'md5' in archive, \
+                'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+
+            download.get_path_from_url(archive['url'], path, archive['md5'])
+    else:
+        pool = ProcessPool(nodes=n_workers)
+        pool.imap(download.get_path_from_url, [_['url'] for _ in archives],
+                  [path] * len(archives), [_['md5'] for _ in archives])
+        pool.close()
+        pool.join()
+
+
+def load_state_dict_from_url(url: str, path: str, md5: str=None):
+    """
+    Download and load a state dict from url
+    """
+    if not os.path.isdir(path):
+        os.makedirs(path)
+
+    download.get_path_from_url(url, path, md5)
+    return load_state_dict(os.path.join(path, os.path.basename(url)))
--- a/paddleaudio/paddleaudio/utils/env.py
+++ b/paddleaudio/paddleaudio/utils/env.py
@ -0,0 +1,53 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+This module is used to store environmental variables in PaddleAudio.
+PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
+├                            default value through the PPAUDIO_HOME environment variable.
+├─ MODEL_HOME    -->  Store model files.
+└─ DATA_HOME     -->  Store automatically downloaded datasets.
+'''
+import os
+
+
+def _get_user_home():
+    return os.path.expanduser('~')
+
+
+def _get_ppaudio_home():
+    if 'PPAUDIO_HOME' in os.environ:
+        home_path = os.environ['PPAUDIO_HOME']
+        if os.path.exists(home_path):
+            if os.path.isdir(home_path):
+                return home_path
+            else:
+                raise RuntimeError(
+                    'The environment variable PPAUDIO_HOME {} is not a directory.'.
+                    format(home_path))
+        else:
+            return home_path
+    return os.path.join(_get_user_home(), '.paddleaudio')
+
+
+def _get_sub_home(directory):
+    home = os.path.join(_get_ppaudio_home(), directory)
+    if not os.path.exists(home):
+        os.makedirs(home)
+    return home
+
+
+USER_HOME = _get_user_home()
+PPAUDIO_HOME = _get_ppaudio_home()
+MODEL_HOME = _get_sub_home('models')
+DATA_HOME = _get_sub_home('datasets')
--- a/paddleaudio/paddleaudio/utils/error.py
+++ b/paddleaudio/paddleaudio/utils/error.py
@ -0,0 +1,20 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['ParameterError']
+
+
+class ParameterError(Exception):
+    """Exception class for Parameter checking"""
+    pass
--- a/paddleaudio/paddleaudio/utils/log.py
+++ b/paddleaudio/paddleaudio/utils/log.py
@ -0,0 +1,136 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import functools
+import logging
+import threading
+import time
+
+import colorlog
+
+loggers = {}
+
+log_config = {
+    'DEBUG': {
+        'level': 10,
+        'color': 'purple'
+    },
+    'INFO': {
+        'level': 20,
+        'color': 'green'
+    },
+    'TRAIN': {
+        'level': 21,
+        'color': 'cyan'
+    },
+    'EVAL': {
+        'level': 22,
+        'color': 'blue'
+    },
+    'WARNING': {
+        'level': 30,
+        'color': 'yellow'
+    },
+    'ERROR': {
+        'level': 40,
+        'color': 'red'
+    },
+    'CRITICAL': {
+        'level': 50,
+        'color': 'bold_red'
+    }
+}
+
+
+class Logger(object):
+    '''
+    Deafult logger in PaddleAudio
+    Args:
+        name(str) : Logger name, default is 'PaddleAudio'
+    '''
+
+    def __init__(self, name: str=None):
+        name = 'PaddleAudio' if not name else name
+        self.logger = logging.getLogger(name)
+
+        for key, conf in log_config.items():
+            logging.addLevelName(conf['level'], key)
+            self.__dict__[key] = functools.partial(self.__call__, conf['level'])
+            self.__dict__[key.lower()] = functools.partial(self.__call__,
+                                                           conf['level'])
+
+        self.format = colorlog.ColoredFormatter(
+            '%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
+            log_colors={key: conf['color']
+                        for key, conf in log_config.items()})
+
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+
+        self.logger.addHandler(self.handler)
+        self.logLevel = 'DEBUG'
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+        self._is_enable = True
+
+    def disable(self):
+        self._is_enable = False
+
+    def enable(self):
+        self._is_enable = True
+
+    @property
+    def is_enable(self) -> bool:
+        return self._is_enable
+
+    def __call__(self, log_level: str, msg: str):
+        if not self.is_enable:
+            return
+
+        self.logger.log(log_level, msg)
+
+    @contextlib.contextmanager
+    def use_terminator(self, terminator: str):
+        old_terminator = self.handler.terminator
+        self.handler.terminator = terminator
+        yield
+        self.handler.terminator = old_terminator
+
+    @contextlib.contextmanager
+    def processing(self, msg: str, interval: float=0.1):
+        '''
+        Continuously print a progress bar with rotating special effects.
+        Args:
+            msg(str): Message to be printed.
+            interval(float): Rotation interval. Default to 0.1.
+        '''
+        end = False
+
+        def _printer():
+            index = 0
+            flags = ['\\', '|', '/', '-']
+            while not end:
+                flag = flags[index % len(flags)]
+                with self.use_terminator('\r'):
+                    self.info('{}: {}'.format(msg, flag))
+                time.sleep(interval)
+                index += 1
+
+        t = threading.Thread(target=_printer)
+        t.start()
+        yield
+        end = True
+
+
+logger = Logger()
--- a/paddleaudio/paddleaudio/utils/time.py
+++ b/paddleaudio/paddleaudio/utils/time.py
@ -0,0 +1,67 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import time
+
+
+class Timer(object):
+    '''Calculate runing speed and estimated time of arrival(ETA)'''
+
+    def __init__(self, total_step: int):
+        self.total_step = total_step
+        self.last_start_step = 0
+        self.current_step = 0
+        self._is_running = True
+
+    def start(self):
+        self.last_time = time.time()
+        self.start_time = time.time()
+
+    def stop(self):
+        self._is_running = False
+        self.end_time = time.time()
+
+    def count(self) -> int:
+        if not self.current_step >= self.total_step:
+            self.current_step += 1
+        return self.current_step
+
+    @property
+    def timing(self) -> float:
+        run_steps = self.current_step - self.last_start_step
+        self.last_start_step = self.current_step
+        time_used = time.time() - self.last_time
+        self.last_time = time.time()
+        return run_steps / time_used
+
+    @property
+    def is_running(self) -> bool:
+        return self._is_running
+
+    @property
+    def eta(self) -> str:
+        if not self.is_running:
+            return '00:00:00'
+        scale = self.total_step / self.current_step
+        remaining_time = (time.time() - self.start_time) * scale
+        return seconds_to_hms(remaining_time)
+
+
+def seconds_to_hms(seconds: int) -> str:
+    '''Convert the number of seconds to hh:mm:ss'''
+    h = math.floor(seconds / 3600)
+    m = math.floor((seconds - h * 3600) / 60)
+    s = int(seconds - h * 3600 - m * 60)
+    hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
+    return hms_str
--- a/paddleaudio/requirements.txt
+++ b/paddleaudio/requirements.txt
@ -0,0 +1,4 @@
+numpy >= 1.15.0
+resampy >= 0.2.2
+scipy >= 1.0.0
+soundfile >= 0.9.0
--- a/paddleaudio/setup.py
+++ b/paddleaudio/setup.py
@ -0,0 +1,43 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import setuptools
+
+# set the version here
+version = '0.1.0a'
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+setuptools.setup(
+    name="paddleaudio",
+    version=version,
+    author="",
+    author_email="",
+    description="PaddleAudio, in development",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="",
+    packages=setuptools.find_packages(exclude=["build*", "test*", "examples*"]),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+    install_requires=[
+        'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
+        'soundfile >= 0.9.0'
+    ],
+    extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2']
+                    }  # for dev only, install: pip install -e .[dev]
+)
--- a/paddleaudio/test/README.md
+++ b/paddleaudio/test/README.md
@ -0,0 +1,41 @@
+# PaddleAudio Testing Guide
+
+
+
+
+# Testing
+First clone a version of the project by
+```
+git clone https://github.com/PaddlePaddle/models.git
+
+```
+Then install the project in your virtual environment.
+```
+cd models/PaddleAudio
+python setup.py bdist_wheel
+pip install -e .[dev]
+```
+The requirements for testing will be installed along with PaddleAudio.  
+
+Now run
+```
+pytest test
+```
+
+If it goes well, you will see outputs like these:
+```
+platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
+rootdir: ./models/PaddleAudio
+plugins: hydra-core-1.0.6
+collected 16 items  
+
+test/unit_test/test_backend.py ...........                                                                         [ 68%]
+test/unit_test/test_features.py .....                                                                              [100%]
+
+==================================================== warnings summary ====================================================
+.
+.
+.
+-- Docs: https://docs.pytest.org/en/stable/warnings.html
+============================================ 16 passed, 11 warnings in 6.76s =============================================
+```
--- a/paddleaudio/test/unit_test/test_backend.py
+++ b/paddleaudio/test/unit_test/test_backend.py
@ -0,0 +1,114 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import librosa
+import numpy as np
+import pytest
+
+import paddleaudio
+
+TEST_FILE = './test/data/test_audio.wav'
+
+
+def relative_err(a, b, real=True):
+    """compute relative error of two matrices or vectors"""
+    if real:
+        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
+    else:
+        err = np.sum((a.real - b.real)**2) / \
+            (EPS + np.sum(a.real**2) + np.sum(b.real**2))
+        err += np.sum((a.imag - b.imag)**2) / \
+            (EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
+
+        return err
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def load_audio():
+    x, r = librosa.load(TEST_FILE, sr=16000)
+    print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
+    return x, r
+
+
+# start testing
+x, r = load_audio()
+EPS = 1e-8
+
+
+def test_load():
+    s, r = paddleaudio.load(TEST_FILE, sr=16000)
+    assert r == 16000
+    assert s.dtype == 'float32'
+
+    s, r = paddleaudio.load(
+        TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
+    assert len(s) / r == 2.0
+    assert r == 16000
+    assert s.dtype == 'int16'
+
+
+def test_depth_convert():
+    y = paddleaudio.depth_convert(x, 'int16')
+    assert len(y) == len(x)
+    assert y.dtype == 'int16'
+    assert np.max(y) <= 32767
+    assert np.min(y) >= -32768
+    assert np.std(y) > EPS
+
+    y = paddleaudio.depth_convert(x, 'int8')
+    assert len(y) == len(x)
+    assert y.dtype == 'int8'
+    assert np.max(y) <= 127
+    assert np.min(y) >= -128
+    assert np.std(y) > EPS
+
+
+# test case for resample
+rs_test_data = [
+    (32000, 'kaiser_fast'),
+    (16000, 'kaiser_fast'),
+    (8000, 'kaiser_fast'),
+    (32000, 'kaiser_best'),
+    (16000, 'kaiser_best'),
+    (8000, 'kaiser_best'),
+    (22050, 'kaiser_best'),
+    (44100, 'kaiser_best'),
+]
+
+
+@pytest.mark.parametrize('sr,mode', rs_test_data)
+def test_resample(sr, mode):
+    y = paddleaudio.resample(x, 16000, sr, mode=mode)
+    factor = sr / 16000
+    err = relative_err(len(y), len(x) * factor)
+    print('err:', err)
+    assert err < EPS
+
+
+def test_normalize():
+    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
+    assert np.max(y) < 0.5 + EPS
+
+    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
+    assert np.max(y) <= 2.0 + EPS
+
+    y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
+    print('np.std(y):', np.std(y))
+    assert np.abs(np.std(y) - 1.0) < EPS
+
+
+if __name__ == '__main__':
+    test_load()
+    test_depth_convert()
+    test_resample(22050, 'kaiser_fast')
+    test_normalize()
--- a/paddleaudio/test/unit_test/test_features.py
+++ b/paddleaudio/test/unit_test/test_features.py
@ -0,0 +1,144 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import librosa
+import numpy as np
+import pytest
+
+import paddleaudio as pa
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def load_audio():
+    x, r = librosa.load('./test/data/test_audio.wav')
+    #x,r = librosa.load('../data/test_audio.wav',sr=16000)
+    return x, r
+
+
+## start testing
+x, r = load_audio()
+EPS = 1e-8
+
+
+def relative_err(a, b, real=True):
+    """compute relative error of two matrices or vectors"""
+    if real:
+        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
+    else:
+        err = np.sum((a.real - b.real)**2) / (
+            EPS + np.sum(a.real**2) + np.sum(b.real**2))
+        err += np.sum((a.imag - b.imag)**2) / (
+            EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
+
+        return err
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_melspectrogram():
+    a = pa.melspectrogram(
+        x,
+        window_size=512,
+        sr=16000,
+        hop_length=320,
+        n_mels=64,
+        fmin=50,
+        to_db=False, )
+    b = librosa.feature.melspectrogram(
+        x,
+        sr=16000,
+        n_fft=512,
+        win_length=512,
+        hop_length=320,
+        n_mels=64,
+        fmin=50)
+    assert relative_err(a, b) < EPS
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_melspectrogram_db():
+
+    a = pa.melspectrogram(
+        x,
+        window_size=512,
+        sr=16000,
+        hop_length=320,
+        n_mels=64,
+        fmin=50,
+        to_db=True,
+        ref=1.0,
+        amin=1e-10,
+        top_db=None)
+    b = librosa.feature.melspectrogram(
+        x,
+        sr=16000,
+        n_fft=512,
+        win_length=512,
+        hop_length=320,
+        n_mels=64,
+        fmin=50)
+    b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
+    assert relative_err(a, b) < EPS
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_stft():
+    a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
+    b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
+    assert a.shape == b.shape
+    assert relative_err(a, b, real=False) < EPS
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_split_frames():
+    a = librosa.util.frame(x, frame_length=512, hop_length=320)
+    b = pa.split_frames(x, frame_length=512, hop_length=320)
+    assert relative_err(a, b) < EPS
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_mfcc():
+    kwargs = {
+        'window_size': 512,
+        'hop_length': 320,
+        'n_mels': 64,
+        'fmin': 50,
+        'to_db': False
+    }
+    a = pa.mfcc(
+        x,
+        #sample_rate=16000,
+        spect=None,
+        n_mfcc=20,
+        dct_type=2,
+        norm='ortho',
+        lifter=0,
+        **kwargs)
+    S = librosa.feature.melspectrogram(
+        x,
+        sr=16000,
+        n_fft=512,
+        win_length=512,
+        hop_length=320,
+        n_mels=64,
+        fmin=50)
+    b = librosa.feature.mfcc(
+        x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
+    assert relative_err(a, b) < EPS
+
+
+if __name__ == '__main__':
+    test_melspectrogram()
+    test_melspectrogram_db()
+    test_stft()
+    test_split_frames()
+    test_mfcc()