Merge pull request #938 from KPatr1ck/develop

[audio] Merge PaddleAudio into PaddleSpeech.
4 years ago · b3ebf5d4c4
parent 304d71747a ee9972a0d3
commit b3ebf5d4c4
44 changed files with 5219 additions and 0 deletions
--- a/paddleaudio/.gitignore
+++ b/paddleaudio/.gitignore
@ -0,0 +1,7 @@
 .ipynb_checkpoints/**
 *.ipynb
 nohup.out
 __pycache__/
 *.wav
 *.m4a
 obsolete/**
--- a/paddleaudio/.pre-commit-config.yaml
+++ b/paddleaudio/.pre-commit-config.yaml
@ -0,0 +1,45 @@
 repos:
 -   repo: local
    hooks:
    -   id: yapf
        name: yapf
        entry: yapf
        language: system
        args: [-i, --style .style.yapf]
        files: \.py$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: a11d9314b22d8f8c7556443875b731ef05965464
    hooks:
    -   id: check-merge-conflict
    -   id: check-symlinks
    -   id: end-of-file-fixer
    -   id: trailing-whitespace
    -   id: detect-private-key
    -   id: check-symlinks
    -   id: check-added-large-files
 -   repo: https://github.com/pycqa/isort
    rev: 5.8.0
    hooks:
    -   id: isort
        name: isort (python)
    -   id: isort
        name: isort (cython)
        types: [cython]
    -   id: isort
        name: isort (pyi)
        types: [pyi]
 -   repo: local
    hooks:
    -   id: flake8
        name: flake8
        entry: flake8
        language: system
        args:
        -   --count
        -   --select=E9,F63,F7,F82
        -   --show-source
        -   --statistics
        files: \.py$
--- a/paddleaudio/.style.yapf
+++ b/paddleaudio/.style.yapf
@ -0,0 +1,3 @@
 [style]
 based_on_style = pep8
 column_limit = 80
--- a/paddleaudio/LICENSE
+++ b/paddleaudio/LICENSE
@ -0,0 +1,201 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright [yyyy] [name of copyright owner]
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/paddleaudio/README.md
+++ b/paddleaudio/README.md
@ -0,0 +1,37 @@
 # PaddleAudio:  The audio library for PaddlePaddle
 ## Introduction
 PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap.
 ## Features
 - Spectrogram and related features are compatible with librosa.
 - State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come.
 - Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap.
 - Data loading supports for common open source audio in multiple languages including English, Mandarin and so on.
 ## Install
 ```
 git clone https://github.com/PaddlePaddle/models
 cd models/PaddleAudio
 pip install .
 ```
 ## Quick start
 ### Audio loading and feature extraction
 ```
 import paddleaudio as pa
 s,r = pa.load(f)
 mel_spect = pa.melspectrogram(s,sr=r)
 ```
 ###  Examples
 We provide a set of examples to help you get started in using PaddleAudio quickly.
 - [PANNs:  acoustic scene and events analysis using pre-trained models](./examples/panns)
 - [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification)
 - [Training a audio-tagging network on Audioset](./examples/audioset_training)
 Please refer to [example directory](./examples) for more details.
--- a/paddleaudio/examples/panns/README.md
+++ b/paddleaudio/examples/panns/README.md
@ -0,0 +1,128 @@
 # Audio Tagging
 声音分类的任务是单标签的分类任务，但是对于一段音频来说，它可以是多标签的。譬如在一般的室内办公环境进行录音，这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音，还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言，对一段音频预测多个标签是具有很强的实用性的。
 在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
 `PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别，因此可用于声音的实时tagging。
 本示例采用`PANNs`预训练模型，基于Audioset的标签类别对输入音频实时tagging，并最终以文本形式输出对应时刻的top k类别和对应的得分。
 ## 模型简介
 PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
 - CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
 - CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
 - CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
 ## 快速开始
 ### 模型预测
 ```shell
 export CUDA_VISIBLE_DEVICES=0
 python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir
 ```
 可支持配置的参数：
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
 - `wav`: 指定预测的音频文件。
 - `sample_duration`: 模型每次预测的音频时间长度，单位为秒，默认为2s。
 - `hop_duration`: 每两个预测音频的时间间隔，单位为秒，默认为0.3s。
 - `output_dir`: 模型预测结果存放的路径，默认为`./output_dir`。
 示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
 ```python
 from paddleaudio.models.panns import cnn14, cnn10, cnn6
 # CNN14
 model = cnn14(pretrained=True, extract_embedding=False)
 # CNN10
 model = cnn10(pretrained=True, extract_embedding=False)
 # CNN6
 model = cnn6(pretrained=True, extract_embedding=False)
 ```
 执行结果：
 ```
 [2021-04-30 19:15:41,025] [    INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz
 ```
 执行后得分结果保存在`output_dir`的`.npz`文件中。
 ### 生成tagging标签文本
 ```shell
 python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir
 ```
 可支持配置的参数：
 - `tagging_file`: 模型预测结果文件。
 - `top_k`: 获取预测结果中，得分最高的前top_k个标签，默认为10。
 - `smooth`: 预测结果的后验概率平滑，默认为True，表示应用平滑。
 - `smooth_size`: 平滑计算过程中的样本数量，默认为5。
 - `label_file`: 模型预测结果对应的Audioset类别的文本文件。
 - `output_dir`: 标签文本存放的路径，默认为`./output_dir`。
 执行结果：
 ```
 [2021-04-30 19:26:58,743] [    INFO] - Posterior smoothing...
 [2021-04-30 19:26:58,746] [    INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt
 ```
 执行后文本结果保存在`output_dir`的`.txt`文件中。
 ## Tagging标签文本
 最终输出的文本结果如下所示。  
 样本每个时间范围的top k结果用空行分隔。在每一个结果中，第一行是时间信息，数字表示tagging结果在时间起点信息，比例值代表当前时刻`t`与音频总长度`T`的比值；紧接的k行是对应的标签和得分。
 ```
 0.0
 Cat: 0.9144676923751831
 Animal: 0.8855036497116089
 Domestic animals, pets: 0.804577112197876
 Meow: 0.7422927021980286
 Music: 0.19959309697151184
 Inside, small room: 0.12550437450408936
 Caterwaul: 0.021584441885352135
 Purr: 0.020247288048267365
 Speech: 0.018197158351540565
 Vehicle: 0.007446660194545984
 0.059197544398158296
 Cat: 0.9250872135162354
 Animal: 0.8957151174545288
 Domestic animals, pets: 0.8228275775909424
 Meow: 0.7650775909423828
 Music: 0.20210561156272888
 Inside, small room: 0.12290887534618378
 Caterwaul: 0.029371455311775208
 Purr: 0.018731823191046715
 Speech: 0.017130598425865173
 Vehicle: 0.007748497650027275
 0.11839508879631659
 Cat: 0.9336574673652649
 Animal: 0.9111202359199524
 Domestic animals, pets: 0.8349071145057678
 Meow: 0.7761964797973633
 Music: 0.20467285811901093
 Inside, small room: 0.10709915310144424
 Caterwaul: 0.05370649695396423
 Purr: 0.018830426037311554
 Speech: 0.017361722886562347
 Vehicle: 0.006929398979991674
 ...
 ...
 ```
 以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子，可以实时地对音频进行多标签预测。
 ![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif)
--- a/paddleaudio/examples/panns/assets/audioset_labels.txt
+++ b/paddleaudio/examples/panns/assets/audioset_labels.txt
@ -0,0 +1,527 @@
 Speech
 Male speech, man speaking
 Female speech, woman speaking
 Child speech, kid speaking
 Conversation
 Narration, monologue
 Babbling
 Speech synthesizer
 Shout
 Bellow
 Whoop
 Yell
 Battle cry
 Children shouting
 Screaming
 Whispering
 Laughter
 Baby laughter
 Giggle
 Snicker
 Belly laugh
 Chuckle, chortle
 Crying, sobbing
 Baby cry, infant cry
 Whimper
 Wail, moan
 Sigh
 Singing
 Choir
 Yodeling
 Chant
 Mantra
 Male singing
 Female singing
 Child singing
 Synthetic singing
 Rapping
 Humming
 Groan
 Grunt
 Whistling
 Breathing
 Wheeze
 Snoring
 Gasp
 Pant
 Snort
 Cough
 Throat clearing
 Sneeze
 Sniff
 Run
 Shuffle
 Walk, footsteps
 Chewing, mastication
 Biting
 Gargling
 Stomach rumble
 Burping, eructation
 Hiccup
 Fart
 Hands
 Finger snapping
 Clapping
 Heart sounds, heartbeat
 Heart murmur
 Cheering
 Applause
 Chatter
 Crowd
 Hubbub, speech noise, speech babble
 Children playing
 Animal
 Domestic animals, pets
 Dog
 Bark
 Yip
 Howl
 Bow-wow
 Growling
 Whimper (dog)
 Cat
 Purr
 Meow
 Hiss
 Caterwaul
 Livestock, farm animals, working animals
 Horse
 Clip-clop
 Neigh, whinny
 Cattle, bovinae
 Moo
 Cowbell
 Pig
 Oink
 Goat
 Bleat
 Sheep
 Fowl
 Chicken, rooster
 Cluck
 Crowing, cock-a-doodle-doo
 Turkey
 Gobble
 Duck
 Quack
 Goose
 Honk
 Wild animals
 Roaring cats (lions, tigers)
 Roar
 Bird
 Bird vocalization, bird call, bird song
 Chirp, tweet
 Squawk
 Pigeon, dove
 Coo
 Crow
 Caw
 Owl
 Hoot
 Bird flight, flapping wings
 Canidae, dogs, wolves
 Rodents, rats, mice
 Mouse
 Patter
 Insect
 Cricket
 Mosquito
 Fly, housefly
 Buzz
 Bee, wasp, etc.
 Frog
 Croak
 Snake
 Rattle
 Whale vocalization
 Music
 Musical instrument
 Plucked string instrument
 Guitar
 Electric guitar
 Bass guitar
 Acoustic guitar
 Steel guitar, slide guitar
 Tapping (guitar technique)
 Strum
 Banjo
 Sitar
 Mandolin
 Zither
 Ukulele
 Keyboard (musical)
 Piano
 Electric piano
 Organ
 Electronic organ
 Hammond organ
 Synthesizer
 Sampler
 Harpsichord
 Percussion
 Drum kit
 Drum machine
 Drum
 Snare drum
 Rimshot
 Drum roll
 Bass drum
 Timpani
 Tabla
 Cymbal
 Hi-hat
 Wood block
 Tambourine
 Rattle (instrument)
 Maraca
 Gong
 Tubular bells
 Mallet percussion
 Marimba, xylophone
 Glockenspiel
 Vibraphone
 Steelpan
 Orchestra
 Brass instrument
 French horn
 Trumpet
 Trombone
 Bowed string instrument
 String section
 Violin, fiddle
 Pizzicato
 Cello
 Double bass
 Wind instrument, woodwind instrument
 Flute
 Saxophone
 Clarinet
 Harp
 Bell
 Church bell
 Jingle bell
 Bicycle bell
 Tuning fork
 Chime
 Wind chime
 Change ringing (campanology)
 Harmonica
 Accordion
 Bagpipes
 Didgeridoo
 Shofar
 Theremin
 Singing bowl
 Scratching (performance technique)
 Pop music
 Hip hop music
 Beatboxing
 Rock music
 Heavy metal
 Punk rock
 Grunge
 Progressive rock
 Rock and roll
 Psychedelic rock
 Rhythm and blues
 Soul music
 Reggae
 Country
 Swing music
 Bluegrass
 Funk
 Folk music
 Middle Eastern music
 Jazz
 Disco
 Classical music
 Opera
 Electronic music
 House music
 Techno
 Dubstep
 Drum and bass
 Electronica
 Electronic dance music
 Ambient music
 Trance music
 Music of Latin America
 Salsa music
 Flamenco
 Blues
 Music for children
 New-age music
 Vocal music
 A capella
 Music of Africa
 Afrobeat
 Christian music
 Gospel music
 Music of Asia
 Carnatic music
 Music of Bollywood
 Ska
 Traditional music
 Independent music
 Song
 Background music
 Theme music
 Jingle (music)
 Soundtrack music
 Lullaby
 Video game music
 Christmas music
 Dance music
 Wedding music
 Happy music
 Funny music
 Sad music
 Tender music
 Exciting music
 Angry music
 Scary music
 Wind
 Rustling leaves
 Wind noise (microphone)
 Thunderstorm
 Thunder
 Water
 Rain
 Raindrop
 Rain on surface
 Stream
 Waterfall
 Ocean
 Waves, surf
 Steam
 Gurgling
 Fire
 Crackle
 Vehicle
 Boat, Water vehicle
 Sailboat, sailing ship
 Rowboat, canoe, kayak
 Motorboat, speedboat
 Ship
 Motor vehicle (road)
 Car
 Vehicle horn, car horn, honking
 Toot
 Car alarm
 Power windows, electric windows
 Skidding
 Tire squeal
 Car passing by
 Race car, auto racing
 Truck
 Air brake
 Air horn, truck horn
 Reversing beeps
 Ice cream truck, ice cream van
 Bus
 Emergency vehicle
 Police car (siren)
 Ambulance (siren)
 Fire engine, fire truck (siren)
 Motorcycle
 Traffic noise, roadway noise
 Rail transport
 Train
 Train whistle
 Train horn
 Railroad car, train wagon
 Train wheels squealing
 Subway, metro, underground
 Aircraft
 Aircraft engine
 Jet engine
 Propeller, airscrew
 Helicopter
 Fixed-wing aircraft, airplane
 Bicycle
 Skateboard
 Engine
 Light engine (high frequency)
 Dental drill, dentist's drill
 Lawn mower
 Chainsaw
 Medium engine (mid frequency)
 Heavy engine (low frequency)
 Engine knocking
 Engine starting
 Idling
 Accelerating, revving, vroom
 Door
 Doorbell
 Ding-dong
 Sliding door
 Slam
 Knock
 Tap
 Squeak
 Cupboard open or close
 Drawer open or close
 Dishes, pots, and pans
 Cutlery, silverware
 Chopping (food)
 Frying (food)
 Microwave oven
 Blender
 Water tap, faucet
 Sink (filling or washing)
 Bathtub (filling or washing)
 Hair dryer
 Toilet flush
 Toothbrush
 Electric toothbrush
 Vacuum cleaner
 Zipper (clothing)
 Keys jangling
 Coin (dropping)
 Scissors
 Electric shaver, electric razor
 Shuffling cards
 Typing
 Typewriter
 Computer keyboard
 Writing
 Alarm
 Telephone
 Telephone bell ringing
 Ringtone
 Telephone dialing, DTMF
 Dial tone
 Busy signal
 Alarm clock
 Siren
 Civil defense siren
 Buzzer
 Smoke detector, smoke alarm
 Fire alarm
 Foghorn
 Whistle
 Steam whistle
 Mechanisms
 Ratchet, pawl
 Clock
 Tick
 Tick-tock
 Gears
 Pulleys
 Sewing machine
 Mechanical fan
 Air conditioning
 Cash register
 Printer
 Camera
 Single-lens reflex camera
 Tools
 Hammer
 Jackhammer
 Sawing
 Filing (rasp)
 Sanding
 Power tool
 Drill
 Explosion
 Gunshot, gunfire
 Machine gun
 Fusillade
 Artillery fire
 Cap gun
 Fireworks
 Firecracker
 Burst, pop
 Eruption
 Boom
 Wood
 Chop
 Splinter
 Crack
 Glass
 Chink, clink
 Shatter
 Liquid
 Splash, splatter
 Slosh
 Squish
 Drip
 Pour
 Trickle, dribble
 Gush
 Fill (with liquid)
 Spray
 Pump (liquid)
 Stir
 Boiling
 Sonar
 Arrow
 Whoosh, swoosh, swish
 Thump, thud
 Thunk
 Electronic tuner
 Effects unit
 Chorus effect
 Basketball bounce
 Bang
 Slap, smack
 Whack, thwack
 Smash, crash
 Breaking
 Bouncing
 Whip
 Flap
 Scratch
 Scrape
 Rub
 Roll
 Crushing
 Crumpling, crinkling
 Tearing
 Beep, bleep
 Ping
 Ding
 Clang
 Squeal
 Creak
 Rustle
 Whir
 Clatter
 Sizzle
 Clicking
 Clickety-clack
 Rumble
 Plop
 Jingle, tinkle
 Hum
 Zing
 Boing
 Crunch
 Silence
 Sine wave
 Harmonic
 Chirp tone
 Sound effect
 Pulse
 Inside, small room
 Inside, large room or hall
 Inside, public space
 Outside, urban or manmade
 Outside, rural or natural
 Reverberation
 Echo
 Noise
 Environmental noise
 Static
 Mains hum
 Distortion
 Sidetone
 Cacophony
 White noise
 Pink noise
 Throbbing
 Vibration
 Television
 Radio
 Field recording
--- a/paddleaudio/examples/panns/audio_tag.py
+++ b/paddleaudio/examples/panns/audio_tag.py
@ -0,0 +1,112 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 from typing import List
 import numpy as np
 import paddle
 from paddleaudio.backends import load as load_audio
 from paddleaudio.features import melspectrogram
 from paddleaudio.models.panns import cnn14
 from paddleaudio.utils import logger
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
 parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
 parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
 parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
 parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
 args = parser.parse_args()
 # yapf: enable
 def split(waveform: np.ndarray, win_size: int, hop_size: int):
    """
    Split into N waveforms.
    N is decided by win_size and hop_size.
    """
    assert isinstance(waveform, np.ndarray)
    time = []
    data = []
    for i in range(0, len(waveform), hop_size):
        segment = waveform[i:i + win_size]
        if len(segment) < win_size:
            segment = np.pad(segment, (0, win_size - len(segment)))
        data.append(segment)
        time.append(i / len(waveform))
    return time, data
 def batchify(data: List[List[float]],
             sample_rate: int,
             batch_size: int,
             **kwargs):
    """
    Extract features from waveforms and create batches.
    """
    examples = []
    for waveform in data:
        feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
        examples.append(feats)
    # Seperates data into some batches.
    one_batch = []
    for example in examples:
        one_batch.append(example)
        if len(one_batch) == batch_size:
            yield one_batch
            one_batch = []
    if one_batch:
        yield one_batch
 def predict(model, data: List[List[float]], sample_rate: int,
            batch_size: int=1):
    """
    Use pretrained model to make predictions.
    """
    batches = batchify(data, sample_rate, batch_size)
    results = None
    model.eval()
    for batch in batches:
        feats = paddle.to_tensor(batch).unsqueeze(1)  \
            # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
        audioset_scores = model(feats)
        if results is None:
            results = audioset_scores.numpy()
        else:
            results = np.concatenate((results, audioset_scores.numpy()))
    return results
 if __name__ == '__main__':
    paddle.set_device(args.device)
    model = cnn14(pretrained=True, extract_embedding=False)
    waveform, sr = load_audio(args.wav, sr=None)
    time, data = split(waveform,
                       int(args.sample_duration * sr),
                       int(args.hop_duration * sr))
    results = predict(model, data, sr, batch_size=8)
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
    output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
    np.savez(output_file, time=time, scores=results)
    logger.info(f'Saved tagging results to {output_file}')
--- a/paddleaudio/examples/panns/parse_result.py
+++ b/paddleaudio/examples/panns/parse_result.py
@ -0,0 +1,84 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import ast
 import os
 from typing import Dict
 import numpy as np
 from paddleaudio.utils import logger
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument('--tagging_file', type=str, required=True, help='')
 parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
 parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
 parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
 parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
 parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
 args = parser.parse_args()
 # yapf: enable
 def smooth(results: np.ndarray, win_size: int):
    """
    Execute posterior smoothing in-place.
    """
    for i in range(len(results) - 1, -1, -1):
        if i < win_size - 1:
            left = 0
        else:
            left = i + 1 - win_size
        results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
 def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
    """
    Return top k result.
    """
    result = np.asarray(result)
    topk_idx = (-result).argsort()[:k]
    ret = ''
    for idx in topk_idx:
        label, score = label_map[idx], result[idx]
        ret += f'{label}: {score}\n'
    return ret
 if __name__ == "__main__":
    label_map = {}
    with open(args.label_file, 'r') as f:
        for i, l in enumerate(f.readlines()):
            label_map[i] = l.strip()
    results = np.load(args.tagging_file, allow_pickle=True)
    times, scores = results['time'], results['scores']
    if args.smooth:
        logger.info('Posterior smoothing...')
        smooth(scores, win_size=args.smooth_size)
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    output_file = os.path.join(
        args.output_dir,
        os.path.basename(args.tagging_file).split('.')[0] + '.txt')
    with open(output_file, 'w') as f:
        for time, score in zip(times, scores):
            f.write(f'{time}\n')
            f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
    logger.info(f'Saved tagging labels to {output_file}')
--- a/paddleaudio/examples/sound_classification/README.md
+++ b/paddleaudio/examples/sound_classification/README.md
@ -0,0 +1,116 @@
 # 声音分类
 声音分类和检测是声音算法的一个热门研究方向。  
 对于声音分类任务，传统机器学习的一个常用做法是首先人工提取音频的时域和频域的多种特征并做特征选择、组合、变换等，然后基于SVM或决策树进行分类。而端到端的深度学习则通常利用深度网络如RNN，CNN等直接对声间波形(waveform)或时频特征(time-frequency)进行特征学习(representation learning)和分类预测。
 在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
 `PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。经过预训练后，模型可以用于提取音频的embbedding。本示例将使用`PANNs`的预训练模型Finetune完成声音分类的任务。
 ## 模型简介
 PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
 - CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
 - CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
 - CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
 ## 快速开始
 ### 模型训练
 以环境声音分类数据集`ESC50`为示例，运行下面的命令，可在训练集上进行模型的finetune，支持单机的单卡训练和多卡训练。关于如何使用`paddle.distributed.launch`启动多卡训练，请查看[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html)。
 单卡训练:
 ```shell
 $ python train.py --epochs 50 --batch_size 16 --checkpoint_dir ./checkpoint --save_freq 10
 ```
 多卡训练:
 ```shell
 $ unset CUDA_VISIBLE_DEVICES
 $ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_size 16 --num_worker 4 --checkpoint_dir ./checkpoint --save_freq 10
 ```
 可支持配置的参数：
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
 - `epochs`: 训练轮次，默认为50。
 - `learning_rate`: Fine-tune的学习率；默认为5e-5。
 - `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为16。
 - `num_workers`: Dataloader获取数据的子进程数。默认为0，加载数据的流程在主进程执行。
 - `checkpoint_dir`: 模型参数文件和optimizer参数文件的保存目录，默认为`./checkpoint`。
 - `save_freq`: 训练过程中的模型保存频率，默认为10。
 - `log_freq`: 训练过程中的信息打印频率，默认为10。
 示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过以下方式执行：
 ```python
 from model import SoundClassifier
 from paddleaudio.datasets import ESC50
 from paddleaudio.models.panns import cnn14, cnn10, cnn6
 # CNN14
 backbone = cnn14(pretrained=True, extract_embedding=True)
 model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
 # CNN10
 backbone = cnn10(pretrained=True, extract_embedding=True)
 model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
 # CNN6
 backbone = cnn6(pretrained=True, extract_embedding=True)
 model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
 ```
 ### 模型预测
 ```shell
 python -u predict.py --wav ./dog.wav --top_k 3 --checkpoint ./checkpoint/epoch_50/model.pdparams
 ```
 可支持配置的参数：
 - `device`: 选用什么设备进行训练，可选cpu或gpu，默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
 - `wav`: 指定预测的音频文件。
 - `top_k`: 预测显示的top k标签的得分，默认为1。
 - `checkpoint`: 模型参数checkpoint文件。
 输出的预测结果如下：
 ```
 [/audio/dog.wav]
 Dog: 0.9999538660049438
 Clock tick: 1.3341237718123011e-05
 Cat: 6.579841738130199e-06
 ```
 ### 模型部署
 #### 1. 动转静
 模型训练结束后，可以将已保存的动态图参数导出成静态图的模型和参数，然后实施静态图的部署。
 ```shell
 python -u export_model.py --checkpoint ./checkpoint/epoch_50/model.pdparams --output_dir ./export
 ```
 可支持配置的参数：
 - `checkpoint`: 模型参数checkpoint文件。
 - `output_dir`: 导出静态图模型和参数文件的保存目录。
 导出的静态图模型和参数文件如下：
 ```sh
 $ tree export
 export
 ├── inference.pdiparams
 ├── inference.pdiparams.info
 └── inference.pdmodel
 ```
 #### 2. 模型部署和预测
 `deploy/python/predict.py` 脚本使用了`paddle.inference`模块下的api，提供了python端部署的示例：
 ```sh
 python deploy/python/predict.py --model_dir ./export --device gpu
 ```
--- a/paddleaudio/examples/sound_classification/deploy/python/predict.py
+++ b/paddleaudio/examples/sound_classification/deploy/python/predict.py
@ -0,0 +1,147 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 import numpy as np
 from paddle import inference
 from scipy.special import softmax
 from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets import ESC50
 from paddleaudio.features import melspectrogram
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
 parser.add_argument("--batch_size", type=int, default=2, help="Batch size per GPU/CPU for training.")
 parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
 parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.')
 parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.')
 parser.add_argument('--enable_mkldnn', type=eval, default=False, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
 parser.add_argument("--log_dir", type=str, default="./log", help="The path to save log.")
 args = parser.parse_args()
 # yapf: enable
 def extract_features(files: str, **kwargs):
    waveforms = []
    srs = []
    max_length = float('-inf')
    for file in files:
        waveform, sr = load_audio(file, sr=None)
        max_length = max(max_length, len(waveform))
        waveforms.append(waveform)
        srs.append(sr)
    feats = []
    for i in range(len(waveforms)):
        # padding
        if len(waveforms[i]) < max_length:
            pad_width = max_length - len(waveforms[i])
            waveforms[i] = np.pad(waveforms[i], pad_width=(0, pad_width))
        feat = melspectrogram(waveforms[i], sr, **kwargs).transpose()
        feats.append(feat)
    return np.stack(feats, axis=0)
 class Predictor(object):
    def __init__(self,
                 model_dir,
                 device="gpu",
                 batch_size=1,
                 use_tensorrt=False,
                 precision="fp32",
                 cpu_threads=10,
                 enable_mkldnn=False):
        self.batch_size = batch_size
        model_file = os.path.join(model_dir, "inference.pdmodel")
        params_file = os.path.join(model_dir, "inference.pdiparams")
        assert os.path.isfile(model_file) and os.path.isfile(
            params_file), 'Please check model and parameter files.'
        config = inference.Config(model_file, params_file)
        if device == "gpu":
            # set GPU configs accordingly
            # such as intialize the gpu memory, enable tensorrt
            config.enable_use_gpu(100, 0)
            precision_map = {
                "fp16": inference.PrecisionType.Half,
                "fp32": inference.PrecisionType.Float32,
            }
            precision_mode = precision_map[precision]
            if use_tensorrt:
                config.enable_tensorrt_engine(
                    max_batch_size=batch_size,
                    min_subgraph_size=30,
                    precision_mode=precision_mode)
        elif device == "cpu":
            # set CPU configs accordingly,
            # such as enable_mkldnn, set_cpu_math_library_num_threads
            config.disable_gpu()
            if enable_mkldnn:
                # cache 10 different shapes for mkldnn to avoid memory leak
                config.set_mkldnn_cache_capacity(10)
                config.enable_mkldnn()
            config.set_cpu_math_library_num_threads(cpu_threads)
        elif device == "xpu":
            # set XPU configs accordingly
            config.enable_xpu(100)
        config.switch_use_feed_fetch_ops(False)
        self.predictor = inference.create_predictor(config)
        self.input_handles = [
            self.predictor.get_input_handle(name)
            for name in self.predictor.get_input_names()
        ]
        self.output_handle = self.predictor.get_output_handle(
            self.predictor.get_output_names()[0])
    def predict(self, wavs):
        feats = extract_features(wavs)
        self.input_handles[0].copy_from_cpu(feats)
        self.predictor.run()
        logits = self.output_handle.copy_to_cpu()
        probs = softmax(logits, axis=1)
        indices = np.argmax(probs, axis=1)
        return indices
 if __name__ == "__main__":
    # Define predictor to do prediction.
    predictor = Predictor(args.model_dir, args.device, args.batch_size,
                          args.use_tensorrt, args.precision, args.cpu_threads,
                          args.enable_mkldnn)
    wavs = [
        '~/audio_demo_resource/cat.wav',
        '~/audio_demo_resource/dog.wav',
    ]
    for i in range(len(wavs)):
        wavs[i] = os.path.abspath(os.path.expanduser(wavs[i]))
        assert os.path.isfile(
            wavs[i]), f'Please check input wave file: {wavs[i]}'
    results = predictor.predict(wavs)
    for idx, wav in enumerate(wavs):
        print(f'Wav: {wav} \t Label: {ESC50.label_list[results[idx]]}')
--- a/paddleaudio/examples/sound_classification/export_model.py
+++ b/paddleaudio/examples/sound_classification/export_model.py
@ -0,0 +1,45 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 import paddle
 from model import SoundClassifier
 from paddleaudio.datasets import ESC50
 from paddleaudio.models.panns import cnn14
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
 parser.add_argument("--output_dir", type=str, default='./export', help="Path to save static model and its parameters.")
 args = parser.parse_args()
 # yapf: enable
 if __name__ == '__main__':
    model = SoundClassifier(
        backbone=cnn14(pretrained=False, extract_embedding=True),
        num_class=len(ESC50.label_list))
    model.set_state_dict(paddle.load(args.checkpoint))
    model.eval()
    model = paddle.jit.to_static(
        model,
        input_spec=[
            paddle.static.InputSpec(
                shape=[None, None, 64], dtype=paddle.float32)
        ])
    # Save in static graph model.
    paddle.jit.save(model, os.path.join(args.output_dir, "inference"))
--- a/paddleaudio/examples/sound_classification/model.py
+++ b/paddleaudio/examples/sound_classification/model.py
@ -0,0 +1,36 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle.nn as nn
 class SoundClassifier(nn.Layer):
    """
    Model for sound classification which uses panns pretrained models to extract
    embeddings from audio files.
    """
    def __init__(self, backbone, num_class, dropout=0.1):
        super(SoundClassifier, self).__init__()
        self.backbone = backbone
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(self.backbone.emb_size, num_class)
    def forward(self, x):
        # x: (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
        x = x.unsqueeze(1)
        x = self.backbone(x)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits
--- a/paddleaudio/examples/sound_classification/predict.py
+++ b/paddleaudio/examples/sound_classification/predict.py
@ -0,0 +1,61 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import numpy as np
 import paddle
 import paddle.nn.functional as F
 from model import SoundClassifier
 from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets import ESC50
 from paddleaudio.features import melspectrogram
 from paddleaudio.models.panns import cnn14
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
 parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
 parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
 parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
 args = parser.parse_args()
 # yapf: enable
 def extract_features(file: str, **kwargs):
    waveform, sr = load_audio(file, sr=None)
    feat = melspectrogram(waveform, sr, **kwargs).transpose()
    return feat
 if __name__ == '__main__':
    paddle.set_device(args.device)
    model = SoundClassifier(
        backbone=cnn14(pretrained=False, extract_embedding=True),
        num_class=len(ESC50.label_list))
    model.set_state_dict(paddle.load(args.checkpoint))
    model.eval()
    feat = np.expand_dims(extract_features(args.wav), 0)
    feat = paddle.to_tensor(feat)
    logits = model(feat)
    probs = F.softmax(logits, axis=1).numpy()
    sorted_indices = (-probs[0]).argsort()
    msg = f'[{args.wav}]\n'
    for idx in sorted_indices[:args.top_k]:
        msg += f'{ESC50.label_list[idx]}: {probs[0][idx]}\n'
    print(msg)
--- a/paddleaudio/examples/sound_classification/train.py
+++ b/paddleaudio/examples/sound_classification/train.py
@ -0,0 +1,149 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import os
 import paddle
 from model import SoundClassifier
 from paddleaudio.datasets import ESC50
 from paddleaudio.models.panns import cnn14
 from paddleaudio.utils import logger
 from paddleaudio.utils import Timer
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
 parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
 parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
 parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
 parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
 parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
 parser.add_argument("--checkpoint_dir", type=str, default='./checkpoint', help="Directory to save model checkpoints.")
 parser.add_argument("--save_freq", type=int, default=10, help="Save checkpoint every n epoch.")
 parser.add_argument("--log_freq", type=int, default=10, help="Log the training infomation every n steps.")
 args = parser.parse_args()
 # yapf: enable
 if __name__ == "__main__":
    paddle.set_device(args.device)
    nranks = paddle.distributed.get_world_size()
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()
    local_rank = paddle.distributed.get_rank()
    backbone = cnn14(pretrained=True, extract_embedding=True)
    model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
    model = paddle.DataParallel(model)
    optimizer = paddle.optimizer.Adam(
        learning_rate=args.learning_rate, parameters=model.parameters())
    criterion = paddle.nn.loss.CrossEntropyLoss()
    train_ds = ESC50(mode='train', feat_type='melspectrogram')
    dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
    train_sampler = paddle.io.DistributedBatchSampler(
        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
    train_loader = paddle.io.DataLoader(
        train_ds,
        batch_sampler=train_sampler,
        num_workers=args.num_workers,
        return_list=True,
        use_buffer_reader=True, )
    steps_per_epoch = len(train_sampler)
    timer = Timer(steps_per_epoch * args.epochs)
    timer.start()
    for epoch in range(1, args.epochs + 1):
        model.train()
        avg_loss = 0
        num_corrects = 0
        num_samples = 0
        for batch_idx, batch in enumerate(train_loader):
            feats, labels = batch
            logits = model(feats)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            if isinstance(optimizer._learning_rate,
                          paddle.optimizer.lr.LRScheduler):
                optimizer._learning_rate.step()
            optimizer.clear_grad()
            # Calculate loss
            avg_loss += loss.numpy()[0]
            # Calculate metrics
            preds = paddle.argmax(logits, axis=1)
            num_corrects += (preds == labels).numpy().sum()
            num_samples += feats.shape[0]
            timer.count()
            if (batch_idx + 1) % args.log_freq == 0 and local_rank == 0:
                lr = optimizer.get_lr()
                avg_loss /= args.log_freq
                avg_acc = num_corrects / num_samples
                print_msg = 'Epoch={}/{}, Step={}/{}'.format(
                    epoch, args.epochs, batch_idx + 1, steps_per_epoch)
                print_msg += ' loss={:.4f}'.format(avg_loss)
                print_msg += ' acc={:.4f}'.format(avg_acc)
                print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
                    lr, timer.timing, timer.eta)
                logger.train(print_msg)
                avg_loss = 0
                num_corrects = 0
                num_samples = 0
        if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
            dev_sampler = paddle.io.BatchSampler(
                dev_ds,
                batch_size=args.batch_size,
                shuffle=False,
                drop_last=False)
            dev_loader = paddle.io.DataLoader(
                dev_ds,
                batch_sampler=dev_sampler,
                num_workers=args.num_workers,
                return_list=True, )
            model.eval()
            num_corrects = 0
            num_samples = 0
            with logger.processing('Evaluation on validation dataset'):
                for batch_idx, batch in enumerate(dev_loader):
                    feats, labels = batch
                    logits = model(feats)
                    preds = paddle.argmax(logits, axis=1)
                    num_corrects += (preds == labels).numpy().sum()
                    num_samples += feats.shape[0]
            print_msg = '[Evaluation result]'
            print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
            logger.eval(print_msg)
            # Save model
            save_dir = os.path.join(args.checkpoint_dir,
                                    'epoch_{}'.format(epoch))
            logger.info('Saving model checkpoint to {}'.format(save_dir))
            paddle.save(model.state_dict(),
                        os.path.join(save_dir, 'model.pdparams'))
            paddle.save(optimizer.state_dict(),
                        os.path.join(save_dir, 'model.pdopt'))
--- a/paddleaudio/paddleaudio/init.py
+++ b/paddleaudio/paddleaudio/init.py
@ -0,0 +1,15 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .backends import *
 from .features import *
--- a/paddleaudio/paddleaudio/backends/init.py
+++ b/paddleaudio/paddleaudio/backends/init.py
@ -0,0 +1,14 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .audio import *
--- a/paddleaudio/paddleaudio/backends/audio.py
+++ b/paddleaudio/paddleaudio/backends/audio.py
@ -0,0 +1,303 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import warnings
 from typing import Optional
 from typing import Tuple
 from typing import Union
 import numpy as np
 import resampy
 import soundfile as sf
 from numpy import ndarray as array
 from scipy.io import wavfile
 from ..utils import ParameterError
 __all__ = [
    'resample',
    'to_mono',
    'depth_convert',
    'normalize',
    'save_wav',
    'load',
 ]
 NORMALMIZE_TYPES = ['linear', 'gaussian']
 MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
 RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
 EPS = 1e-8
 def resample(y: array, src_sr: int, target_sr: int,
             mode: str='kaiser_fast') -> array:
    """ Audio resampling
     This function is the same as using resampy.resample().
     Notes:
        The default mode is kaiser_fast.  For better audio quality, use mode = 'kaiser_fast'
     """
    if mode == 'kaiser_best':
        warnings.warn(
            f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
        we recommend the mode kaiser_fast in large scale audio trainning')
    if not isinstance(y, np.ndarray):
        raise ParameterError(
            'Only support numpy array, but received y in {type(y)}')
    if mode not in RESAMPLE_MODES:
        raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
    return resampy.resample(y, src_sr, target_sr, filter=mode)
 def to_mono(y: array, merge_type: str='average') -> array:
    """ convert sterior audio to mono
    """
    if merge_type not in MERGE_TYPES:
        raise ParameterError(
            f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
        )
    if y.ndim > 2:
        raise ParameterError(
            f'Unsupported audio array,  y.ndim > 2, the shape is {y.shape}')
    if y.ndim == 1:  # nothing to merge
        return y
    if merge_type == 'ch0':
        return y[0]
    if merge_type == 'ch1':
        return y[1]
    if merge_type == 'random':
        return y[np.random.randint(0, 2)]
    # need to do averaging according to dtype
    if y.dtype == 'float32':
        y_out = (y[0] + y[1]) * 0.5
    elif y.dtype == 'int16':
        y_out = y.astype('int32')
        y_out = (y_out[0] + y_out[1]) // 2
        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
                        np.iinfo(y.dtype).max).astype(y.dtype)
    elif y.dtype == 'int8':
        y_out = y.astype('int16')
        y_out = (y_out[0] + y_out[1]) // 2
        y_out = np.clip(y_out, np.iinfo(y.dtype).min,
                        np.iinfo(y.dtype).max).astype(y.dtype)
    else:
        raise ParameterError(f'Unsupported dtype: {y.dtype}')
    return y_out
 def _safe_cast(y: array, dtype: Union[type, str]) -> array:
    """ data type casting in a safe way, i.e., prevent overflow or underflow
    This function is used internally.
    """
    return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
 def depth_convert(y: array, dtype: Union[type, str],
                  dithering: bool=True) -> array:
    """Convert audio array to target dtype safely
    This function convert audio waveform to a target dtype, with addition steps of
    preventing overflow/underflow and preserving audio range.
    """
    SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
    if y.dtype not in SUPPORT_DTYPE:
        raise ParameterError(
            'Unsupported audio dtype, '
            f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
    if dtype not in SUPPORT_DTYPE:
        raise ParameterError(
            'Unsupported audio dtype, '
            f'target dtype  is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
    if dtype == y.dtype:
        return y
    if dtype == 'float64' and y.dtype == 'float32':
        return _safe_cast(y, dtype)
    if dtype == 'float32' and y.dtype == 'float64':
        return _safe_cast(y, dtype)
    if dtype == 'int16' or dtype == 'int8':
        if y.dtype in ['float64', 'float32']:
            factor = np.iinfo(dtype).max
            y = np.clip(y * factor, np.iinfo(dtype).min,
                        np.iinfo(dtype).max).astype(dtype)
            y = y.astype(dtype)
        else:
            if dtype == 'int16' and y.dtype == 'int8':
                factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
                y = y.astype('float32') * factor
                y = y.astype('int16')
            else:  # dtype == 'int8' and y.dtype=='int16':
                y = y.astype('int32') * np.iinfo('int8').max / \
                    np.iinfo('int16').max
                y = y.astype('int8')
    if dtype in ['float32', 'float64']:
        org_dtype = y.dtype
        y = y.astype(dtype) / np.iinfo(org_dtype).max
    return y
 def sound_file_load(file: str,
                    offset: Optional[float]=None,
                    dtype: str='int16',
                    duration: Optional[int]=None) -> Tuple[array, int]:
    """Load audio using soundfile library
    This function load audio file using libsndfile.
    Reference:
        http://www.mega-nerd.com/libsndfile/#Features
    """
    with sf.SoundFile(file) as sf_desc:
        sr_native = sf_desc.samplerate
        if offset:
            sf_desc.seek(int(offset * sr_native))
        if duration is not None:
            frame_duration = int(duration * sr_native)
        else:
            frame_duration = -1
        y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
    return y, sf_desc.samplerate
 def audio_file_load():
    """Load audio using audiofile library
    This function load audio file using audiofile.
    Reference:
        https://audiofile.68k.org/
    """
    raise NotImplementedError()
 def sox_file_load():
    """Load audio using sox library
    This function load audio file using sox.
    Reference:
        http://sox.sourceforge.net/
    """
    raise NotImplementedError()
 def normalize(y: array, norm_type: str='linear',
              mul_factor: float=1.0) -> array:
    """ normalize an input audio with additional multiplier.
    """
    if norm_type == 'linear':
        amax = np.max(np.abs(y))
        factor = 1.0 / (amax + EPS)
        y = y * factor * mul_factor
    elif norm_type == 'gaussian':
        amean = np.mean(y)
        astd = np.std(y)
        astd = max(astd, EPS)
        y = mul_factor * (y - amean) / astd
    else:
        raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
    return y
 def save_wav(y: array, sr: int, file: str) -> None:
    """Save audio file to disk.
    This function saves audio to disk using scipy.io.wavfile, with additional step
    to convert input waveform to int16 unless it already is int16
    Notes:
        It only support raw wav format.
    """
    if not file.endswith('.wav'):
        raise ParameterError(
            f'only .wav file supported, but dst file name is: {file}')
    if sr <= 0:
        raise ParameterError(
            f'Sample rate should be larger than 0, recieved sr = {sr}')
    if y.dtype not in ['int16', 'int8']:
        warnings.warn(
            f'input data type is {y.dtype}, will convert data to int16 format before saving'
        )
        y_out = depth_convert(y, 'int16')
    else:
        y_out = y
    wavfile.write(file, sr, y_out)
 def load(
        file: str,
        sr: Optional[int]=None,
        mono: bool=True,
        merge_type: str='average',  # ch0,ch1,random,average
        normal: bool=True,
        norm_type: str='linear',
        norm_mul_factor: float=1.0,
        offset: float=0.0,
        duration: Optional[int]=None,
        dtype: str='float32',
        resample_mode: str='kaiser_fast') -> Tuple[array, int]:
    """Load audio file from disk.
    This function loads audio from disk using using audio beackend.
    Parameters:
    Notes:
    """
    y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration)
    if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
        raise ParameterError(f'audio file {file} looks empty')
    if mono:
        y = to_mono(y, merge_type)
    if sr is not None and sr != r:
        y = resample(y, r, sr, mode=resample_mode)
        r = sr
    if normal:
        y = normalize(y, norm_type, norm_mul_factor)
    elif dtype in ['int8', 'int16']:
        # still need to do normalization, before depth convertion
        y = normalize(y, 'linear', 1.0)
    y = depth_convert(y, dtype)
    return y, r
--- a/paddleaudio/paddleaudio/datasets/init.py
+++ b/paddleaudio/paddleaudio/datasets/init.py
@ -0,0 +1,34 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .aishell import AISHELL1
 from .dcase import UrbanAcousticScenes
 from .dcase import UrbanAudioVisualScenes
 from .esc50 import ESC50
 from .gtzan import GTZAN
 from .librispeech import LIBRISPEECH
 from .ravdess import RAVDESS
 from .tess import TESS
 from .urban_sound import UrbanSound8K
 __all__ = [
    'AISHELL1',
    'LIBRISPEECH',
    'ESC50',
    'UrbanSound8K',
    'GTZAN',
    'UrbanAcousticScenes',
    'UrbanAudioVisualScenes',
    'RAVDESS',
    'TESS',
 ]
--- a/paddleaudio/paddleaudio/datasets/aishell.py
+++ b/paddleaudio/paddleaudio/datasets/aishell.py
@ -0,0 +1,154 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import codecs
 import collections
 import json
 import os
 from typing import Dict
 from paddle.io import Dataset
 from tqdm import tqdm
 from ..backends import load as load_audio
 from ..utils.download import decompress
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from ..utils.log import logger
 from .dataset import feat_funcs
 __all__ = ['AISHELL1']
 class AISHELL1(Dataset):
    """
    This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
    It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
    smart home, autonomous driving, and industrial production. The whole recording was
    put in quiet indoor environment, using 3 different devices at the same time: high
    fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
    iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
    to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
    in China were invited to participate in the recording. The manual transcription
    accuracy rate is above 95%, through professional speech annotation and strict
    quality inspection. The corpus is divided into training, development and testing
    sets.
    Reference:
        AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
        https://arxiv.org/abs/1709.05522
    """
    archieves = [
        {
            'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
            'md5': '2f494334227864a8a8fec932999db9d8',
        },
    ]
    text_meta = os.path.join('data_aishell', 'transcript',
                             'aishell_transcript_v0.8.txt')
    utt_info = collections.namedtuple('META_INFO',
                                      ('file_path', 'utt_id', 'text'))
    audio_path = os.path.join('data_aishell', 'wav')
    manifest_path = os.path.join('data_aishell', 'manifest')
    subset = ['train', 'dev', 'test']
    def __init__(self, subset: str='train', feat_type: str='raw', **kwargs):
        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
            self.subset, subset)
        self.subset = subset
        self.feat_type = feat_type
        self.feat_config = kwargs
        self._data = self._get_data()
        super(AISHELL1, self).__init__()
    def _get_text_info(self) -> Dict[str, str]:
        ret = {}
        with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
            for line in rf.readlines()[1:]:
                utt_id, text = map(str.strip, line.split(' ',
                                                         1))  # utt_id, text
                ret.update({utt_id: ''.join(text.split())})
        return ret
    def _get_data(self):
        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
            not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
            download_and_decompress(self.archieves, DATA_HOME)
            # Extract *wav from *.tar.gz.
            for root, _, files in os.walk(
                    os.path.join(DATA_HOME, self.audio_path)):
                for file in files:
                    if file.endswith('.tar.gz'):
                        decompress(os.path.join(root, file))
                        os.remove(os.path.join(root, file))
        text_info = self._get_text_info()
        data = []
        for root, _, files in os.walk(
                os.path.join(DATA_HOME, self.audio_path, self.subset)):
            for file in files:
                if file.endswith('.wav'):
                    utt_id = os.path.splitext(file)[0]
                    if utt_id not in text_info:  # There are some utt_id that without label
                        continue
                    text = text_info[utt_id]
                    file_path = os.path.join(root, file)
                    data.append(self.utt_info(file_path, utt_id, text))
        return data
    def _convert_to_record(self, idx: int):
        sample = self._data[idx]
        record = {}
        # To show all fields in a namedtuple: `type(sample)._fields`
        for field in type(sample)._fields:
            record[field] = getattr(sample, field)
        waveform, sr = load_audio(
            sample[0])  # The first element of sample is file path
        feat_func = feat_funcs[self.feat_type]
        feat = feat_func(
            waveform, sample_rate=sr,
            **self.feat_config) if feat_func else waveform
        record.update({'feat': feat, 'duration': len(waveform) / sr})
        return record
    def create_manifest(self, prefix='manifest'):
        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
                                     f'{prefix}.{self.subset}')
        with codecs.open(manifest_file, 'w', 'utf-8') as f:
            for idx in tqdm(range(len(self))):
                record = self._convert_to_record(idx)
                record_line = json.dumps(
                    {
                        'utt': record['utt_id'],
                        'feat': record['file_path'],
                        'feat_shape': (record['duration'], ),
                        'text': record['text']
                    },
                    ensure_ascii=False)
                f.write(record_line + '\n')
        logger.info(f'Manifest file {manifest_file} created.')
    def __getitem__(self, idx):
        record = self._convert_to_record(idx)
        return tuple(record.values())
    def __len__(self):
        return len(self._data)
--- a/paddleaudio/paddleaudio/datasets/dataset.py
+++ b/paddleaudio/paddleaudio/datasets/dataset.py
@ -0,0 +1,82 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 import numpy as np
 import paddle
 from ..backends import load as load_audio
 from ..features import melspectrogram
 from ..features import mfcc
 feat_funcs = {
    'raw': None,
    'melspectrogram': melspectrogram,
    'mfcc': mfcc,
 }
 class AudioClassificationDataset(paddle.io.Dataset):
    """
    Base class of audio classification dataset.
    """
    def __init__(self,
                 files: List[str],
                 labels: List[int],
                 feat_type: str='raw',
                 **kwargs):
        """
        Ags:
            files (:obj:`List[str]`): A list of absolute path of audio files.
            labels (:obj:`List[int]`): Labels of audio files.
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        super(AudioClassificationDataset, self).__init__()
        if feat_type not in feat_funcs.keys():
            raise RuntimeError(
                f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
            )
        self.files = files
        self.labels = labels
        self.feat_type = feat_type
        self.feat_config = kwargs  # Pass keyword arguments to customize feature config
    def _get_data(self, input_file: str):
        raise NotImplementedError
    def _convert_to_record(self, idx):
        file, label = self.files[idx], self.labels[idx]
        waveform, sample_rate = load_audio(file)
        feat_func = feat_funcs[self.feat_type]
        record = {}
        record['feat'] = feat_func(
            waveform, sample_rate,
            **self.feat_config) if feat_func else waveform
        record['label'] = label
        return record
    def __getitem__(self, idx):
        record = self._convert_to_record(idx)
        return np.array(record['feat']).transpose(), np.array(
            record['label'], dtype=np.int64)
    def __len__(self):
        return len(self.files)
--- a/paddleaudio/paddleaudio/datasets/dcase.py
+++ b/paddleaudio/paddleaudio/datasets/dcase.py
@ -0,0 +1,298 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
 import os
 from typing import List
 from typing import Tuple
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
 class UrbanAcousticScenes(AudioClassificationDataset):
    """
    TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from
    12 European cities in 10 different acoustic scenes using 4 different devices.
    Additionally, synthetic data for 11 mobile devices was created based on the original
    recordings. Of the 12 cities, two are present only in the evaluation set.
    Reference:
        A multi-device dataset for urban acoustic scene classification
        https://arxiv.org/abs/1807.09840
    """
    source_url = 'https://zenodo.org/record/3819968/files/'
    base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
    archieves = [
        {
            'url': source_url + base_name + '.meta.zip',
            'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
        },
        {
            'url': source_url + base_name + '.audio.1.zip',
            'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
        },
        {
            'url': source_url + base_name + '.audio.2.zip',
            'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
        },
        {
            'url': source_url + base_name + '.audio.3.zip',
            'md5': 'ed38956c4246abb56190c1e9b602b7b8',
        },
        {
            'url': source_url + base_name + '.audio.4.zip',
            'md5': '97ab8560056b6816808dedc044dcc023',
        },
        {
            'url': source_url + base_name + '.audio.5.zip',
            'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
        },
        {
            'url': source_url + base_name + '.audio.6.zip',
            'md5': 'fbf856a3a86fff7520549c899dc94372',
        },
        {
            'url': source_url + base_name + '.audio.7.zip',
            'md5': '0dbffe7b6e45564da649378723284062',
        },
        {
            'url': source_url + base_name + '.audio.8.zip',
            'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
        },
        {
            'url': source_url + base_name + '.audio.9.zip',
            'md5': 'a65596a5372eab10c78e08a0de797c9e',
        },
        {
            'url': source_url + base_name + '.audio.10.zip',
            'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
        },
        {
            'url': source_url + base_name + '.audio.11.zip',
            'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
        },
        {
            'url': source_url + base_name + '.audio.12.zip',
            'md5': 'e5f4400c6b9697295fab4cf507155a2f',
        },
        {
            'url': source_url + base_name + '.audio.13.zip',
            'md5': '8855ab9f9896422746ab4c5d89d8da2f',
        },
        {
            'url': source_url + base_name + '.audio.14.zip',
            'md5': '092ad744452cd3e7de78f988a3d13020',
        },
        {
            'url': source_url + base_name + '.audio.15.zip',
            'md5': '4b5eb85f6592aebf846088d9df76b420',
        },
        {
            'url': source_url + base_name + '.audio.16.zip',
            'md5': '2e0a89723e58a3836be019e6996ae460',
        },
    ]
    label_list = [
        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
    ]
    meta = os.path.join(base_name, 'meta.csv')
    meta_info = collections.namedtuple('META_INFO', (
        'filename', 'scene_label', 'identifier', 'source_label'))
    subset_meta = {
        'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
        'dev':
        os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
        'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
    }
    subset_meta_info = collections.namedtuple('SUBSET_META_INFO',
                                              ('filename', 'scene_label'))
    audio_path = os.path.join(base_name, 'audio')
    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train or dev).
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        files, labels = self._get_data(mode)
        super(UrbanAcousticScenes, self).__init__(
            files=files, labels=labels, feat_type=feat_type, **kwargs)
    def _get_meta_info(self, subset: str=None,
                       skip_header: bool=True) -> List[collections.namedtuple]:
        if subset is None:
            meta_file = self.meta
            meta_info = self.meta_info
        else:
            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
            meta_file = self.subset_meta[subset]
            meta_info = self.subset_meta_info
        ret = []
        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
            lines = rf.readlines()[1:] if skip_header else rf.readlines()
            for line in lines:
                ret.append(meta_info(*line.strip().split('\t')))
        return ret
    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
            download_and_decompress(self.archieves, DATA_HOME)
        meta_info = self._get_meta_info(subset=mode, skip_header=True)
        files = []
        labels = []
        for sample in meta_info:
            filename, label = sample[:2]
            filename = os.path.basename(filename)
            target = self.label_list.index(label)
            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
            labels.append(int(target))
        return files, labels
 class UrbanAudioVisualScenes(AudioClassificationDataset):
    """
    TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
    and video recordings from 12 European cities in 10 different scenes.
    This dataset consists of 10-seconds audio and video segments from 10
    acoustic scenes. The total amount of audio in the development set is 34 hours.
    Reference:
        A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
        https://arxiv.org/abs/2011.00030
    """
    source_url = 'https://zenodo.org/record/4477542/files/'
    base_name = 'TAU-urban-audio-visual-scenes-2021-development'
    archieves = [
        {
            'url': source_url + base_name + '.meta.zip',
            'md5': '76e3d7ed5291b118372e06379cb2b490',
        },
        {
            'url': source_url + base_name + '.audio.1.zip',
            'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
        },
        {
            'url': source_url + base_name + '.audio.2.zip',
            'md5': '7fd6bb63127f5785874a55aba4e77aa5',
        },
        {
            'url': source_url + base_name + '.audio.3.zip',
            'md5': '61396bede29d7c8c89729a01a6f6b2e2',
        },
        {
            'url': source_url + base_name + '.audio.4.zip',
            'md5': '6ddac89717fcf9c92c451868eed77fe1',
        },
        {
            'url': source_url + base_name + '.audio.5.zip',
            'md5': 'af4820756cdf1a7d4bd6037dc034d384',
        },
        {
            'url': source_url + base_name + '.audio.6.zip',
            'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
        },
        {
            'url': source_url + base_name + '.audio.7.zip',
            'md5': '2be39a76aeed704d5929d020a2909efd',
        },
        {
            'url': source_url + base_name + '.audio.8.zip',
            'md5': '972d8afe0874720fc2f28086e7cb22a9',
        },
    ]
    label_list = [
        'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
    ]
    meta_base_path = os.path.join(base_name, base_name + '.meta')
    meta = os.path.join(meta_base_path, 'meta.csv')
    meta_info = collections.namedtuple('META_INFO', (
        'filename_audio', 'filename_video', 'scene_label', 'identifier'))
    subset_meta = {
        'train':
        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
        'dev':
        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
        'test':
        os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
    }
    subset_meta_info = collections.namedtuple('SUBSET_META_INFO', (
        'filename_audio', 'filename_video', 'scene_label'))
    audio_path = os.path.join(base_name, 'audio')
    def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train or dev).
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        files, labels = self._get_data(mode)
        super(UrbanAudioVisualScenes, self).__init__(
            files=files, labels=labels, feat_type=feat_type, **kwargs)
    def _get_meta_info(self, subset: str=None,
                       skip_header: bool=True) -> List[collections.namedtuple]:
        if subset is None:
            meta_file = self.meta
            meta_info = self.meta_info
        else:
            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
            meta_file = self.subset_meta[subset]
            meta_info = self.subset_meta_info
        ret = []
        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
            lines = rf.readlines()[1:] if skip_header else rf.readlines()
            for line in lines:
                ret.append(meta_info(*line.strip().split('\t')))
        return ret
    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
            download_and_decompress(self.archieves,
                                    os.path.join(DATA_HOME, self.base_name))
        meta_info = self._get_meta_info(subset=mode, skip_header=True)
        files = []
        labels = []
        for sample in meta_info:
            filename, _, label = sample[:3]
            filename = os.path.basename(filename)
            target = self.label_list.index(label)
            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
            labels.append(int(target))
        return files, labels
--- a/paddleaudio/paddleaudio/datasets/esc50.py
+++ b/paddleaudio/paddleaudio/datasets/esc50.py
@ -0,0 +1,152 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
 import os
 from typing import List
 from typing import Tuple
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['ESC50']
 class ESC50(AudioClassificationDataset):
    """
    The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
    suitable for benchmarking methods of environmental sound classification. The dataset
    consists of 5-second-long recordings organized into 50 semantical classes (with
    40 examples per class)
    Reference:
        ESC: Dataset for Environmental Sound Classification
        http://dx.doi.org/10.1145/2733373.2806390
    """
    archieves = [
        {
            'url':
            'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
            'md5': '7771e4b9d86d0945acce719c7a59305a',
        },
    ]
    label_list = [
        # Animals
        'Dog',
        'Rooster',
        'Pig',
        'Cow',
        'Frog',
        'Cat',
        'Hen',
        'Insects (flying)',
        'Sheep',
        'Crow',
        # Natural soundscapes & water sounds
        'Rain',
        'Sea waves',
        'Crackling fire',
        'Crickets',
        'Chirping birds',
        'Water drops',
        'Wind',
        'Pouring water',
        'Toilet flush',
        'Thunderstorm',
        # Human, non-speech sounds
        'Crying baby',
        'Sneezing',
        'Clapping',
        'Breathing',
        'Coughing',
        'Footsteps',
        'Laughing',
        'Brushing teeth',
        'Snoring',
        'Drinking, sipping',
        # Interior/domestic sounds
        'Door knock',
        'Mouse click',
        'Keyboard typing',
        'Door, wood creaks',
        'Can opening',
        'Washing machine',
        'Vacuum cleaner',
        'Clock alarm',
        'Clock tick',
        'Glass breaking',
        # Exterior/urban noises
        'Helicopter',
        'Chainsaw',
        'Siren',
        'Car horn',
        'Engine',
        'Train',
        'Church bells',
        'Airplane',
        'Fireworks',
        'Hand saw',
    ]
    meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
    meta_info = collections.namedtuple(
        'META_INFO',
        ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
    audio_path = os.path.join('ESC-50-master', 'audio')
    def __init__(self,
                 mode: str='train',
                 split: int=1,
                 feat_type: str='raw',
                 **kwargs):
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train or dev).
            split (:obj:`int`, `optional`, defaults to 1):
                It specify the fold of dev dataset.
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        files, labels = self._get_data(mode, split)
        super(ESC50, self).__init__(
            files=files, labels=labels, feat_type=feat_type, **kwargs)
    def _get_meta_info(self) -> List[collections.namedtuple]:
        ret = []
        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
            for line in rf.readlines()[1:]:
                ret.append(self.meta_info(*line.strip().split(',')))
        return ret
    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
            download_and_decompress(self.archieves, DATA_HOME)
        meta_info = self._get_meta_info()
        files = []
        labels = []
        for sample in meta_info:
            filename, fold, target, _, _, _, _ = sample
            if mode == 'train' and int(fold) != split:
                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
                labels.append(int(target))
            if mode != 'train' and int(fold) == split:
                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
                labels.append(int(target))
        return files, labels
--- a/paddleaudio/paddleaudio/datasets/gtzan.py
+++ b/paddleaudio/paddleaudio/datasets/gtzan.py
@ -0,0 +1,115 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
 import os
 import random
 from typing import List
 from typing import Tuple
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['GTZAN']
 class GTZAN(AudioClassificationDataset):
    """
    The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
    each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
    in machine listening research for music genre recognition (MGR).
    Reference:
        Musical genre classification of audio signals
        https://ieeexplore.ieee.org/document/1021072/
    """
    archieves = [
        {
            'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
            'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
        },
    ]
    label_list = [
        'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
        'pop', 'reggae', 'rock'
    ]
    meta = os.path.join('genres', 'input.mf')
    meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
    audio_path = 'genres'
    def __init__(self,
                 mode='train',
                 seed=0,
                 n_folds=5,
                 split=1,
                 feat_type='raw',
                 **kwargs):
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train or dev).
            seed (:obj:`int`, `optional`, defaults to 0):
                Set the random seed to shuffle samples.
            n_folds (:obj:`int`, `optional`, defaults to 5):
                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
            split (:obj:`int`, `optional`, defaults to 1):
                It specify the fold of dev dataset.
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
        files, labels = self._get_data(mode, seed, n_folds, split)
        super(GTZAN, self).__init__(
            files=files, labels=labels, feat_type=feat_type, **kwargs)
    def _get_meta_info(self) -> List[collections.namedtuple]:
        ret = []
        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
            for line in rf.readlines():
                ret.append(self.meta_info(*line.strip().split('\t')))
        return ret
    def _get_data(self, mode, seed, n_folds,
                  split) -> Tuple[List[str], List[int]]:
        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
            download_and_decompress(self.archieves, DATA_HOME)
        meta_info = self._get_meta_info()
        random.seed(seed)  # shuffle samples to split data
        random.shuffle(
            meta_info
        )  # make sure using the same seed to create train and dev dataset
        files = []
        labels = []
        n_samples_per_fold = len(meta_info) // n_folds
        for idx, sample in enumerate(meta_info):
            file_path, label = sample
            filename = os.path.basename(file_path)
            target = self.label_list.index(label)
            fold = idx // n_samples_per_fold + 1
            if mode == 'train' and int(fold) != split:
                files.append(
                    os.path.join(DATA_HOME, self.audio_path, label, filename))
                labels.append(target)
            if mode != 'train' and int(fold) == split:
                files.append(
                    os.path.join(DATA_HOME, self.audio_path, label, filename))
                labels.append(target)
        return files, labels
--- a/paddleaudio/paddleaudio/datasets/librispeech.py
+++ b/paddleaudio/paddleaudio/datasets/librispeech.py
@ -0,0 +1,199 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import codecs
 import collections
 import json
 import os
 from typing import Dict
 from paddle.io import Dataset
 from tqdm import tqdm
 from ..backends import load as load_audio
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from ..utils.log import logger
 from .dataset import feat_funcs
 __all__ = ['LIBRISPEECH']
 class LIBRISPEECH(Dataset):
    """
    LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
    prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
    derived from read audiobooks from the LibriVox project, and has been carefully
    segmented and aligned.
    Reference:
        LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
        http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
        https://arxiv.org/abs/1709.05522
    """
    source_url = 'http://www.openslr.org/resources/12/'
    archieves = [
        {
            'url': source_url + 'train-clean-100.tar.gz',
            'md5': '2a93770f6d5c6c964bc36631d331a522',
        },
        {
            'url': source_url + 'train-clean-360.tar.gz',
            'md5': 'c0e676e450a7ff2f54aeade5171606fa',
        },
        {
            'url': source_url + 'train-other-500.tar.gz',
            'md5': 'd1a0fd59409feb2c614ce4d30c387708',
        },
        {
            'url': source_url + 'dev-clean.tar.gz',
            'md5': '42e2234ba48799c1f50f24a7926300a1',
        },
        {
            'url': source_url + 'dev-other.tar.gz',
            'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
        },
        {
            'url': source_url + 'test-clean.tar.gz',
            'md5': '32fa31d27d2e1cad72775fee3f4849a9',
        },
        {
            'url': source_url + 'test-other.tar.gz',
            'md5': 'fb5a50374b501bb3bac4815ee91d3135',
        },
    ]
    speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
    utt_info = collections.namedtuple('META_INFO', (
        'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
    audio_path = 'LibriSpeech'
    manifest_path = os.path.join('LibriSpeech', 'manifest')
    subset = [
        'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean',
        'dev-other', 'test-clean', 'test-other'
    ]
    def __init__(self,
                 subset: str='train-clean-100',
                 feat_type: str='raw',
                 **kwargs):
        assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
            self.subset, subset)
        self.subset = subset
        self.feat_type = feat_type
        self.feat_config = kwargs
        self._data = self._get_data()
        super(LIBRISPEECH, self).__init__()
    def _get_speaker_info(self) -> Dict[str, str]:
        ret = {}
        with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
            for line in rf.readlines():
                if ';' in line:  # Skip dataset abstract
                    continue
                spk_id, gender = map(str.strip,
                                     line.split('|')[:2])  # spk_id, gender
                ret.update({spk_id: gender})
        return ret
    def _get_text_info(self, trans_file) -> Dict[str, str]:
        ret = {}
        with open(trans_file, 'r') as rf:
            for line in rf.readlines():
                utt_id, text = map(str.strip, line.split(' ',
                                                         1))  # utt_id, text
                ret.update({utt_id: text})
        return ret
    def _get_data(self):
        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
            not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
            download_and_decompress(self.archieves, DATA_HOME,
                                    len(self.archieves))
        # Speaker info
        speaker_info = self._get_speaker_info()
        # Text info
        text_info = {}
        for root, _, files in os.walk(
                os.path.join(DATA_HOME, self.audio_path, self.subset)):
            for file in files:
                if file.endswith('.trans.txt'):
                    text_info.update(
                        self._get_text_info(os.path.join(root, file)))
        data = []
        for root, _, files in os.walk(
                os.path.join(DATA_HOME, self.audio_path, self.subset)):
            for file in files:
                if file.endswith('.flac'):
                    utt_id = os.path.splitext(file)[0]
                    spk_id = utt_id.split('-')[0]
                    if utt_id not in text_info \
                        or spk_id not in speaker_info :  # Skip samples with incomplete data
                        continue
                    file_path = os.path.join(root, file)
                    text = text_info[utt_id]
                    spk_gender = speaker_info[spk_id]
                    data.append(
                        self.utt_info(file_path, utt_id, text, spk_id,
                                      spk_gender))
        return data
    def _convert_to_record(self, idx: int):
        sample = self._data[idx]
        record = {}
        # To show all fields in a namedtuple: `type(sample)._fields`
        for field in type(sample)._fields:
            record[field] = getattr(sample, field)
        waveform, sr = load_audio(
            sample[0])  # The first element of sample is file path
        feat_func = feat_funcs[self.feat_type]
        feat = feat_func(
            waveform, sample_rate=sr,
            **self.feat_config) if feat_func else waveform
        record.update({'feat': feat, 'duration': len(waveform) / sr})
        return record
    def create_manifest(self, prefix='manifest'):
        if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
            os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
        manifest_file = os.path.join(DATA_HOME, self.manifest_path,
                                     f'{prefix}.{self.subset}')
        with codecs.open(manifest_file, 'w', 'utf-8') as f:
            for idx in tqdm(range(len(self))):
                record = self._convert_to_record(idx)
                record_line = json.dumps(
                    {
                        'utt': record['utt_id'],
                        'feat': record['file_path'],
                        'feat_shape': (record['duration'], ),
                        'text': record['text'],
                        'spk': record['spk_id'],
                        'gender': record['spk_gender'],
                    },
                    ensure_ascii=False)
                f.write(record_line + '\n')
        logger.info(f'Manifest file {manifest_file} created.')
    def __getitem__(self, idx):
        record = self._convert_to_record(idx)
        return tuple(record.values())
    def __len__(self):
        return len(self._data)
--- a/paddleaudio/paddleaudio/datasets/ravdess.py
+++ b/paddleaudio/paddleaudio/datasets/ravdess.py
@ -0,0 +1,136 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
 import os
 import random
 from typing import List
 from typing import Tuple
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['RAVDESS']
 class RAVDESS(AudioClassificationDataset):
    """
    The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
    lexically-matched statements in a neutral North American accent. Speech emotions
    includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
    Each expression is produced at two levels of emotional intensity (normal, strong),
    with an additional neutral expression.
    Reference:
        The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
        A dynamic, multimodal set of facial and vocal expressions in North American English
        https://doi.org/10.1371/journal.pone.0196391
    """
    archieves = [
        {
            'url':
            'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
            'md5':
            '5411230427d67a21e18aa4d466e6d1b9',
        },
        {
            'url':
            'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
            'md5':
            'bc696df654c87fed845eb13823edef8a',
        },
    ]
    label_list = [
        'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
        'surprised'
    ]
    meta_info = collections.namedtuple(
        'META_INFO', ('modality', 'vocal_channel', 'emotion',
                      'emotion_intensity', 'statement', 'repitition', 'actor'))
    speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
    song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
    def __init__(self,
                 mode='train',
                 seed=0,
                 n_folds=5,
                 split=1,
                 feat_type='raw',
                 **kwargs):
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train or dev).
            seed (:obj:`int`, `optional`, defaults to 0):
                Set the random seed to shuffle samples.
            n_folds (:obj:`int`, `optional`, defaults to 5):
                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
            split (:obj:`int`, `optional`, defaults to 1):
                It specify the fold of dev dataset.
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
        files, labels = self._get_data(mode, seed, n_folds, split)
        super(RAVDESS, self).__init__(
            files=files, labels=labels, feat_type=feat_type, **kwargs)
    def _get_meta_info(self, files) -> List[collections.namedtuple]:
        ret = []
        for file in files:
            basename_without_extend = os.path.basename(file)[:-4]
            ret.append(self.meta_info(*basename_without_extend.split('-')))
        return ret
    def _get_data(self, mode, seed, n_folds,
                  split) -> Tuple[List[str], List[int]]:
        if not os.path.isdir(self.speech_path) and not os.path.isdir(
                self.song_path):
            download_and_decompress(self.archieves, DATA_HOME)
        wav_files = []
        for root, _, files in os.walk(self.speech_path):
            for file in files:
                if file.endswith('.wav'):
                    wav_files.append(os.path.join(root, file))
        for root, _, files in os.walk(self.song_path):
            for file in files:
                if file.endswith('.wav'):
                    wav_files.append(os.path.join(root, file))
        random.seed(seed)  # shuffle samples to split data
        random.shuffle(
            wav_files
        )  # make sure using the same seed to create train and dev dataset
        meta_info = self._get_meta_info(wav_files)
        files = []
        labels = []
        n_samples_per_fold = len(meta_info) // n_folds
        for idx, sample in enumerate(meta_info):
            _, _, emotion, _, _, _, _ = sample
            target = int(emotion) - 1
            fold = idx // n_samples_per_fold + 1
            if mode == 'train' and int(fold) != split:
                files.append(wav_files[idx])
                labels.append(target)
            if mode != 'train' and int(fold) == split:
                files.append(wav_files[idx])
                labels.append(target)
        return files, labels
--- a/paddleaudio/paddleaudio/datasets/tess.py
+++ b/paddleaudio/paddleaudio/datasets/tess.py
@ -0,0 +1,126 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
 import os
 import random
 from typing import List
 from typing import Tuple
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['TESS']
 class TESS(AudioClassificationDataset):
    """
    TESS is a set of 200 target words were spoken in the carrier phrase
    "Say the word _____' by two actresses (aged 26 and 64 years) and
    recordings were made of the set portraying each of seven emotions(anger,
    disgust, fear, happiness, pleasant surprise, sadness, and neutral).
    There are 2800 stimuli in total.
    Reference:
        Toronto emotional speech set (TESS)
        https://doi.org/10.5683/SP2/E8H2MF
    """
    archieves = [
        {
            'url':
            'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
            'md5':
            '1465311b24d1de704c4c63e4ccc470c7',
        },
    ]
    label_list = [
        'angry',
        'disgust',
        'fear',
        'happy',
        'neutral',
        'ps',  # pleasant surprise
        'sad',
    ]
    meta_info = collections.namedtuple('META_INFO',
                                       ('speaker', 'word', 'emotion'))
    audio_path = 'TESS_Toronto_emotional_speech_set'
    def __init__(self,
                 mode='train',
                 seed=0,
                 n_folds=5,
                 split=1,
                 feat_type='raw',
                 **kwargs):
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train or dev).
            seed (:obj:`int`, `optional`, defaults to 0):
                Set the random seed to shuffle samples.
            n_folds (:obj:`int`, `optional`, defaults to 5):
                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
            split (:obj:`int`, `optional`, defaults to 1):
                It specify the fold of dev dataset.
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
        files, labels = self._get_data(mode, seed, n_folds, split)
        super(TESS, self).__init__(
            files=files, labels=labels, feat_type=feat_type, **kwargs)
    def _get_meta_info(self, files) -> List[collections.namedtuple]:
        ret = []
        for file in files:
            basename_without_extend = os.path.basename(file)[:-4]
            ret.append(self.meta_info(*basename_without_extend.split('_')))
        return ret
    def _get_data(self, mode, seed, n_folds,
                  split) -> Tuple[List[str], List[int]]:
        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
            download_and_decompress(self.archieves, DATA_HOME)
        wav_files = []
        for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
            for file in files:
                if file.endswith('.wav'):
                    wav_files.append(os.path.join(root, file))
        random.seed(seed)  # shuffle samples to split data
        random.shuffle(
            wav_files
        )  # make sure using the same seed to create train and dev dataset
        meta_info = self._get_meta_info(wav_files)
        files = []
        labels = []
        n_samples_per_fold = len(meta_info) // n_folds
        for idx, sample in enumerate(meta_info):
            _, _, emotion = sample
            target = self.label_list.index(emotion)
            fold = idx // n_samples_per_fold + 1
            if mode == 'train' and int(fold) != split:
                files.append(wav_files[idx])
                labels.append(target)
            if mode != 'train' and int(fold) == split:
                files.append(wav_files[idx])
                labels.append(target)
        return files, labels
--- a/paddleaudio/paddleaudio/datasets/urban_sound.py
+++ b/paddleaudio/paddleaudio/datasets/urban_sound.py
@ -0,0 +1,104 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
 import os
 from typing import List
 from typing import Tuple
 from ..utils.download import download_and_decompress
 from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['UrbanSound8K']
 class UrbanSound8K(AudioClassificationDataset):
    """
    UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
    sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
    drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
    classes are drawn from the urban sound taxonomy.
    Reference:
        A Dataset and Taxonomy for Urban Sound Research
        https://dl.acm.org/doi/10.1145/2647868.2655045
    """
    archieves = [
        {
            'url':
            'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
            'md5': '9aa69802bbf37fb986f71ec1483a196e',
        },
    ]
    label_list = [
        "air_conditioner", "car_horn", "children_playing", "dog_bark",
        "drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
        "street_music"
    ]
    meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
    meta_info = collections.namedtuple(
        'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
                      'class_id', 'label'))
    audio_path = os.path.join('UrbanSound8K', 'audio')
    def __init__(self,
                 mode: str='train',
                 split: int=1,
                 feat_type: str='raw',
                 **kwargs):
        files, labels = self._get_data(mode, split)
        super(UrbanSound8K, self).__init__(
            files=files, labels=labels, feat_type=feat_type, **kwargs)
        """
        Ags:
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train or dev).
            split (:obj:`int`, `optional`, defaults to 1):
                It specify the fold of dev dataset.
            feat_type (:obj:`str`, `optional`, defaults to `raw`):
                It identifies the feature type that user wants to extrace of an audio file.
        """
    def _get_meta_info(self):
        ret = []
        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
            for line in rf.readlines()[1:]:
                ret.append(self.meta_info(*line.strip().split(',')))
        return ret
    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
            download_and_decompress(self.archieves, DATA_HOME)
        meta_info = self._get_meta_info()
        files = []
        labels = []
        for sample in meta_info:
            filename, _, _, _, _, fold, target, _ = sample
            if mode == 'train' and int(fold) != split:
                files.append(
                    os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
                                 filename))
                labels.append(int(target))
            if mode != 'train' and int(fold) == split:
                files.append(
                    os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
                                 filename))
                labels.append(int(target))
        return files, labels
--- a/paddleaudio/paddleaudio/features/init.py
+++ b/paddleaudio/paddleaudio/features/init.py
@ -0,0 +1,15 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .augment import *
 from .core import *
--- a/paddleaudio/paddleaudio/features/augment.py
+++ b/paddleaudio/paddleaudio/features/augment.py
@ -0,0 +1,170 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import List
 import numpy as np
 from numpy import ndarray as array
 from paddleaudio.backends import depth_convert
 from paddleaudio.utils import ParameterError
 __all__ = [
    'depth_augment',
    'spect_augment',
    'random_crop1d',
    'random_crop2d',
    'adaptive_spect_augment',
 ]
 def randint(high: int) -> int:
    """Generate one random integer in range [0 high)
     This is a helper function for random data augmentaiton
    """
    return int(np.random.randint(0, high=high))
 def rand() -> float:
    """Generate one floating-point number in range [0 1)
    This is a helper function for random data augmentaiton
    """
    return float(np.random.rand(1))
 def depth_augment(y: array,
                  choices: List=['int8', 'int16'],
                  probs: List[float]=[0.5, 0.5]) -> array:
    """ Audio depth augmentation
    Do audio depth augmentation to simulate the distortion brought by quantization.
    """
    assert len(probs) == len(
        choices
    ), 'number of choices {} must be equal to size of probs {}'.format(
        len(choices), len(probs))
    depth = np.random.choice(choices, p=probs)
    src_depth = y.dtype
    y1 = depth_convert(y, depth)
    y2 = depth_convert(y1, src_depth)
    return y2
 def adaptive_spect_augment(spect: array, tempo_axis: int=0,
                           level: float=0.1) -> array:
    """Do adpative spectrogram augmentation
    The level of the augmentation is gowern by the paramter level,
    ranging from 0 to 1, with 0 represents no augmentation。
    """
    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
    if tempo_axis == 0:
        nt, nf = spect.shape
    else:
        nf, nt = spect.shape
    time_mask_width = int(nt * level * 0.5)
    freq_mask_width = int(nf * level * 0.5)
    num_time_mask = int(10 * level)
    num_freq_mask = int(10 * level)
    if tempo_axis == 0:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[start:start + time_mask_width, :] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[:, start:start + freq_mask_width] = 0
    else:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[:, start:start + time_mask_width] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[start:start + freq_mask_width, :] = 0
    return spect
 def spect_augment(spect: array,
                  tempo_axis: int=0,
                  max_time_mask: int=3,
                  max_freq_mask: int=3,
                  max_time_mask_width: int=30,
                  max_freq_mask_width: int=20) -> array:
    """Do spectrogram augmentation in both time and freq axis
    Reference:
    """
    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
    if tempo_axis == 0:
        nt, nf = spect.shape
    else:
        nf, nt = spect.shape
    num_time_mask = randint(max_time_mask)
    num_freq_mask = randint(max_freq_mask)
    time_mask_width = randint(max_time_mask_width)
    freq_mask_width = randint(max_freq_mask_width)
    if tempo_axis == 0:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[start:start + time_mask_width, :] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[:, start:start + freq_mask_width] = 0
    else:
        for _ in range(num_time_mask):
            start = randint(nt - time_mask_width)
            spect[:, start:start + time_mask_width] = 0
        for _ in range(num_freq_mask):
            start = randint(nf - freq_mask_width)
            spect[start:start + freq_mask_width, :] = 0
    return spect
 def random_crop1d(y: array, crop_len: int) -> array:
    """ Do random cropping on 1d input signal
    The input is a 1d signal, typically a sound waveform
    """
    if y.ndim != 1:
        'only accept 1d tensor or numpy array'
    n = len(y)
    idx = randint(n - crop_len)
    return y[idx:idx + crop_len]
 def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
    """ Do random cropping for 2D array, typically a spectrogram.
    The cropping is done in temporal direction on the time-freq input signal.
    """
    if tempo_axis >= s.ndim:
        raise ParameterError('axis out of range')
    n = s.shape[tempo_axis]
    idx = randint(high=n - crop_len)
    sli = [slice(None) for i in range(s.ndim)]
    sli[tempo_axis] = slice(idx, idx + crop_len)
    out = s[tuple(sli)]
    return out
--- a/paddleaudio/paddleaudio/features/core.py
+++ b/paddleaudio/paddleaudio/features/core.py
@ -0,0 +1,576 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import warnings
 from typing import List
 from typing import Optional
 from typing import Union
 import numpy as np
 import scipy
 from numpy import ndarray as array
 from numpy.lib.stride_tricks import as_strided
 from scipy.signal import get_window
 from paddleaudio.utils import ParameterError
 __all__ = [
    'stft',
    'mfcc',
    'hz_to_mel',
    'mel_to_hz',
    'split_frames',
    'mel_frequencies',
    'power_to_db',
    'compute_fbank_matrix',
    'melspectrogram',
    'spectrogram',
    'mu_encode',
    'mu_decode',
 ]
 def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array:
    """Pad an array to a target length along a target axis.
    This differs from `np.pad` by centering the data prior to padding,
    analogous to `str.center`
    """
    kwargs.setdefault("mode", "constant")
    n = data.shape[axis]
    lpad = int((size - n) // 2)
    lengths = [(0, 0)] * data.ndim
    lengths[axis] = (lpad, int(size - n - lpad))
    if lpad < 0:
        raise ParameterError(("Target size ({size:d}) must be "
                              "at least input size ({n:d})"))
    return np.pad(data, lengths, **kwargs)
 def split_frames(x: array, frame_length: int, hop_length: int,
                 axis: int=-1) -> array:
    """Slice a data array into (overlapping) frames.
    This function is aligned with librosa.frame
    """
    if not isinstance(x, np.ndarray):
        raise ParameterError(
            f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
    if x.shape[axis] < frame_length:
        raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
                             f" for frame_length={frame_length:d}")
    if hop_length < 1:
        raise ParameterError(f"Invalid hop_length: {hop_length:d}")
    if axis == -1 and not x.flags["F_CONTIGUOUS"]:
        warnings.warn(f"librosa.util.frame called with axis={axis} "
                      "on a non-contiguous input. This will result in a copy.")
        x = np.asfortranarray(x)
    elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
        warnings.warn(f"librosa.util.frame called with axis={axis} "
                      "on a non-contiguous input. This will result in a copy.")
        x = np.ascontiguousarray(x)
    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
    strides = np.asarray(x.strides)
    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
    if axis == -1:
        shape = list(x.shape)[:-1] + [frame_length, n_frames]
        strides = list(strides) + [hop_length * new_stride]
    elif axis == 0:
        shape = [n_frames, frame_length] + list(x.shape)[1:]
        strides = [hop_length * new_stride] + list(strides)
    else:
        raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
    return as_strided(x, shape=shape, strides=strides)
 def _check_audio(y, mono=True) -> bool:
    """Determine whether a variable contains valid audio data.
    The audio y must be a np.ndarray, ether 1-channel or two channel
    """
    if not isinstance(y, np.ndarray):
        raise ParameterError("Audio data must be of type numpy.ndarray")
    if y.ndim > 2:
        raise ParameterError(
            f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
    if mono and y.ndim == 2:
        raise ParameterError(
            f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
    if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
        raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
    if not np.issubdtype(y.dtype, np.floating):
        raise ParameterError("Audio data must be floating-point")
    if not np.isfinite(y).all():
        raise ParameterError("Audio buffer is not finite everywhere")
    return True
 def hz_to_mel(frequencies: Union[float, List[float], array],
              htk: bool=False) -> array:
    """Convert Hz to Mels
    This function is aligned with librosa.
    """
    freq = np.asanyarray(frequencies)
    if htk:
        return 2595.0 * np.log10(1.0 + freq / 700.0)
    # Fill in the linear part
    f_min = 0.0
    f_sp = 200.0 / 3
    mels = (freq - f_min) / f_sp
    # Fill in the log-scale part
    min_log_hz = 1000.0  # beginning of log region (Hz)
    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
    logstep = np.log(6.4) / 27.0  # step size for log region
    if freq.ndim:
        # If we have array data, vectorize
        log_t = freq >= min_log_hz
        mels[log_t] = min_log_mel + \
            np.log(freq[log_t] / min_log_hz) / logstep
    elif freq >= min_log_hz:
        # If we have scalar data, heck directly
        mels = min_log_mel + np.log(freq / min_log_hz) / logstep
    return mels
 def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array:
    """Convert mel bin numbers to frequencies.
    This function is aligned with librosa.
    """
    mel_array = np.asanyarray(mels)
    if htk:
        return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
    # Fill in the linear scale
    f_min = 0.0
    f_sp = 200.0 / 3
    freqs = f_min + f_sp * mel_array
    # And now the nonlinear scale
    min_log_hz = 1000.0  # beginning of log region (Hz)
    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
    logstep = np.log(6.4) / 27.0  # step size for log region
    if mel_array.ndim:
        # If we have vector data, vectorize
        log_t = mel_array >= min_log_mel
        freqs[log_t] = min_log_hz * \
            np.exp(logstep * (mel_array[log_t] - min_log_mel))
    elif mel_array >= min_log_mel:
        # If we have scalar data, check directly
        freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
    return freqs
 def mel_frequencies(n_mels: int=128,
                    fmin: float=0.0,
                    fmax: float=11025.0,
                    htk: bool=False) -> array:
    """Compute mel frequencies
    This function is aligned with librosa.
    """
    # 'Center freqs' of mel bands - uniformly spaced between limits
    min_mel = hz_to_mel(fmin, htk=htk)
    max_mel = hz_to_mel(fmax, htk=htk)
    mels = np.linspace(min_mel, max_mel, n_mels)
    return mel_to_hz(mels, htk=htk)
 def fft_frequencies(sr: int, n_fft: int) -> array:
    """Compute fourier frequencies.
    This function is aligned with librosa.
    """
    return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
 def compute_fbank_matrix(sr: int,
                         n_fft: int,
                         n_mels: int=128,
                         fmin: float=0.0,
                         fmax: Optional[float]=None,
                         htk: bool=False,
                         norm: str="slaney",
                         dtype: type=np.float32):
    """Compute fbank matrix.
    This funciton is aligned with librosa.
    """
    if norm != "slaney":
        raise ParameterError('norm must set to slaney')
    if fmax is None:
        fmax = float(sr) / 2
    # Initialize the weights
    n_mels = int(n_mels)
    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
    # Center freqs of each FFT bin
    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
    # 'Center freqs' of mel bands - uniformly spaced between limits
    mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
    fdiff = np.diff(mel_f)
    ramps = np.subtract.outer(mel_f, fftfreqs)
    for i in range(n_mels):
        # lower and upper slopes for all bins
        lower = -ramps[i] / fdiff[i]
        upper = ramps[i + 2] / fdiff[i + 1]
        # .. then intersect them with each other and zero
        weights[i] = np.maximum(0, np.minimum(lower, upper))
    if norm == "slaney":
        # Slaney-style mel is scaled to be approx constant energy per channel
        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
        weights *= enorm[:, np.newaxis]
    # Only check weights if f_mel[0] is positive
    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
        # This means we have an empty channel somewhere
        warnings.warn("Empty filters detected in mel frequency basis. "
                      "Some channels will produce empty responses. "
                      "Try increasing your sampling rate (and fmax) or "
                      "reducing n_mels.")
    return weights
 def stft(x: array,
         n_fft: int=2048,
         hop_length: Optional[int]=None,
         win_length: Optional[int]=None,
         window: str="hann",
         center: bool=True,
         dtype: type=np.complex64,
         pad_mode: str="reflect") -> array:
    """Short-time Fourier transform (STFT).
    This function is aligned with librosa.
    """
    _check_audio(x)
    # By default, use the entire frame
    if win_length is None:
        win_length = n_fft
    # Set the default hop, if it's not already specified
    if hop_length is None:
        hop_length = int(win_length // 4)
    fft_window = get_window(window, win_length, fftbins=True)
    # Pad the window out to n_fft size
    fft_window = pad_center(fft_window, n_fft)
    # Reshape so that the window can be broadcast
    fft_window = fft_window.reshape((-1, 1))
    # Pad the time series so that frames are centered
    if center:
        if n_fft > x.shape[-1]:
            warnings.warn(
                f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
            )
        x = np.pad(x, int(n_fft // 2), mode=pad_mode)
    elif n_fft > x.shape[-1]:
        raise ParameterError(
            f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
        )
    # Window the time series.
    x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length)
    # Pre-allocate the STFT matrix
    stft_matrix = np.empty(
        (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
    fft = np.fft  # use numpy fft as default
    # Constrain STFT block sizes to 256 KB
    MAX_MEM_BLOCK = 2**8 * 2**10
    # how many columns can we fit within MAX_MEM_BLOCK?
    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
    n_columns = max(n_columns, 1)
    for bl_s in range(0, stft_matrix.shape[1], n_columns):
        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
        stft_matrix[:, bl_s:bl_t] = fft.rfft(
            fft_window * x_frames[:, bl_s:bl_t], axis=0)
    return stft_matrix
 def power_to_db(spect: array,
                ref: float=1.0,
                amin: float=1e-10,
                top_db: Optional[float]=80.0) -> array:
    """Convert a power spectrogram (amplitude squared) to decibel (dB) units
    This computes the scaling ``10 * log10(spect / ref)`` in a numerically
    stable way.
    This function is aligned with librosa.
    """
    spect = np.asarray(spect)
    if amin <= 0:
        raise ParameterError("amin must be strictly positive")
    if np.issubdtype(spect.dtype, np.complexfloating):
        warnings.warn(
            "power_to_db was called on complex input so phase "
            "information will be discarded. To suppress this warning, "
            "call power_to_db(np.abs(D)**2) instead.")
        magnitude = np.abs(spect)
    else:
        magnitude = spect
    if callable(ref):
        # User supplied a function to calculate reference power
        ref_value = ref(magnitude)
    else:
        ref_value = np.abs(ref)
    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
    if top_db is not None:
        if top_db < 0:
            raise ParameterError("top_db must be non-negative")
        log_spec = np.maximum(log_spec, log_spec.max() - top_db)
    return log_spec
 def mfcc(x,
         sr: int=16000,
         spect: Optional[array]=None,
         n_mfcc: int=20,
         dct_type: int=2,
         norm: str="ortho",
         lifter: int=0,
         **kwargs) -> array:
    """Mel-frequency cepstral coefficients (MFCCs)
    This function is NOT strictly aligned with librosa. The following example shows how to get the
    same result with librosa:
    # paddleaudioe mfcc:
     kwargs = {
        'window_size':512,
        'hop_length':320,
        'mel_bins':64,
        'fmin':50,
         'to_db':False}
    a = mfcc(x,
        spect=None,
        n_mfcc=20,
        dct_type=2,
        norm='ortho',
        lifter=0,
        **kwargs)
    # librosa mfcc:
    spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512,
                                              win_length=512,
                                              hop_length=320,
                                              n_mels=64, fmin=50)
    b = librosa.feature.mfcc(x,
        sr=16000,
        S=spect,
        n_mfcc=20,
        dct_type=2,
        norm='ortho',
        lifter=0)
    assert np.mean( (a-b)**2) < 1e-8
    """
    if spect is None:
        spect = melspectrogram(x, sr=sr, **kwargs)
    M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
    if lifter > 0:
        factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
                        lifter)
        return M * factor[:, np.newaxis]
    elif lifter == 0:
        return M
    else:
        raise ParameterError(
            f"MFCC lifter={lifter} must be a non-negative number")
 def melspectrogram(x: array,
                   sr: int=16000,
                   window_size: int=512,
                   hop_length: int=320,
                   n_mels: int=64,
                   fmin: int=50,
                   fmax: Optional[float]=None,
                   window: str='hann',
                   center: bool=True,
                   pad_mode: str='reflect',
                   power: float=2.0,
                   to_db: bool=True,
                   ref: float=1.0,
                   amin: float=1e-10,
                   top_db: Optional[float]=None) -> array:
    """Compute mel-spectrogram.
    Parameters:
        x: numpy.ndarray
        The input wavform is a numpy array [shape=(n,)]
        window_size: int, typically 512, 1024, 2048, etc.
        The window size for framing, also used as n_fft for stft
    Returns:
        The mel-spectrogram in power scale or db scale(default)
    Notes:
    1. sr is default to 16000, which is commonly used in speech/speaker processing.
    2. when fmax is None, it is set to sr//2.
    3. this function will convert mel spectgrum to db scale by default. This is different
    that of librosa.
    """
    _check_audio(x, mono=True)
    if len(x) <= 0:
        raise ParameterError('The input waveform is empty')
    if fmax is None:
        fmax = sr // 2
    if fmin < 0 or fmin >= fmax:
        raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
    s = stft(
        x,
        n_fft=window_size,
        hop_length=hop_length,
        win_length=window_size,
        window=window,
        center=center,
        pad_mode=pad_mode)
    spect_power = np.abs(s)**power
    fb_matrix = compute_fbank_matrix(
        sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
    mel_spect = np.matmul(fb_matrix, spect_power)
    if to_db:
        return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
    else:
        return mel_spect
 def spectrogram(x: array,
                sr: int=16000,
                window_size: int=512,
                hop_length: int=320,
                window: str='hann',
                center: bool=True,
                pad_mode: str='reflect',
                power: float=2.0) -> array:
    """Compute spectrogram from an input waveform.
    This function is a wrapper for librosa.feature.stft, with addition step to
    compute the magnitude of the complex spectrogram.
    """
    s = stft(
        x,
        n_fft=window_size,
        hop_length=hop_length,
        win_length=window_size,
        window=window,
        center=center,
        pad_mode=pad_mode)
    return np.abs(s)**power
 def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array:
    """Mu-law encoding.
    Compute the mu-law decoding given an input code.
    When quantized is True, the result will be converted to
    integer in range [0,mu-1]. Otherwise, the resulting signal
    is in range [-1,1]
    Reference:
        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
    """
    mu = 255
    y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
    if quantized:
        y = np.floor((y + 1) / 2 * mu + 0.5)  # convert to [0 , mu-1]
    return y
 def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
    """Mu-law decoding.
    Compute the mu-law decoding given an input code.
    it assumes that the input y is in
    range [0,mu-1] when quantize is True and [-1,1] otherwise
    Reference:
        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
    """
    if mu < 1:
        raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
    mu = mu - 1
    if quantized:  # undo the quantization
        y = y * 2 / mu - 1
    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
    return x
--- a/paddleaudio/paddleaudio/models/init.py
+++ b/paddleaudio/paddleaudio/models/init.py
@ -0,0 +1,13 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/paddleaudio/paddleaudio/models/panns.py
+++ b/paddleaudio/paddleaudio/models/panns.py
@ -0,0 +1,309 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import paddle.nn as nn
 import paddle.nn.functional as F
 from ..utils.download import load_state_dict_from_url
 from ..utils.env import MODEL_HOME
 __all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6']
 pretrained_model_urls = {
    'cnn14': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams',
    'cnn10': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams',
    'cnn6': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams',
 }
 class ConvBlock(nn.Layer):
    def __init__(self, in_channels, out_channels):
        super(ConvBlock, self).__init__()
        self.conv1 = nn.Conv2D(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
            bias_attr=False)
        self.conv2 = nn.Conv2D(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            stride=(1, 1),
            padding=(1, 1),
            bias_attr=False)
        self.bn1 = nn.BatchNorm2D(out_channels)
        self.bn2 = nn.BatchNorm2D(out_channels)
    def forward(self, x, pool_size=(2, 2), pool_type='avg'):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x = F.avg_pool2d(
                x, kernel_size=pool_size) + F.max_pool2d(
                    x, kernel_size=pool_size)
        else:
            raise Exception(
                f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".'
            )
        return x
 class ConvBlock5x5(nn.Layer):
    def __init__(self, in_channels, out_channels):
        super(ConvBlock5x5, self).__init__()
        self.conv1 = nn.Conv2D(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=(5, 5),
            stride=(1, 1),
            padding=(2, 2),
            bias_attr=False)
        self.bn1 = nn.BatchNorm2D(out_channels)
    def forward(self, x, pool_size=(2, 2), pool_type='avg'):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        if pool_type == 'max':
            x = F.max_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg':
            x = F.avg_pool2d(x, kernel_size=pool_size)
        elif pool_type == 'avg+max':
            x = F.avg_pool2d(
                x, kernel_size=pool_size) + F.max_pool2d(
                    x, kernel_size=pool_size)
        else:
            raise Exception(
                f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".'
            )
        return x
 class CNN14(nn.Layer):
    """
    The CNN14(14-layer CNNs) mainly consist of 6 convolutional blocks while each convolutional
    block consists of 2 convolutional layers with a kernel size of 3 × 3.
    Reference:
        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
        https://arxiv.org/pdf/1912.10211.pdf
    """
    emb_size = 2048
    def __init__(self, extract_embedding: bool=True):
        super(CNN14, self).__init__()
        self.bn0 = nn.BatchNorm2D(64)
        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
        self.fc1 = nn.Linear(2048, self.emb_size)
        self.fc_audioset = nn.Linear(self.emb_size, 527)
        self.extract_embedding = extract_embedding
    def forward(self, x):
        x.stop_gradient = False
        x = x.transpose([0, 3, 2, 1])
        x = self.bn0(x)
        x = x.transpose([0, 3, 2, 1])
        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = x.mean(axis=3)
        x = x.max(axis=2) + x.mean(axis=2)
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(self.fc1(x))
        if self.extract_embedding:
            output = F.dropout(x, p=0.5, training=self.training)
        else:
            output = F.sigmoid(self.fc_audioset(x))
        return output
 class CNN10(nn.Layer):
    """
    The CNN10(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
    block consists of 2 convolutional layers with a kernel size of 3 × 3.
    Reference:
        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
        https://arxiv.org/pdf/1912.10211.pdf
    """
    emb_size = 512
    def __init__(self, extract_embedding: bool=True):
        super(CNN10, self).__init__()
        self.bn0 = nn.BatchNorm2D(64)
        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.fc1 = nn.Linear(512, self.emb_size)
        self.fc_audioset = nn.Linear(self.emb_size, 527)
        self.extract_embedding = extract_embedding
    def forward(self, x):
        x.stop_gradient = False
        x = x.transpose([0, 3, 2, 1])
        x = self.bn0(x)
        x = x.transpose([0, 3, 2, 1])
        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = x.mean(axis=3)
        x = x.max(axis=2) + x.mean(axis=2)
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(self.fc1(x))
        if self.extract_embedding:
            output = F.dropout(x, p=0.5, training=self.training)
        else:
            output = F.sigmoid(self.fc_audioset(x))
        return output
 class CNN6(nn.Layer):
    """
    The CNN14(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
    block consists of 1 convolutional layers with a kernel size of 5 × 5.
    Reference:
        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
        https://arxiv.org/pdf/1912.10211.pdf
    """
    emb_size = 512
    def __init__(self, extract_embedding: bool=True):
        super(CNN6, self).__init__()
        self.bn0 = nn.BatchNorm2D(64)
        self.conv_block1 = ConvBlock5x5(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock5x5(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock5x5(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock5x5(in_channels=256, out_channels=512)
        self.fc1 = nn.Linear(512, self.emb_size)
        self.fc_audioset = nn.Linear(self.emb_size, 527)
        self.extract_embedding = extract_embedding
    def forward(self, x):
        x.stop_gradient = False
        x = x.transpose([0, 3, 2, 1])
        x = self.bn0(x)
        x = x.transpose([0, 3, 2, 1])
        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)
        x = x.mean(axis=3)
        x = x.max(axis=2) + x.mean(axis=2)
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(self.fc1(x))
        if self.extract_embedding:
            output = F.dropout(x, p=0.5, training=self.training)
        else:
            output = F.sigmoid(self.fc_audioset(x))
        return output
 def cnn14(pretrained: bool=False, extract_embedding: bool=True) -> CNN14:
    model = CNN14(extract_embedding=extract_embedding)
    if pretrained:
        state_dict = load_state_dict_from_url(
            url=pretrained_model_urls['cnn14'],
            path=os.path.join(MODEL_HOME, 'panns'))
        model.set_state_dict(state_dict)
    return model
 def cnn10(pretrained: bool=False, extract_embedding: bool=True) -> CNN10:
    model = CNN10(extract_embedding=extract_embedding)
    if pretrained:
        state_dict = load_state_dict_from_url(
            url=pretrained_model_urls['cnn10'],
            path=os.path.join(MODEL_HOME, 'panns'))
        model.set_state_dict(state_dict)
    return model
 def cnn6(pretrained: bool=False, extract_embedding: bool=True) -> CNN6:
    model = CNN6(extract_embedding=extract_embedding)
    if pretrained:
        state_dict = load_state_dict_from_url(
            url=pretrained_model_urls['cnn6'],
            path=os.path.join(MODEL_HOME, 'panns'))
        model.set_state_dict(state_dict)
    return model
--- a/paddleaudio/paddleaudio/utils/init.py
+++ b/paddleaudio/paddleaudio/utils/init.py
@ -0,0 +1,18 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .download import *
 from .env import *
 from .error import *
 from .log import *
 from .time import *
--- a/paddleaudio/paddleaudio/utils/download.py
+++ b/paddleaudio/paddleaudio/utils/download.py
@ -0,0 +1,66 @@
 # Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 from typing import Dict
 from typing import List
 from paddle.framework import load as load_state_dict
 from paddle.utils import download
 from pathos.multiprocessing import ProcessPool
 from .log import logger
 download.logger = logger
 def decompress(file: str):
    """
    Extracts all files from a compressed file.
    """
    assert os.path.isfile(file), "File: {} not exists.".format(file)
    download._decompress(file)
 def download_and_decompress(archives: List[Dict[str, str]],
                            path: str,
                            n_workers: int=0):
    """
    Download archieves and decompress to specific path.
    """
    if not os.path.isdir(path):
        os.makedirs(path)
    if n_workers <= 0:
        for archive in archives:
            assert 'url' in archive and 'md5' in archive, \
                'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
            download.get_path_from_url(archive['url'], path, archive['md5'])
    else:
        pool = ProcessPool(nodes=n_workers)
        pool.imap(download.get_path_from_url, [_['url'] for _ in archives],
                  [path] * len(archives), [_['md5'] for _ in archives])
        pool.close()
        pool.join()
 def load_state_dict_from_url(url: str, path: str, md5: str=None):
    """
    Download and load a state dict from url
    """
    if not os.path.isdir(path):
        os.makedirs(path)
    download.get_path_from_url(url, path, md5)
    return load_state_dict(os.path.join(path, os.path.basename(url)))
--- a/paddleaudio/paddleaudio/utils/env.py
+++ b/paddleaudio/paddleaudio/utils/env.py
@ -0,0 +1,53 @@
 # Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 '''
 This module is used to store environmental variables in PaddleAudio.
 PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
 ├                            default value through the PPAUDIO_HOME environment variable.
 ├─ MODEL_HOME    -->  Store model files.
 └─ DATA_HOME     -->  Store automatically downloaded datasets.
 '''
 import os
 def _get_user_home():
    return os.path.expanduser('~')
 def _get_ppaudio_home():
    if 'PPAUDIO_HOME' in os.environ:
        home_path = os.environ['PPAUDIO_HOME']
        if os.path.exists(home_path):
            if os.path.isdir(home_path):
                return home_path
            else:
                raise RuntimeError(
                    'The environment variable PPAUDIO_HOME {} is not a directory.'.
                    format(home_path))
        else:
            return home_path
    return os.path.join(_get_user_home(), '.paddleaudio')
 def _get_sub_home(directory):
    home = os.path.join(_get_ppaudio_home(), directory)
    if not os.path.exists(home):
        os.makedirs(home)
    return home
 USER_HOME = _get_user_home()
 PPAUDIO_HOME = _get_ppaudio_home()
 MODEL_HOME = _get_sub_home('models')
 DATA_HOME = _get_sub_home('datasets')
--- a/paddleaudio/paddleaudio/utils/error.py
+++ b/paddleaudio/paddleaudio/utils/error.py
@ -0,0 +1,20 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 __all__ = ['ParameterError']
 class ParameterError(Exception):
    """Exception class for Parameter checking"""
    pass
--- a/paddleaudio/paddleaudio/utils/log.py
+++ b/paddleaudio/paddleaudio/utils/log.py
@ -0,0 +1,136 @@
 # Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
 import functools
 import logging
 import threading
 import time
 import colorlog
 loggers = {}
 log_config = {
    'DEBUG': {
        'level': 10,
        'color': 'purple'
    },
    'INFO': {
        'level': 20,
        'color': 'green'
    },
    'TRAIN': {
        'level': 21,
        'color': 'cyan'
    },
    'EVAL': {
        'level': 22,
        'color': 'blue'
    },
    'WARNING': {
        'level': 30,
        'color': 'yellow'
    },
    'ERROR': {
        'level': 40,
        'color': 'red'
    },
    'CRITICAL': {
        'level': 50,
        'color': 'bold_red'
    }
 }
 class Logger(object):
    '''
    Deafult logger in PaddleAudio
    Args:
        name(str) : Logger name, default is 'PaddleAudio'
    '''
    def __init__(self, name: str=None):
        name = 'PaddleAudio' if not name else name
        self.logger = logging.getLogger(name)
        for key, conf in log_config.items():
            logging.addLevelName(conf['level'], key)
            self.__dict__[key] = functools.partial(self.__call__, conf['level'])
            self.__dict__[key.lower()] = functools.partial(self.__call__,
                                                           conf['level'])
        self.format = colorlog.ColoredFormatter(
            '%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
            log_colors={key: conf['color']
                        for key, conf in log_config.items()})
        self.handler = logging.StreamHandler()
        self.handler.setFormatter(self.format)
        self.logger.addHandler(self.handler)
        self.logLevel = 'DEBUG'
        self.logger.setLevel(logging.DEBUG)
        self.logger.propagate = False
        self._is_enable = True
    def disable(self):
        self._is_enable = False
    def enable(self):
        self._is_enable = True
    @property
    def is_enable(self) -> bool:
        return self._is_enable
    def __call__(self, log_level: str, msg: str):
        if not self.is_enable:
            return
        self.logger.log(log_level, msg)
    @contextlib.contextmanager
    def use_terminator(self, terminator: str):
        old_terminator = self.handler.terminator
        self.handler.terminator = terminator
        yield
        self.handler.terminator = old_terminator
    @contextlib.contextmanager
    def processing(self, msg: str, interval: float=0.1):
        '''
        Continuously print a progress bar with rotating special effects.
        Args:
            msg(str): Message to be printed.
            interval(float): Rotation interval. Default to 0.1.
        '''
        end = False
        def _printer():
            index = 0
            flags = ['\\', '|', '/', '-']
            while not end:
                flag = flags[index % len(flags)]
                with self.use_terminator('\r'):
                    self.info('{}: {}'.format(msg, flag))
                time.sleep(interval)
                index += 1
        t = threading.Thread(target=_printer)
        t.start()
        yield
        end = True
 logger = Logger()
--- a/paddleaudio/paddleaudio/utils/time.py
+++ b/paddleaudio/paddleaudio/utils/time.py
@ -0,0 +1,67 @@
 # Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import time
 class Timer(object):
    '''Calculate runing speed and estimated time of arrival(ETA)'''
    def __init__(self, total_step: int):
        self.total_step = total_step
        self.last_start_step = 0
        self.current_step = 0
        self._is_running = True
    def start(self):
        self.last_time = time.time()
        self.start_time = time.time()
    def stop(self):
        self._is_running = False
        self.end_time = time.time()
    def count(self) -> int:
        if not self.current_step >= self.total_step:
            self.current_step += 1
        return self.current_step
    @property
    def timing(self) -> float:
        run_steps = self.current_step - self.last_start_step
        self.last_start_step = self.current_step
        time_used = time.time() - self.last_time
        self.last_time = time.time()
        return run_steps / time_used
    @property
    def is_running(self) -> bool:
        return self._is_running
    @property
    def eta(self) -> str:
        if not self.is_running:
            return '00:00:00'
        scale = self.total_step / self.current_step
        remaining_time = (time.time() - self.start_time) * scale
        return seconds_to_hms(remaining_time)
 def seconds_to_hms(seconds: int) -> str:
    '''Convert the number of seconds to hh:mm:ss'''
    h = math.floor(seconds / 3600)
    m = math.floor((seconds - h * 3600) / 60)
    s = int(seconds - h * 3600 - m * 60)
    hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
    return hms_str
--- a/paddleaudio/requirements.txt
+++ b/paddleaudio/requirements.txt
@ -0,0 +1,4 @@
 numpy >= 1.15.0
 resampy >= 0.2.2
 scipy >= 1.0.0
 soundfile >= 0.9.0
--- a/paddleaudio/setup.py
+++ b/paddleaudio/setup.py
@ -0,0 +1,43 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import setuptools
 # set the version here
 version = '0.1.0a'
 with open("README.md", "r") as fh:
    long_description = fh.read()
 setuptools.setup(
    name="paddleaudio",
    version=version,
    author="",
    author_email="",
    description="PaddleAudio, in development",
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="",
    packages=setuptools.find_packages(exclude=["build*", "test*", "examples*"]),
    classifiers=[
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ],
    python_requires='>=3.6',
    install_requires=[
        'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
        'soundfile >= 0.9.0'
    ],
    extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2']
                    }  # for dev only, install: pip install -e .[dev]
 )
--- a/paddleaudio/test/README.md
+++ b/paddleaudio/test/README.md
@ -0,0 +1,41 @@
 # PaddleAudio Testing Guide
 # Testing
 First clone a version of the project by
 ```
 git clone https://github.com/PaddlePaddle/models.git
 ```
 Then install the project in your virtual environment.
 ```
 cd models/PaddleAudio
 python setup.py bdist_wheel
 pip install -e .[dev]
 ```
 The requirements for testing will be installed along with PaddleAudio.  
 Now run
 ```
 pytest test
 ```
 If it goes well, you will see outputs like these:
 ```
 platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
 rootdir: ./models/PaddleAudio
 plugins: hydra-core-1.0.6
 collected 16 items  
 test/unit_test/test_backend.py ...........                                                                         [ 68%]
 test/unit_test/test_features.py .....                                                                              [100%]
 ==================================================== warnings summary ====================================================
 .
 .
 .
 -- Docs: https://docs.pytest.org/en/stable/warnings.html
 ============================================ 16 passed, 11 warnings in 6.76s =============================================
 ```
--- a/paddleaudio/test/unit_test/test_backend.py
+++ b/paddleaudio/test/unit_test/test_backend.py
@ -0,0 +1,114 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import librosa
 import numpy as np
 import pytest
 import paddleaudio
 TEST_FILE = './test/data/test_audio.wav'
 def relative_err(a, b, real=True):
    """compute relative error of two matrices or vectors"""
    if real:
        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
    else:
        err = np.sum((a.real - b.real)**2) / \
            (EPS + np.sum(a.real**2) + np.sum(b.real**2))
        err += np.sum((a.imag - b.imag)**2) / \
            (EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
        return err
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def load_audio():
    x, r = librosa.load(TEST_FILE, sr=16000)
    print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
    return x, r
 # start testing
 x, r = load_audio()
 EPS = 1e-8
 def test_load():
    s, r = paddleaudio.load(TEST_FILE, sr=16000)
    assert r == 16000
    assert s.dtype == 'float32'
    s, r = paddleaudio.load(
        TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
    assert len(s) / r == 2.0
    assert r == 16000
    assert s.dtype == 'int16'
 def test_depth_convert():
    y = paddleaudio.depth_convert(x, 'int16')
    assert len(y) == len(x)
    assert y.dtype == 'int16'
    assert np.max(y) <= 32767
    assert np.min(y) >= -32768
    assert np.std(y) > EPS
    y = paddleaudio.depth_convert(x, 'int8')
    assert len(y) == len(x)
    assert y.dtype == 'int8'
    assert np.max(y) <= 127
    assert np.min(y) >= -128
    assert np.std(y) > EPS
 # test case for resample
 rs_test_data = [
    (32000, 'kaiser_fast'),
    (16000, 'kaiser_fast'),
    (8000, 'kaiser_fast'),
    (32000, 'kaiser_best'),
    (16000, 'kaiser_best'),
    (8000, 'kaiser_best'),
    (22050, 'kaiser_best'),
    (44100, 'kaiser_best'),
 ]
@pytest.mark.parametrize('sr,mode', rs_test_data)
 def test_resample(sr, mode):
    y = paddleaudio.resample(x, 16000, sr, mode=mode)
    factor = sr / 16000
    err = relative_err(len(y), len(x) * factor)
    print('err:', err)
    assert err < EPS
 def test_normalize():
    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
    assert np.max(y) < 0.5 + EPS
    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
    assert np.max(y) <= 2.0 + EPS
    y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
    print('np.std(y):', np.std(y))
    assert np.abs(np.std(y) - 1.0) < EPS
 if __name__ == '__main__':
    test_load()
    test_depth_convert()
    test_resample(22050, 'kaiser_fast')
    test_normalize()
--- a/paddleaudio/test/unit_test/test_features.py
+++ b/paddleaudio/test/unit_test/test_features.py
@ -0,0 +1,144 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import librosa
 import numpy as np
 import pytest
 import paddleaudio as pa
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def load_audio():
    x, r = librosa.load('./test/data/test_audio.wav')
    #x,r = librosa.load('../data/test_audio.wav',sr=16000)
    return x, r
 ## start testing
 x, r = load_audio()
 EPS = 1e-8
 def relative_err(a, b, real=True):
    """compute relative error of two matrices or vectors"""
    if real:
        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
    else:
        err = np.sum((a.real - b.real)**2) / (
            EPS + np.sum(a.real**2) + np.sum(b.real**2))
        err += np.sum((a.imag - b.imag)**2) / (
            EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
        return err
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_melspectrogram():
    a = pa.melspectrogram(
        x,
        window_size=512,
        sr=16000,
        hop_length=320,
        n_mels=64,
        fmin=50,
        to_db=False, )
    b = librosa.feature.melspectrogram(
        x,
        sr=16000,
        n_fft=512,
        win_length=512,
        hop_length=320,
        n_mels=64,
        fmin=50)
    assert relative_err(a, b) < EPS
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_melspectrogram_db():
    a = pa.melspectrogram(
        x,
        window_size=512,
        sr=16000,
        hop_length=320,
        n_mels=64,
        fmin=50,
        to_db=True,
        ref=1.0,
        amin=1e-10,
        top_db=None)
    b = librosa.feature.melspectrogram(
        x,
        sr=16000,
        n_fft=512,
        win_length=512,
        hop_length=320,
        n_mels=64,
        fmin=50)
    b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
    assert relative_err(a, b) < EPS
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_stft():
    a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
    b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
    assert a.shape == b.shape
    assert relative_err(a, b, real=False) < EPS
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_split_frames():
    a = librosa.util.frame(x, frame_length=512, hop_length=320)
    b = pa.split_frames(x, frame_length=512, hop_length=320)
    assert relative_err(a, b) < EPS
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
 def test_mfcc():
    kwargs = {
        'window_size': 512,
        'hop_length': 320,
        'n_mels': 64,
        'fmin': 50,
        'to_db': False
    }
    a = pa.mfcc(
        x,
        #sample_rate=16000,
        spect=None,
        n_mfcc=20,
        dct_type=2,
        norm='ortho',
        lifter=0,
        **kwargs)
    S = librosa.feature.melspectrogram(
        x,
        sr=16000,
        n_fft=512,
        win_length=512,
        hop_length=320,
        n_mels=64,
        fmin=50)
    b = librosa.feature.mfcc(
        x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
    assert relative_err(a, b) < EPS
 if __name__ == '__main__':
    test_melspectrogram()
    test_melspectrogram_db()
    test_stft()
    test_split_frames()
    test_mfcc()