diff --git a/audio/.gitignore b/audio/.gitignore deleted file mode 100644 index e649619e..00000000 --- a/audio/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -.ipynb_checkpoints/** -*.ipynb -nohup.out -__pycache__/ -*.wav -*.m4a -obsolete/** diff --git a/audio/.pre-commit-config.yaml b/audio/.pre-commit-config.yaml deleted file mode 100644 index 4100f348..00000000 --- a/audio/.pre-commit-config.yaml +++ /dev/null @@ -1,45 +0,0 @@ -repos: -- repo: local - hooks: - - id: yapf - name: yapf - entry: yapf - language: system - args: [-i, --style .style.yapf] - files: \.py$ - -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: a11d9314b22d8f8c7556443875b731ef05965464 - hooks: - - id: check-merge-conflict - - id: check-symlinks - - id: end-of-file-fixer - - id: trailing-whitespace - - id: detect-private-key - - id: check-symlinks - - id: check-added-large-files - -- repo: https://github.com/pycqa/isort - rev: 5.8.0 - hooks: - - id: isort - name: isort (python) - - id: isort - name: isort (cython) - types: [cython] - - id: isort - name: isort (pyi) - types: [pyi] - -- repo: local - hooks: - - id: flake8 - name: flake8 - entry: flake8 - language: system - args: - - --count - - --select=E9,F63,F7,F82 - - --show-source - - --statistics - files: \.py$ diff --git a/audio/.style.yapf b/audio/.style.yapf deleted file mode 100644 index 4741fb4f..00000000 --- a/audio/.style.yapf +++ /dev/null @@ -1,3 +0,0 @@ -[style] -based_on_style = pep8 -column_limit = 80 diff --git a/audio/LICENSE b/audio/LICENSE deleted file mode 100644 index 261eeb9e..00000000 --- a/audio/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/audio/README.md b/audio/README.md deleted file mode 100644 index 9607fd86..00000000 --- a/audio/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# PaddleAudio: The audio library for PaddlePaddle - -## Introduction -PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap. - - - -## Features -- Spectrogram and related features are compatible with librosa. -- State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come. -- Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap. -- Data loading supports for common open source audio in multiple languages including English, Mandarin and so on. - - -## Install -``` -git clone https://github.com/PaddlePaddle/models -cd models/PaddleAudio -pip install . - -``` - -## Quick start -### Audio loading and feature extraction -``` -import paddleaudio as pa -s,r = pa.load(f) -mel_spect = pa.melspectrogram(s,sr=r) -``` - -### Examples -We provide a set of examples to help you get started in using PaddleAudio quickly. -- [PANNs: acoustic scene and events analysis using pre-trained models](./examples/panns) -- [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification) -- [Training a audio-tagging network on Audioset](./examples/audioset_training) - -Please refer to [example directory](./examples) for more details. diff --git a/audio/examples/panns/README.md b/audio/examples/panns/README.md deleted file mode 100644 index 243ebf8e..00000000 --- a/audio/examples/panns/README.md +++ /dev/null @@ -1,128 +0,0 @@ -# Audio Tagging - -声音分类的任务是单标签的分类任务,但是对于一段音频来说,它可以是多标签的。譬如在一般的室内办公环境进行录音,这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音,还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言,对一段音频预测多个标签是具有很强的实用性的。 - -在IEEE ICASSP 2017 大会上,谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段(来源于YouTube视频)。目前该数据集已经有210万个已标注的视频数据,5800小时的音频数据,经过标记的声音样本的标签类别为527。 - -`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别,因此可用于声音的实时tagging。 - -本示例采用`PANNs`预训练模型,基于Audioset的标签类别对输入音频实时tagging,并最终以文本形式输出对应时刻的top k类别和对应的得分。 - - -## 模型简介 - -PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型,可供用户选择使用: -- CNN14: 该模型主要包含12个卷积层和2个全连接层,模型参数的数量为79.6M,embbedding维度是2048。 -- CNN10: 该模型主要包含8个卷积层和2个全连接层,模型参数的数量为4.9M,embbedding维度是512。 -- CNN6: 该模型主要包含4个卷积层和2个全连接层,模型参数的数量为4.5M,embbedding维度是512。 - - -## 快速开始 - -### 模型预测 - -```shell -export CUDA_VISIBLE_DEVICES=0 -python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir -``` - -可支持配置的参数: - -- `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 -- `wav`: 指定预测的音频文件。 -- `sample_duration`: 模型每次预测的音频时间长度,单位为秒,默认为2s。 -- `hop_duration`: 每两个预测音频的时间间隔,单位为秒,默认为0.3s。 -- `output_dir`: 模型预测结果存放的路径,默认为`./output_dir`。 - -示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过以下方式执行: -```python -from paddleaudio.models.panns import cnn14, cnn10, cnn6 - -# CNN14 -model = cnn14(pretrained=True, extract_embedding=False) -# CNN10 -model = cnn10(pretrained=True, extract_embedding=False) -# CNN6 -model = cnn6(pretrained=True, extract_embedding=False) -``` - -执行结果: -``` -[2021-04-30 19:15:41,025] [ INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz -``` - -执行后得分结果保存在`output_dir`的`.npz`文件中。 - - -### 生成tagging标签文本 -```shell -python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir -``` - -可支持配置的参数: - -- `tagging_file`: 模型预测结果文件。 -- `top_k`: 获取预测结果中,得分最高的前top_k个标签,默认为10。 -- `smooth`: 预测结果的后验概率平滑,默认为True,表示应用平滑。 -- `smooth_size`: 平滑计算过程中的样本数量,默认为5。 -- `label_file`: 模型预测结果对应的Audioset类别的文本文件。 -- `output_dir`: 标签文本存放的路径,默认为`./output_dir`。 - -执行结果: -``` -[2021-04-30 19:26:58,743] [ INFO] - Posterior smoothing... -[2021-04-30 19:26:58,746] [ INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt -``` - -执行后文本结果保存在`output_dir`的`.txt`文件中。 - - -## Tagging标签文本 - -最终输出的文本结果如下所示。 -样本每个时间范围的top k结果用空行分隔。在每一个结果中,第一行是时间信息,数字表示tagging结果在时间起点信息,比例值代表当前时刻`t`与音频总长度`T`的比值;紧接的k行是对应的标签和得分。 - -``` -0.0 -Cat: 0.9144676923751831 -Animal: 0.8855036497116089 -Domestic animals, pets: 0.804577112197876 -Meow: 0.7422927021980286 -Music: 0.19959309697151184 -Inside, small room: 0.12550437450408936 -Caterwaul: 0.021584441885352135 -Purr: 0.020247288048267365 -Speech: 0.018197158351540565 -Vehicle: 0.007446660194545984 - -0.059197544398158296 -Cat: 0.9250872135162354 -Animal: 0.8957151174545288 -Domestic animals, pets: 0.8228275775909424 -Meow: 0.7650775909423828 -Music: 0.20210561156272888 -Inside, small room: 0.12290887534618378 -Caterwaul: 0.029371455311775208 -Purr: 0.018731823191046715 -Speech: 0.017130598425865173 -Vehicle: 0.007748497650027275 - -0.11839508879631659 -Cat: 0.9336574673652649 -Animal: 0.9111202359199524 -Domestic animals, pets: 0.8349071145057678 -Meow: 0.7761964797973633 -Music: 0.20467285811901093 -Inside, small room: 0.10709915310144424 -Caterwaul: 0.05370649695396423 -Purr: 0.018830426037311554 -Speech: 0.017361722886562347 -Vehicle: 0.006929398979991674 - -... -... -``` - -以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子,可以实时地对音频进行多标签预测。 - -![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif) diff --git a/audio/examples/panns/assets/audioset_labels.txt b/audio/examples/panns/assets/audioset_labels.txt deleted file mode 100644 index 6fccf56a..00000000 --- a/audio/examples/panns/assets/audioset_labels.txt +++ /dev/null @@ -1,527 +0,0 @@ -Speech -Male speech, man speaking -Female speech, woman speaking -Child speech, kid speaking -Conversation -Narration, monologue -Babbling -Speech synthesizer -Shout -Bellow -Whoop -Yell -Battle cry -Children shouting -Screaming -Whispering -Laughter -Baby laughter -Giggle -Snicker -Belly laugh -Chuckle, chortle -Crying, sobbing -Baby cry, infant cry -Whimper -Wail, moan -Sigh -Singing -Choir -Yodeling -Chant -Mantra -Male singing -Female singing -Child singing -Synthetic singing -Rapping -Humming -Groan -Grunt -Whistling -Breathing -Wheeze -Snoring -Gasp -Pant -Snort -Cough -Throat clearing -Sneeze -Sniff -Run -Shuffle -Walk, footsteps -Chewing, mastication -Biting -Gargling -Stomach rumble -Burping, eructation -Hiccup -Fart -Hands -Finger snapping -Clapping -Heart sounds, heartbeat -Heart murmur -Cheering -Applause -Chatter -Crowd -Hubbub, speech noise, speech babble -Children playing -Animal -Domestic animals, pets -Dog -Bark -Yip -Howl -Bow-wow -Growling -Whimper (dog) -Cat -Purr -Meow -Hiss -Caterwaul -Livestock, farm animals, working animals -Horse -Clip-clop -Neigh, whinny -Cattle, bovinae -Moo -Cowbell -Pig -Oink -Goat -Bleat -Sheep -Fowl -Chicken, rooster -Cluck -Crowing, cock-a-doodle-doo -Turkey -Gobble -Duck -Quack -Goose -Honk -Wild animals -Roaring cats (lions, tigers) -Roar -Bird -Bird vocalization, bird call, bird song -Chirp, tweet -Squawk -Pigeon, dove -Coo -Crow -Caw -Owl -Hoot -Bird flight, flapping wings -Canidae, dogs, wolves -Rodents, rats, mice -Mouse -Patter -Insect -Cricket -Mosquito -Fly, housefly -Buzz -Bee, wasp, etc. -Frog -Croak -Snake -Rattle -Whale vocalization -Music -Musical instrument -Plucked string instrument -Guitar -Electric guitar -Bass guitar -Acoustic guitar -Steel guitar, slide guitar -Tapping (guitar technique) -Strum -Banjo -Sitar -Mandolin -Zither -Ukulele -Keyboard (musical) -Piano -Electric piano -Organ -Electronic organ -Hammond organ -Synthesizer -Sampler -Harpsichord -Percussion -Drum kit -Drum machine -Drum -Snare drum -Rimshot -Drum roll -Bass drum -Timpani -Tabla -Cymbal -Hi-hat -Wood block -Tambourine -Rattle (instrument) -Maraca -Gong -Tubular bells -Mallet percussion -Marimba, xylophone -Glockenspiel -Vibraphone -Steelpan -Orchestra -Brass instrument -French horn -Trumpet -Trombone -Bowed string instrument -String section -Violin, fiddle -Pizzicato -Cello -Double bass -Wind instrument, woodwind instrument -Flute -Saxophone -Clarinet -Harp -Bell -Church bell -Jingle bell -Bicycle bell -Tuning fork -Chime -Wind chime -Change ringing (campanology) -Harmonica -Accordion -Bagpipes -Didgeridoo -Shofar -Theremin -Singing bowl -Scratching (performance technique) -Pop music -Hip hop music -Beatboxing -Rock music -Heavy metal -Punk rock -Grunge -Progressive rock -Rock and roll -Psychedelic rock -Rhythm and blues -Soul music -Reggae -Country -Swing music -Bluegrass -Funk -Folk music -Middle Eastern music -Jazz -Disco -Classical music -Opera -Electronic music -House music -Techno -Dubstep -Drum and bass -Electronica -Electronic dance music -Ambient music -Trance music -Music of Latin America -Salsa music -Flamenco -Blues -Music for children -New-age music -Vocal music -A capella -Music of Africa -Afrobeat -Christian music -Gospel music -Music of Asia -Carnatic music -Music of Bollywood -Ska -Traditional music -Independent music -Song -Background music -Theme music -Jingle (music) -Soundtrack music -Lullaby -Video game music -Christmas music -Dance music -Wedding music -Happy music -Funny music -Sad music -Tender music -Exciting music -Angry music -Scary music -Wind -Rustling leaves -Wind noise (microphone) -Thunderstorm -Thunder -Water -Rain -Raindrop -Rain on surface -Stream -Waterfall -Ocean -Waves, surf -Steam -Gurgling -Fire -Crackle -Vehicle -Boat, Water vehicle -Sailboat, sailing ship -Rowboat, canoe, kayak -Motorboat, speedboat -Ship -Motor vehicle (road) -Car -Vehicle horn, car horn, honking -Toot -Car alarm -Power windows, electric windows -Skidding -Tire squeal -Car passing by -Race car, auto racing -Truck -Air brake -Air horn, truck horn -Reversing beeps -Ice cream truck, ice cream van -Bus -Emergency vehicle -Police car (siren) -Ambulance (siren) -Fire engine, fire truck (siren) -Motorcycle -Traffic noise, roadway noise -Rail transport -Train -Train whistle -Train horn -Railroad car, train wagon -Train wheels squealing -Subway, metro, underground -Aircraft -Aircraft engine -Jet engine -Propeller, airscrew -Helicopter -Fixed-wing aircraft, airplane -Bicycle -Skateboard -Engine -Light engine (high frequency) -Dental drill, dentist's drill -Lawn mower -Chainsaw -Medium engine (mid frequency) -Heavy engine (low frequency) -Engine knocking -Engine starting -Idling -Accelerating, revving, vroom -Door -Doorbell -Ding-dong -Sliding door -Slam -Knock -Tap -Squeak -Cupboard open or close -Drawer open or close -Dishes, pots, and pans -Cutlery, silverware -Chopping (food) -Frying (food) -Microwave oven -Blender -Water tap, faucet -Sink (filling or washing) -Bathtub (filling or washing) -Hair dryer -Toilet flush -Toothbrush -Electric toothbrush -Vacuum cleaner -Zipper (clothing) -Keys jangling -Coin (dropping) -Scissors -Electric shaver, electric razor -Shuffling cards -Typing -Typewriter -Computer keyboard -Writing -Alarm -Telephone -Telephone bell ringing -Ringtone -Telephone dialing, DTMF -Dial tone -Busy signal -Alarm clock -Siren -Civil defense siren -Buzzer -Smoke detector, smoke alarm -Fire alarm -Foghorn -Whistle -Steam whistle -Mechanisms -Ratchet, pawl -Clock -Tick -Tick-tock -Gears -Pulleys -Sewing machine -Mechanical fan -Air conditioning -Cash register -Printer -Camera -Single-lens reflex camera -Tools -Hammer -Jackhammer -Sawing -Filing (rasp) -Sanding -Power tool -Drill -Explosion -Gunshot, gunfire -Machine gun -Fusillade -Artillery fire -Cap gun -Fireworks -Firecracker -Burst, pop -Eruption -Boom -Wood -Chop -Splinter -Crack -Glass -Chink, clink -Shatter -Liquid -Splash, splatter -Slosh -Squish -Drip -Pour -Trickle, dribble -Gush -Fill (with liquid) -Spray -Pump (liquid) -Stir -Boiling -Sonar -Arrow -Whoosh, swoosh, swish -Thump, thud -Thunk -Electronic tuner -Effects unit -Chorus effect -Basketball bounce -Bang -Slap, smack -Whack, thwack -Smash, crash -Breaking -Bouncing -Whip -Flap -Scratch -Scrape -Rub -Roll -Crushing -Crumpling, crinkling -Tearing -Beep, bleep -Ping -Ding -Clang -Squeal -Creak -Rustle -Whir -Clatter -Sizzle -Clicking -Clickety-clack -Rumble -Plop -Jingle, tinkle -Hum -Zing -Boing -Crunch -Silence -Sine wave -Harmonic -Chirp tone -Sound effect -Pulse -Inside, small room -Inside, large room or hall -Inside, public space -Outside, urban or manmade -Outside, rural or natural -Reverberation -Echo -Noise -Environmental noise -Static -Mains hum -Distortion -Sidetone -Cacophony -White noise -Pink noise -Throbbing -Vibration -Television -Radio -Field recording diff --git a/audio/examples/panns/audio_tag.py b/audio/examples/panns/audio_tag.py deleted file mode 100644 index 6f08cd1c..00000000 --- a/audio/examples/panns/audio_tag.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -from typing import List - -import numpy as np -import paddle -from paddleaudio.backends import load as load_audio -from paddleaudio.features import melspectrogram -from paddleaudio.models.panns import cnn14 -from paddleaudio.utils import logger - -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.') -parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.') -parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.') -parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.') -parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.') -args = parser.parse_args() -# yapf: enable - - -def split(waveform: np.ndarray, win_size: int, hop_size: int): - """ - Split into N waveforms. - N is decided by win_size and hop_size. - """ - assert isinstance(waveform, np.ndarray) - time = [] - data = [] - for i in range(0, len(waveform), hop_size): - segment = waveform[i:i + win_size] - if len(segment) < win_size: - segment = np.pad(segment, (0, win_size - len(segment))) - data.append(segment) - time.append(i / len(waveform)) - return time, data - - -def batchify(data: List[List[float]], - sample_rate: int, - batch_size: int, - **kwargs): - """ - Extract features from waveforms and create batches. - """ - examples = [] - for waveform in data: - feats = melspectrogram(waveform, sample_rate, **kwargs).transpose() - examples.append(feats) - - # Seperates data into some batches. - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - yield one_batch - one_batch = [] - if one_batch: - yield one_batch - - -def predict(model, data: List[List[float]], sample_rate: int, - batch_size: int=1): - """ - Use pretrained model to make predictions. - """ - batches = batchify(data, sample_rate, batch_size) - results = None - model.eval() - for batch in batches: - feats = paddle.to_tensor(batch).unsqueeze(1) \ - # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins) - - audioset_scores = model(feats) - if results is None: - results = audioset_scores.numpy() - else: - results = np.concatenate((results, audioset_scores.numpy())) - - return results - - -if __name__ == '__main__': - paddle.set_device(args.device) - model = cnn14(pretrained=True, extract_embedding=False) - waveform, sr = load_audio(args.wav, sr=None) - time, data = split(waveform, - int(args.sample_duration * sr), - int(args.hop_duration * sr)) - results = predict(model, data, sr, batch_size=8) - - if not os.path.exists(args.output_dir): - os.makedirs(args.output_dir) - time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform)) - output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz') - np.savez(output_file, time=time, scores=results) - logger.info(f'Saved tagging results to {output_file}') diff --git a/audio/examples/panns/parse_result.py b/audio/examples/panns/parse_result.py deleted file mode 100644 index 056c573f..00000000 --- a/audio/examples/panns/parse_result.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import ast -import os -from typing import Dict - -import numpy as np -from paddleaudio.utils import logger - -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument('--tagging_file', type=str, required=True, help='') -parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.') -parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.') -parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.') -parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.') -parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.') -args = parser.parse_args() -# yapf: enable - - -def smooth(results: np.ndarray, win_size: int): - """ - Execute posterior smoothing in-place. - """ - for i in range(len(results) - 1, -1, -1): - if i < win_size - 1: - left = 0 - else: - left = i + 1 - win_size - results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1) - - -def generate_topk_label(k: int, label_map: Dict, result: np.ndarray): - """ - Return top k result. - """ - result = np.asarray(result) - topk_idx = (-result).argsort()[:k] - - ret = '' - for idx in topk_idx: - label, score = label_map[idx], result[idx] - ret += f'{label}: {score}\n' - return ret - - -if __name__ == "__main__": - label_map = {} - with open(args.label_file, 'r') as f: - for i, l in enumerate(f.readlines()): - label_map[i] = l.strip() - - results = np.load(args.tagging_file, allow_pickle=True) - times, scores = results['time'], results['scores'] - - if args.smooth: - logger.info('Posterior smoothing...') - smooth(scores, win_size=args.smooth_size) - - if not os.path.exists(args.output_dir): - os.makedirs(args.output_dir) - output_file = os.path.join( - args.output_dir, - os.path.basename(args.tagging_file).split('.')[0] + '.txt') - with open(output_file, 'w') as f: - for time, score in zip(times, scores): - f.write(f'{time}\n') - f.write(generate_topk_label(args.top_k, label_map, score) + '\n') - - logger.info(f'Saved tagging labels to {output_file}') diff --git a/audio/paddleaudio/datasets/aishell.py b/audio/paddleaudio/datasets/aishell.py deleted file mode 100644 index d84d9876..00000000 --- a/audio/paddleaudio/datasets/aishell.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import codecs -import collections -import json -import os -from typing import Dict - -from paddle.io import Dataset -from tqdm import tqdm - -from ..backends import load as load_audio -from ..utils.download import decompress -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME -from ..utils.log import logger -from .dataset import feat_funcs - -__all__ = ['AISHELL1'] - - -class AISHELL1(Dataset): - """ - This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. - It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including - smart home, autonomous driving, and industrial production. The whole recording was - put in quiet indoor environment, using 3 different devices at the same time: high - fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), - iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled - to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas - in China were invited to participate in the recording. The manual transcription - accuracy rate is above 95%, through professional speech annotation and strict - quality inspection. The corpus is divided into training, development and testing - sets. - - Reference: - AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline - https://arxiv.org/abs/1709.05522 - """ - - archieves = [ - { - 'url': 'http://www.openslr.org/resources/33/data_aishell.tgz', - 'md5': '2f494334227864a8a8fec932999db9d8', - }, - ] - text_meta = os.path.join('data_aishell', 'transcript', - 'aishell_transcript_v0.8.txt') - utt_info = collections.namedtuple('META_INFO', - ('file_path', 'utt_id', 'text')) - audio_path = os.path.join('data_aishell', 'wav') - manifest_path = os.path.join('data_aishell', 'manifest') - subset = ['train', 'dev', 'test'] - - def __init__(self, subset: str='train', feat_type: str='raw', **kwargs): - assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format( - self.subset, subset) - self.subset = subset - self.feat_type = feat_type - self.feat_config = kwargs - self._data = self._get_data() - super(AISHELL1, self).__init__() - - def _get_text_info(self) -> Dict[str, str]: - ret = {} - with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf: - for line in rf.readlines()[1:]: - utt_id, text = map(str.strip, line.split(' ', - 1)) # utt_id, text - ret.update({utt_id: ''.join(text.split())}) - return ret - - def _get_data(self): - if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ - not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)): - download_and_decompress(self.archieves, DATA_HOME) - # Extract *wav from *.tar.gz. - for root, _, files in os.walk( - os.path.join(DATA_HOME, self.audio_path)): - for file in files: - if file.endswith('.tar.gz'): - decompress(os.path.join(root, file)) - os.remove(os.path.join(root, file)) - - text_info = self._get_text_info() - - data = [] - for root, _, files in os.walk( - os.path.join(DATA_HOME, self.audio_path, self.subset)): - for file in files: - if file.endswith('.wav'): - utt_id = os.path.splitext(file)[0] - if utt_id not in text_info: # There are some utt_id that without label - continue - text = text_info[utt_id] - file_path = os.path.join(root, file) - data.append(self.utt_info(file_path, utt_id, text)) - - return data - - def _convert_to_record(self, idx: int): - sample = self._data[idx] - - record = {} - # To show all fields in a namedtuple: `type(sample)._fields` - for field in type(sample)._fields: - record[field] = getattr(sample, field) - - waveform, sr = load_audio( - sample[0]) # The first element of sample is file path - feat_func = feat_funcs[self.feat_type] - feat = feat_func( - waveform, sample_rate=sr, - **self.feat_config) if feat_func else waveform - record.update({'feat': feat, 'duration': len(waveform) / sr}) - return record - - def create_manifest(self, prefix='manifest'): - if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)): - os.makedirs(os.path.join(DATA_HOME, self.manifest_path)) - - manifest_file = os.path.join(DATA_HOME, self.manifest_path, - f'{prefix}.{self.subset}') - with codecs.open(manifest_file, 'w', 'utf-8') as f: - for idx in tqdm(range(len(self))): - record = self._convert_to_record(idx) - record_line = json.dumps( - { - 'utt': record['utt_id'], - 'feat': record['file_path'], - 'feat_shape': (record['duration'], ), - 'text': record['text'] - }, - ensure_ascii=False) - f.write(record_line + '\n') - logger.info(f'Manifest file {manifest_file} created.') - - def __getitem__(self, idx): - record = self._convert_to_record(idx) - return tuple(record.values()) - - def __len__(self): - return len(self._data) diff --git a/audio/paddleaudio/datasets/dcase.py b/audio/paddleaudio/datasets/dcase.py deleted file mode 100644 index 47b0c915..00000000 --- a/audio/paddleaudio/datasets/dcase.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import collections -import os -from typing import List -from typing import Tuple - -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME -from .dataset import AudioClassificationDataset - -__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes'] - - -class UrbanAcousticScenes(AudioClassificationDataset): - """ - TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from - 12 European cities in 10 different acoustic scenes using 4 different devices. - Additionally, synthetic data for 11 mobile devices was created based on the original - recordings. Of the 12 cities, two are present only in the evaluation set. - - Reference: - A multi-device dataset for urban acoustic scene classification - https://arxiv.org/abs/1807.09840 - """ - - source_url = 'https://zenodo.org/record/3819968/files/' - base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development' - archieves = [ - { - 'url': source_url + base_name + '.meta.zip', - 'md5': '6eae9db553ce48e4ea246e34e50a3cf5', - }, - { - 'url': source_url + base_name + '.audio.1.zip', - 'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8', - }, - { - 'url': source_url + base_name + '.audio.2.zip', - 'md5': '4310a13cc2943d6ce3f70eba7ba4c784', - }, - { - 'url': source_url + base_name + '.audio.3.zip', - 'md5': 'ed38956c4246abb56190c1e9b602b7b8', - }, - { - 'url': source_url + base_name + '.audio.4.zip', - 'md5': '97ab8560056b6816808dedc044dcc023', - }, - { - 'url': source_url + base_name + '.audio.5.zip', - 'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb', - }, - { - 'url': source_url + base_name + '.audio.6.zip', - 'md5': 'fbf856a3a86fff7520549c899dc94372', - }, - { - 'url': source_url + base_name + '.audio.7.zip', - 'md5': '0dbffe7b6e45564da649378723284062', - }, - { - 'url': source_url + base_name + '.audio.8.zip', - 'md5': 'bb6f77832bf0bd9f786f965beb251b2e', - }, - { - 'url': source_url + base_name + '.audio.9.zip', - 'md5': 'a65596a5372eab10c78e08a0de797c9e', - }, - { - 'url': source_url + base_name + '.audio.10.zip', - 'md5': '2ad595819ffa1d56d2de4c7ed43205a6', - }, - { - 'url': source_url + base_name + '.audio.11.zip', - 'md5': '0ad29f7040a4e6a22cfd639b3a6738e5', - }, - { - 'url': source_url + base_name + '.audio.12.zip', - 'md5': 'e5f4400c6b9697295fab4cf507155a2f', - }, - { - 'url': source_url + base_name + '.audio.13.zip', - 'md5': '8855ab9f9896422746ab4c5d89d8da2f', - }, - { - 'url': source_url + base_name + '.audio.14.zip', - 'md5': '092ad744452cd3e7de78f988a3d13020', - }, - { - 'url': source_url + base_name + '.audio.15.zip', - 'md5': '4b5eb85f6592aebf846088d9df76b420', - }, - { - 'url': source_url + base_name + '.audio.16.zip', - 'md5': '2e0a89723e58a3836be019e6996ae460', - }, - ] - label_list = [ - 'airport', 'shopping_mall', 'metro_station', 'street_pedestrian', - 'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park' - ] - - meta = os.path.join(base_name, 'meta.csv') - meta_info = collections.namedtuple('META_INFO', ( - 'filename', 'scene_label', 'identifier', 'source_label')) - subset_meta = { - 'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'), - 'dev': - os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'), - 'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'), - } - subset_meta_info = collections.namedtuple('SUBSET_META_INFO', - ('filename', 'scene_label')) - audio_path = os.path.join(base_name, 'audio') - - def __init__(self, mode: str='train', feat_type: str='raw', **kwargs): - """ - Ags: - mode (:obj:`str`, `optional`, defaults to `train`): - It identifies the dataset mode (train or dev). - feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. - """ - files, labels = self._get_data(mode) - super(UrbanAcousticScenes, self).__init__( - files=files, labels=labels, feat_type=feat_type, **kwargs) - - def _get_meta_info(self, subset: str=None, - skip_header: bool=True) -> List[collections.namedtuple]: - if subset is None: - meta_file = self.meta - meta_info = self.meta_info - else: - assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.' - meta_file = self.subset_meta[subset] - meta_info = self.subset_meta_info - - ret = [] - with open(os.path.join(DATA_HOME, meta_file), 'r') as rf: - lines = rf.readlines()[1:] if skip_header else rf.readlines() - for line in lines: - ret.append(meta_info(*line.strip().split('\t'))) - return ret - - def _get_data(self, mode: str) -> Tuple[List[str], List[int]]: - if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ - not os.path.isfile(os.path.join(DATA_HOME, self.meta)): - download_and_decompress(self.archieves, DATA_HOME) - - meta_info = self._get_meta_info(subset=mode, skip_header=True) - - files = [] - labels = [] - for sample in meta_info: - filename, label = sample[:2] - filename = os.path.basename(filename) - target = self.label_list.index(label) - - files.append(os.path.join(DATA_HOME, self.audio_path, filename)) - labels.append(int(target)) - - return files, labels - - -class UrbanAudioVisualScenes(AudioClassificationDataset): - """ - TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio - and video recordings from 12 European cities in 10 different scenes. - This dataset consists of 10-seconds audio and video segments from 10 - acoustic scenes. The total amount of audio in the development set is 34 hours. - - Reference: - A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis - https://arxiv.org/abs/2011.00030 - """ - - source_url = 'https://zenodo.org/record/4477542/files/' - base_name = 'TAU-urban-audio-visual-scenes-2021-development' - - archieves = [ - { - 'url': source_url + base_name + '.meta.zip', - 'md5': '76e3d7ed5291b118372e06379cb2b490', - }, - { - 'url': source_url + base_name + '.audio.1.zip', - 'md5': '186f6273f8f69ed9dbdc18ad65ac234f', - }, - { - 'url': source_url + base_name + '.audio.2.zip', - 'md5': '7fd6bb63127f5785874a55aba4e77aa5', - }, - { - 'url': source_url + base_name + '.audio.3.zip', - 'md5': '61396bede29d7c8c89729a01a6f6b2e2', - }, - { - 'url': source_url + base_name + '.audio.4.zip', - 'md5': '6ddac89717fcf9c92c451868eed77fe1', - }, - { - 'url': source_url + base_name + '.audio.5.zip', - 'md5': 'af4820756cdf1a7d4bd6037dc034d384', - }, - { - 'url': source_url + base_name + '.audio.6.zip', - 'md5': 'ebd11ec24411f2a17a64723bd4aa7fff', - }, - { - 'url': source_url + base_name + '.audio.7.zip', - 'md5': '2be39a76aeed704d5929d020a2909efd', - }, - { - 'url': source_url + base_name + '.audio.8.zip', - 'md5': '972d8afe0874720fc2f28086e7cb22a9', - }, - ] - label_list = [ - 'airport', 'shopping_mall', 'metro_station', 'street_pedestrian', - 'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park' - ] - - meta_base_path = os.path.join(base_name, base_name + '.meta') - meta = os.path.join(meta_base_path, 'meta.csv') - meta_info = collections.namedtuple('META_INFO', ( - 'filename_audio', 'filename_video', 'scene_label', 'identifier')) - subset_meta = { - 'train': - os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'), - 'dev': - os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'), - 'test': - os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'), - } - subset_meta_info = collections.namedtuple('SUBSET_META_INFO', ( - 'filename_audio', 'filename_video', 'scene_label')) - audio_path = os.path.join(base_name, 'audio') - - def __init__(self, mode: str='train', feat_type: str='raw', **kwargs): - """ - Ags: - mode (:obj:`str`, `optional`, defaults to `train`): - It identifies the dataset mode (train or dev). - feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. - """ - files, labels = self._get_data(mode) - super(UrbanAudioVisualScenes, self).__init__( - files=files, labels=labels, feat_type=feat_type, **kwargs) - - def _get_meta_info(self, subset: str=None, - skip_header: bool=True) -> List[collections.namedtuple]: - if subset is None: - meta_file = self.meta - meta_info = self.meta_info - else: - assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.' - meta_file = self.subset_meta[subset] - meta_info = self.subset_meta_info - - ret = [] - with open(os.path.join(DATA_HOME, meta_file), 'r') as rf: - lines = rf.readlines()[1:] if skip_header else rf.readlines() - for line in lines: - ret.append(meta_info(*line.strip().split('\t'))) - return ret - - def _get_data(self, mode: str) -> Tuple[List[str], List[int]]: - if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ - not os.path.isfile(os.path.join(DATA_HOME, self.meta)): - download_and_decompress(self.archieves, - os.path.join(DATA_HOME, self.base_name)) - - meta_info = self._get_meta_info(subset=mode, skip_header=True) - - files = [] - labels = [] - for sample in meta_info: - filename, _, label = sample[:3] - filename = os.path.basename(filename) - target = self.label_list.index(label) - - files.append(os.path.join(DATA_HOME, self.audio_path, filename)) - labels.append(int(target)) - - return files, labels diff --git a/audio/paddleaudio/datasets/librispeech.py b/audio/paddleaudio/datasets/librispeech.py deleted file mode 100644 index c3b3c83d..00000000 --- a/audio/paddleaudio/datasets/librispeech.py +++ /dev/null @@ -1,199 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import codecs -import collections -import json -import os -from typing import Dict - -from paddle.io import Dataset -from tqdm import tqdm - -from ..backends import load as load_audio -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME -from ..utils.log import logger -from .dataset import feat_funcs - -__all__ = ['LIBRISPEECH'] - - -class LIBRISPEECH(Dataset): - """ - LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech, - prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is - derived from read audiobooks from the LibriVox project, and has been carefully - segmented and aligned. - - Reference: - LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS - http://www.danielpovey.com/files/2015_icassp_librispeech.pdf - https://arxiv.org/abs/1709.05522 - """ - - source_url = 'http://www.openslr.org/resources/12/' - archieves = [ - { - 'url': source_url + 'train-clean-100.tar.gz', - 'md5': '2a93770f6d5c6c964bc36631d331a522', - }, - { - 'url': source_url + 'train-clean-360.tar.gz', - 'md5': 'c0e676e450a7ff2f54aeade5171606fa', - }, - { - 'url': source_url + 'train-other-500.tar.gz', - 'md5': 'd1a0fd59409feb2c614ce4d30c387708', - }, - { - 'url': source_url + 'dev-clean.tar.gz', - 'md5': '42e2234ba48799c1f50f24a7926300a1', - }, - { - 'url': source_url + 'dev-other.tar.gz', - 'md5': 'c8d0bcc9cca99d4f8b62fcc847357931', - }, - { - 'url': source_url + 'test-clean.tar.gz', - 'md5': '32fa31d27d2e1cad72775fee3f4849a9', - }, - { - 'url': source_url + 'test-other.tar.gz', - 'md5': 'fb5a50374b501bb3bac4815ee91d3135', - }, - ] - speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT') - utt_info = collections.namedtuple('META_INFO', ( - 'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender')) - audio_path = 'LibriSpeech' - manifest_path = os.path.join('LibriSpeech', 'manifest') - subset = [ - 'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean', - 'dev-other', 'test-clean', 'test-other' - ] - - def __init__(self, - subset: str='train-clean-100', - feat_type: str='raw', - **kwargs): - assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format( - self.subset, subset) - self.subset = subset - self.feat_type = feat_type - self.feat_config = kwargs - self._data = self._get_data() - super(LIBRISPEECH, self).__init__() - - def _get_speaker_info(self) -> Dict[str, str]: - ret = {} - with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf: - for line in rf.readlines(): - if ';' in line: # Skip dataset abstract - continue - spk_id, gender = map(str.strip, - line.split('|')[:2]) # spk_id, gender - ret.update({spk_id: gender}) - return ret - - def _get_text_info(self, trans_file) -> Dict[str, str]: - ret = {} - with open(trans_file, 'r') as rf: - for line in rf.readlines(): - utt_id, text = map(str.strip, line.split(' ', - 1)) # utt_id, text - ret.update({utt_id: text}) - return ret - - def _get_data(self): - if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ - not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)): - download_and_decompress(self.archieves, DATA_HOME, - len(self.archieves)) - - # Speaker info - speaker_info = self._get_speaker_info() - - # Text info - text_info = {} - for root, _, files in os.walk( - os.path.join(DATA_HOME, self.audio_path, self.subset)): - for file in files: - if file.endswith('.trans.txt'): - text_info.update( - self._get_text_info(os.path.join(root, file))) - - data = [] - for root, _, files in os.walk( - os.path.join(DATA_HOME, self.audio_path, self.subset)): - for file in files: - if file.endswith('.flac'): - utt_id = os.path.splitext(file)[0] - spk_id = utt_id.split('-')[0] - if utt_id not in text_info \ - or spk_id not in speaker_info : # Skip samples with incomplete data - continue - file_path = os.path.join(root, file) - text = text_info[utt_id] - spk_gender = speaker_info[spk_id] - data.append( - self.utt_info(file_path, utt_id, text, spk_id, - spk_gender)) - - return data - - def _convert_to_record(self, idx: int): - sample = self._data[idx] - - record = {} - # To show all fields in a namedtuple: `type(sample)._fields` - for field in type(sample)._fields: - record[field] = getattr(sample, field) - - waveform, sr = load_audio( - sample[0]) # The first element of sample is file path - feat_func = feat_funcs[self.feat_type] - feat = feat_func( - waveform, sample_rate=sr, - **self.feat_config) if feat_func else waveform - record.update({'feat': feat, 'duration': len(waveform) / sr}) - return record - - def create_manifest(self, prefix='manifest'): - if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)): - os.makedirs(os.path.join(DATA_HOME, self.manifest_path)) - - manifest_file = os.path.join(DATA_HOME, self.manifest_path, - f'{prefix}.{self.subset}') - with codecs.open(manifest_file, 'w', 'utf-8') as f: - for idx in tqdm(range(len(self))): - record = self._convert_to_record(idx) - record_line = json.dumps( - { - 'utt': record['utt_id'], - 'feat': record['file_path'], - 'feat_shape': (record['duration'], ), - 'text': record['text'], - 'spk': record['spk_id'], - 'gender': record['spk_gender'], - }, - ensure_ascii=False) - f.write(record_line + '\n') - logger.info(f'Manifest file {manifest_file} created.') - - def __getitem__(self, idx): - record = self._convert_to_record(idx) - return tuple(record.values()) - - def __len__(self): - return len(self._data) diff --git a/audio/paddleaudio/datasets/ravdess.py b/audio/paddleaudio/datasets/ravdess.py deleted file mode 100644 index d886aad2..00000000 --- a/audio/paddleaudio/datasets/ravdess.py +++ /dev/null @@ -1,136 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import collections -import os -import random -from typing import List -from typing import Tuple - -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME -from .dataset import AudioClassificationDataset - -__all__ = ['RAVDESS'] - - -class RAVDESS(AudioClassificationDataset): - """ - The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two - lexically-matched statements in a neutral North American accent. Speech emotions - includes calm, happy, sad, angry, fearful, surprise, and disgust expressions. - Each expression is produced at two levels of emotional intensity (normal, strong), - with an additional neutral expression. - - Reference: - The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): - A dynamic, multimodal set of facial and vocal expressions in North American English - https://doi.org/10.1371/journal.pone.0196391 - """ - - archieves = [ - { - 'url': - 'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip', - 'md5': - '5411230427d67a21e18aa4d466e6d1b9', - }, - { - 'url': - 'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip', - 'md5': - 'bc696df654c87fed845eb13823edef8a', - }, - ] - label_list = [ - 'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', - 'surprised' - ] - meta_info = collections.namedtuple( - 'META_INFO', ('modality', 'vocal_channel', 'emotion', - 'emotion_intensity', 'statement', 'repitition', 'actor')) - speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24') - song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24') - - def __init__(self, - mode='train', - seed=0, - n_folds=5, - split=1, - feat_type='raw', - **kwargs): - """ - Ags: - mode (:obj:`str`, `optional`, defaults to `train`): - It identifies the dataset mode (train or dev). - seed (:obj:`int`, `optional`, defaults to 0): - Set the random seed to shuffle samples. - n_folds (:obj:`int`, `optional`, defaults to 5): - Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset. - split (:obj:`int`, `optional`, defaults to 1): - It specify the fold of dev dataset. - feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. - """ - assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' - files, labels = self._get_data(mode, seed, n_folds, split) - super(RAVDESS, self).__init__( - files=files, labels=labels, feat_type=feat_type, **kwargs) - - def _get_meta_info(self, files) -> List[collections.namedtuple]: - ret = [] - for file in files: - basename_without_extend = os.path.basename(file)[:-4] - ret.append(self.meta_info(*basename_without_extend.split('-'))) - return ret - - def _get_data(self, mode, seed, n_folds, - split) -> Tuple[List[str], List[int]]: - if not os.path.isdir(self.speech_path) and not os.path.isdir( - self.song_path): - download_and_decompress(self.archieves, DATA_HOME) - - wav_files = [] - for root, _, files in os.walk(self.speech_path): - for file in files: - if file.endswith('.wav'): - wav_files.append(os.path.join(root, file)) - - for root, _, files in os.walk(self.song_path): - for file in files: - if file.endswith('.wav'): - wav_files.append(os.path.join(root, file)) - - random.seed(seed) # shuffle samples to split data - random.shuffle( - wav_files - ) # make sure using the same seed to create train and dev dataset - meta_info = self._get_meta_info(wav_files) - - files = [] - labels = [] - n_samples_per_fold = len(meta_info) // n_folds - for idx, sample in enumerate(meta_info): - _, _, emotion, _, _, _, _ = sample - target = int(emotion) - 1 - fold = idx // n_samples_per_fold + 1 - - if mode == 'train' and int(fold) != split: - files.append(wav_files[idx]) - labels.append(target) - - if mode != 'train' and int(fold) == split: - files.append(wav_files[idx]) - labels.append(target) - - return files, labels diff --git a/audio/test/README.md b/audio/test/README.md deleted file mode 100644 index e5dbc537..00000000 --- a/audio/test/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# PaddleAudio Testing Guide - - - - -# Testing -First clone a version of the project by -``` -git clone https://github.com/PaddlePaddle/models.git - -``` -Then install the project in your virtual environment. -``` -cd models/PaddleAudio -python setup.py bdist_wheel -pip install -e .[dev] -``` -The requirements for testing will be installed along with PaddleAudio. - -Now run -``` -pytest test -``` - -If it goes well, you will see outputs like these: -``` -platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1 -rootdir: ./models/PaddleAudio -plugins: hydra-core-1.0.6 -collected 16 items - -test/unit_test/test_backend.py ........... [ 68%] -test/unit_test/test_features.py ..... [100%] - -==================================================== warnings summary ==================================================== -. -. -. --- Docs: https://docs.pytest.org/en/stable/warnings.html -============================================ 16 passed, 11 warnings in 6.76s ============================================= -``` diff --git a/audio/test/unit_test/test_backend.py b/audio/test/unit_test/test_backend.py deleted file mode 100644 index 1bf1504e..00000000 --- a/audio/test/unit_test/test_backend.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import librosa -import numpy as np -import paddleaudio -import pytest - -TEST_FILE = './test/data/test_audio.wav' - - -def relative_err(a, b, real=True): - """compute relative error of two matrices or vectors""" - if real: - return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2)) - else: - err = np.sum((a.real - b.real)**2) / \ - (EPS + np.sum(a.real**2) + np.sum(b.real**2)) - err += np.sum((a.imag - b.imag)**2) / \ - (EPS + np.sum(a.imag**2) + np.sum(b.imag**2)) - - return err - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def load_audio(): - x, r = librosa.load(TEST_FILE, sr=16000) - print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}') - return x, r - - -# start testing -x, r = load_audio() -EPS = 1e-8 - - -def test_load(): - s, r = paddleaudio.load(TEST_FILE, sr=16000) - assert r == 16000 - assert s.dtype == 'float32' - - s, r = paddleaudio.load( - TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16') - assert len(s) / r == 2.0 - assert r == 16000 - assert s.dtype == 'int16' - - -def test_depth_convert(): - y = paddleaudio.depth_convert(x, 'int16') - assert len(y) == len(x) - assert y.dtype == 'int16' - assert np.max(y) <= 32767 - assert np.min(y) >= -32768 - assert np.std(y) > EPS - - y = paddleaudio.depth_convert(x, 'int8') - assert len(y) == len(x) - assert y.dtype == 'int8' - assert np.max(y) <= 127 - assert np.min(y) >= -128 - assert np.std(y) > EPS - - -# test case for resample -rs_test_data = [ - (32000, 'kaiser_fast'), - (16000, 'kaiser_fast'), - (8000, 'kaiser_fast'), - (32000, 'kaiser_best'), - (16000, 'kaiser_best'), - (8000, 'kaiser_best'), - (22050, 'kaiser_best'), - (44100, 'kaiser_best'), -] - - -@pytest.mark.parametrize('sr,mode', rs_test_data) -def test_resample(sr, mode): - y = paddleaudio.resample(x, 16000, sr, mode=mode) - factor = sr / 16000 - err = relative_err(len(y), len(x) * factor) - print('err:', err) - assert err < EPS - - -def test_normalize(): - y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5) - assert np.max(y) < 0.5 + EPS - - y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0) - assert np.max(y) <= 2.0 + EPS - - y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0) - print('np.std(y):', np.std(y)) - assert np.abs(np.std(y) - 1.0) < EPS - - -if __name__ == '__main__': - test_load() - test_depth_convert() - test_resample(22050, 'kaiser_fast') - test_normalize() diff --git a/audio/test/unit_test/test_features.py b/audio/test/unit_test/test_features.py deleted file mode 100644 index 9e4e29cb..00000000 --- a/audio/test/unit_test/test_features.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import librosa -import numpy as np -import paddleaudio as pa -import pytest - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def load_audio(): - x, r = librosa.load('./test/data/test_audio.wav') - #x,r = librosa.load('../data/test_audio.wav',sr=16000) - return x, r - - -## start testing -x, r = load_audio() -EPS = 1e-8 - - -def relative_err(a, b, real=True): - """compute relative error of two matrices or vectors""" - if real: - return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2)) - else: - err = np.sum((a.real - b.real)**2) / ( - EPS + np.sum(a.real**2) + np.sum(b.real**2)) - err += np.sum((a.imag - b.imag)**2) / ( - EPS + np.sum(a.imag**2) + np.sum(b.imag**2)) - - return err - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_melspectrogram(): - a = pa.melspectrogram( - x, - window_size=512, - sr=16000, - hop_length=320, - n_mels=64, - fmin=50, - to_db=False, ) - b = librosa.feature.melspectrogram( - x, - sr=16000, - n_fft=512, - win_length=512, - hop_length=320, - n_mels=64, - fmin=50) - assert relative_err(a, b) < EPS - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_melspectrogram_db(): - - a = pa.melspectrogram( - x, - window_size=512, - sr=16000, - hop_length=320, - n_mels=64, - fmin=50, - to_db=True, - ref=1.0, - amin=1e-10, - top_db=None) - b = librosa.feature.melspectrogram( - x, - sr=16000, - n_fft=512, - win_length=512, - hop_length=320, - n_mels=64, - fmin=50) - b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None) - assert relative_err(a, b) < EPS - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_stft(): - a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512) - b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512) - assert a.shape == b.shape - assert relative_err(a, b, real=False) < EPS - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_split_frames(): - a = librosa.util.frame(x, frame_length=512, hop_length=320) - b = pa.split_frames(x, frame_length=512, hop_length=320) - assert relative_err(a, b) < EPS - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_mfcc(): - kwargs = { - 'window_size': 512, - 'hop_length': 320, - 'n_mels': 64, - 'fmin': 50, - 'to_db': False - } - a = pa.mfcc( - x, - #sample_rate=16000, - spect=None, - n_mfcc=20, - dct_type=2, - norm='ortho', - lifter=0, - **kwargs) - S = librosa.feature.melspectrogram( - x, - sr=16000, - n_fft=512, - win_length=512, - hop_length=320, - n_mels=64, - fmin=50) - b = librosa.feature.mfcc( - x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0) - assert relative_err(a, b) < EPS - - -if __name__ == '__main__': - test_melspectrogram() - test_melspectrogram_db() - test_stft() - test_split_frames() - test_mfcc() diff --git a/audio/examples/sound_classification/README.md b/examples/esc50/README.md similarity index 74% rename from audio/examples/sound_classification/README.md rename to examples/esc50/README.md index 86a54cb3..66409754 100644 --- a/audio/examples/sound_classification/README.md +++ b/examples/esc50/README.md @@ -21,22 +21,17 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型,可供用 ### 模型训练 -以环境声音分类数据集`ESC50`为示例,运行下面的命令,可在训练集上进行模型的finetune,支持单机的单卡训练和多卡训练。关于如何使用`paddle.distributed.launch`启动多卡训练,请查看[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html)。 +以环境声音分类数据集`ESC50`为示例,运行下面的命令,可在训练集上进行模型的finetune,支持单机的单卡训练和多卡训练。 -单卡训练: +启动训练: ```shell -$ python train.py --epochs 50 --batch_size 16 --checkpoint_dir ./checkpoint --save_freq 10 +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 ``` -多卡训练: -```shell -$ unset CUDA_VISIBLE_DEVICES -$ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_size 16 --num_worker 4 --checkpoint_dir ./checkpoint --save_freq 10 -``` +`paddlespeech/cls/exps/panns/train.py` 脚本中可支持配置的参数: -可支持配置的参数: - -- `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 +- `device`: 指定模型预测时使用的设备。 +- `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 - `epochs`: 训练轮次,默认为50。 - `learning_rate`: Fine-tune的学习率;默认为5e-5。 - `batch_size`: 批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数;默认为16。 @@ -47,9 +42,9 @@ $ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_ 示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过以下方式执行: ```python -from model import SoundClassifier from paddleaudio.datasets import ESC50 -from paddleaudio.models.panns import cnn14, cnn10, cnn6 +from paddlespeech.cls.models import SoundClassifier +from paddlespeech.cls.models import cnn14, cnn10, cnn6 # CNN14 backbone = cnn14(pretrained=True, extract_embedding=True) @@ -67,12 +62,14 @@ model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) ### 模型预测 ```shell -python -u predict.py --wav ./dog.wav --top_k 3 --checkpoint ./checkpoint/epoch_50/model.pdparams +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 ``` -可支持配置的参数: -- `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 +`paddlespeech/cls/exps/panns/predict.py` 脚本中可支持配置的参数: + +- `device`: 指定模型预测时使用的设备。 - `wav`: 指定预测的音频文件。 +- `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 - `top_k`: 预测显示的top k标签的得分,默认为1。 - `checkpoint`: 模型参数checkpoint文件。 @@ -91,10 +88,10 @@ Cat: 6.579841738130199e-06 模型训练结束后,可以将已保存的动态图参数导出成静态图的模型和参数,然后实施静态图的部署。 ```shell -python -u export_model.py --checkpoint ./checkpoint/epoch_50/model.pdparams --output_dir ./export +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3 ``` -可支持配置的参数: +`paddlespeech/cls/exps/panns/export_model.py` 脚本中可支持配置的参数: - `checkpoint`: 模型参数checkpoint文件。 - `output_dir`: 导出静态图模型和参数文件的保存目录。 @@ -109,8 +106,13 @@ export #### 2. 模型部署和预测 -`deploy/python/predict.py` 脚本使用了`paddle.inference`模块下的api,提供了python端部署的示例: +`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api,提供了python端部署的示例: -```sh -python deploy/python/predict.py --model_dir ./export --device gpu +```shell +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4 ``` + +`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本中可支持配置的主要参数: +- `device`: 指定模型预测时使用的设备。 +- `model_dir`: 导出静态图模型和参数文件的保存目录。 +- `wav`: 指定预测的音频文件。 diff --git a/examples/esc50/cls0/local/export.sh b/examples/esc50/cls0/local/export.sh new file mode 100755 index 00000000..160dc743 --- /dev/null +++ b/examples/esc50/cls0/local/export.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +ckpt_dir=$1 +output_dir=$2 + +python3 ${BIN_DIR}/export_model.py \ +--checkpoint ${ckpt_dir}/model.pdparams \ +--output_dir ${output_dir} diff --git a/examples/esc50/cls0/local/infer.sh b/examples/esc50/cls0/local/infer.sh new file mode 100755 index 00000000..bc03d681 --- /dev/null +++ b/examples/esc50/cls0/local/infer.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +audio_file=$1 +ckpt_dir=$2 +feat_backend=$3 + +python3 ${BIN_DIR}/predict.py \ +--wav ${audio_file} \ +--feat_backend ${feat_backend} \ +--top_k 10 \ +--checkpoint ${ckpt_dir}/model.pdparams diff --git a/examples/esc50/cls0/local/static_model_infer.sh b/examples/esc50/cls0/local/static_model_infer.sh new file mode 100755 index 00000000..9b3abb5d --- /dev/null +++ b/examples/esc50/cls0/local/static_model_infer.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +device=$1 +model_dir=$2 +audio_file=$3 + +python3 ${BIN_DIR}/deploy/predict.py \ +--device ${device} \ +--model_dir ${model_dir} \ +--wav ${audio_file} diff --git a/examples/esc50/cls0/local/train.sh b/examples/esc50/cls0/local/train.sh new file mode 100755 index 00000000..0f0f3d09 --- /dev/null +++ b/examples/esc50/cls0/local/train.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +ngpu=$1 +feat_backend=$2 + +num_epochs=50 +batch_size=16 +ckpt_dir=./checkpoint +save_freq=10 + +if [ ${ngpu} -gt 0 ]; then + python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \ + --epochs ${num_epochs} \ + --feat_backend ${feat_backend} \ + --batch_size ${batch_size} \ + --checkpoint_dir ${ckpt_dir} \ + --save_freq ${save_freq} +else + python3 ${BIN_DIR}/train.py \ + --epochs ${num_epochs} \ + --feat_backend ${feat_backend} \ + --batch_size ${batch_size} \ + --checkpoint_dir ${ckpt_dir} \ + --save_freq ${save_freq} +fi diff --git a/examples/esc50/cls0/path.sh b/examples/esc50/cls0/path.sh new file mode 100644 index 00000000..3eff28e4 --- /dev/null +++ b/examples/esc50/cls0/path.sh @@ -0,0 +1,13 @@ +#!/bin/bash +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +MODEL=panns +export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL} \ No newline at end of file diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh new file mode 100755 index 00000000..7283aa8d --- /dev/null +++ b/examples/esc50/cls0/run.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e +source path.sh + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') + +stage=$1 +stop_stage=100 +feat_backend=numpy +audio_file=~/cat.wav +ckpt_dir=./checkpoint/epoch_50 +output_dir=./export +infer_device=cpu + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + ./local/train.sh ${ngpu} ${feat_backend} || exit -1 + exit 0 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + ./local/infer.sh ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1 + exit 0 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + ./local/export.sh ${ckpt_dir} ${output_dir} || exit -1 + exit 0 +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + ./local/static_model_infer.sh ${infer_device} ${output_dir} ${audio_file} || exit -1 + exit 0 +fi diff --git a/audio/paddleaudio/__init__.py b/paddleaudio/__init__.py similarity index 100% rename from audio/paddleaudio/__init__.py rename to paddleaudio/__init__.py diff --git a/audio/paddleaudio/backends/__init__.py b/paddleaudio/backends/__init__.py similarity index 100% rename from audio/paddleaudio/backends/__init__.py rename to paddleaudio/backends/__init__.py diff --git a/audio/paddleaudio/backends/audio.py b/paddleaudio/backends/audio.py similarity index 100% rename from audio/paddleaudio/backends/audio.py rename to paddleaudio/backends/audio.py diff --git a/audio/paddleaudio/datasets/__init__.py b/paddleaudio/datasets/__init__.py similarity index 73% rename from audio/paddleaudio/datasets/__init__.py rename to paddleaudio/datasets/__init__.py index e1d2bbc5..8d2fdab4 100644 --- a/audio/paddleaudio/datasets/__init__.py +++ b/paddleaudio/datasets/__init__.py @@ -11,24 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .aishell import AISHELL1 -from .dcase import UrbanAcousticScenes -from .dcase import UrbanAudioVisualScenes from .esc50 import ESC50 from .gtzan import GTZAN -from .librispeech import LIBRISPEECH -from .ravdess import RAVDESS from .tess import TESS from .urban_sound import UrbanSound8K __all__ = [ - 'AISHELL1', - 'LIBRISPEECH', 'ESC50', 'UrbanSound8K', 'GTZAN', - 'UrbanAcousticScenes', - 'UrbanAudioVisualScenes', - 'RAVDESS', 'TESS', ] diff --git a/audio/paddleaudio/datasets/dataset.py b/paddleaudio/datasets/dataset.py similarity index 100% rename from audio/paddleaudio/datasets/dataset.py rename to paddleaudio/datasets/dataset.py diff --git a/audio/paddleaudio/datasets/esc50.py b/paddleaudio/datasets/esc50.py similarity index 100% rename from audio/paddleaudio/datasets/esc50.py rename to paddleaudio/datasets/esc50.py diff --git a/audio/paddleaudio/datasets/gtzan.py b/paddleaudio/datasets/gtzan.py similarity index 100% rename from audio/paddleaudio/datasets/gtzan.py rename to paddleaudio/datasets/gtzan.py diff --git a/audio/paddleaudio/datasets/tess.py b/paddleaudio/datasets/tess.py similarity index 100% rename from audio/paddleaudio/datasets/tess.py rename to paddleaudio/datasets/tess.py diff --git a/audio/paddleaudio/datasets/urban_sound.py b/paddleaudio/datasets/urban_sound.py similarity index 100% rename from audio/paddleaudio/datasets/urban_sound.py rename to paddleaudio/datasets/urban_sound.py diff --git a/audio/paddleaudio/features/__init__.py b/paddleaudio/features/__init__.py similarity index 96% rename from audio/paddleaudio/features/__init__.py rename to paddleaudio/features/__init__.py index 8503cfab..d8ac7c4b 100644 --- a/audio/paddleaudio/features/__init__.py +++ b/paddleaudio/features/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from .augment import * from .core import * +from .spectrum import * diff --git a/audio/paddleaudio/features/augment.py b/paddleaudio/features/augment.py similarity index 98% rename from audio/paddleaudio/features/augment.py rename to paddleaudio/features/augment.py index 7556bb3c..6f903bdb 100644 --- a/audio/paddleaudio/features/augment.py +++ b/paddleaudio/features/augment.py @@ -15,8 +15,9 @@ from typing import List import numpy as np from numpy import ndarray as array -from paddleaudio.backends import depth_convert -from paddleaudio.utils import ParameterError + +from ..backends import depth_convert +from ..utils import ParameterError __all__ = [ 'depth_augment', diff --git a/audio/paddleaudio/features/core.py b/paddleaudio/features/core.py similarity index 99% rename from audio/paddleaudio/features/core.py rename to paddleaudio/features/core.py index dd25724f..d3c2e290 100644 --- a/audio/paddleaudio/features/core.py +++ b/paddleaudio/features/core.py @@ -21,9 +21,10 @@ import numpy as np import scipy from numpy import ndarray as array from numpy.lib.stride_tricks import as_strided -from paddleaudio.utils import ParameterError from scipy.signal import get_window +from ..utils import ParameterError + __all__ = [ 'stft', 'mfcc', @@ -293,6 +294,7 @@ def stft(x: array, This function is aligned with librosa. """ _check_audio(x) + # By default, use the entire frame if win_length is None: win_length = n_fft @@ -397,7 +399,7 @@ def mfcc(x, This function is NOT strictly aligned with librosa. The following example shows how to get the same result with librosa: - # paddleaudioe mfcc: + # mfcc: kwargs = { 'window_size':512, 'hop_length':320, diff --git a/paddleaudio/features/spectrum.py b/paddleaudio/features/spectrum.py new file mode 100644 index 00000000..154b6484 --- /dev/null +++ b/paddleaudio/features/spectrum.py @@ -0,0 +1,461 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from functools import partial +from typing import Optional +from typing import Union + +import paddle +import paddle.nn as nn + +from .window import get_window + +__all__ = [ + 'Spectrogram', + 'MelSpectrogram', + 'LogMelSpectrogram', +] + + +def hz_to_mel(freq: Union[paddle.Tensor, float], + htk: bool=False) -> Union[paddle.Tensor, float]: + """Convert Hz to Mels. + Parameters: + freq: the input tensor of arbitrary shape, or a single floating point number. + htk: use HTK formula to do the conversion. + The default value is False. + Returns: + The frequencies represented in Mel-scale. + """ + + if htk: + if isinstance(freq, paddle.Tensor): + return 2595.0 * paddle.log10(1.0 + freq / 700.0) + else: + return 2595.0 * math.log10(1.0 + freq / 700.0) + + # Fill in the linear part + f_min = 0.0 + f_sp = 200.0 / 3 + + mels = (freq - f_min) / f_sp + + # Fill in the log-scale part + + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = math.log(6.4) / 27.0 # step size for log region + + if isinstance(freq, paddle.Tensor): + target = min_log_mel + paddle.log( + freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10 + mask = (freq > min_log_hz).astype(freq.dtype) + mels = target * mask + mels * ( + 1 - mask) # will replace by masked_fill OP in future + else: + if freq >= min_log_hz: + mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep + + return mels + + +def mel_to_hz(mel: Union[float, paddle.Tensor], + htk: bool=False) -> Union[float, paddle.Tensor]: + """Convert mel bin numbers to frequencies. + Parameters: + mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number. + htk: use HTK formula to do the conversion. + Returns: + The frequencies represented in hz. + """ + if htk: + return 700.0 * (10.0**(mel / 2595.0) - 1.0) + + f_min = 0.0 + f_sp = 200.0 / 3 + freqs = f_min + f_sp * mel + # And now the nonlinear scale + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = math.log(6.4) / 27.0 # step size for log region + if isinstance(mel, paddle.Tensor): + target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel)) + mask = (mel > min_log_mel).astype(mel.dtype) + freqs = target * mask + freqs * ( + 1 - mask) # will replace by masked_fill OP in future + else: + if mel >= min_log_mel: + freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel)) + + return freqs + + +def mel_frequencies(n_mels: int=64, + f_min: float=0.0, + f_max: float=11025.0, + htk: bool=False, + dtype: str=paddle.float32): + """Compute mel frequencies. + Parameters: + n_mels(int): number of Mel bins. + f_min(float): the lower cut-off frequency, below which the filter response is zero. + f_max(float): the upper cut-off frequency, above which the filter response is zero. + htk(bool): whether to use htk formula. + dtype(str): the datatype of the return frequencies. + Returns: + The frequencies represented in Mel-scale + """ + # 'Center freqs' of mel bands - uniformly spaced between limits + min_mel = hz_to_mel(f_min, htk=htk) + max_mel = hz_to_mel(f_max, htk=htk) + mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype) + freqs = mel_to_hz(mels, htk=htk) + return freqs + + +def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32): + """Compute fourier frequencies. + Parameters: + sr(int): the audio sample rate. + n_fft(float): the number of fft bins. + dtype(str): the datatype of the return frequencies. + Returns: + The frequencies represented in hz. + """ + return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype) + + +def compute_fbank_matrix(sr: int, + n_fft: int, + n_mels: int=64, + f_min: float=0.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + dtype: str=paddle.float32): + """Compute fbank matrix. + Parameters: + sr(int): the audio sample rate. + n_fft(int): the number of fft bins. + n_mels(int): the number of Mel bins. + f_min(float): the lower cut-off frequency, below which the filter response is zero. + f_max(float): the upper cut-off frequency, above which the filter response is zero. + htk: whether to use htk formula. + return_complex(bool): whether to return complex matrix. If True, the matrix will + be complex type. Otherwise, the real and image part will be stored in the last + axis of returned tensor. + dtype(str): the datatype of the returned fbank matrix. + Returns: + The fbank matrix of shape (n_mels, int(1+n_fft//2)). + Shape: + output: (n_mels, int(1+n_fft//2)) + """ + + if f_max is None: + f_max = float(sr) / 2 + + # Initialize the weights + weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) + + # Center freqs of each FFT bin + fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype) + + # 'Center freqs' of mel bands - uniformly spaced between limits + mel_f = mel_frequencies( + n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype) + + fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f) + ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0) + #ramps = np.subtract.outer(mel_f, fftfreqs) + + for i in range(n_mels): + # lower and upper slopes for all bins + lower = -ramps[i] / fdiff[i] + upper = ramps[i + 2] / fdiff[i + 1] + + # .. then intersect them with each other and zero + weights[i] = paddle.maximum( + paddle.zeros_like(lower), paddle.minimum(lower, upper)) + + # Slaney-style mel is scaled to be approx constant energy per channel + if norm == 'slaney': + enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels]) + weights *= enorm.unsqueeze(1) + elif isinstance(norm, int) or isinstance(norm, float): + weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1) + + return weights + + +def power_to_db(magnitude: paddle.Tensor, + ref_value: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=None) -> paddle.Tensor: + """Convert a power spectrogram (amplitude squared) to decibel (dB) units. + The function computes the scaling ``10 * log10(x / ref)`` in a numerically + stable way. + Parameters: + magnitude(Tensor): the input magnitude tensor of any shape. + ref_value(float): the reference value. If smaller than 1.0, the db level + of the signal will be pulled up accordingly. Otherwise, the db level + is pushed down. + amin(float): the minimum value of input magnitude, below which the input + magnitude is clipped(to amin). + top_db(float): the maximum db value of resulting spectrum, above which the + spectrum is clipped(to top_db). + Returns: + The spectrogram in log-scale. + shape: + input: any shape + output: same as input + """ + if amin <= 0: + raise Exception("amin must be strictly positive") + + if ref_value <= 0: + raise Exception("ref_value must be strictly positive") + + ones = paddle.ones_like(magnitude) + log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude)) + log_spec -= 10.0 * math.log10(max(ref_value, amin)) + + if top_db is not None: + if top_db < 0: + raise Exception("top_db must be non-negative") + log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db)) + + return log_spec + + +class Spectrogram(nn.Layer): + def __init__(self, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + center: bool=True, + pad_mode: str='reflect', + dtype: str=paddle.float32): + """Compute spectrogram of a given signal, typically an audio waveform. + The spectorgram is defined as the complex norm of the short-time + Fourier transformation. + Parameters: + n_fft(int): the number of frequency components of the discrete Fourier transform. + The default value is 2048, + hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4. + The default value is None. + win_length: the window length of the short time FFt. If None, it is set to same as n_fft. + The default value is None. + window(str): the name of the window function applied to the single before the Fourier transform. + The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', + 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. + The default value is 'hann' + center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. + If False, frame t begins at x[t * hop_length] + The default value is True + pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect' + and 'constant'. The default value is 'reflect'. + dtype(str): the data type of input and window. + Notes: + The Spectrogram transform relies on STFT transform to compute the spectrogram. + By default, the weights are not learnable. To fine-tune the Fourier coefficients, + set stop_gradient=False before training. + For more information, see STFT(). + """ + super(Spectrogram, self).__init__() + + if win_length is None: + win_length = n_fft + + fft_window = get_window(window, win_length, fftbins=True, dtype=dtype) + self._stft = partial( + paddle.signal.stft, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=fft_window, + center=center, + pad_mode=pad_mode) + + def forward(self, x): + stft = self._stft(x) + spectrogram = paddle.square(paddle.abs(stft)) + return spectrogram + + +class MelSpectrogram(nn.Layer): + def __init__(self, + sr: int=22050, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + center: bool=True, + pad_mode: str='reflect', + n_mels: int=64, + f_min: float=50.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + dtype: str=paddle.float32): + """Compute the melspectrogram of a given signal, typically an audio waveform. + The melspectrogram is also known as filterbank or fbank feature in audio community. + It is computed by multiplying spectrogram with Mel filter bank matrix. + Parameters: + sr(int): the audio sample rate. + The default value is 22050. + n_fft(int): the number of frequency components of the discrete Fourier transform. + The default value is 2048, + hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4. + The default value is None. + win_length: the window length of the short time FFt. If None, it is set to same as n_fft. + The default value is None. + window(str): the name of the window function applied to the single before the Fourier transform. + The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', + 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. + The default value is 'hann' + center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. + If False, frame t begins at x[t * hop_length] + The default value is True + pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect' + and 'constant'. + The default value is 'reflect'. + n_mels(int): the mel bins. + f_min(float): the lower cut-off frequency, below which the filter response is zero. + f_max(float): the upper cut-off frequency, above which the filter response is zeros. + htk(bool): whether to use HTK formula in computing fbank matrix. + norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. + You can specify norm=1.0/2.0 to use customized p-norm normalization. + dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical + accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. + """ + super(MelSpectrogram, self).__init__() + + self._spectrogram = Spectrogram( + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, + dtype=dtype) + self.n_mels = n_mels + self.f_min = f_min + self.f_max = f_max + self.htk = htk + self.norm = norm + if f_max is None: + f_max = sr // 2 + self.fbank_matrix = compute_fbank_matrix( + sr=sr, + n_fft=n_fft, + n_mels=n_mels, + f_min=f_min, + f_max=f_max, + htk=htk, + norm=norm, + dtype=dtype) # float64 for better numerical results + self.register_buffer('fbank_matrix', self.fbank_matrix) + + def forward(self, x): + spect_feature = self._spectrogram(x) + mel_feature = paddle.matmul(self.fbank_matrix, spect_feature) + return mel_feature + + +class LogMelSpectrogram(nn.Layer): + def __init__(self, + sr: int=22050, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + center: bool=True, + pad_mode: str='reflect', + n_mels: int=64, + f_min: float=50.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + ref_value: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=None, + dtype: str=paddle.float32): + """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal, + typically an audio waveform. + Parameters: + sr(int): the audio sample rate. + The default value is 22050. + n_fft(int): the number of frequency components of the discrete Fourier transform. + The default value is 2048, + hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4. + The default value is None. + win_length: the window length of the short time FFt. If None, it is set to same as n_fft. + The default value is None. + window(str): the name of the window function applied to the single before the Fourier transform. + The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', + 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. + The default value is 'hann' + center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. + If False, frame t begins at x[t * hop_length] + The default value is True + pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect' + and 'constant'. + The default value is 'reflect'. + n_mels(int): the mel bins. + f_min(float): the lower cut-off frequency, below which the filter response is zero. + f_max(float): the upper cut-off frequency, above which the filter response is zeros. + ref_value(float): the reference value. If smaller than 1.0, the db level + htk(bool): whether to use HTK formula in computing fbank matrix. + norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. + You can specify norm=1.0/2.0 to use customized p-norm normalization. + dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical + accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. + amin(float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly. + Otherwise, the db level is pushed down. + magnitude is clipped(to amin). For numerical stability, set amin to a larger value, + e.g., 1e-3. + top_db(float): the maximum db value of resulting spectrum, above which the + spectrum is clipped(to top_db). + """ + super(LogMelSpectrogram, self).__init__() + + self._melspectrogram = MelSpectrogram( + sr=sr, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, + n_mels=n_mels, + f_min=f_min, + f_max=f_max, + htk=htk, + norm=norm, + dtype=dtype) + + self.ref_value = ref_value + self.amin = amin + self.top_db = top_db + + def forward(self, x): + # import ipdb; ipdb.set_trace() + mel_feature = self._melspectrogram(x) + log_mel_feature = power_to_db( + mel_feature, + ref_value=self.ref_value, + amin=self.amin, + top_db=self.top_db) + return log_mel_feature diff --git a/paddleaudio/features/window.py b/paddleaudio/features/window.py new file mode 100644 index 00000000..629989fc --- /dev/null +++ b/paddleaudio/features/window.py @@ -0,0 +1,415 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +import math +from typing import List +from typing import Tuple +from typing import Union + +import paddle +from paddle import Tensor + +__all__ = [ + 'get_window', +] + + +def _cat(a: List[Tensor], data_type: str) -> Tensor: + l = [paddle.to_tensor(_a, data_type) for _a in a] + return paddle.concat(l) + + +def _acosh(x: Union[Tensor, float]) -> Tensor: + if isinstance(x, float): + return math.log(x + math.sqrt(x**2 - 1)) + return paddle.log(x + paddle.sqrt(paddle.square(x) - 1)) + + +def _extend(M: int, sym: bool) -> bool: + """Extend window by 1 sample if needed for DFT-even symmetry""" + if not sym: + return M + 1, True + else: + return M, False + + +def _len_guards(M: int) -> bool: + """Handle small or incorrect window lengths""" + if int(M) != M or M < 0: + raise ValueError('Window length M must be a non-negative integer') + + return M <= 1 + + +def _truncate(w: Tensor, needed: bool) -> Tensor: + """Truncate window by 1 sample if needed for DFT-even symmetry""" + if needed: + return w[:-1] + else: + return w + + +def general_gaussian(M: int, p, sig, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a window with a generalized Gaussian shape. + This function is consistent with scipy.signal.windows.general_gaussian(). + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0 + w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p)) + + return _truncate(w, needs_trunc) + + +def general_hamming(M: int, alpha: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a generalized Hamming window. + This function is consistent with scipy.signal.windows.general_hamming() + """ + return general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype) + + +def taylor(M: int, + nbar=4, + sll=30, + norm=True, + sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a Taylor window. + The Taylor window taper function approximates the Dolph-Chebyshev window's + constant sidelobe level for a parameterized number of near-in sidelobes. + Parameters: + M(int): window size + nbar, sil, norm: the window-specific parameter. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + # Original text uses a negative sidelobe level parameter and then negates + # it in the calculation of B. To keep consistent with other methods we + # assume the sidelobe level parameter to be positive. + B = 10**(sll / 20) + A = _acosh(B) / math.pi + s2 = nbar**2 / (A**2 + (nbar - 0.5)**2) + ma = paddle.arange(1, nbar, dtype=dtype) + + Fm = paddle.empty((nbar - 1, ), dtype=dtype) + signs = paddle.empty_like(ma) + signs[::2] = 1 + signs[1::2] = -1 + m2 = ma * ma + for mi in range(len(ma)): + numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2 + )) + if mi == 0: + denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:]) + elif mi == len(ma) - 1: + denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) + else: + denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[ + mi] / m2[mi + 1:]) + + Fm[mi] = numer / denom + + def W(n): + return 1 + 2 * paddle.matmul( + Fm.unsqueeze(0), + paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M)) + + w = W(paddle.arange(0, M, dtype=dtype)) + + # normalize (Note that this is not described in the original text [1]) + if norm: + scale = 1.0 / W((M - 1) / 2) + w *= scale + w = w.squeeze() + return _truncate(w, needs_trunc) + + +def general_cosine(M: int, a: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a generic weighted sum of cosine terms window. + This function is consistent with scipy.signal.windows.general_cosine(). + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype) + w = paddle.zeros((M, ), dtype=dtype) + for k in range(len(a)): + w += a[k] * paddle.cos(k * fac) + return _truncate(w, needs_trunc) + + +def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Hamming window. + The Hamming window is a taper formed by using a raised cosine with + non-zero endpoints, optimized to minimize the nearest side lobe. + Parameters: + M(int): window size + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + return general_hamming(M, 0.54, sym, dtype=dtype) + + +def hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Hann window. + The Hann window is a taper formed by using a raised cosine or sine-squared + with ends that touch zero. + Parameters: + M(int): window size + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + return general_hamming(M, 0.5, sym, dtype=dtype) + + +def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Tukey window. + The Tukey window is also known as a tapered cosine window. + Parameters: + M(int): window size + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + + if alpha <= 0: + return paddle.ones((M, ), dtype=dtype) + elif alpha >= 1.0: + return hann(M, sym=sym) + + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) + width = int(alpha * (M - 1) / 2.0) + n1 = n[0:width + 1] + n2 = n[width + 1:M - width - 1] + n3 = n[M - width - 1:] + + w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1)))) + w2 = paddle.ones(n2.shape, dtype=dtype) + w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / + (M - 1)))) + w = paddle.concat([w1, w2, w3]) + + return _truncate(w, needs_trunc) + + +def kaiser(M: int, beta: float, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Kaiser window. + The Kaiser window is a taper formed by using a Bessel function. + Parameters: + M(int): window size. + beta(float): the window-specific parameter. + sym(bool):whether to return symmetric window. + The default value is True + Returns: + Tensor: the window tensor + """ + raise NotImplementedError() + + +def gaussian(M: int, std: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a Gaussian window. + The Gaussian widows has a Gaussian shape defined by the standard deviation(std). + Parameters: + M(int): window size. + std(float): the window-specific parameter. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0 + sig2 = 2 * std * std + w = paddle.exp(-n**2 / sig2) + + return _truncate(w, needs_trunc) + + +def exponential(M: int, + center=None, + tau=1., + sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute an exponential (or Poisson) window. + Parameters: + M(int): window size. + tau(float): the window-specific parameter. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if sym and center is not None: + raise ValueError("If sym==True, center must be None.") + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + if center is None: + center = (M - 1) / 2 + + n = paddle.arange(0, M, dtype=dtype) + w = paddle.exp(-paddle.abs(n - center) / tau) + + return _truncate(w, needs_trunc) + + +def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a triangular window. + Parameters: + M(int): window size. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype) + if M % 2 == 0: + w = (2 * n - 1.0) / M + w = paddle.concat([w, w[::-1]]) + else: + w = 2 * n / (M + 1.0) + w = paddle.concat([w, w[-2::-1]]) + + return _truncate(w, needs_trunc) + + +def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Bohman window. + The Bohman window is the autocorrelation of a cosine window. + Parameters: + M(int): window size. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1]) + w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin( + math.pi * fac) + w = _cat([0, w, 0], dtype) + + return _truncate(w, needs_trunc) + + +def blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Blackman window. + The Blackman window is a taper formed by using the first three terms of + a summation of cosines. It was designed to have close to the minimal + leakage possible. It is close to optimal, only slightly worse than a + Kaiser window. + Parameters: + M(int): window size. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + return general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype) + + +def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a window with a simple cosine shape. + Parameters: + M(int): window size. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5)) + + return _truncate(w, needs_trunc) + + +def get_window(window: Union[str, Tuple[str, float]], + win_length: int, + fftbins: bool=True, + dtype: str='float64') -> Tensor: + """Return a window of a given length and type. + Parameters: + window(str|(str,float)): the type of window to create. + win_length(int): the number of samples in the window. + fftbins(bool): If True, create a "periodic" window. Otherwise, + create a "symmetric" window, for use in filter design. + Returns: + The window represented as a tensor. + """ + sym = not fftbins + + args = () + if isinstance(window, tuple): + winstr = window[0] + if len(window) > 1: + args = window[1:] + elif isinstance(window, str): + if window in ['gaussian', 'exponential']: + raise ValueError("The '" + window + "' window needs one or " + "more parameters -- pass a tuple.") + else: + winstr = window + else: + raise ValueError("%s as window type is not supported." % + str(type(window))) + + try: + winfunc = eval(winstr) + except KeyError as e: + raise ValueError("Unknown window type.") from e + + params = (win_length, ) + args + kwargs = {'sym': sym} + return winfunc(*params, dtype=dtype, **kwargs) diff --git a/audio/paddleaudio/utils/__init__.py b/paddleaudio/utils/__init__.py similarity index 100% rename from audio/paddleaudio/utils/__init__.py rename to paddleaudio/utils/__init__.py diff --git a/audio/paddleaudio/utils/download.py b/paddleaudio/utils/download.py similarity index 65% rename from audio/paddleaudio/utils/download.py rename to paddleaudio/utils/download.py index 0a36f29b..45a8e57b 100644 --- a/audio/paddleaudio/utils/download.py +++ b/paddleaudio/utils/download.py @@ -17,7 +17,6 @@ from typing import List from paddle.framework import load as load_state_dict from paddle.utils import download -from pathos.multiprocessing import ProcessPool from .log import logger @@ -32,27 +31,18 @@ def decompress(file: str): download._decompress(file) -def download_and_decompress(archives: List[Dict[str, str]], - path: str, - n_workers: int=0): +def download_and_decompress(archives: List[Dict[str, str]], path: str): """ Download archieves and decompress to specific path. """ if not os.path.isdir(path): os.makedirs(path) - if n_workers <= 0: - for archive in archives: - assert 'url' in archive and 'md5' in archive, \ - 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' + for archive in archives: + assert 'url' in archive and 'md5' in archive, \ + 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' - download.get_path_from_url(archive['url'], path, archive['md5']) - else: - pool = ProcessPool(nodes=n_workers) - pool.imap(download.get_path_from_url, [_['url'] for _ in archives], - [path] * len(archives), [_['md5'] for _ in archives]) - pool.close() - pool.join() + download.get_path_from_url(archive['url'], path, archive['md5']) def load_state_dict_from_url(url: str, path: str, md5: str=None): diff --git a/audio/paddleaudio/utils/env.py b/paddleaudio/utils/env.py similarity index 100% rename from audio/paddleaudio/utils/env.py rename to paddleaudio/utils/env.py diff --git a/audio/paddleaudio/utils/error.py b/paddleaudio/utils/error.py similarity index 100% rename from audio/paddleaudio/utils/error.py rename to paddleaudio/utils/error.py diff --git a/audio/paddleaudio/utils/log.py b/paddleaudio/utils/log.py similarity index 100% rename from audio/paddleaudio/utils/log.py rename to paddleaudio/utils/log.py diff --git a/audio/paddleaudio/utils/time.py b/paddleaudio/utils/time.py similarity index 100% rename from audio/paddleaudio/utils/time.py rename to paddleaudio/utils/time.py diff --git a/audio/paddleaudio/models/__init__.py b/paddlespeech/cls/exps/__init__.py similarity index 100% rename from audio/paddleaudio/models/__init__.py rename to paddlespeech/cls/exps/__init__.py diff --git a/paddlespeech/cls/exps/panns/__init__.py b/paddlespeech/cls/exps/panns/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/paddlespeech/cls/exps/panns/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/cls/exps/panns/deploy/__init__.py b/paddlespeech/cls/exps/panns/deploy/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/paddlespeech/cls/exps/panns/deploy/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/audio/examples/sound_classification/deploy/python/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py similarity index 97% rename from audio/examples/sound_classification/deploy/python/predict.py rename to paddlespeech/cls/exps/panns/deploy/predict.py index a99b8980..d4e5c22f 100644 --- a/audio/examples/sound_classification/deploy/python/predict.py +++ b/paddlespeech/cls/exps/panns/deploy/predict.py @@ -16,16 +16,18 @@ import os import numpy as np from paddle import inference +from scipy.special import softmax + from paddleaudio.backends import load as load_audio from paddleaudio.datasets import ESC50 from paddleaudio.features import melspectrogram -from scipy.special import softmax # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.") -parser.add_argument("--batch_size", type=int, default=2, help="Batch size per GPU/CPU for training.") parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.") +parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") +parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.") parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.') parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.') parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.') @@ -131,10 +133,7 @@ if __name__ == "__main__": args.use_tensorrt, args.precision, args.cpu_threads, args.enable_mkldnn) - wavs = [ - '~/audio_demo_resource/cat.wav', - '~/audio_demo_resource/dog.wav', - ] + wavs = [args.wav] for i in range(len(wavs)): wavs[i] = os.path.abspath(os.path.expanduser(wavs[i])) diff --git a/audio/examples/sound_classification/export_model.py b/paddlespeech/cls/exps/panns/export_model.py similarity index 94% rename from audio/examples/sound_classification/export_model.py rename to paddlespeech/cls/exps/panns/export_model.py index 1be7b27a..c295c6a3 100644 --- a/audio/examples/sound_classification/export_model.py +++ b/paddlespeech/cls/exps/panns/export_model.py @@ -15,9 +15,10 @@ import argparse import os import paddle -from model import SoundClassifier + from paddleaudio.datasets import ESC50 -from paddleaudio.models.panns import cnn14 +from paddlespeech.cls.models import cnn14 +from paddlespeech.cls.models import SoundClassifier # yapf: disable parser = argparse.ArgumentParser(__doc__) diff --git a/audio/examples/sound_classification/predict.py b/paddlespeech/cls/exps/panns/predict.py similarity index 67% rename from audio/examples/sound_classification/predict.py rename to paddlespeech/cls/exps/panns/predict.py index 30d141cd..9cfd8b6c 100644 --- a/audio/examples/sound_classification/predict.py +++ b/paddlespeech/cls/exps/panns/predict.py @@ -16,30 +16,40 @@ import argparse import numpy as np import paddle import paddle.nn.functional as F -from model import SoundClassifier + from paddleaudio.backends import load as load_audio from paddleaudio.datasets import ESC50 +from paddleaudio.features import LogMelSpectrogram from paddleaudio.features import melspectrogram -from paddleaudio.models.panns import cnn14 +from paddlespeech.cls.models import cnn14 +from paddlespeech.cls.models import SoundClassifier # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.") parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") +parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.") parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results") parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.") args = parser.parse_args() # yapf: enable -def extract_features(file: str, **kwargs): +def extract_features(file: str, feat_backend: str='numpy', + **kwargs) -> paddle.Tensor: waveform, sr = load_audio(file, sr=None) - feat = melspectrogram(waveform, sr, **kwargs).transpose() + + if args.feat_backend == 'numpy': + feat = melspectrogram(waveform, sr, **kwargs).transpose() + feat = np.expand_dims(feat, 0) + feat = paddle.to_tensor(feat) + else: + feature_extractor = LogMelSpectrogram(sr=sr, **kwargs) + feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0)) + feat = paddle.transpose(feat, [0, 2, 1]) return feat if __name__ == '__main__': - paddle.set_device(args.device) model = SoundClassifier( backbone=cnn14(pretrained=False, extract_embedding=True), @@ -47,8 +57,7 @@ if __name__ == '__main__': model.set_state_dict(paddle.load(args.checkpoint)) model.eval() - feat = np.expand_dims(extract_features(args.wav), 0) - feat = paddle.to_tensor(feat) + feat = extract_features(args.wav, args.feat_backend) logits = model(feat) probs = F.softmax(logits, axis=1).numpy() diff --git a/audio/examples/sound_classification/train.py b/paddlespeech/cls/exps/panns/train.py similarity index 80% rename from audio/examples/sound_classification/train.py rename to paddlespeech/cls/exps/panns/train.py index e3b5e2ae..12130978 100644 --- a/audio/examples/sound_classification/train.py +++ b/paddlespeech/cls/exps/panns/train.py @@ -15,16 +15,18 @@ import argparse import os import paddle -from model import SoundClassifier + from paddleaudio.datasets import ESC50 -from paddleaudio.models.panns import cnn14 +from paddleaudio.features import LogMelSpectrogram from paddleaudio.utils import logger from paddleaudio.utils import Timer +from paddlespeech.cls.models import cnn14 +from paddlespeech.cls.models import SoundClassifier # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.") parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.") +parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.") @@ -35,7 +37,6 @@ args = parser.parse_args() # yapf: enable if __name__ == "__main__": - paddle.set_device(args.device) nranks = paddle.distributed.get_world_size() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() @@ -48,8 +49,13 @@ if __name__ == "__main__": learning_rate=args.learning_rate, parameters=model.parameters()) criterion = paddle.nn.loss.CrossEntropyLoss() - train_ds = ESC50(mode='train', feat_type='melspectrogram') - dev_ds = ESC50(mode='dev', feat_type='melspectrogram') + if args.feat_backend == 'numpy': + train_ds = ESC50(mode='train', feat_type='melspectrogram') + dev_ds = ESC50(mode='dev', feat_type='melspectrogram') + else: + train_ds = ESC50(mode='train') + dev_ds = ESC50(mode='dev') + feature_extractor = LogMelSpectrogram(sr=16000) train_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False) @@ -71,7 +77,16 @@ if __name__ == "__main__": num_corrects = 0 num_samples = 0 for batch_idx, batch in enumerate(train_loader): - feats, labels = batch + if args.feat_backend == 'numpy': + feats, labels = batch + else: + waveforms, labels = batch + feats = feature_extractor( + waveforms + ) # Need a padding when lengths of waveforms differ in a batch. + feats = paddle.transpose(feats, + [0, 2, 1]) # To [N, length, n_mels] + logits = model(feats) loss = criterion(logits, labels) @@ -126,7 +141,13 @@ if __name__ == "__main__": num_samples = 0 with logger.processing('Evaluation on validation dataset'): for batch_idx, batch in enumerate(dev_loader): - feats, labels = batch + if args.feat_backend == 'numpy': + feats, labels = batch + else: + waveforms, labels = batch + feats = feature_extractor(waveforms) + feats = paddle.transpose(feats, [0, 2, 1]) + logits = model(feats) preds = paddle.argmax(logits, axis=1) diff --git a/paddlespeech/cls/models/__init__.py b/paddlespeech/cls/models/__init__.py new file mode 100644 index 00000000..4bfadda1 --- /dev/null +++ b/paddlespeech/cls/models/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .panns import * diff --git a/paddlespeech/cls/models/panns/__init__.py b/paddlespeech/cls/models/panns/__init__.py new file mode 100644 index 00000000..638f772f --- /dev/null +++ b/paddlespeech/cls/models/panns/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .classifier import * +from .panns import * diff --git a/audio/examples/sound_classification/model.py b/paddlespeech/cls/models/panns/classifier.py similarity index 100% rename from audio/examples/sound_classification/model.py rename to paddlespeech/cls/models/panns/classifier.py diff --git a/audio/paddleaudio/models/panns.py b/paddlespeech/cls/models/panns/panns.py similarity index 99% rename from audio/paddleaudio/models/panns.py rename to paddlespeech/cls/models/panns/panns.py index 1c68f06f..6d2dac56 100644 --- a/audio/paddleaudio/models/panns.py +++ b/paddlespeech/cls/models/panns/panns.py @@ -16,8 +16,8 @@ import os import paddle.nn as nn import paddle.nn.functional as F -from ..utils.download import load_state_dict_from_url -from ..utils.env import MODEL_HOME +from paddleaudio.utils.download import load_state_dict_from_url +from paddleaudio.utils.env import MODEL_HOME __all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6'] diff --git a/audio/setup.py b/setup_audio.py similarity index 78% rename from audio/setup.py rename to setup_audio.py index e0ac9818..24c9bb9b 100644 --- a/audio/setup.py +++ b/setup_audio.py @@ -16,19 +16,16 @@ import setuptools # set the version here version = '0.1.0a' -with open("README.md", "r") as fh: - long_description = fh.read() - setuptools.setup( name="paddleaudio", version=version, author="", author_email="", description="PaddleAudio, in development", - long_description=long_description, + long_description="", long_description_content_type="text/markdown", url="", - packages=setuptools.find_packages(exclude=["build*", "test*", "examples*"]), + packages=setuptools.find_packages(include=['paddleaudio*']), classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", @@ -41,8 +38,4 @@ setuptools.setup( 'resampy >= 0.2.2', 'soundfile >= 0.9.0', 'colorlog', - 'pathos', - ], - extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2'] - } # for dev only, install: pip install -e .[dev] -) + ], )