From 118911778415b40f384ffc4ec32417932b9c27f0 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Wed, 24 Nov 2021 19:47:40 +0800 Subject: [PATCH 1/9] Add paddlespeech.cls and esc50 example. --- audio/.gitignore | 7 - audio/.pre-commit-config.yaml | 45 -- audio/.style.yapf | 3 - audio/LICENSE | 201 ------- audio/README.md | 37 -- audio/examples/panns/README.md | 128 ----- .../examples/panns/assets/audioset_labels.txt | 527 ------------------ audio/examples/panns/audio_tag.py | 111 ---- audio/examples/panns/parse_result.py | 83 --- audio/paddleaudio/__init__.py | 15 - audio/paddleaudio/datasets/aishell.py | 154 ----- audio/paddleaudio/datasets/dcase.py | 298 ---------- audio/paddleaudio/datasets/librispeech.py | 199 ------- audio/paddleaudio/datasets/ravdess.py | 136 ----- audio/setup.py | 48 -- audio/test/README.md | 41 -- audio/test/unit_test/test_backend.py | 113 ---- audio/test/unit_test/test_features.py | 143 ----- .../esc50}/README.md | 25 +- .../cls0/local}/deploy/python/predict.py | 7 +- .../esc50/cls0/local}/export_model.py | 5 +- .../esc50/cls0/local}/model.py | 0 .../esc50/cls0/local}/predict.py | 27 +- .../esc50/cls0/local}/train.py | 40 +- examples/esc50/cls0/path.sh | 14 + examples/esc50/cls0/run.sh | 51 ++ paddlespeech/cls/__init__.py | 2 + .../cls}/backends/__init__.py | 0 .../cls}/backends/audio.py | 0 .../cls}/datasets/__init__.py | 10 - .../cls}/datasets/dataset.py | 0 .../cls}/datasets/esc50.py | 0 .../cls}/datasets/gtzan.py | 0 .../cls}/datasets/tess.py | 0 .../cls}/datasets/urban_sound.py | 0 .../cls}/features/__init__.py | 1 + .../cls}/features/augment.py | 5 +- .../cls}/features/core.py | 6 +- paddlespeech/cls/features/spectrum.py | 461 +++++++++++++++ paddlespeech/cls/features/window.py | 415 ++++++++++++++ .../cls}/models/__init__.py | 1 + .../cls}/models/panns.py | 0 .../cls}/utils/__init__.py | 0 .../cls}/utils/download.py | 0 .../cls}/utils/env.py | 22 +- .../cls}/utils/error.py | 0 .../cls}/utils/log.py | 6 +- .../cls}/utils/time.py | 0 setup.py | 2 +- 49 files changed, 1036 insertions(+), 2353 deletions(-) delete mode 100644 audio/.gitignore delete mode 100644 audio/.pre-commit-config.yaml delete mode 100644 audio/.style.yapf delete mode 100644 audio/LICENSE delete mode 100644 audio/README.md delete mode 100644 audio/examples/panns/README.md delete mode 100644 audio/examples/panns/assets/audioset_labels.txt delete mode 100644 audio/examples/panns/audio_tag.py delete mode 100644 audio/examples/panns/parse_result.py delete mode 100644 audio/paddleaudio/__init__.py delete mode 100644 audio/paddleaudio/datasets/aishell.py delete mode 100644 audio/paddleaudio/datasets/dcase.py delete mode 100644 audio/paddleaudio/datasets/librispeech.py delete mode 100644 audio/paddleaudio/datasets/ravdess.py delete mode 100644 audio/setup.py delete mode 100644 audio/test/README.md delete mode 100644 audio/test/unit_test/test_backend.py delete mode 100644 audio/test/unit_test/test_features.py rename {audio/examples/sound_classification => examples/esc50}/README.md (85%) rename {audio/examples/sound_classification => examples/esc50/cls0/local}/deploy/python/predict.py (97%) rename {audio/examples/sound_classification => examples/esc50/cls0/local}/export_model.py (94%) rename {audio/examples/sound_classification => examples/esc50/cls0/local}/model.py (100%) rename {audio/examples/sound_classification => examples/esc50/cls0/local}/predict.py (66%) rename {audio/examples/sound_classification => examples/esc50/cls0/local}/train.py (80%) create mode 100644 examples/esc50/cls0/path.sh create mode 100755 examples/esc50/cls0/run.sh rename {audio/paddleaudio => paddlespeech/cls}/backends/__init__.py (100%) rename {audio/paddleaudio => paddlespeech/cls}/backends/audio.py (100%) rename {audio/paddleaudio => paddlespeech/cls}/datasets/__init__.py (73%) rename {audio/paddleaudio => paddlespeech/cls}/datasets/dataset.py (100%) rename {audio/paddleaudio => paddlespeech/cls}/datasets/esc50.py (100%) rename {audio/paddleaudio => paddlespeech/cls}/datasets/gtzan.py (100%) rename {audio/paddleaudio => paddlespeech/cls}/datasets/tess.py (100%) rename {audio/paddleaudio => paddlespeech/cls}/datasets/urban_sound.py (100%) rename {audio/paddleaudio => paddlespeech/cls}/features/__init__.py (96%) rename {audio/paddleaudio => paddlespeech/cls}/features/augment.py (98%) rename {audio/paddleaudio => paddlespeech/cls}/features/core.py (99%) create mode 100644 paddlespeech/cls/features/spectrum.py create mode 100644 paddlespeech/cls/features/window.py rename {audio/paddleaudio => paddlespeech/cls}/models/__init__.py (96%) rename {audio/paddleaudio => paddlespeech/cls}/models/panns.py (100%) rename {audio/paddleaudio => paddlespeech/cls}/utils/__init__.py (100%) rename {audio/paddleaudio => paddlespeech/cls}/utils/download.py (100%) rename {audio/paddleaudio => paddlespeech/cls}/utils/env.py (66%) rename {audio/paddleaudio => paddlespeech/cls}/utils/error.py (100%) rename {audio/paddleaudio => paddlespeech/cls}/utils/log.py (95%) rename {audio/paddleaudio => paddlespeech/cls}/utils/time.py (100%) diff --git a/audio/.gitignore b/audio/.gitignore deleted file mode 100644 index e649619e..00000000 --- a/audio/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -.ipynb_checkpoints/** -*.ipynb -nohup.out -__pycache__/ -*.wav -*.m4a -obsolete/** diff --git a/audio/.pre-commit-config.yaml b/audio/.pre-commit-config.yaml deleted file mode 100644 index 4100f348..00000000 --- a/audio/.pre-commit-config.yaml +++ /dev/null @@ -1,45 +0,0 @@ -repos: -- repo: local - hooks: - - id: yapf - name: yapf - entry: yapf - language: system - args: [-i, --style .style.yapf] - files: \.py$ - -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: a11d9314b22d8f8c7556443875b731ef05965464 - hooks: - - id: check-merge-conflict - - id: check-symlinks - - id: end-of-file-fixer - - id: trailing-whitespace - - id: detect-private-key - - id: check-symlinks - - id: check-added-large-files - -- repo: https://github.com/pycqa/isort - rev: 5.8.0 - hooks: - - id: isort - name: isort (python) - - id: isort - name: isort (cython) - types: [cython] - - id: isort - name: isort (pyi) - types: [pyi] - -- repo: local - hooks: - - id: flake8 - name: flake8 - entry: flake8 - language: system - args: - - --count - - --select=E9,F63,F7,F82 - - --show-source - - --statistics - files: \.py$ diff --git a/audio/.style.yapf b/audio/.style.yapf deleted file mode 100644 index 4741fb4f..00000000 --- a/audio/.style.yapf +++ /dev/null @@ -1,3 +0,0 @@ -[style] -based_on_style = pep8 -column_limit = 80 diff --git a/audio/LICENSE b/audio/LICENSE deleted file mode 100644 index 261eeb9e..00000000 --- a/audio/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/audio/README.md b/audio/README.md deleted file mode 100644 index 9607fd86..00000000 --- a/audio/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# PaddleAudio: The audio library for PaddlePaddle - -## Introduction -PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap. - - - -## Features -- Spectrogram and related features are compatible with librosa. -- State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come. -- Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap. -- Data loading supports for common open source audio in multiple languages including English, Mandarin and so on. - - -## Install -``` -git clone https://github.com/PaddlePaddle/models -cd models/PaddleAudio -pip install . - -``` - -## Quick start -### Audio loading and feature extraction -``` -import paddleaudio as pa -s,r = pa.load(f) -mel_spect = pa.melspectrogram(s,sr=r) -``` - -### Examples -We provide a set of examples to help you get started in using PaddleAudio quickly. -- [PANNs: acoustic scene and events analysis using pre-trained models](./examples/panns) -- [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification) -- [Training a audio-tagging network on Audioset](./examples/audioset_training) - -Please refer to [example directory](./examples) for more details. diff --git a/audio/examples/panns/README.md b/audio/examples/panns/README.md deleted file mode 100644 index 243ebf8e..00000000 --- a/audio/examples/panns/README.md +++ /dev/null @@ -1,128 +0,0 @@ -# Audio Tagging - -声音分类的任务是单标签的分类任务,但是对于一段音频来说,它可以是多标签的。譬如在一般的室内办公环境进行录音,这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音,还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言,对一段音频预测多个标签是具有很强的实用性的。 - -在IEEE ICASSP 2017 大会上,谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段(来源于YouTube视频)。目前该数据集已经有210万个已标注的视频数据,5800小时的音频数据,经过标记的声音样本的标签类别为527。 - -`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别,因此可用于声音的实时tagging。 - -本示例采用`PANNs`预训练模型,基于Audioset的标签类别对输入音频实时tagging,并最终以文本形式输出对应时刻的top k类别和对应的得分。 - - -## 模型简介 - -PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型,可供用户选择使用: -- CNN14: 该模型主要包含12个卷积层和2个全连接层,模型参数的数量为79.6M,embbedding维度是2048。 -- CNN10: 该模型主要包含8个卷积层和2个全连接层,模型参数的数量为4.9M,embbedding维度是512。 -- CNN6: 该模型主要包含4个卷积层和2个全连接层,模型参数的数量为4.5M,embbedding维度是512。 - - -## 快速开始 - -### 模型预测 - -```shell -export CUDA_VISIBLE_DEVICES=0 -python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir -``` - -可支持配置的参数: - -- `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 -- `wav`: 指定预测的音频文件。 -- `sample_duration`: 模型每次预测的音频时间长度,单位为秒,默认为2s。 -- `hop_duration`: 每两个预测音频的时间间隔,单位为秒,默认为0.3s。 -- `output_dir`: 模型预测结果存放的路径,默认为`./output_dir`。 - -示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过以下方式执行: -```python -from paddleaudio.models.panns import cnn14, cnn10, cnn6 - -# CNN14 -model = cnn14(pretrained=True, extract_embedding=False) -# CNN10 -model = cnn10(pretrained=True, extract_embedding=False) -# CNN6 -model = cnn6(pretrained=True, extract_embedding=False) -``` - -执行结果: -``` -[2021-04-30 19:15:41,025] [ INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz -``` - -执行后得分结果保存在`output_dir`的`.npz`文件中。 - - -### 生成tagging标签文本 -```shell -python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir -``` - -可支持配置的参数: - -- `tagging_file`: 模型预测结果文件。 -- `top_k`: 获取预测结果中,得分最高的前top_k个标签,默认为10。 -- `smooth`: 预测结果的后验概率平滑,默认为True,表示应用平滑。 -- `smooth_size`: 平滑计算过程中的样本数量,默认为5。 -- `label_file`: 模型预测结果对应的Audioset类别的文本文件。 -- `output_dir`: 标签文本存放的路径,默认为`./output_dir`。 - -执行结果: -``` -[2021-04-30 19:26:58,743] [ INFO] - Posterior smoothing... -[2021-04-30 19:26:58,746] [ INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt -``` - -执行后文本结果保存在`output_dir`的`.txt`文件中。 - - -## Tagging标签文本 - -最终输出的文本结果如下所示。 -样本每个时间范围的top k结果用空行分隔。在每一个结果中,第一行是时间信息,数字表示tagging结果在时间起点信息,比例值代表当前时刻`t`与音频总长度`T`的比值;紧接的k行是对应的标签和得分。 - -``` -0.0 -Cat: 0.9144676923751831 -Animal: 0.8855036497116089 -Domestic animals, pets: 0.804577112197876 -Meow: 0.7422927021980286 -Music: 0.19959309697151184 -Inside, small room: 0.12550437450408936 -Caterwaul: 0.021584441885352135 -Purr: 0.020247288048267365 -Speech: 0.018197158351540565 -Vehicle: 0.007446660194545984 - -0.059197544398158296 -Cat: 0.9250872135162354 -Animal: 0.8957151174545288 -Domestic animals, pets: 0.8228275775909424 -Meow: 0.7650775909423828 -Music: 0.20210561156272888 -Inside, small room: 0.12290887534618378 -Caterwaul: 0.029371455311775208 -Purr: 0.018731823191046715 -Speech: 0.017130598425865173 -Vehicle: 0.007748497650027275 - -0.11839508879631659 -Cat: 0.9336574673652649 -Animal: 0.9111202359199524 -Domestic animals, pets: 0.8349071145057678 -Meow: 0.7761964797973633 -Music: 0.20467285811901093 -Inside, small room: 0.10709915310144424 -Caterwaul: 0.05370649695396423 -Purr: 0.018830426037311554 -Speech: 0.017361722886562347 -Vehicle: 0.006929398979991674 - -... -... -``` - -以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子,可以实时地对音频进行多标签预测。 - -![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif) diff --git a/audio/examples/panns/assets/audioset_labels.txt b/audio/examples/panns/assets/audioset_labels.txt deleted file mode 100644 index 6fccf56a..00000000 --- a/audio/examples/panns/assets/audioset_labels.txt +++ /dev/null @@ -1,527 +0,0 @@ -Speech -Male speech, man speaking -Female speech, woman speaking -Child speech, kid speaking -Conversation -Narration, monologue -Babbling -Speech synthesizer -Shout -Bellow -Whoop -Yell -Battle cry -Children shouting -Screaming -Whispering -Laughter -Baby laughter -Giggle -Snicker -Belly laugh -Chuckle, chortle -Crying, sobbing -Baby cry, infant cry -Whimper -Wail, moan -Sigh -Singing -Choir -Yodeling -Chant -Mantra -Male singing -Female singing -Child singing -Synthetic singing -Rapping -Humming -Groan -Grunt -Whistling -Breathing -Wheeze -Snoring -Gasp -Pant -Snort -Cough -Throat clearing -Sneeze -Sniff -Run -Shuffle -Walk, footsteps -Chewing, mastication -Biting -Gargling -Stomach rumble -Burping, eructation -Hiccup -Fart -Hands -Finger snapping -Clapping -Heart sounds, heartbeat -Heart murmur -Cheering -Applause -Chatter -Crowd -Hubbub, speech noise, speech babble -Children playing -Animal -Domestic animals, pets -Dog -Bark -Yip -Howl -Bow-wow -Growling -Whimper (dog) -Cat -Purr -Meow -Hiss -Caterwaul -Livestock, farm animals, working animals -Horse -Clip-clop -Neigh, whinny -Cattle, bovinae -Moo -Cowbell -Pig -Oink -Goat -Bleat -Sheep -Fowl -Chicken, rooster -Cluck -Crowing, cock-a-doodle-doo -Turkey -Gobble -Duck -Quack -Goose -Honk -Wild animals -Roaring cats (lions, tigers) -Roar -Bird -Bird vocalization, bird call, bird song -Chirp, tweet -Squawk -Pigeon, dove -Coo -Crow -Caw -Owl -Hoot -Bird flight, flapping wings -Canidae, dogs, wolves -Rodents, rats, mice -Mouse -Patter -Insect -Cricket -Mosquito -Fly, housefly -Buzz -Bee, wasp, etc. -Frog -Croak -Snake -Rattle -Whale vocalization -Music -Musical instrument -Plucked string instrument -Guitar -Electric guitar -Bass guitar -Acoustic guitar -Steel guitar, slide guitar -Tapping (guitar technique) -Strum -Banjo -Sitar -Mandolin -Zither -Ukulele -Keyboard (musical) -Piano -Electric piano -Organ -Electronic organ -Hammond organ -Synthesizer -Sampler -Harpsichord -Percussion -Drum kit -Drum machine -Drum -Snare drum -Rimshot -Drum roll -Bass drum -Timpani -Tabla -Cymbal -Hi-hat -Wood block -Tambourine -Rattle (instrument) -Maraca -Gong -Tubular bells -Mallet percussion -Marimba, xylophone -Glockenspiel -Vibraphone -Steelpan -Orchestra -Brass instrument -French horn -Trumpet -Trombone -Bowed string instrument -String section -Violin, fiddle -Pizzicato -Cello -Double bass -Wind instrument, woodwind instrument -Flute -Saxophone -Clarinet -Harp -Bell -Church bell -Jingle bell -Bicycle bell -Tuning fork -Chime -Wind chime -Change ringing (campanology) -Harmonica -Accordion -Bagpipes -Didgeridoo -Shofar -Theremin -Singing bowl -Scratching (performance technique) -Pop music -Hip hop music -Beatboxing -Rock music -Heavy metal -Punk rock -Grunge -Progressive rock -Rock and roll -Psychedelic rock -Rhythm and blues -Soul music -Reggae -Country -Swing music -Bluegrass -Funk -Folk music -Middle Eastern music -Jazz -Disco -Classical music -Opera -Electronic music -House music -Techno -Dubstep -Drum and bass -Electronica -Electronic dance music -Ambient music -Trance music -Music of Latin America -Salsa music -Flamenco -Blues -Music for children -New-age music -Vocal music -A capella -Music of Africa -Afrobeat -Christian music -Gospel music -Music of Asia -Carnatic music -Music of Bollywood -Ska -Traditional music -Independent music -Song -Background music -Theme music -Jingle (music) -Soundtrack music -Lullaby -Video game music -Christmas music -Dance music -Wedding music -Happy music -Funny music -Sad music -Tender music -Exciting music -Angry music -Scary music -Wind -Rustling leaves -Wind noise (microphone) -Thunderstorm -Thunder -Water -Rain -Raindrop -Rain on surface -Stream -Waterfall -Ocean -Waves, surf -Steam -Gurgling -Fire -Crackle -Vehicle -Boat, Water vehicle -Sailboat, sailing ship -Rowboat, canoe, kayak -Motorboat, speedboat -Ship -Motor vehicle (road) -Car -Vehicle horn, car horn, honking -Toot -Car alarm -Power windows, electric windows -Skidding -Tire squeal -Car passing by -Race car, auto racing -Truck -Air brake -Air horn, truck horn -Reversing beeps -Ice cream truck, ice cream van -Bus -Emergency vehicle -Police car (siren) -Ambulance (siren) -Fire engine, fire truck (siren) -Motorcycle -Traffic noise, roadway noise -Rail transport -Train -Train whistle -Train horn -Railroad car, train wagon -Train wheels squealing -Subway, metro, underground -Aircraft -Aircraft engine -Jet engine -Propeller, airscrew -Helicopter -Fixed-wing aircraft, airplane -Bicycle -Skateboard -Engine -Light engine (high frequency) -Dental drill, dentist's drill -Lawn mower -Chainsaw -Medium engine (mid frequency) -Heavy engine (low frequency) -Engine knocking -Engine starting -Idling -Accelerating, revving, vroom -Door -Doorbell -Ding-dong -Sliding door -Slam -Knock -Tap -Squeak -Cupboard open or close -Drawer open or close -Dishes, pots, and pans -Cutlery, silverware -Chopping (food) -Frying (food) -Microwave oven -Blender -Water tap, faucet -Sink (filling or washing) -Bathtub (filling or washing) -Hair dryer -Toilet flush -Toothbrush -Electric toothbrush -Vacuum cleaner -Zipper (clothing) -Keys jangling -Coin (dropping) -Scissors -Electric shaver, electric razor -Shuffling cards -Typing -Typewriter -Computer keyboard -Writing -Alarm -Telephone -Telephone bell ringing -Ringtone -Telephone dialing, DTMF -Dial tone -Busy signal -Alarm clock -Siren -Civil defense siren -Buzzer -Smoke detector, smoke alarm -Fire alarm -Foghorn -Whistle -Steam whistle -Mechanisms -Ratchet, pawl -Clock -Tick -Tick-tock -Gears -Pulleys -Sewing machine -Mechanical fan -Air conditioning -Cash register -Printer -Camera -Single-lens reflex camera -Tools -Hammer -Jackhammer -Sawing -Filing (rasp) -Sanding -Power tool -Drill -Explosion -Gunshot, gunfire -Machine gun -Fusillade -Artillery fire -Cap gun -Fireworks -Firecracker -Burst, pop -Eruption -Boom -Wood -Chop -Splinter -Crack -Glass -Chink, clink -Shatter -Liquid -Splash, splatter -Slosh -Squish -Drip -Pour -Trickle, dribble -Gush -Fill (with liquid) -Spray -Pump (liquid) -Stir -Boiling -Sonar -Arrow -Whoosh, swoosh, swish -Thump, thud -Thunk -Electronic tuner -Effects unit -Chorus effect -Basketball bounce -Bang -Slap, smack -Whack, thwack -Smash, crash -Breaking -Bouncing -Whip -Flap -Scratch -Scrape -Rub -Roll -Crushing -Crumpling, crinkling -Tearing -Beep, bleep -Ping -Ding -Clang -Squeal -Creak -Rustle -Whir -Clatter -Sizzle -Clicking -Clickety-clack -Rumble -Plop -Jingle, tinkle -Hum -Zing -Boing -Crunch -Silence -Sine wave -Harmonic -Chirp tone -Sound effect -Pulse -Inside, small room -Inside, large room or hall -Inside, public space -Outside, urban or manmade -Outside, rural or natural -Reverberation -Echo -Noise -Environmental noise -Static -Mains hum -Distortion -Sidetone -Cacophony -White noise -Pink noise -Throbbing -Vibration -Television -Radio -Field recording diff --git a/audio/examples/panns/audio_tag.py b/audio/examples/panns/audio_tag.py deleted file mode 100644 index 6f08cd1c..00000000 --- a/audio/examples/panns/audio_tag.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -from typing import List - -import numpy as np -import paddle -from paddleaudio.backends import load as load_audio -from paddleaudio.features import melspectrogram -from paddleaudio.models.panns import cnn14 -from paddleaudio.utils import logger - -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.') -parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.') -parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.') -parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.') -parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.') -args = parser.parse_args() -# yapf: enable - - -def split(waveform: np.ndarray, win_size: int, hop_size: int): - """ - Split into N waveforms. - N is decided by win_size and hop_size. - """ - assert isinstance(waveform, np.ndarray) - time = [] - data = [] - for i in range(0, len(waveform), hop_size): - segment = waveform[i:i + win_size] - if len(segment) < win_size: - segment = np.pad(segment, (0, win_size - len(segment))) - data.append(segment) - time.append(i / len(waveform)) - return time, data - - -def batchify(data: List[List[float]], - sample_rate: int, - batch_size: int, - **kwargs): - """ - Extract features from waveforms and create batches. - """ - examples = [] - for waveform in data: - feats = melspectrogram(waveform, sample_rate, **kwargs).transpose() - examples.append(feats) - - # Seperates data into some batches. - one_batch = [] - for example in examples: - one_batch.append(example) - if len(one_batch) == batch_size: - yield one_batch - one_batch = [] - if one_batch: - yield one_batch - - -def predict(model, data: List[List[float]], sample_rate: int, - batch_size: int=1): - """ - Use pretrained model to make predictions. - """ - batches = batchify(data, sample_rate, batch_size) - results = None - model.eval() - for batch in batches: - feats = paddle.to_tensor(batch).unsqueeze(1) \ - # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins) - - audioset_scores = model(feats) - if results is None: - results = audioset_scores.numpy() - else: - results = np.concatenate((results, audioset_scores.numpy())) - - return results - - -if __name__ == '__main__': - paddle.set_device(args.device) - model = cnn14(pretrained=True, extract_embedding=False) - waveform, sr = load_audio(args.wav, sr=None) - time, data = split(waveform, - int(args.sample_duration * sr), - int(args.hop_duration * sr)) - results = predict(model, data, sr, batch_size=8) - - if not os.path.exists(args.output_dir): - os.makedirs(args.output_dir) - time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform)) - output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz') - np.savez(output_file, time=time, scores=results) - logger.info(f'Saved tagging results to {output_file}') diff --git a/audio/examples/panns/parse_result.py b/audio/examples/panns/parse_result.py deleted file mode 100644 index 056c573f..00000000 --- a/audio/examples/panns/parse_result.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import ast -import os -from typing import Dict - -import numpy as np -from paddleaudio.utils import logger - -# yapf: disable -parser = argparse.ArgumentParser(__doc__) -parser.add_argument('--tagging_file', type=str, required=True, help='') -parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.') -parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.') -parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.') -parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.') -parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.') -args = parser.parse_args() -# yapf: enable - - -def smooth(results: np.ndarray, win_size: int): - """ - Execute posterior smoothing in-place. - """ - for i in range(len(results) - 1, -1, -1): - if i < win_size - 1: - left = 0 - else: - left = i + 1 - win_size - results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1) - - -def generate_topk_label(k: int, label_map: Dict, result: np.ndarray): - """ - Return top k result. - """ - result = np.asarray(result) - topk_idx = (-result).argsort()[:k] - - ret = '' - for idx in topk_idx: - label, score = label_map[idx], result[idx] - ret += f'{label}: {score}\n' - return ret - - -if __name__ == "__main__": - label_map = {} - with open(args.label_file, 'r') as f: - for i, l in enumerate(f.readlines()): - label_map[i] = l.strip() - - results = np.load(args.tagging_file, allow_pickle=True) - times, scores = results['time'], results['scores'] - - if args.smooth: - logger.info('Posterior smoothing...') - smooth(scores, win_size=args.smooth_size) - - if not os.path.exists(args.output_dir): - os.makedirs(args.output_dir) - output_file = os.path.join( - args.output_dir, - os.path.basename(args.tagging_file).split('.')[0] + '.txt') - with open(output_file, 'w') as f: - for time, score in zip(times, scores): - f.write(f'{time}\n') - f.write(generate_topk_label(args.top_k, label_map, score) + '\n') - - logger.info(f'Saved tagging labels to {output_file}') diff --git a/audio/paddleaudio/__init__.py b/audio/paddleaudio/__init__.py deleted file mode 100644 index 2685cf57..00000000 --- a/audio/paddleaudio/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .backends import * -from .features import * diff --git a/audio/paddleaudio/datasets/aishell.py b/audio/paddleaudio/datasets/aishell.py deleted file mode 100644 index d84d9876..00000000 --- a/audio/paddleaudio/datasets/aishell.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import codecs -import collections -import json -import os -from typing import Dict - -from paddle.io import Dataset -from tqdm import tqdm - -from ..backends import load as load_audio -from ..utils.download import decompress -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME -from ..utils.log import logger -from .dataset import feat_funcs - -__all__ = ['AISHELL1'] - - -class AISHELL1(Dataset): - """ - This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. - It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including - smart home, autonomous driving, and industrial production. The whole recording was - put in quiet indoor environment, using 3 different devices at the same time: high - fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), - iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled - to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas - in China were invited to participate in the recording. The manual transcription - accuracy rate is above 95%, through professional speech annotation and strict - quality inspection. The corpus is divided into training, development and testing - sets. - - Reference: - AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline - https://arxiv.org/abs/1709.05522 - """ - - archieves = [ - { - 'url': 'http://www.openslr.org/resources/33/data_aishell.tgz', - 'md5': '2f494334227864a8a8fec932999db9d8', - }, - ] - text_meta = os.path.join('data_aishell', 'transcript', - 'aishell_transcript_v0.8.txt') - utt_info = collections.namedtuple('META_INFO', - ('file_path', 'utt_id', 'text')) - audio_path = os.path.join('data_aishell', 'wav') - manifest_path = os.path.join('data_aishell', 'manifest') - subset = ['train', 'dev', 'test'] - - def __init__(self, subset: str='train', feat_type: str='raw', **kwargs): - assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format( - self.subset, subset) - self.subset = subset - self.feat_type = feat_type - self.feat_config = kwargs - self._data = self._get_data() - super(AISHELL1, self).__init__() - - def _get_text_info(self) -> Dict[str, str]: - ret = {} - with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf: - for line in rf.readlines()[1:]: - utt_id, text = map(str.strip, line.split(' ', - 1)) # utt_id, text - ret.update({utt_id: ''.join(text.split())}) - return ret - - def _get_data(self): - if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ - not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)): - download_and_decompress(self.archieves, DATA_HOME) - # Extract *wav from *.tar.gz. - for root, _, files in os.walk( - os.path.join(DATA_HOME, self.audio_path)): - for file in files: - if file.endswith('.tar.gz'): - decompress(os.path.join(root, file)) - os.remove(os.path.join(root, file)) - - text_info = self._get_text_info() - - data = [] - for root, _, files in os.walk( - os.path.join(DATA_HOME, self.audio_path, self.subset)): - for file in files: - if file.endswith('.wav'): - utt_id = os.path.splitext(file)[0] - if utt_id not in text_info: # There are some utt_id that without label - continue - text = text_info[utt_id] - file_path = os.path.join(root, file) - data.append(self.utt_info(file_path, utt_id, text)) - - return data - - def _convert_to_record(self, idx: int): - sample = self._data[idx] - - record = {} - # To show all fields in a namedtuple: `type(sample)._fields` - for field in type(sample)._fields: - record[field] = getattr(sample, field) - - waveform, sr = load_audio( - sample[0]) # The first element of sample is file path - feat_func = feat_funcs[self.feat_type] - feat = feat_func( - waveform, sample_rate=sr, - **self.feat_config) if feat_func else waveform - record.update({'feat': feat, 'duration': len(waveform) / sr}) - return record - - def create_manifest(self, prefix='manifest'): - if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)): - os.makedirs(os.path.join(DATA_HOME, self.manifest_path)) - - manifest_file = os.path.join(DATA_HOME, self.manifest_path, - f'{prefix}.{self.subset}') - with codecs.open(manifest_file, 'w', 'utf-8') as f: - for idx in tqdm(range(len(self))): - record = self._convert_to_record(idx) - record_line = json.dumps( - { - 'utt': record['utt_id'], - 'feat': record['file_path'], - 'feat_shape': (record['duration'], ), - 'text': record['text'] - }, - ensure_ascii=False) - f.write(record_line + '\n') - logger.info(f'Manifest file {manifest_file} created.') - - def __getitem__(self, idx): - record = self._convert_to_record(idx) - return tuple(record.values()) - - def __len__(self): - return len(self._data) diff --git a/audio/paddleaudio/datasets/dcase.py b/audio/paddleaudio/datasets/dcase.py deleted file mode 100644 index 47b0c915..00000000 --- a/audio/paddleaudio/datasets/dcase.py +++ /dev/null @@ -1,298 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import collections -import os -from typing import List -from typing import Tuple - -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME -from .dataset import AudioClassificationDataset - -__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes'] - - -class UrbanAcousticScenes(AudioClassificationDataset): - """ - TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from - 12 European cities in 10 different acoustic scenes using 4 different devices. - Additionally, synthetic data for 11 mobile devices was created based on the original - recordings. Of the 12 cities, two are present only in the evaluation set. - - Reference: - A multi-device dataset for urban acoustic scene classification - https://arxiv.org/abs/1807.09840 - """ - - source_url = 'https://zenodo.org/record/3819968/files/' - base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development' - archieves = [ - { - 'url': source_url + base_name + '.meta.zip', - 'md5': '6eae9db553ce48e4ea246e34e50a3cf5', - }, - { - 'url': source_url + base_name + '.audio.1.zip', - 'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8', - }, - { - 'url': source_url + base_name + '.audio.2.zip', - 'md5': '4310a13cc2943d6ce3f70eba7ba4c784', - }, - { - 'url': source_url + base_name + '.audio.3.zip', - 'md5': 'ed38956c4246abb56190c1e9b602b7b8', - }, - { - 'url': source_url + base_name + '.audio.4.zip', - 'md5': '97ab8560056b6816808dedc044dcc023', - }, - { - 'url': source_url + base_name + '.audio.5.zip', - 'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb', - }, - { - 'url': source_url + base_name + '.audio.6.zip', - 'md5': 'fbf856a3a86fff7520549c899dc94372', - }, - { - 'url': source_url + base_name + '.audio.7.zip', - 'md5': '0dbffe7b6e45564da649378723284062', - }, - { - 'url': source_url + base_name + '.audio.8.zip', - 'md5': 'bb6f77832bf0bd9f786f965beb251b2e', - }, - { - 'url': source_url + base_name + '.audio.9.zip', - 'md5': 'a65596a5372eab10c78e08a0de797c9e', - }, - { - 'url': source_url + base_name + '.audio.10.zip', - 'md5': '2ad595819ffa1d56d2de4c7ed43205a6', - }, - { - 'url': source_url + base_name + '.audio.11.zip', - 'md5': '0ad29f7040a4e6a22cfd639b3a6738e5', - }, - { - 'url': source_url + base_name + '.audio.12.zip', - 'md5': 'e5f4400c6b9697295fab4cf507155a2f', - }, - { - 'url': source_url + base_name + '.audio.13.zip', - 'md5': '8855ab9f9896422746ab4c5d89d8da2f', - }, - { - 'url': source_url + base_name + '.audio.14.zip', - 'md5': '092ad744452cd3e7de78f988a3d13020', - }, - { - 'url': source_url + base_name + '.audio.15.zip', - 'md5': '4b5eb85f6592aebf846088d9df76b420', - }, - { - 'url': source_url + base_name + '.audio.16.zip', - 'md5': '2e0a89723e58a3836be019e6996ae460', - }, - ] - label_list = [ - 'airport', 'shopping_mall', 'metro_station', 'street_pedestrian', - 'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park' - ] - - meta = os.path.join(base_name, 'meta.csv') - meta_info = collections.namedtuple('META_INFO', ( - 'filename', 'scene_label', 'identifier', 'source_label')) - subset_meta = { - 'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'), - 'dev': - os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'), - 'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'), - } - subset_meta_info = collections.namedtuple('SUBSET_META_INFO', - ('filename', 'scene_label')) - audio_path = os.path.join(base_name, 'audio') - - def __init__(self, mode: str='train', feat_type: str='raw', **kwargs): - """ - Ags: - mode (:obj:`str`, `optional`, defaults to `train`): - It identifies the dataset mode (train or dev). - feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. - """ - files, labels = self._get_data(mode) - super(UrbanAcousticScenes, self).__init__( - files=files, labels=labels, feat_type=feat_type, **kwargs) - - def _get_meta_info(self, subset: str=None, - skip_header: bool=True) -> List[collections.namedtuple]: - if subset is None: - meta_file = self.meta - meta_info = self.meta_info - else: - assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.' - meta_file = self.subset_meta[subset] - meta_info = self.subset_meta_info - - ret = [] - with open(os.path.join(DATA_HOME, meta_file), 'r') as rf: - lines = rf.readlines()[1:] if skip_header else rf.readlines() - for line in lines: - ret.append(meta_info(*line.strip().split('\t'))) - return ret - - def _get_data(self, mode: str) -> Tuple[List[str], List[int]]: - if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ - not os.path.isfile(os.path.join(DATA_HOME, self.meta)): - download_and_decompress(self.archieves, DATA_HOME) - - meta_info = self._get_meta_info(subset=mode, skip_header=True) - - files = [] - labels = [] - for sample in meta_info: - filename, label = sample[:2] - filename = os.path.basename(filename) - target = self.label_list.index(label) - - files.append(os.path.join(DATA_HOME, self.audio_path, filename)) - labels.append(int(target)) - - return files, labels - - -class UrbanAudioVisualScenes(AudioClassificationDataset): - """ - TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio - and video recordings from 12 European cities in 10 different scenes. - This dataset consists of 10-seconds audio and video segments from 10 - acoustic scenes. The total amount of audio in the development set is 34 hours. - - Reference: - A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis - https://arxiv.org/abs/2011.00030 - """ - - source_url = 'https://zenodo.org/record/4477542/files/' - base_name = 'TAU-urban-audio-visual-scenes-2021-development' - - archieves = [ - { - 'url': source_url + base_name + '.meta.zip', - 'md5': '76e3d7ed5291b118372e06379cb2b490', - }, - { - 'url': source_url + base_name + '.audio.1.zip', - 'md5': '186f6273f8f69ed9dbdc18ad65ac234f', - }, - { - 'url': source_url + base_name + '.audio.2.zip', - 'md5': '7fd6bb63127f5785874a55aba4e77aa5', - }, - { - 'url': source_url + base_name + '.audio.3.zip', - 'md5': '61396bede29d7c8c89729a01a6f6b2e2', - }, - { - 'url': source_url + base_name + '.audio.4.zip', - 'md5': '6ddac89717fcf9c92c451868eed77fe1', - }, - { - 'url': source_url + base_name + '.audio.5.zip', - 'md5': 'af4820756cdf1a7d4bd6037dc034d384', - }, - { - 'url': source_url + base_name + '.audio.6.zip', - 'md5': 'ebd11ec24411f2a17a64723bd4aa7fff', - }, - { - 'url': source_url + base_name + '.audio.7.zip', - 'md5': '2be39a76aeed704d5929d020a2909efd', - }, - { - 'url': source_url + base_name + '.audio.8.zip', - 'md5': '972d8afe0874720fc2f28086e7cb22a9', - }, - ] - label_list = [ - 'airport', 'shopping_mall', 'metro_station', 'street_pedestrian', - 'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park' - ] - - meta_base_path = os.path.join(base_name, base_name + '.meta') - meta = os.path.join(meta_base_path, 'meta.csv') - meta_info = collections.namedtuple('META_INFO', ( - 'filename_audio', 'filename_video', 'scene_label', 'identifier')) - subset_meta = { - 'train': - os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'), - 'dev': - os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'), - 'test': - os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'), - } - subset_meta_info = collections.namedtuple('SUBSET_META_INFO', ( - 'filename_audio', 'filename_video', 'scene_label')) - audio_path = os.path.join(base_name, 'audio') - - def __init__(self, mode: str='train', feat_type: str='raw', **kwargs): - """ - Ags: - mode (:obj:`str`, `optional`, defaults to `train`): - It identifies the dataset mode (train or dev). - feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. - """ - files, labels = self._get_data(mode) - super(UrbanAudioVisualScenes, self).__init__( - files=files, labels=labels, feat_type=feat_type, **kwargs) - - def _get_meta_info(self, subset: str=None, - skip_header: bool=True) -> List[collections.namedtuple]: - if subset is None: - meta_file = self.meta - meta_info = self.meta_info - else: - assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.' - meta_file = self.subset_meta[subset] - meta_info = self.subset_meta_info - - ret = [] - with open(os.path.join(DATA_HOME, meta_file), 'r') as rf: - lines = rf.readlines()[1:] if skip_header else rf.readlines() - for line in lines: - ret.append(meta_info(*line.strip().split('\t'))) - return ret - - def _get_data(self, mode: str) -> Tuple[List[str], List[int]]: - if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ - not os.path.isfile(os.path.join(DATA_HOME, self.meta)): - download_and_decompress(self.archieves, - os.path.join(DATA_HOME, self.base_name)) - - meta_info = self._get_meta_info(subset=mode, skip_header=True) - - files = [] - labels = [] - for sample in meta_info: - filename, _, label = sample[:3] - filename = os.path.basename(filename) - target = self.label_list.index(label) - - files.append(os.path.join(DATA_HOME, self.audio_path, filename)) - labels.append(int(target)) - - return files, labels diff --git a/audio/paddleaudio/datasets/librispeech.py b/audio/paddleaudio/datasets/librispeech.py deleted file mode 100644 index c3b3c83d..00000000 --- a/audio/paddleaudio/datasets/librispeech.py +++ /dev/null @@ -1,199 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import codecs -import collections -import json -import os -from typing import Dict - -from paddle.io import Dataset -from tqdm import tqdm - -from ..backends import load as load_audio -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME -from ..utils.log import logger -from .dataset import feat_funcs - -__all__ = ['LIBRISPEECH'] - - -class LIBRISPEECH(Dataset): - """ - LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech, - prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is - derived from read audiobooks from the LibriVox project, and has been carefully - segmented and aligned. - - Reference: - LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS - http://www.danielpovey.com/files/2015_icassp_librispeech.pdf - https://arxiv.org/abs/1709.05522 - """ - - source_url = 'http://www.openslr.org/resources/12/' - archieves = [ - { - 'url': source_url + 'train-clean-100.tar.gz', - 'md5': '2a93770f6d5c6c964bc36631d331a522', - }, - { - 'url': source_url + 'train-clean-360.tar.gz', - 'md5': 'c0e676e450a7ff2f54aeade5171606fa', - }, - { - 'url': source_url + 'train-other-500.tar.gz', - 'md5': 'd1a0fd59409feb2c614ce4d30c387708', - }, - { - 'url': source_url + 'dev-clean.tar.gz', - 'md5': '42e2234ba48799c1f50f24a7926300a1', - }, - { - 'url': source_url + 'dev-other.tar.gz', - 'md5': 'c8d0bcc9cca99d4f8b62fcc847357931', - }, - { - 'url': source_url + 'test-clean.tar.gz', - 'md5': '32fa31d27d2e1cad72775fee3f4849a9', - }, - { - 'url': source_url + 'test-other.tar.gz', - 'md5': 'fb5a50374b501bb3bac4815ee91d3135', - }, - ] - speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT') - utt_info = collections.namedtuple('META_INFO', ( - 'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender')) - audio_path = 'LibriSpeech' - manifest_path = os.path.join('LibriSpeech', 'manifest') - subset = [ - 'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean', - 'dev-other', 'test-clean', 'test-other' - ] - - def __init__(self, - subset: str='train-clean-100', - feat_type: str='raw', - **kwargs): - assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format( - self.subset, subset) - self.subset = subset - self.feat_type = feat_type - self.feat_config = kwargs - self._data = self._get_data() - super(LIBRISPEECH, self).__init__() - - def _get_speaker_info(self) -> Dict[str, str]: - ret = {} - with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf: - for line in rf.readlines(): - if ';' in line: # Skip dataset abstract - continue - spk_id, gender = map(str.strip, - line.split('|')[:2]) # spk_id, gender - ret.update({spk_id: gender}) - return ret - - def _get_text_info(self, trans_file) -> Dict[str, str]: - ret = {} - with open(trans_file, 'r') as rf: - for line in rf.readlines(): - utt_id, text = map(str.strip, line.split(' ', - 1)) # utt_id, text - ret.update({utt_id: text}) - return ret - - def _get_data(self): - if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ - not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)): - download_and_decompress(self.archieves, DATA_HOME, - len(self.archieves)) - - # Speaker info - speaker_info = self._get_speaker_info() - - # Text info - text_info = {} - for root, _, files in os.walk( - os.path.join(DATA_HOME, self.audio_path, self.subset)): - for file in files: - if file.endswith('.trans.txt'): - text_info.update( - self._get_text_info(os.path.join(root, file))) - - data = [] - for root, _, files in os.walk( - os.path.join(DATA_HOME, self.audio_path, self.subset)): - for file in files: - if file.endswith('.flac'): - utt_id = os.path.splitext(file)[0] - spk_id = utt_id.split('-')[0] - if utt_id not in text_info \ - or spk_id not in speaker_info : # Skip samples with incomplete data - continue - file_path = os.path.join(root, file) - text = text_info[utt_id] - spk_gender = speaker_info[spk_id] - data.append( - self.utt_info(file_path, utt_id, text, spk_id, - spk_gender)) - - return data - - def _convert_to_record(self, idx: int): - sample = self._data[idx] - - record = {} - # To show all fields in a namedtuple: `type(sample)._fields` - for field in type(sample)._fields: - record[field] = getattr(sample, field) - - waveform, sr = load_audio( - sample[0]) # The first element of sample is file path - feat_func = feat_funcs[self.feat_type] - feat = feat_func( - waveform, sample_rate=sr, - **self.feat_config) if feat_func else waveform - record.update({'feat': feat, 'duration': len(waveform) / sr}) - return record - - def create_manifest(self, prefix='manifest'): - if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)): - os.makedirs(os.path.join(DATA_HOME, self.manifest_path)) - - manifest_file = os.path.join(DATA_HOME, self.manifest_path, - f'{prefix}.{self.subset}') - with codecs.open(manifest_file, 'w', 'utf-8') as f: - for idx in tqdm(range(len(self))): - record = self._convert_to_record(idx) - record_line = json.dumps( - { - 'utt': record['utt_id'], - 'feat': record['file_path'], - 'feat_shape': (record['duration'], ), - 'text': record['text'], - 'spk': record['spk_id'], - 'gender': record['spk_gender'], - }, - ensure_ascii=False) - f.write(record_line + '\n') - logger.info(f'Manifest file {manifest_file} created.') - - def __getitem__(self, idx): - record = self._convert_to_record(idx) - return tuple(record.values()) - - def __len__(self): - return len(self._data) diff --git a/audio/paddleaudio/datasets/ravdess.py b/audio/paddleaudio/datasets/ravdess.py deleted file mode 100644 index d886aad2..00000000 --- a/audio/paddleaudio/datasets/ravdess.py +++ /dev/null @@ -1,136 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import collections -import os -import random -from typing import List -from typing import Tuple - -from ..utils.download import download_and_decompress -from ..utils.env import DATA_HOME -from .dataset import AudioClassificationDataset - -__all__ = ['RAVDESS'] - - -class RAVDESS(AudioClassificationDataset): - """ - The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two - lexically-matched statements in a neutral North American accent. Speech emotions - includes calm, happy, sad, angry, fearful, surprise, and disgust expressions. - Each expression is produced at two levels of emotional intensity (normal, strong), - with an additional neutral expression. - - Reference: - The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): - A dynamic, multimodal set of facial and vocal expressions in North American English - https://doi.org/10.1371/journal.pone.0196391 - """ - - archieves = [ - { - 'url': - 'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip', - 'md5': - '5411230427d67a21e18aa4d466e6d1b9', - }, - { - 'url': - 'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip', - 'md5': - 'bc696df654c87fed845eb13823edef8a', - }, - ] - label_list = [ - 'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', - 'surprised' - ] - meta_info = collections.namedtuple( - 'META_INFO', ('modality', 'vocal_channel', 'emotion', - 'emotion_intensity', 'statement', 'repitition', 'actor')) - speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24') - song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24') - - def __init__(self, - mode='train', - seed=0, - n_folds=5, - split=1, - feat_type='raw', - **kwargs): - """ - Ags: - mode (:obj:`str`, `optional`, defaults to `train`): - It identifies the dataset mode (train or dev). - seed (:obj:`int`, `optional`, defaults to 0): - Set the random seed to shuffle samples. - n_folds (:obj:`int`, `optional`, defaults to 5): - Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset. - split (:obj:`int`, `optional`, defaults to 1): - It specify the fold of dev dataset. - feat_type (:obj:`str`, `optional`, defaults to `raw`): - It identifies the feature type that user wants to extrace of an audio file. - """ - assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' - files, labels = self._get_data(mode, seed, n_folds, split) - super(RAVDESS, self).__init__( - files=files, labels=labels, feat_type=feat_type, **kwargs) - - def _get_meta_info(self, files) -> List[collections.namedtuple]: - ret = [] - for file in files: - basename_without_extend = os.path.basename(file)[:-4] - ret.append(self.meta_info(*basename_without_extend.split('-'))) - return ret - - def _get_data(self, mode, seed, n_folds, - split) -> Tuple[List[str], List[int]]: - if not os.path.isdir(self.speech_path) and not os.path.isdir( - self.song_path): - download_and_decompress(self.archieves, DATA_HOME) - - wav_files = [] - for root, _, files in os.walk(self.speech_path): - for file in files: - if file.endswith('.wav'): - wav_files.append(os.path.join(root, file)) - - for root, _, files in os.walk(self.song_path): - for file in files: - if file.endswith('.wav'): - wav_files.append(os.path.join(root, file)) - - random.seed(seed) # shuffle samples to split data - random.shuffle( - wav_files - ) # make sure using the same seed to create train and dev dataset - meta_info = self._get_meta_info(wav_files) - - files = [] - labels = [] - n_samples_per_fold = len(meta_info) // n_folds - for idx, sample in enumerate(meta_info): - _, _, emotion, _, _, _, _ = sample - target = int(emotion) - 1 - fold = idx // n_samples_per_fold + 1 - - if mode == 'train' and int(fold) != split: - files.append(wav_files[idx]) - labels.append(target) - - if mode != 'train' and int(fold) == split: - files.append(wav_files[idx]) - labels.append(target) - - return files, labels diff --git a/audio/setup.py b/audio/setup.py deleted file mode 100644 index e0ac9818..00000000 --- a/audio/setup.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import setuptools - -# set the version here -version = '0.1.0a' - -with open("README.md", "r") as fh: - long_description = fh.read() - -setuptools.setup( - name="paddleaudio", - version=version, - author="", - author_email="", - description="PaddleAudio, in development", - long_description=long_description, - long_description_content_type="text/markdown", - url="", - packages=setuptools.find_packages(exclude=["build*", "test*", "examples*"]), - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - ], - python_requires='>=3.6', - install_requires=[ - 'numpy >= 1.15.0', - 'scipy >= 1.0.0', - 'resampy >= 0.2.2', - 'soundfile >= 0.9.0', - 'colorlog', - 'pathos', - ], - extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2'] - } # for dev only, install: pip install -e .[dev] -) diff --git a/audio/test/README.md b/audio/test/README.md deleted file mode 100644 index e5dbc537..00000000 --- a/audio/test/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# PaddleAudio Testing Guide - - - - -# Testing -First clone a version of the project by -``` -git clone https://github.com/PaddlePaddle/models.git - -``` -Then install the project in your virtual environment. -``` -cd models/PaddleAudio -python setup.py bdist_wheel -pip install -e .[dev] -``` -The requirements for testing will be installed along with PaddleAudio. - -Now run -``` -pytest test -``` - -If it goes well, you will see outputs like these: -``` -platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1 -rootdir: ./models/PaddleAudio -plugins: hydra-core-1.0.6 -collected 16 items - -test/unit_test/test_backend.py ........... [ 68%] -test/unit_test/test_features.py ..... [100%] - -==================================================== warnings summary ==================================================== -. -. -. --- Docs: https://docs.pytest.org/en/stable/warnings.html -============================================ 16 passed, 11 warnings in 6.76s ============================================= -``` diff --git a/audio/test/unit_test/test_backend.py b/audio/test/unit_test/test_backend.py deleted file mode 100644 index 1bf1504e..00000000 --- a/audio/test/unit_test/test_backend.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import librosa -import numpy as np -import paddleaudio -import pytest - -TEST_FILE = './test/data/test_audio.wav' - - -def relative_err(a, b, real=True): - """compute relative error of two matrices or vectors""" - if real: - return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2)) - else: - err = np.sum((a.real - b.real)**2) / \ - (EPS + np.sum(a.real**2) + np.sum(b.real**2)) - err += np.sum((a.imag - b.imag)**2) / \ - (EPS + np.sum(a.imag**2) + np.sum(b.imag**2)) - - return err - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def load_audio(): - x, r = librosa.load(TEST_FILE, sr=16000) - print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}') - return x, r - - -# start testing -x, r = load_audio() -EPS = 1e-8 - - -def test_load(): - s, r = paddleaudio.load(TEST_FILE, sr=16000) - assert r == 16000 - assert s.dtype == 'float32' - - s, r = paddleaudio.load( - TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16') - assert len(s) / r == 2.0 - assert r == 16000 - assert s.dtype == 'int16' - - -def test_depth_convert(): - y = paddleaudio.depth_convert(x, 'int16') - assert len(y) == len(x) - assert y.dtype == 'int16' - assert np.max(y) <= 32767 - assert np.min(y) >= -32768 - assert np.std(y) > EPS - - y = paddleaudio.depth_convert(x, 'int8') - assert len(y) == len(x) - assert y.dtype == 'int8' - assert np.max(y) <= 127 - assert np.min(y) >= -128 - assert np.std(y) > EPS - - -# test case for resample -rs_test_data = [ - (32000, 'kaiser_fast'), - (16000, 'kaiser_fast'), - (8000, 'kaiser_fast'), - (32000, 'kaiser_best'), - (16000, 'kaiser_best'), - (8000, 'kaiser_best'), - (22050, 'kaiser_best'), - (44100, 'kaiser_best'), -] - - -@pytest.mark.parametrize('sr,mode', rs_test_data) -def test_resample(sr, mode): - y = paddleaudio.resample(x, 16000, sr, mode=mode) - factor = sr / 16000 - err = relative_err(len(y), len(x) * factor) - print('err:', err) - assert err < EPS - - -def test_normalize(): - y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5) - assert np.max(y) < 0.5 + EPS - - y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0) - assert np.max(y) <= 2.0 + EPS - - y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0) - print('np.std(y):', np.std(y)) - assert np.abs(np.std(y) - 1.0) < EPS - - -if __name__ == '__main__': - test_load() - test_depth_convert() - test_resample(22050, 'kaiser_fast') - test_normalize() diff --git a/audio/test/unit_test/test_features.py b/audio/test/unit_test/test_features.py deleted file mode 100644 index 9e4e29cb..00000000 --- a/audio/test/unit_test/test_features.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import librosa -import numpy as np -import paddleaudio as pa -import pytest - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def load_audio(): - x, r = librosa.load('./test/data/test_audio.wav') - #x,r = librosa.load('../data/test_audio.wav',sr=16000) - return x, r - - -## start testing -x, r = load_audio() -EPS = 1e-8 - - -def relative_err(a, b, real=True): - """compute relative error of two matrices or vectors""" - if real: - return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2)) - else: - err = np.sum((a.real - b.real)**2) / ( - EPS + np.sum(a.real**2) + np.sum(b.real**2)) - err += np.sum((a.imag - b.imag)**2) / ( - EPS + np.sum(a.imag**2) + np.sum(b.imag**2)) - - return err - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_melspectrogram(): - a = pa.melspectrogram( - x, - window_size=512, - sr=16000, - hop_length=320, - n_mels=64, - fmin=50, - to_db=False, ) - b = librosa.feature.melspectrogram( - x, - sr=16000, - n_fft=512, - win_length=512, - hop_length=320, - n_mels=64, - fmin=50) - assert relative_err(a, b) < EPS - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_melspectrogram_db(): - - a = pa.melspectrogram( - x, - window_size=512, - sr=16000, - hop_length=320, - n_mels=64, - fmin=50, - to_db=True, - ref=1.0, - amin=1e-10, - top_db=None) - b = librosa.feature.melspectrogram( - x, - sr=16000, - n_fft=512, - win_length=512, - hop_length=320, - n_mels=64, - fmin=50) - b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None) - assert relative_err(a, b) < EPS - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_stft(): - a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512) - b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512) - assert a.shape == b.shape - assert relative_err(a, b, real=False) < EPS - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_split_frames(): - a = librosa.util.frame(x, frame_length=512, hop_length=320) - b = pa.split_frames(x, frame_length=512, hop_length=320) - assert relative_err(a, b) < EPS - - -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_mfcc(): - kwargs = { - 'window_size': 512, - 'hop_length': 320, - 'n_mels': 64, - 'fmin': 50, - 'to_db': False - } - a = pa.mfcc( - x, - #sample_rate=16000, - spect=None, - n_mfcc=20, - dct_type=2, - norm='ortho', - lifter=0, - **kwargs) - S = librosa.feature.melspectrogram( - x, - sr=16000, - n_fft=512, - win_length=512, - hop_length=320, - n_mels=64, - fmin=50) - b = librosa.feature.mfcc( - x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0) - assert relative_err(a, b) < EPS - - -if __name__ == '__main__': - test_melspectrogram() - test_melspectrogram_db() - test_stft() - test_split_frames() - test_mfcc() diff --git a/audio/examples/sound_classification/README.md b/examples/esc50/README.md similarity index 85% rename from audio/examples/sound_classification/README.md rename to examples/esc50/README.md index 86a54cb3..e148efd0 100644 --- a/audio/examples/sound_classification/README.md +++ b/examples/esc50/README.md @@ -21,22 +21,17 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型,可供用 ### 模型训练 -以环境声音分类数据集`ESC50`为示例,运行下面的命令,可在训练集上进行模型的finetune,支持单机的单卡训练和多卡训练。关于如何使用`paddle.distributed.launch`启动多卡训练,请查看[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html)。 +以环境声音分类数据集`ESC50`为示例,运行下面的命令,可在训练集上进行模型的finetune,支持单机的单卡训练和多卡训练。 -单卡训练: +启动训练: ```shell -$ python train.py --epochs 50 --batch_size 16 --checkpoint_dir ./checkpoint --save_freq 10 +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 ``` -多卡训练: -```shell -$ unset CUDA_VISIBLE_DEVICES -$ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_size 16 --num_worker 4 --checkpoint_dir ./checkpoint --save_freq 10 -``` - -可支持配置的参数: +`local/train.py` 脚本中可支持配置的参数: - `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 +- `gpu_feat`: 选择是否用gpu加速提取音频特征,默认为False。 - `epochs`: 训练轮次,默认为50。 - `learning_rate`: Fine-tune的学习率;默认为5e-5。 - `batch_size`: 批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数;默认为16。 @@ -48,8 +43,8 @@ $ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_ 示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过以下方式执行: ```python from model import SoundClassifier -from paddleaudio.datasets import ESC50 -from paddleaudio.models.panns import cnn14, cnn10, cnn6 +from paddlespeech.cls.datasets import ESC50 +from paddlespeech.cls.models import cnn14, cnn10, cnn6 # CNN14 backbone = cnn14(pretrained=True, extract_embedding=True) @@ -67,12 +62,14 @@ model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) ### 模型预测 ```shell -python -u predict.py --wav ./dog.wav --top_k 3 --checkpoint ./checkpoint/epoch_50/model.pdparams +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 ``` -可支持配置的参数: +`local/predict.py` 脚本中可支持配置的参数: + - `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 - `wav`: 指定预测的音频文件。 +- `gpu_feat`: 选择是否用gpu加速提取音频特征,默认为False。 - `top_k`: 预测显示的top k标签的得分,默认为1。 - `checkpoint`: 模型参数checkpoint文件。 diff --git a/audio/examples/sound_classification/deploy/python/predict.py b/examples/esc50/cls0/local/deploy/python/predict.py similarity index 97% rename from audio/examples/sound_classification/deploy/python/predict.py rename to examples/esc50/cls0/local/deploy/python/predict.py index a99b8980..13730acd 100644 --- a/audio/examples/sound_classification/deploy/python/predict.py +++ b/examples/esc50/cls0/local/deploy/python/predict.py @@ -16,11 +16,12 @@ import os import numpy as np from paddle import inference -from paddleaudio.backends import load as load_audio -from paddleaudio.datasets import ESC50 -from paddleaudio.features import melspectrogram from scipy.special import softmax +from paddlespeech.cls.backends import load as load_audio +from paddlespeech.cls.datasets import ESC50 +from paddlespeech.cls.features import melspectrogram + # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.") diff --git a/audio/examples/sound_classification/export_model.py b/examples/esc50/cls0/local/export_model.py similarity index 94% rename from audio/examples/sound_classification/export_model.py rename to examples/esc50/cls0/local/export_model.py index 1be7b27a..87dd527c 100644 --- a/audio/examples/sound_classification/export_model.py +++ b/examples/esc50/cls0/local/export_model.py @@ -16,8 +16,9 @@ import os import paddle from model import SoundClassifier -from paddleaudio.datasets import ESC50 -from paddleaudio.models.panns import cnn14 + +from paddlespeech.cls.datasets import ESC50 +from paddlespeech.cls.models.panns import cnn14 # yapf: disable parser = argparse.ArgumentParser(__doc__) diff --git a/audio/examples/sound_classification/model.py b/examples/esc50/cls0/local/model.py similarity index 100% rename from audio/examples/sound_classification/model.py rename to examples/esc50/cls0/local/model.py diff --git a/audio/examples/sound_classification/predict.py b/examples/esc50/cls0/local/predict.py similarity index 66% rename from audio/examples/sound_classification/predict.py rename to examples/esc50/cls0/local/predict.py index 30d141cd..58187677 100644 --- a/audio/examples/sound_classification/predict.py +++ b/examples/esc50/cls0/local/predict.py @@ -12,29 +12,41 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse +import ast import numpy as np import paddle import paddle.nn.functional as F from model import SoundClassifier -from paddleaudio.backends import load as load_audio -from paddleaudio.datasets import ESC50 -from paddleaudio.features import melspectrogram -from paddleaudio.models.panns import cnn14 + +from paddlespeech.cls.backends import load as load_audio +from paddlespeech.cls.datasets import ESC50 +from paddlespeech.cls.features import LogMelSpectrogram +from paddlespeech.cls.features import melspectrogram +from paddlespeech.cls.models.panns import cnn14 # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.") parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") +parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.") parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results") parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.") args = parser.parse_args() # yapf: enable -def extract_features(file: str, **kwargs): +def extract_features(file: str, gpu_feat: bool=False, + **kwargs) -> paddle.Tensor: waveform, sr = load_audio(file, sr=None) - feat = melspectrogram(waveform, sr, **kwargs).transpose() + if gpu_feat: + feature_extractor = LogMelSpectrogram(sr=sr, hop_length=320, **kwargs) + feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0)) + feat = paddle.transpose(feat, [0, 2, 1]) + else: + feat = melspectrogram(waveform, sr, **kwargs).transpose() + feat = np.expand_dims(feat, 0) + feat = paddle.to_tensor(feat) return feat @@ -47,8 +59,7 @@ if __name__ == '__main__': model.set_state_dict(paddle.load(args.checkpoint)) model.eval() - feat = np.expand_dims(extract_features(args.wav), 0) - feat = paddle.to_tensor(feat) + feat = extract_features(args.wav, args.gpu_feat) logits = model(feat) probs = F.softmax(logits, axis=1).numpy() diff --git a/audio/examples/sound_classification/train.py b/examples/esc50/cls0/local/train.py similarity index 80% rename from audio/examples/sound_classification/train.py rename to examples/esc50/cls0/local/train.py index e3b5e2ae..67215535 100644 --- a/audio/examples/sound_classification/train.py +++ b/examples/esc50/cls0/local/train.py @@ -12,19 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse +import ast import os import paddle from model import SoundClassifier -from paddleaudio.datasets import ESC50 -from paddleaudio.models.panns import cnn14 -from paddleaudio.utils import logger -from paddleaudio.utils import Timer + +from paddlespeech.cls.datasets import ESC50 +from paddlespeech.cls.features import LogMelSpectrogram +from paddlespeech.cls.models.panns import cnn14 +from paddlespeech.cls.utils import logger +from paddlespeech.cls.utils import Timer # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.") parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.") +parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.") @@ -48,8 +52,13 @@ if __name__ == "__main__": learning_rate=args.learning_rate, parameters=model.parameters()) criterion = paddle.nn.loss.CrossEntropyLoss() - train_ds = ESC50(mode='train', feat_type='melspectrogram') - dev_ds = ESC50(mode='dev', feat_type='melspectrogram') + if args.gpu_feat: + train_ds = ESC50(mode='train') + dev_ds = ESC50(mode='dev') + feature_extractor = LogMelSpectrogram(sr=16000, hop_length=320) + else: + train_ds = ESC50(mode='train', feat_type='melspectrogram') + dev_ds = ESC50(mode='dev', feat_type='melspectrogram') train_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False) @@ -71,7 +80,16 @@ if __name__ == "__main__": num_corrects = 0 num_samples = 0 for batch_idx, batch in enumerate(train_loader): - feats, labels = batch + if args.gpu_feat: + waveforms, labels = batch + feats = feature_extractor( + waveforms + ) # Need a padding when lengths of waveforms differ in a batch. + feats = paddle.transpose(feats, + [0, 2, 1]) # To [N, length, n_mels] + else: + feats, labels = batch + logits = model(feats) loss = criterion(logits, labels) @@ -126,7 +144,13 @@ if __name__ == "__main__": num_samples = 0 with logger.processing('Evaluation on validation dataset'): for batch_idx, batch in enumerate(dev_loader): - feats, labels = batch + if args.gpu_feat: + waveforms, labels = batch + feats = feature_extractor(waveforms) + feats = paddle.transpose(feats, [0, 2, 1]) + else: + feats, labels = batch + logits = model(feats) preds = paddle.argmax(logits, axis=1) diff --git a/examples/esc50/cls0/path.sh b/examples/esc50/cls0/path.sh new file mode 100644 index 00000000..867cfb5d --- /dev/null +++ b/examples/esc50/cls0/path.sh @@ -0,0 +1,14 @@ +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +MODEL=deepspeech2 +export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh new file mode 100755 index 00000000..17f2fd99 --- /dev/null +++ b/examples/esc50/cls0/run.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -e +source path.sh + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +if [ ${ngpu} == 0 ];then + device=cpu +else + device=gpu +fi + +stage=$1 +stop_stage=100 + +num_epochs=50 +batch_size=16 +ckpt_dir=./checkpoint +save_freq=10 +gpu_feat=True + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + if [ ${ngpu} -gt 1 ]; then + python -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES local/train.py \ + --epochs ${num_epochs} \ + --gpu_feat ${gpu_feat} \ + --batch_size ${batch_size} \ + --checkpoint_dir ${ckpt_dir} \ + --save_freq ${save_freq} + else + python local/train.py \ + --device ${device} \ + --epochs ${num_epochs} \ + --gpu_feat ${gpu_feat} \ + --batch_size ${batch_size} \ + --checkpoint_dir ${ckpt_dir} \ + --save_freq ${save_freq} + fi +fi + +audio_file=~/cat.wav +ckpt=./checkpoint/epoch_50/model.pdparams +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python local/predict.py \ + --device ${device} \ + --wav ${audio_file} \ + --gpu_feat ${gpu_feat} \ + --top_k 10 \ + --checkpoint ${ckpt} +fi + +exit 0 \ No newline at end of file diff --git a/paddlespeech/cls/__init__.py b/paddlespeech/cls/__init__.py index 185a92b8..2685cf57 100644 --- a/paddlespeech/cls/__init__.py +++ b/paddlespeech/cls/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .backends import * +from .features import * diff --git a/audio/paddleaudio/backends/__init__.py b/paddlespeech/cls/backends/__init__.py similarity index 100% rename from audio/paddleaudio/backends/__init__.py rename to paddlespeech/cls/backends/__init__.py diff --git a/audio/paddleaudio/backends/audio.py b/paddlespeech/cls/backends/audio.py similarity index 100% rename from audio/paddleaudio/backends/audio.py rename to paddlespeech/cls/backends/audio.py diff --git a/audio/paddleaudio/datasets/__init__.py b/paddlespeech/cls/datasets/__init__.py similarity index 73% rename from audio/paddleaudio/datasets/__init__.py rename to paddlespeech/cls/datasets/__init__.py index e1d2bbc5..8d2fdab4 100644 --- a/audio/paddleaudio/datasets/__init__.py +++ b/paddlespeech/cls/datasets/__init__.py @@ -11,24 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .aishell import AISHELL1 -from .dcase import UrbanAcousticScenes -from .dcase import UrbanAudioVisualScenes from .esc50 import ESC50 from .gtzan import GTZAN -from .librispeech import LIBRISPEECH -from .ravdess import RAVDESS from .tess import TESS from .urban_sound import UrbanSound8K __all__ = [ - 'AISHELL1', - 'LIBRISPEECH', 'ESC50', 'UrbanSound8K', 'GTZAN', - 'UrbanAcousticScenes', - 'UrbanAudioVisualScenes', - 'RAVDESS', 'TESS', ] diff --git a/audio/paddleaudio/datasets/dataset.py b/paddlespeech/cls/datasets/dataset.py similarity index 100% rename from audio/paddleaudio/datasets/dataset.py rename to paddlespeech/cls/datasets/dataset.py diff --git a/audio/paddleaudio/datasets/esc50.py b/paddlespeech/cls/datasets/esc50.py similarity index 100% rename from audio/paddleaudio/datasets/esc50.py rename to paddlespeech/cls/datasets/esc50.py diff --git a/audio/paddleaudio/datasets/gtzan.py b/paddlespeech/cls/datasets/gtzan.py similarity index 100% rename from audio/paddleaudio/datasets/gtzan.py rename to paddlespeech/cls/datasets/gtzan.py diff --git a/audio/paddleaudio/datasets/tess.py b/paddlespeech/cls/datasets/tess.py similarity index 100% rename from audio/paddleaudio/datasets/tess.py rename to paddlespeech/cls/datasets/tess.py diff --git a/audio/paddleaudio/datasets/urban_sound.py b/paddlespeech/cls/datasets/urban_sound.py similarity index 100% rename from audio/paddleaudio/datasets/urban_sound.py rename to paddlespeech/cls/datasets/urban_sound.py diff --git a/audio/paddleaudio/features/__init__.py b/paddlespeech/cls/features/__init__.py similarity index 96% rename from audio/paddleaudio/features/__init__.py rename to paddlespeech/cls/features/__init__.py index 8503cfab..d8ac7c4b 100644 --- a/audio/paddleaudio/features/__init__.py +++ b/paddlespeech/cls/features/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from .augment import * from .core import * +from .spectrum import * diff --git a/audio/paddleaudio/features/augment.py b/paddlespeech/cls/features/augment.py similarity index 98% rename from audio/paddleaudio/features/augment.py rename to paddlespeech/cls/features/augment.py index 7556bb3c..6f903bdb 100644 --- a/audio/paddleaudio/features/augment.py +++ b/paddlespeech/cls/features/augment.py @@ -15,8 +15,9 @@ from typing import List import numpy as np from numpy import ndarray as array -from paddleaudio.backends import depth_convert -from paddleaudio.utils import ParameterError + +from ..backends import depth_convert +from ..utils import ParameterError __all__ = [ 'depth_augment', diff --git a/audio/paddleaudio/features/core.py b/paddlespeech/cls/features/core.py similarity index 99% rename from audio/paddleaudio/features/core.py rename to paddlespeech/cls/features/core.py index dd25724f..d3c2e290 100644 --- a/audio/paddleaudio/features/core.py +++ b/paddlespeech/cls/features/core.py @@ -21,9 +21,10 @@ import numpy as np import scipy from numpy import ndarray as array from numpy.lib.stride_tricks import as_strided -from paddleaudio.utils import ParameterError from scipy.signal import get_window +from ..utils import ParameterError + __all__ = [ 'stft', 'mfcc', @@ -293,6 +294,7 @@ def stft(x: array, This function is aligned with librosa. """ _check_audio(x) + # By default, use the entire frame if win_length is None: win_length = n_fft @@ -397,7 +399,7 @@ def mfcc(x, This function is NOT strictly aligned with librosa. The following example shows how to get the same result with librosa: - # paddleaudioe mfcc: + # mfcc: kwargs = { 'window_size':512, 'hop_length':320, diff --git a/paddlespeech/cls/features/spectrum.py b/paddlespeech/cls/features/spectrum.py new file mode 100644 index 00000000..d70e60fb --- /dev/null +++ b/paddlespeech/cls/features/spectrum.py @@ -0,0 +1,461 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from functools import partial +from typing import Optional +from typing import Union + +import paddle +import paddle.nn as nn + +from .window import get_window + +__all__ = [ + 'Spectrogram', + 'MelSpectrogram', + 'LogMelSpectrogram', +] + + +def hz_to_mel(freq: Union[paddle.Tensor, float], + htk: bool=False) -> Union[paddle.Tensor, float]: + """Convert Hz to Mels. + Parameters: + freq: the input tensor of arbitrary shape, or a single floating point number. + htk: use HTK formula to do the conversion. + The default value is False. + Returns: + The frequencies represented in Mel-scale. + """ + + if htk: + if isinstance(freq, paddle.Tensor): + return 2595.0 * paddle.log10(1.0 + freq / 700.0) + else: + return 2595.0 * math.log10(1.0 + freq / 700.0) + + # Fill in the linear part + f_min = 0.0 + f_sp = 200.0 / 3 + + mels = (freq - f_min) / f_sp + + # Fill in the log-scale part + + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = math.log(6.4) / 27.0 # step size for log region + + if isinstance(freq, paddle.Tensor): + target = min_log_mel + paddle.log( + freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10 + mask = (freq > min_log_hz).astype(freq.dtype) + mels = target * mask + mels * ( + 1 - mask) # will replace by masked_fill OP in future + else: + if freq >= min_log_hz: + mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep + + return mels + + +def mel_to_hz(mel: Union[float, paddle.Tensor], + htk: bool=False) -> Union[float, paddle.Tensor]: + """Convert mel bin numbers to frequencies. + Parameters: + mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number. + htk: use HTK formula to do the conversion. + Returns: + The frequencies represented in hz. + """ + if htk: + return 700.0 * (10.0**(mel / 2595.0) - 1.0) + + f_min = 0.0 + f_sp = 200.0 / 3 + freqs = f_min + f_sp * mel + # And now the nonlinear scale + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = math.log(6.4) / 27.0 # step size for log region + if isinstance(mel, paddle.Tensor): + target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel)) + mask = (mel > min_log_mel).astype(mel.dtype) + freqs = target * mask + freqs * ( + 1 - mask) # will replace by masked_fill OP in future + else: + if mel >= min_log_mel: + freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel)) + + return freqs + + +def mel_frequencies(n_mels: int=64, + f_min: float=0.0, + f_max: float=11025.0, + htk: bool=False, + dtype: str=paddle.float32): + """Compute mel frequencies. + Parameters: + n_mels(int): number of Mel bins. + f_min(float): the lower cut-off frequency, below which the filter response is zero. + f_max(float): the upper cut-off frequency, above which the filter response is zero. + htk(bool): whether to use htk formula. + dtype(str): the datatype of the return frequencies. + Returns: + The frequencies represented in Mel-scale + """ + # 'Center freqs' of mel bands - uniformly spaced between limits + min_mel = hz_to_mel(f_min, htk=htk) + max_mel = hz_to_mel(f_max, htk=htk) + mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype) + freqs = mel_to_hz(mels, htk=htk) + return freqs + + +def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32): + """Compute fourier frequencies. + Parameters: + sr(int): the audio sample rate. + n_fft(float): the number of fft bins. + dtype(str): the datatype of the return frequencies. + Returns: + The frequencies represented in hz. + """ + return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype) + + +def compute_fbank_matrix(sr: int, + n_fft: int, + n_mels: int=64, + f_min: float=0.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + dtype: str=paddle.float32): + """Compute fbank matrix. + Parameters: + sr(int): the audio sample rate. + n_fft(int): the number of fft bins. + n_mels(int): the number of Mel bins. + f_min(float): the lower cut-off frequency, below which the filter response is zero. + f_max(float): the upper cut-off frequency, above which the filter response is zero. + htk: whether to use htk formula. + return_complex(bool): whether to return complex matrix. If True, the matrix will + be complex type. Otherwise, the real and image part will be stored in the last + axis of returned tensor. + dtype(str): the datatype of the returned fbank matrix. + Returns: + The fbank matrix of shape (n_mels, int(1+n_fft//2)). + Shape: + output: (n_mels, int(1+n_fft//2)) + """ + + if f_max is None: + f_max = float(sr) / 2 + + # Initialize the weights + weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) + + # Center freqs of each FFT bin + fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype) + + # 'Center freqs' of mel bands - uniformly spaced between limits + mel_f = mel_frequencies( + n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype) + + fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f) + ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0) + #ramps = np.subtract.outer(mel_f, fftfreqs) + + for i in range(n_mels): + # lower and upper slopes for all bins + lower = -ramps[i] / fdiff[i] + upper = ramps[i + 2] / fdiff[i + 1] + + # .. then intersect them with each other and zero + weights[i] = paddle.maximum( + paddle.zeros_like(lower), paddle.minimum(lower, upper)) + + # Slaney-style mel is scaled to be approx constant energy per channel + if norm == 'slaney': + enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels]) + weights *= enorm.unsqueeze(1) + elif isinstance(norm, int) or isinstance(norm, float): + weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1) + + return weights + + +def power_to_db(magnitude: paddle.Tensor, + ref_value: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=80.0) -> paddle.Tensor: + """Convert a power spectrogram (amplitude squared) to decibel (dB) units. + The function computes the scaling ``10 * log10(x / ref)`` in a numerically + stable way. + Parameters: + magnitude(Tensor): the input magnitude tensor of any shape. + ref_value(float): the reference value. If smaller than 1.0, the db level + of the signal will be pulled up accordingly. Otherwise, the db level + is pushed down. + amin(float): the minimum value of input magnitude, below which the input + magnitude is clipped(to amin). + top_db(float): the maximum db value of resulting spectrum, above which the + spectrum is clipped(to top_db). + Returns: + The spectrogram in log-scale. + shape: + input: any shape + output: same as input + """ + if amin <= 0: + raise Exception("amin must be strictly positive") + + if ref_value <= 0: + raise Exception("ref_value must be strictly positive") + + ones = paddle.ones_like(magnitude) + log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude)) + log_spec -= 10.0 * math.log10(max(ref_value, amin)) + + if top_db is not None: + if top_db < 0: + raise Exception("top_db must be non-negative") + log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db)) + + return log_spec + + +class Spectrogram(nn.Layer): + def __init__(self, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + center: bool=True, + pad_mode: str='reflect', + dtype: str=paddle.float32): + """Compute spectrogram of a given signal, typically an audio waveform. + The spectorgram is defined as the complex norm of the short-time + Fourier transformation. + Parameters: + n_fft(int): the number of frequency components of the discrete Fourier transform. + The default value is 2048, + hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4. + The default value is None. + win_length: the window length of the short time FFt. If None, it is set to same as n_fft. + The default value is None. + window(str): the name of the window function applied to the single before the Fourier transform. + The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', + 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. + The default value is 'hann' + center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. + If False, frame t begins at x[t * hop_length] + The default value is True + pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect' + and 'constant'. The default value is 'reflect'. + dtype(str): the data type of input and window. + Notes: + The Spectrogram transform relies on STFT transform to compute the spectrogram. + By default, the weights are not learnable. To fine-tune the Fourier coefficients, + set stop_gradient=False before training. + For more information, see STFT(). + """ + super(Spectrogram, self).__init__() + + if win_length is None: + win_length = n_fft + + fft_window = get_window(window, win_length, fftbins=True, dtype=dtype) + self._stft = partial( + paddle.signal.stft, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=fft_window, + center=center, + pad_mode=pad_mode) + + def forward(self, x): + stft = self._stft(x) + spectrogram = paddle.square(paddle.abs(stft)) + return spectrogram + + +class MelSpectrogram(nn.Layer): + def __init__(self, + sr: int=22050, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + center: bool=True, + pad_mode: str='reflect', + n_mels: int=64, + f_min: float=0.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + dtype: str=paddle.float32): + """Compute the melspectrogram of a given signal, typically an audio waveform. + The melspectrogram is also known as filterbank or fbank feature in audio community. + It is computed by multiplying spectrogram with Mel filter bank matrix. + Parameters: + sr(int): the audio sample rate. + The default value is 22050. + n_fft(int): the number of frequency components of the discrete Fourier transform. + The default value is 2048, + hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4. + The default value is None. + win_length: the window length of the short time FFt. If None, it is set to same as n_fft. + The default value is None. + window(str): the name of the window function applied to the single before the Fourier transform. + The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', + 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. + The default value is 'hann' + center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. + If False, frame t begins at x[t * hop_length] + The default value is True + pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect' + and 'constant'. + The default value is 'reflect'. + n_mels(int): the mel bins. + f_min(float): the lower cut-off frequency, below which the filter response is zero. + f_max(float): the upper cut-off frequency, above which the filter response is zeros. + htk(bool): whether to use HTK formula in computing fbank matrix. + norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. + You can specify norm=1.0/2.0 to use customized p-norm normalization. + dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical + accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. + """ + super(MelSpectrogram, self).__init__() + + self._spectrogram = Spectrogram( + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, + dtype=dtype) + self.n_mels = n_mels + self.f_min = f_min + self.f_max = f_max + self.htk = htk + self.norm = norm + if f_max is None: + f_max = sr // 2 + self.fbank_matrix = compute_fbank_matrix( + sr=sr, + n_fft=n_fft, + n_mels=n_mels, + f_min=f_min, + f_max=f_max, + htk=htk, + norm=norm, + dtype=dtype) # float64 for better numerical results + self.register_buffer('fbank_matrix', self.fbank_matrix) + + def forward(self, x): + spect_feature = self._spectrogram(x) + mel_feature = paddle.matmul(self.fbank_matrix, spect_feature) + return mel_feature + + +class LogMelSpectrogram(nn.Layer): + def __init__(self, + sr: int=22050, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + center: bool=True, + pad_mode: str='reflect', + n_mels: int=64, + f_min: float=0.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + ref_value: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=80.0, + dtype: str=paddle.float32): + """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal, + typically an audio waveform. + Parameters: + sr(int): the audio sample rate. + The default value is 22050. + n_fft(int): the number of frequency components of the discrete Fourier transform. + The default value is 2048, + hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4. + The default value is None. + win_length: the window length of the short time FFt. If None, it is set to same as n_fft. + The default value is None. + window(str): the name of the window function applied to the single before the Fourier transform. + The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', + 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. + The default value is 'hann' + center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. + If False, frame t begins at x[t * hop_length] + The default value is True + pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect' + and 'constant'. + The default value is 'reflect'. + n_mels(int): the mel bins. + f_min(float): the lower cut-off frequency, below which the filter response is zero. + f_max(float): the upper cut-off frequency, above which the filter response is zeros. + ref_value(float): the reference value. If smaller than 1.0, the db level + htk(bool): whether to use HTK formula in computing fbank matrix. + norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. + You can specify norm=1.0/2.0 to use customized p-norm normalization. + dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical + accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. + amin(float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly. + Otherwise, the db level is pushed down. + magnitude is clipped(to amin). For numerical stability, set amin to a larger value, + e.g., 1e-3. + top_db(float): the maximum db value of resulting spectrum, above which the + spectrum is clipped(to top_db). + """ + super(LogMelSpectrogram, self).__init__() + + self._melspectrogram = MelSpectrogram( + sr=sr, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, + n_mels=n_mels, + f_min=f_min, + f_max=f_max, + htk=htk, + norm=norm, + dtype=dtype) + + self.ref_value = ref_value + self.amin = amin + self.top_db = top_db + + def forward(self, x): + # import ipdb; ipdb.set_trace() + mel_feature = self._melspectrogram(x) + log_mel_feature = power_to_db( + mel_feature, + ref_value=self.ref_value, + amin=self.amin, + top_db=self.top_db) + return log_mel_feature diff --git a/paddlespeech/cls/features/window.py b/paddlespeech/cls/features/window.py new file mode 100644 index 00000000..629989fc --- /dev/null +++ b/paddlespeech/cls/features/window.py @@ -0,0 +1,415 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +import math +from typing import List +from typing import Tuple +from typing import Union + +import paddle +from paddle import Tensor + +__all__ = [ + 'get_window', +] + + +def _cat(a: List[Tensor], data_type: str) -> Tensor: + l = [paddle.to_tensor(_a, data_type) for _a in a] + return paddle.concat(l) + + +def _acosh(x: Union[Tensor, float]) -> Tensor: + if isinstance(x, float): + return math.log(x + math.sqrt(x**2 - 1)) + return paddle.log(x + paddle.sqrt(paddle.square(x) - 1)) + + +def _extend(M: int, sym: bool) -> bool: + """Extend window by 1 sample if needed for DFT-even symmetry""" + if not sym: + return M + 1, True + else: + return M, False + + +def _len_guards(M: int) -> bool: + """Handle small or incorrect window lengths""" + if int(M) != M or M < 0: + raise ValueError('Window length M must be a non-negative integer') + + return M <= 1 + + +def _truncate(w: Tensor, needed: bool) -> Tensor: + """Truncate window by 1 sample if needed for DFT-even symmetry""" + if needed: + return w[:-1] + else: + return w + + +def general_gaussian(M: int, p, sig, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a window with a generalized Gaussian shape. + This function is consistent with scipy.signal.windows.general_gaussian(). + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0 + w = paddle.exp(-0.5 * paddle.abs(n / sig)**(2 * p)) + + return _truncate(w, needs_trunc) + + +def general_hamming(M: int, alpha: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a generalized Hamming window. + This function is consistent with scipy.signal.windows.general_hamming() + """ + return general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype) + + +def taylor(M: int, + nbar=4, + sll=30, + norm=True, + sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a Taylor window. + The Taylor window taper function approximates the Dolph-Chebyshev window's + constant sidelobe level for a parameterized number of near-in sidelobes. + Parameters: + M(int): window size + nbar, sil, norm: the window-specific parameter. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + # Original text uses a negative sidelobe level parameter and then negates + # it in the calculation of B. To keep consistent with other methods we + # assume the sidelobe level parameter to be positive. + B = 10**(sll / 20) + A = _acosh(B) / math.pi + s2 = nbar**2 / (A**2 + (nbar - 0.5)**2) + ma = paddle.arange(1, nbar, dtype=dtype) + + Fm = paddle.empty((nbar - 1, ), dtype=dtype) + signs = paddle.empty_like(ma) + signs[::2] = 1 + signs[1::2] = -1 + m2 = ma * ma + for mi in range(len(ma)): + numer = signs[mi] * paddle.prod(1 - m2[mi] / s2 / (A**2 + (ma - 0.5)**2 + )) + if mi == 0: + denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1:]) + elif mi == len(ma) - 1: + denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) + else: + denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi]) * paddle.prod(1 - m2[ + mi] / m2[mi + 1:]) + + Fm[mi] = numer / denom + + def W(n): + return 1 + 2 * paddle.matmul( + Fm.unsqueeze(0), + paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2. + 0.5) / M)) + + w = W(paddle.arange(0, M, dtype=dtype)) + + # normalize (Note that this is not described in the original text [1]) + if norm: + scale = 1.0 / W((M - 1) / 2) + w *= scale + w = w.squeeze() + return _truncate(w, needs_trunc) + + +def general_cosine(M: int, a: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a generic weighted sum of cosine terms window. + This function is consistent with scipy.signal.windows.general_cosine(). + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype) + w = paddle.zeros((M, ), dtype=dtype) + for k in range(len(a)): + w += a[k] * paddle.cos(k * fac) + return _truncate(w, needs_trunc) + + +def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Hamming window. + The Hamming window is a taper formed by using a raised cosine with + non-zero endpoints, optimized to minimize the nearest side lobe. + Parameters: + M(int): window size + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + return general_hamming(M, 0.54, sym, dtype=dtype) + + +def hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Hann window. + The Hann window is a taper formed by using a raised cosine or sine-squared + with ends that touch zero. + Parameters: + M(int): window size + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + return general_hamming(M, 0.5, sym, dtype=dtype) + + +def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Tukey window. + The Tukey window is also known as a tapered cosine window. + Parameters: + M(int): window size + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + + if alpha <= 0: + return paddle.ones((M, ), dtype=dtype) + elif alpha >= 1.0: + return hann(M, sym=sym) + + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) + width = int(alpha * (M - 1) / 2.0) + n1 = n[0:width + 1] + n2 = n[width + 1:M - width - 1] + n3 = n[M - width - 1:] + + w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1)))) + w2 = paddle.ones(n2.shape, dtype=dtype) + w3 = 0.5 * (1 + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / + (M - 1)))) + w = paddle.concat([w1, w2, w3]) + + return _truncate(w, needs_trunc) + + +def kaiser(M: int, beta: float, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Kaiser window. + The Kaiser window is a taper formed by using a Bessel function. + Parameters: + M(int): window size. + beta(float): the window-specific parameter. + sym(bool):whether to return symmetric window. + The default value is True + Returns: + Tensor: the window tensor + """ + raise NotImplementedError() + + +def gaussian(M: int, std: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a Gaussian window. + The Gaussian widows has a Gaussian shape defined by the standard deviation(std). + Parameters: + M(int): window size. + std(float): the window-specific parameter. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0 + sig2 = 2 * std * std + w = paddle.exp(-n**2 / sig2) + + return _truncate(w, needs_trunc) + + +def exponential(M: int, + center=None, + tau=1., + sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute an exponential (or Poisson) window. + Parameters: + M(int): window size. + tau(float): the window-specific parameter. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if sym and center is not None: + raise ValueError("If sym==True, center must be None.") + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + if center is None: + center = (M - 1) / 2 + + n = paddle.arange(0, M, dtype=dtype) + w = paddle.exp(-paddle.abs(n - center) / tau) + + return _truncate(w, needs_trunc) + + +def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a triangular window. + Parameters: + M(int): window size. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype) + if M % 2 == 0: + w = (2 * n - 1.0) / M + w = paddle.concat([w, w[::-1]]) + else: + w = 2 * n / (M + 1.0) + w = paddle.concat([w, w[-2::-1]]) + + return _truncate(w, needs_trunc) + + +def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Bohman window. + The Bohman window is the autocorrelation of a cosine window. + Parameters: + M(int): window size. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + + fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1]) + w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin( + math.pi * fac) + w = _cat([0, w, 0], dtype) + + return _truncate(w, needs_trunc) + + +def blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a Blackman window. + The Blackman window is a taper formed by using the first three terms of + a summation of cosines. It was designed to have close to the minimal + leakage possible. It is close to optimal, only slightly worse than a + Kaiser window. + Parameters: + M(int): window size. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + return general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype) + + +def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: + """Compute a window with a simple cosine shape. + Parameters: + M(int): window size. + sym(bool):whether to return symmetric window. + The default value is True + dtype(str): the datatype of returned tensor. + Returns: + Tensor: the window tensor + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + .5)) + + return _truncate(w, needs_trunc) + + +def get_window(window: Union[str, Tuple[str, float]], + win_length: int, + fftbins: bool=True, + dtype: str='float64') -> Tensor: + """Return a window of a given length and type. + Parameters: + window(str|(str,float)): the type of window to create. + win_length(int): the number of samples in the window. + fftbins(bool): If True, create a "periodic" window. Otherwise, + create a "symmetric" window, for use in filter design. + Returns: + The window represented as a tensor. + """ + sym = not fftbins + + args = () + if isinstance(window, tuple): + winstr = window[0] + if len(window) > 1: + args = window[1:] + elif isinstance(window, str): + if window in ['gaussian', 'exponential']: + raise ValueError("The '" + window + "' window needs one or " + "more parameters -- pass a tuple.") + else: + winstr = window + else: + raise ValueError("%s as window type is not supported." % + str(type(window))) + + try: + winfunc = eval(winstr) + except KeyError as e: + raise ValueError("Unknown window type.") from e + + params = (win_length, ) + args + kwargs = {'sym': sym} + return winfunc(*params, dtype=dtype, **kwargs) diff --git a/audio/paddleaudio/models/__init__.py b/paddlespeech/cls/models/__init__.py similarity index 96% rename from audio/paddleaudio/models/__init__.py rename to paddlespeech/cls/models/__init__.py index 185a92b8..4bfadda1 100644 --- a/audio/paddleaudio/models/__init__.py +++ b/paddlespeech/cls/models/__init__.py @@ -11,3 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .panns import * diff --git a/audio/paddleaudio/models/panns.py b/paddlespeech/cls/models/panns.py similarity index 100% rename from audio/paddleaudio/models/panns.py rename to paddlespeech/cls/models/panns.py diff --git a/audio/paddleaudio/utils/__init__.py b/paddlespeech/cls/utils/__init__.py similarity index 100% rename from audio/paddleaudio/utils/__init__.py rename to paddlespeech/cls/utils/__init__.py diff --git a/audio/paddleaudio/utils/download.py b/paddlespeech/cls/utils/download.py similarity index 100% rename from audio/paddleaudio/utils/download.py rename to paddlespeech/cls/utils/download.py diff --git a/audio/paddleaudio/utils/env.py b/paddlespeech/cls/utils/env.py similarity index 66% rename from audio/paddleaudio/utils/env.py rename to paddlespeech/cls/utils/env.py index 59c6b621..340c1e4b 100644 --- a/audio/paddleaudio/utils/env.py +++ b/paddlespeech/cls/utils/env.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. ''' -This module is used to store environmental variables in PaddleAudio. -PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the -├ default value through the PPAUDIO_HOME environment variable. +This module is used to store environmental variables in PaddleSpeech. +PACKAGE_HOME --> the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the +├ default value through the PACKAGE_HOME environment variable. ├─ MODEL_HOME --> Store model files. └─ DATA_HOME --> Store automatically downloaded datasets. ''' @@ -25,29 +25,29 @@ def _get_user_home(): return os.path.expanduser('~') -def _get_ppaudio_home(): - if 'PPAUDIO_HOME' in os.environ: - home_path = os.environ['PPAUDIO_HOME'] +def _get_package_home(): + if 'PACKAGE_HOME' in os.environ: + home_path = os.environ['PACKAGE_HOME'] if os.path.exists(home_path): if os.path.isdir(home_path): return home_path else: raise RuntimeError( - 'The environment variable PPAUDIO_HOME {} is not a directory.'. + 'The environment variable PACKAGE_HOME {} is not a directory.'. format(home_path)) else: return home_path - return os.path.join(_get_user_home(), '.paddleaudio') + return os.path.join(_get_user_home(), '.paddlespeech') def _get_sub_home(directory): - home = os.path.join(_get_ppaudio_home(), directory) + home = os.path.join(_get_package_home(), directory) if not os.path.exists(home): os.makedirs(home) return home USER_HOME = _get_user_home() -PPAUDIO_HOME = _get_ppaudio_home() -MODEL_HOME = _get_sub_home('models') +PACKAGE_HOME = _get_package_home() +MODEL_HOME = _get_sub_home('pretrained_models') DATA_HOME = _get_sub_home('datasets') diff --git a/audio/paddleaudio/utils/error.py b/paddlespeech/cls/utils/error.py similarity index 100% rename from audio/paddleaudio/utils/error.py rename to paddlespeech/cls/utils/error.py diff --git a/audio/paddleaudio/utils/log.py b/paddlespeech/cls/utils/log.py similarity index 95% rename from audio/paddleaudio/utils/log.py rename to paddlespeech/cls/utils/log.py index 5e7db68a..89d1e5b1 100644 --- a/audio/paddleaudio/utils/log.py +++ b/paddlespeech/cls/utils/log.py @@ -55,13 +55,13 @@ log_config = { class Logger(object): ''' - Deafult logger in PaddleAudio + Deafult logger in PaddleSpeechCls Args: - name(str) : Logger name, default is 'PaddleAudio' + name(str) : Logger name, default is 'PaddleSpeechCls' ''' def __init__(self, name: str=None): - name = 'PaddleAudio' if not name else name + name = 'PaddleSpeechCls' if not name else name self.logger = logging.getLogger(name) for key, conf in log_config.items(): diff --git a/audio/paddleaudio/utils/time.py b/paddlespeech/cls/utils/time.py similarity index 100% rename from audio/paddleaudio/utils/time.py rename to paddlespeech/cls/utils/time.py diff --git a/setup.py b/setup.py index 310eed1e..d07db788 100644 --- a/setup.py +++ b/setup.py @@ -173,7 +173,7 @@ setup_info = dict( # Package info packages=find_packages(exclude=('utils', 'tests', 'tests.*', 'examples*', - 'paddleaudio*', 'third_party*', 'tools*')), + 'third_party*', 'tools*')), zip_safe=True, classifiers=[ 'Development Status :: 3 - Alpha', From eb68b3d80079dd21eb92f9f43f2f803efebd3783 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Wed, 24 Nov 2021 20:40:28 +0800 Subject: [PATCH 2/9] Add paddlespeech.cls and esc50 example. --- examples/esc50/cls0/path.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/esc50/cls0/path.sh b/examples/esc50/cls0/path.sh index 867cfb5d..38a242a4 100644 --- a/examples/esc50/cls0/path.sh +++ b/examples/esc50/cls0/path.sh @@ -9,6 +9,3 @@ export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ - -MODEL=deepspeech2 -export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin From bdb3ce23ee2a0a80418d51c072c80afc6ca85992 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 25 Nov 2021 13:32:36 +0800 Subject: [PATCH 3/9] Add paddlespeech.cls and esc50 example. --- examples/esc50/README.md | 4 ++-- examples/esc50/cls0/local/predict.py | 18 +++++++++--------- examples/esc50/cls0/local/train.py | 25 ++++++++++++------------- examples/esc50/cls0/run.sh | 8 ++++---- paddlespeech/cls/features/spectrum.py | 8 ++++---- paddlespeech/cls/utils/env.py | 12 ++++++------ paddlespeech/cls/utils/log.py | 6 +++--- 7 files changed, 40 insertions(+), 41 deletions(-) diff --git a/examples/esc50/README.md b/examples/esc50/README.md index e148efd0..6ac10b3a 100644 --- a/examples/esc50/README.md +++ b/examples/esc50/README.md @@ -31,7 +31,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 `local/train.py` 脚本中可支持配置的参数: - `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 -- `gpu_feat`: 选择是否用gpu加速提取音频特征,默认为False。 +- `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 - `epochs`: 训练轮次,默认为50。 - `learning_rate`: Fine-tune的学习率;默认为5e-5。 - `batch_size`: 批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数;默认为16。 @@ -69,7 +69,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 - `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 - `wav`: 指定预测的音频文件。 -- `gpu_feat`: 选择是否用gpu加速提取音频特征,默认为False。 +- `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 - `top_k`: 预测显示的top k标签的得分,默认为1。 - `checkpoint`: 模型参数checkpoint文件。 diff --git a/examples/esc50/cls0/local/predict.py b/examples/esc50/cls0/local/predict.py index 58187677..a6e38a35 100644 --- a/examples/esc50/cls0/local/predict.py +++ b/examples/esc50/cls0/local/predict.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse -import ast import numpy as np import paddle @@ -29,24 +28,25 @@ from paddlespeech.cls.models.panns import cnn14 parser = argparse.ArgumentParser(__doc__) parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.") parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") -parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.") +parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.") parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results") parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.") args = parser.parse_args() # yapf: enable -def extract_features(file: str, gpu_feat: bool=False, +def extract_features(file: str, feat_backend: str='numpy', **kwargs) -> paddle.Tensor: waveform, sr = load_audio(file, sr=None) - if gpu_feat: - feature_extractor = LogMelSpectrogram(sr=sr, hop_length=320, **kwargs) - feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0)) - feat = paddle.transpose(feat, [0, 2, 1]) - else: + + if args.feat_backend == 'numpy': feat = melspectrogram(waveform, sr, **kwargs).transpose() feat = np.expand_dims(feat, 0) feat = paddle.to_tensor(feat) + else: + feature_extractor = LogMelSpectrogram(sr=sr, **kwargs) + feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0)) + feat = paddle.transpose(feat, [0, 2, 1]) return feat @@ -59,7 +59,7 @@ if __name__ == '__main__': model.set_state_dict(paddle.load(args.checkpoint)) model.eval() - feat = extract_features(args.wav, args.gpu_feat) + feat = extract_features(args.wav, args.feat_backend) logits = model(feat) probs = F.softmax(logits, axis=1).numpy() diff --git a/examples/esc50/cls0/local/train.py b/examples/esc50/cls0/local/train.py index 67215535..7a030187 100644 --- a/examples/esc50/cls0/local/train.py +++ b/examples/esc50/cls0/local/train.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse -import ast import os import paddle @@ -28,7 +27,7 @@ from paddlespeech.cls.utils import Timer parser = argparse.ArgumentParser(__doc__) parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.") parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.") -parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.") +parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.") @@ -52,13 +51,13 @@ if __name__ == "__main__": learning_rate=args.learning_rate, parameters=model.parameters()) criterion = paddle.nn.loss.CrossEntropyLoss() - if args.gpu_feat: - train_ds = ESC50(mode='train') - dev_ds = ESC50(mode='dev') - feature_extractor = LogMelSpectrogram(sr=16000, hop_length=320) - else: + if args.feat_backend == 'numpy': train_ds = ESC50(mode='train', feat_type='melspectrogram') dev_ds = ESC50(mode='dev', feat_type='melspectrogram') + else: + train_ds = ESC50(mode='train') + dev_ds = ESC50(mode='dev') + feature_extractor = LogMelSpectrogram(sr=16000) train_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False) @@ -80,15 +79,15 @@ if __name__ == "__main__": num_corrects = 0 num_samples = 0 for batch_idx, batch in enumerate(train_loader): - if args.gpu_feat: + if args.feat_backend == 'numpy': + feats, labels = batch + else: waveforms, labels = batch feats = feature_extractor( waveforms ) # Need a padding when lengths of waveforms differ in a batch. feats = paddle.transpose(feats, [0, 2, 1]) # To [N, length, n_mels] - else: - feats, labels = batch logits = model(feats) @@ -144,12 +143,12 @@ if __name__ == "__main__": num_samples = 0 with logger.processing('Evaluation on validation dataset'): for batch_idx, batch in enumerate(dev_loader): - if args.gpu_feat: + if args.feat_backend == 'numpy': + feats, labels = batch + else: waveforms, labels = batch feats = feature_extractor(waveforms) feats = paddle.transpose(feats, [0, 2, 1]) - else: - feats, labels = batch logits = model(feats) diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh index 17f2fd99..6d3a09c6 100755 --- a/examples/esc50/cls0/run.sh +++ b/examples/esc50/cls0/run.sh @@ -16,13 +16,13 @@ num_epochs=50 batch_size=16 ckpt_dir=./checkpoint save_freq=10 -gpu_feat=True +feat_backend=numpy if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${ngpu} -gt 1 ]; then python -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES local/train.py \ --epochs ${num_epochs} \ - --gpu_feat ${gpu_feat} \ + --feat_backend ${feat_backend} \ --batch_size ${batch_size} \ --checkpoint_dir ${ckpt_dir} \ --save_freq ${save_freq} @@ -30,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python local/train.py \ --device ${device} \ --epochs ${num_epochs} \ - --gpu_feat ${gpu_feat} \ + --feat_backend ${feat_backend} \ --batch_size ${batch_size} \ --checkpoint_dir ${ckpt_dir} \ --save_freq ${save_freq} @@ -43,7 +43,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python local/predict.py \ --device ${device} \ --wav ${audio_file} \ - --gpu_feat ${gpu_feat} \ + --feat_backend ${feat_backend} \ --top_k 10 \ --checkpoint ${ckpt} fi diff --git a/paddlespeech/cls/features/spectrum.py b/paddlespeech/cls/features/spectrum.py index d70e60fb..154b6484 100644 --- a/paddlespeech/cls/features/spectrum.py +++ b/paddlespeech/cls/features/spectrum.py @@ -201,7 +201,7 @@ def compute_fbank_matrix(sr: int, def power_to_db(magnitude: paddle.Tensor, ref_value: float=1.0, amin: float=1e-10, - top_db: Optional[float]=80.0) -> paddle.Tensor: + top_db: Optional[float]=None) -> paddle.Tensor: """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling ``10 * log10(x / ref)`` in a numerically stable way. @@ -304,7 +304,7 @@ class MelSpectrogram(nn.Layer): center: bool=True, pad_mode: str='reflect', n_mels: int=64, - f_min: float=0.0, + f_min: float=50.0, f_max: Optional[float]=None, htk: bool=False, norm: Union[str, float]='slaney', @@ -384,13 +384,13 @@ class LogMelSpectrogram(nn.Layer): center: bool=True, pad_mode: str='reflect', n_mels: int=64, - f_min: float=0.0, + f_min: float=50.0, f_max: Optional[float]=None, htk: bool=False, norm: Union[str, float]='slaney', ref_value: float=1.0, amin: float=1e-10, - top_db: Optional[float]=80.0, + top_db: Optional[float]=None, dtype: str=paddle.float32): """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal, typically an audio waveform. diff --git a/paddlespeech/cls/utils/env.py b/paddlespeech/cls/utils/env.py index 340c1e4b..c455af00 100644 --- a/paddlespeech/cls/utils/env.py +++ b/paddlespeech/cls/utils/env.py @@ -13,8 +13,8 @@ # limitations under the License. ''' This module is used to store environmental variables in PaddleSpeech. -PACKAGE_HOME --> the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the -├ default value through the PACKAGE_HOME environment variable. +PPSPEECH_HOME --> the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the +├ default value through the PPSPEECH_HOME environment variable. ├─ MODEL_HOME --> Store model files. └─ DATA_HOME --> Store automatically downloaded datasets. ''' @@ -26,14 +26,14 @@ def _get_user_home(): def _get_package_home(): - if 'PACKAGE_HOME' in os.environ: - home_path = os.environ['PACKAGE_HOME'] + if 'PPSPEECH_HOME' in os.environ: + home_path = os.environ['PPSPEECH_HOME'] if os.path.exists(home_path): if os.path.isdir(home_path): return home_path else: raise RuntimeError( - 'The environment variable PACKAGE_HOME {} is not a directory.'. + 'The environment variable PPSPEECH_HOME {} is not a directory.'. format(home_path)) else: return home_path @@ -48,6 +48,6 @@ def _get_sub_home(directory): USER_HOME = _get_user_home() -PACKAGE_HOME = _get_package_home() +PPSPEECH_HOME = _get_package_home() MODEL_HOME = _get_sub_home('pretrained_models') DATA_HOME = _get_sub_home('datasets') diff --git a/paddlespeech/cls/utils/log.py b/paddlespeech/cls/utils/log.py index 89d1e5b1..f4146c4f 100644 --- a/paddlespeech/cls/utils/log.py +++ b/paddlespeech/cls/utils/log.py @@ -55,13 +55,13 @@ log_config = { class Logger(object): ''' - Deafult logger in PaddleSpeechCls + Deafult logger in PaddleSpeech Args: - name(str) : Logger name, default is 'PaddleSpeechCls' + name(str) : Logger name, default is 'PaddleSpeech' ''' def __init__(self, name: str=None): - name = 'PaddleSpeechCls' if not name else name + name = 'PaddleSpeech' if not name else name self.logger = logging.getLogger(name) for key, conf in log_config.items(): From b12ae34ef1f56ad6d8e292a08e693becb3c25703 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 25 Nov 2021 14:08:34 +0800 Subject: [PATCH 4/9] Update requirements.txt. --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 99e485f8..c6cb556c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +colorlog ConfigArgParse coverage distro @@ -21,6 +22,7 @@ numba paddlespeech_ctcdecoders paddlespeech_feat pandas +pathos phkit Pillow praatio~=4.1 From 2c531d78ac7825c52ba045318de49b7bc23f7860 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 25 Nov 2021 16:45:01 +0800 Subject: [PATCH 5/9] Add paddlespeech.cls and esc50 example. --- examples/esc50/cls0/local/infer.sh | 13 ++++++ examples/esc50/cls0/local/train.sh | 27 ++++++++++++ examples/esc50/cls0/path.sh | 4 +- examples/esc50/cls0/run.sh | 32 ++------------- paddleaudio/__init__.py | 15 +++++++ .../cls => paddleaudio}/backends/__init__.py | 0 .../cls => paddleaudio}/backends/audio.py | 0 .../cls => paddleaudio}/datasets/__init__.py | 0 .../cls => paddleaudio}/datasets/dataset.py | 0 .../cls => paddleaudio}/datasets/esc50.py | 0 .../cls => paddleaudio}/datasets/gtzan.py | 0 .../cls => paddleaudio}/datasets/tess.py | 0 .../datasets/urban_sound.py | 0 .../cls => paddleaudio}/features/__init__.py | 0 .../cls => paddleaudio}/features/augment.py | 0 .../cls => paddleaudio}/features/core.py | 0 .../cls => paddleaudio}/features/spectrum.py | 0 .../cls => paddleaudio}/features/window.py | 0 .../cls => paddleaudio}/utils/__init__.py | 0 .../cls => paddleaudio}/utils/download.py | 20 +++------ .../cls => paddleaudio}/utils/env.py | 22 +++++----- .../cls => paddleaudio}/utils/error.py | 0 .../cls => paddleaudio}/utils/log.py | 6 +-- .../cls => paddleaudio}/utils/time.py | 0 paddlespeech/cls/__init__.py | 2 - .../cls/{models => exps/PANNs}/__init__.py | 0 .../cls/exps/PANNs}/deploy/python/predict.py | 0 .../cls/exps/PANNs}/export_model.py | 6 +-- .../cls/exps/PANNs}/model.py | 0 .../cls/{models => exps/PANNs}/panns.py | 4 +- .../cls/exps/PANNs}/predict.py | 10 ++--- .../cls/exps/PANNs}/train.py | 10 ++--- paddlespeech/cls/exps/__init__.py | 13 ++++++ requirements.txt | 3 +- setup.py | 2 +- setup_audio.py | 41 +++++++++++++++++++ 36 files changed, 152 insertions(+), 78 deletions(-) create mode 100755 examples/esc50/cls0/local/infer.sh create mode 100755 examples/esc50/cls0/local/train.sh create mode 100644 paddleaudio/__init__.py rename {paddlespeech/cls => paddleaudio}/backends/__init__.py (100%) rename {paddlespeech/cls => paddleaudio}/backends/audio.py (100%) rename {paddlespeech/cls => paddleaudio}/datasets/__init__.py (100%) rename {paddlespeech/cls => paddleaudio}/datasets/dataset.py (100%) rename {paddlespeech/cls => paddleaudio}/datasets/esc50.py (100%) rename {paddlespeech/cls => paddleaudio}/datasets/gtzan.py (100%) rename {paddlespeech/cls => paddleaudio}/datasets/tess.py (100%) rename {paddlespeech/cls => paddleaudio}/datasets/urban_sound.py (100%) rename {paddlespeech/cls => paddleaudio}/features/__init__.py (100%) rename {paddlespeech/cls => paddleaudio}/features/augment.py (100%) rename {paddlespeech/cls => paddleaudio}/features/core.py (100%) rename {paddlespeech/cls => paddleaudio}/features/spectrum.py (100%) rename {paddlespeech/cls => paddleaudio}/features/window.py (100%) rename {paddlespeech/cls => paddleaudio}/utils/__init__.py (100%) rename {paddlespeech/cls => paddleaudio}/utils/download.py (65%) rename {paddlespeech/cls => paddleaudio}/utils/env.py (64%) rename {paddlespeech/cls => paddleaudio}/utils/error.py (100%) rename {paddlespeech/cls => paddleaudio}/utils/log.py (96%) rename {paddlespeech/cls => paddleaudio}/utils/time.py (100%) rename paddlespeech/cls/{models => exps/PANNs}/__init__.py (100%) rename {examples/esc50/cls0/local => paddlespeech/cls/exps/PANNs}/deploy/python/predict.py (100%) rename {examples/esc50/cls0/local => paddlespeech/cls/exps/PANNs}/export_model.py (92%) rename {examples/esc50/cls0/local => paddlespeech/cls/exps/PANNs}/model.py (100%) rename paddlespeech/cls/{models => exps/PANNs}/panns.py (99%) rename {examples/esc50/cls0/local => paddlespeech/cls/exps/PANNs}/predict.py (90%) rename {examples/esc50/cls0/local => paddlespeech/cls/exps/PANNs}/train.py (96%) create mode 100644 paddlespeech/cls/exps/__init__.py create mode 100644 setup_audio.py diff --git a/examples/esc50/cls0/local/infer.sh b/examples/esc50/cls0/local/infer.sh new file mode 100755 index 00000000..57fc157a --- /dev/null +++ b/examples/esc50/cls0/local/infer.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +device=$1 +audio_file=$2 +ckpt_dir=$3 +feat_backend=$4 + +python3 ${BIN_DIR}/predict.py \ +--device ${device} \ +--wav ${audio_file} \ +--feat_backend ${feat_backend} \ +--top_k 10 \ +--checkpoint ${ckpt_dir}/model.pdparams \ No newline at end of file diff --git a/examples/esc50/cls0/local/train.sh b/examples/esc50/cls0/local/train.sh new file mode 100755 index 00000000..19490472 --- /dev/null +++ b/examples/esc50/cls0/local/train.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +ngpu=$1 +device=$2 +feat_backend=$3 + +num_epochs=50 +batch_size=16 +ckpt_dir=./checkpoint +save_freq=10 + +if [ ${ngpu} -gt 1 ]; then + python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \ + --epochs ${num_epochs} \ + --feat_backend ${feat_backend} \ + --batch_size ${batch_size} \ + --checkpoint_dir ${ckpt_dir} \ + --save_freq ${save_freq} +else + python3 ${BIN_DIR}/train.py \ + --device ${device} \ + --epochs ${num_epochs} \ + --feat_backend ${feat_backend} \ + --batch_size ${batch_size} \ + --checkpoint_dir ${ckpt_dir} \ + --save_freq ${save_freq} +fi diff --git a/examples/esc50/cls0/path.sh b/examples/esc50/cls0/path.sh index 38a242a4..2cc73e27 100644 --- a/examples/esc50/cls0/path.sh +++ b/examples/esc50/cls0/path.sh @@ -1,3 +1,4 @@ +#!/bin/bash export MAIN_ROOT=`realpath ${PWD}/../../../` export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} @@ -8,4 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1 export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} -export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ +MODEL=PANNs +export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL} \ No newline at end of file diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh index 6d3a09c6..e75ad517 100755 --- a/examples/esc50/cls0/run.sh +++ b/examples/esc50/cls0/run.sh @@ -11,41 +11,17 @@ fi stage=$1 stop_stage=100 - -num_epochs=50 -batch_size=16 -ckpt_dir=./checkpoint -save_freq=10 feat_backend=numpy if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - if [ ${ngpu} -gt 1 ]; then - python -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES local/train.py \ - --epochs ${num_epochs} \ - --feat_backend ${feat_backend} \ - --batch_size ${batch_size} \ - --checkpoint_dir ${ckpt_dir} \ - --save_freq ${save_freq} - else - python local/train.py \ - --device ${device} \ - --epochs ${num_epochs} \ - --feat_backend ${feat_backend} \ - --batch_size ${batch_size} \ - --checkpoint_dir ${ckpt_dir} \ - --save_freq ${save_freq} - fi + ./local/train.sh ${ngpu} ${device} ${feat_backend} || exit -1 fi audio_file=~/cat.wav -ckpt=./checkpoint/epoch_50/model.pdparams +ckpt_dir=./checkpoint/epoch_50 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - python local/predict.py \ - --device ${device} \ - --wav ${audio_file} \ - --feat_backend ${feat_backend} \ - --top_k 10 \ - --checkpoint ${ckpt} + ./local/infer.sh ${device} ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1 fi + exit 0 \ No newline at end of file diff --git a/paddleaudio/__init__.py b/paddleaudio/__init__.py new file mode 100644 index 00000000..2685cf57 --- /dev/null +++ b/paddleaudio/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .backends import * +from .features import * diff --git a/paddlespeech/cls/backends/__init__.py b/paddleaudio/backends/__init__.py similarity index 100% rename from paddlespeech/cls/backends/__init__.py rename to paddleaudio/backends/__init__.py diff --git a/paddlespeech/cls/backends/audio.py b/paddleaudio/backends/audio.py similarity index 100% rename from paddlespeech/cls/backends/audio.py rename to paddleaudio/backends/audio.py diff --git a/paddlespeech/cls/datasets/__init__.py b/paddleaudio/datasets/__init__.py similarity index 100% rename from paddlespeech/cls/datasets/__init__.py rename to paddleaudio/datasets/__init__.py diff --git a/paddlespeech/cls/datasets/dataset.py b/paddleaudio/datasets/dataset.py similarity index 100% rename from paddlespeech/cls/datasets/dataset.py rename to paddleaudio/datasets/dataset.py diff --git a/paddlespeech/cls/datasets/esc50.py b/paddleaudio/datasets/esc50.py similarity index 100% rename from paddlespeech/cls/datasets/esc50.py rename to paddleaudio/datasets/esc50.py diff --git a/paddlespeech/cls/datasets/gtzan.py b/paddleaudio/datasets/gtzan.py similarity index 100% rename from paddlespeech/cls/datasets/gtzan.py rename to paddleaudio/datasets/gtzan.py diff --git a/paddlespeech/cls/datasets/tess.py b/paddleaudio/datasets/tess.py similarity index 100% rename from paddlespeech/cls/datasets/tess.py rename to paddleaudio/datasets/tess.py diff --git a/paddlespeech/cls/datasets/urban_sound.py b/paddleaudio/datasets/urban_sound.py similarity index 100% rename from paddlespeech/cls/datasets/urban_sound.py rename to paddleaudio/datasets/urban_sound.py diff --git a/paddlespeech/cls/features/__init__.py b/paddleaudio/features/__init__.py similarity index 100% rename from paddlespeech/cls/features/__init__.py rename to paddleaudio/features/__init__.py diff --git a/paddlespeech/cls/features/augment.py b/paddleaudio/features/augment.py similarity index 100% rename from paddlespeech/cls/features/augment.py rename to paddleaudio/features/augment.py diff --git a/paddlespeech/cls/features/core.py b/paddleaudio/features/core.py similarity index 100% rename from paddlespeech/cls/features/core.py rename to paddleaudio/features/core.py diff --git a/paddlespeech/cls/features/spectrum.py b/paddleaudio/features/spectrum.py similarity index 100% rename from paddlespeech/cls/features/spectrum.py rename to paddleaudio/features/spectrum.py diff --git a/paddlespeech/cls/features/window.py b/paddleaudio/features/window.py similarity index 100% rename from paddlespeech/cls/features/window.py rename to paddleaudio/features/window.py diff --git a/paddlespeech/cls/utils/__init__.py b/paddleaudio/utils/__init__.py similarity index 100% rename from paddlespeech/cls/utils/__init__.py rename to paddleaudio/utils/__init__.py diff --git a/paddlespeech/cls/utils/download.py b/paddleaudio/utils/download.py similarity index 65% rename from paddlespeech/cls/utils/download.py rename to paddleaudio/utils/download.py index 0a36f29b..45a8e57b 100644 --- a/paddlespeech/cls/utils/download.py +++ b/paddleaudio/utils/download.py @@ -17,7 +17,6 @@ from typing import List from paddle.framework import load as load_state_dict from paddle.utils import download -from pathos.multiprocessing import ProcessPool from .log import logger @@ -32,27 +31,18 @@ def decompress(file: str): download._decompress(file) -def download_and_decompress(archives: List[Dict[str, str]], - path: str, - n_workers: int=0): +def download_and_decompress(archives: List[Dict[str, str]], path: str): """ Download archieves and decompress to specific path. """ if not os.path.isdir(path): os.makedirs(path) - if n_workers <= 0: - for archive in archives: - assert 'url' in archive and 'md5' in archive, \ - 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' + for archive in archives: + assert 'url' in archive and 'md5' in archive, \ + 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' - download.get_path_from_url(archive['url'], path, archive['md5']) - else: - pool = ProcessPool(nodes=n_workers) - pool.imap(download.get_path_from_url, [_['url'] for _ in archives], - [path] * len(archives), [_['md5'] for _ in archives]) - pool.close() - pool.join() + download.get_path_from_url(archive['url'], path, archive['md5']) def load_state_dict_from_url(url: str, path: str, md5: str=None): diff --git a/paddlespeech/cls/utils/env.py b/paddleaudio/utils/env.py similarity index 64% rename from paddlespeech/cls/utils/env.py rename to paddleaudio/utils/env.py index c455af00..59c6b621 100644 --- a/paddlespeech/cls/utils/env.py +++ b/paddleaudio/utils/env.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. ''' -This module is used to store environmental variables in PaddleSpeech. -PPSPEECH_HOME --> the root directory for storing PaddleSpeech related data. Default to ~/.paddlespeech. Users can change the -├ default value through the PPSPEECH_HOME environment variable. +This module is used to store environmental variables in PaddleAudio. +PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the +├ default value through the PPAUDIO_HOME environment variable. ├─ MODEL_HOME --> Store model files. └─ DATA_HOME --> Store automatically downloaded datasets. ''' @@ -25,29 +25,29 @@ def _get_user_home(): return os.path.expanduser('~') -def _get_package_home(): - if 'PPSPEECH_HOME' in os.environ: - home_path = os.environ['PPSPEECH_HOME'] +def _get_ppaudio_home(): + if 'PPAUDIO_HOME' in os.environ: + home_path = os.environ['PPAUDIO_HOME'] if os.path.exists(home_path): if os.path.isdir(home_path): return home_path else: raise RuntimeError( - 'The environment variable PPSPEECH_HOME {} is not a directory.'. + 'The environment variable PPAUDIO_HOME {} is not a directory.'. format(home_path)) else: return home_path - return os.path.join(_get_user_home(), '.paddlespeech') + return os.path.join(_get_user_home(), '.paddleaudio') def _get_sub_home(directory): - home = os.path.join(_get_package_home(), directory) + home = os.path.join(_get_ppaudio_home(), directory) if not os.path.exists(home): os.makedirs(home) return home USER_HOME = _get_user_home() -PPSPEECH_HOME = _get_package_home() -MODEL_HOME = _get_sub_home('pretrained_models') +PPAUDIO_HOME = _get_ppaudio_home() +MODEL_HOME = _get_sub_home('models') DATA_HOME = _get_sub_home('datasets') diff --git a/paddlespeech/cls/utils/error.py b/paddleaudio/utils/error.py similarity index 100% rename from paddlespeech/cls/utils/error.py rename to paddleaudio/utils/error.py diff --git a/paddlespeech/cls/utils/log.py b/paddleaudio/utils/log.py similarity index 96% rename from paddlespeech/cls/utils/log.py rename to paddleaudio/utils/log.py index f4146c4f..5e7db68a 100644 --- a/paddlespeech/cls/utils/log.py +++ b/paddleaudio/utils/log.py @@ -55,13 +55,13 @@ log_config = { class Logger(object): ''' - Deafult logger in PaddleSpeech + Deafult logger in PaddleAudio Args: - name(str) : Logger name, default is 'PaddleSpeech' + name(str) : Logger name, default is 'PaddleAudio' ''' def __init__(self, name: str=None): - name = 'PaddleSpeech' if not name else name + name = 'PaddleAudio' if not name else name self.logger = logging.getLogger(name) for key, conf in log_config.items(): diff --git a/paddlespeech/cls/utils/time.py b/paddleaudio/utils/time.py similarity index 100% rename from paddlespeech/cls/utils/time.py rename to paddleaudio/utils/time.py diff --git a/paddlespeech/cls/__init__.py b/paddlespeech/cls/__init__.py index 2685cf57..185a92b8 100644 --- a/paddlespeech/cls/__init__.py +++ b/paddlespeech/cls/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .backends import * -from .features import * diff --git a/paddlespeech/cls/models/__init__.py b/paddlespeech/cls/exps/PANNs/__init__.py similarity index 100% rename from paddlespeech/cls/models/__init__.py rename to paddlespeech/cls/exps/PANNs/__init__.py diff --git a/examples/esc50/cls0/local/deploy/python/predict.py b/paddlespeech/cls/exps/PANNs/deploy/python/predict.py similarity index 100% rename from examples/esc50/cls0/local/deploy/python/predict.py rename to paddlespeech/cls/exps/PANNs/deploy/python/predict.py diff --git a/examples/esc50/cls0/local/export_model.py b/paddlespeech/cls/exps/PANNs/export_model.py similarity index 92% rename from examples/esc50/cls0/local/export_model.py rename to paddlespeech/cls/exps/PANNs/export_model.py index 87dd527c..4dac5237 100644 --- a/examples/esc50/cls0/local/export_model.py +++ b/paddlespeech/cls/exps/PANNs/export_model.py @@ -15,10 +15,10 @@ import argparse import os import paddle -from model import SoundClassifier -from paddlespeech.cls.datasets import ESC50 -from paddlespeech.cls.models.panns import cnn14 +from .model import SoundClassifier +from .panns import cnn14 +from paddleaudio.datasets import ESC50 # yapf: disable parser = argparse.ArgumentParser(__doc__) diff --git a/examples/esc50/cls0/local/model.py b/paddlespeech/cls/exps/PANNs/model.py similarity index 100% rename from examples/esc50/cls0/local/model.py rename to paddlespeech/cls/exps/PANNs/model.py diff --git a/paddlespeech/cls/models/panns.py b/paddlespeech/cls/exps/PANNs/panns.py similarity index 99% rename from paddlespeech/cls/models/panns.py rename to paddlespeech/cls/exps/PANNs/panns.py index 1c68f06f..6d2dac56 100644 --- a/paddlespeech/cls/models/panns.py +++ b/paddlespeech/cls/exps/PANNs/panns.py @@ -16,8 +16,8 @@ import os import paddle.nn as nn import paddle.nn.functional as F -from ..utils.download import load_state_dict_from_url -from ..utils.env import MODEL_HOME +from paddleaudio.utils.download import load_state_dict_from_url +from paddleaudio.utils.env import MODEL_HOME __all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6'] diff --git a/examples/esc50/cls0/local/predict.py b/paddlespeech/cls/exps/PANNs/predict.py similarity index 90% rename from examples/esc50/cls0/local/predict.py rename to paddlespeech/cls/exps/PANNs/predict.py index a6e38a35..2d97ab1b 100644 --- a/examples/esc50/cls0/local/predict.py +++ b/paddlespeech/cls/exps/PANNs/predict.py @@ -17,12 +17,12 @@ import numpy as np import paddle import paddle.nn.functional as F from model import SoundClassifier +from panns import cnn14 -from paddlespeech.cls.backends import load as load_audio -from paddlespeech.cls.datasets import ESC50 -from paddlespeech.cls.features import LogMelSpectrogram -from paddlespeech.cls.features import melspectrogram -from paddlespeech.cls.models.panns import cnn14 +from paddleaudio.backends import load as load_audio +from paddleaudio.datasets import ESC50 +from paddleaudio.features import LogMelSpectrogram +from paddleaudio.features import melspectrogram # yapf: disable parser = argparse.ArgumentParser(__doc__) diff --git a/examples/esc50/cls0/local/train.py b/paddlespeech/cls/exps/PANNs/train.py similarity index 96% rename from examples/esc50/cls0/local/train.py rename to paddlespeech/cls/exps/PANNs/train.py index 7a030187..a3fb01ef 100644 --- a/examples/esc50/cls0/local/train.py +++ b/paddlespeech/cls/exps/PANNs/train.py @@ -16,12 +16,12 @@ import os import paddle from model import SoundClassifier +from panns import cnn14 -from paddlespeech.cls.datasets import ESC50 -from paddlespeech.cls.features import LogMelSpectrogram -from paddlespeech.cls.models.panns import cnn14 -from paddlespeech.cls.utils import logger -from paddlespeech.cls.utils import Timer +from paddleaudio.datasets import ESC50 +from paddleaudio.features import LogMelSpectrogram +from paddleaudio.utils import logger +from paddleaudio.utils import Timer # yapf: disable parser = argparse.ArgumentParser(__doc__) diff --git a/paddlespeech/cls/exps/__init__.py b/paddlespeech/cls/exps/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/paddlespeech/cls/exps/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/requirements.txt b/requirements.txt index c6cb556c..4456ccc2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -colorlog ConfigArgParse coverage distro @@ -19,10 +18,10 @@ matplotlib nara_wpe nltk numba +paddleaudio paddlespeech_ctcdecoders paddlespeech_feat pandas -pathos phkit Pillow praatio~=4.1 diff --git a/setup.py b/setup.py index d07db788..310eed1e 100644 --- a/setup.py +++ b/setup.py @@ -173,7 +173,7 @@ setup_info = dict( # Package info packages=find_packages(exclude=('utils', 'tests', 'tests.*', 'examples*', - 'third_party*', 'tools*')), + 'paddleaudio*', 'third_party*', 'tools*')), zip_safe=True, classifiers=[ 'Development Status :: 3 - Alpha', diff --git a/setup_audio.py b/setup_audio.py new file mode 100644 index 00000000..24c9bb9b --- /dev/null +++ b/setup_audio.py @@ -0,0 +1,41 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import setuptools + +# set the version here +version = '0.1.0a' + +setuptools.setup( + name="paddleaudio", + version=version, + author="", + author_email="", + description="PaddleAudio, in development", + long_description="", + long_description_content_type="text/markdown", + url="", + packages=setuptools.find_packages(include=['paddleaudio*']), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.6', + install_requires=[ + 'numpy >= 1.15.0', + 'scipy >= 1.0.0', + 'resampy >= 0.2.2', + 'soundfile >= 0.9.0', + 'colorlog', + ], ) From dfdc19fb49df4e1a88d035b6e3229c4ae4dc13dd Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 25 Nov 2021 16:48:50 +0800 Subject: [PATCH 6/9] Add paddlespeech.cls and esc50 example. --- examples/esc50/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/esc50/README.md b/examples/esc50/README.md index 6ac10b3a..3cf93259 100644 --- a/examples/esc50/README.md +++ b/examples/esc50/README.md @@ -28,7 +28,7 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型,可供用 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 ``` -`local/train.py` 脚本中可支持配置的参数: +`paddlespeech/cls/exps/PANNs/train.py` 脚本中可支持配置的参数: - `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 - `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 @@ -65,7 +65,7 @@ model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 ``` -`local/predict.py` 脚本中可支持配置的参数: +`paddlespeech/cls/exps/PANNs/predict.py` 脚本中可支持配置的参数: - `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 - `wav`: 指定预测的音频文件。 From 476f05c424ae461f593951d9bfb4c1ff6e7c84c9 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 25 Nov 2021 20:14:06 +0800 Subject: [PATCH 7/9] Update requirements.txt --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4456ccc2..99e485f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,7 +18,6 @@ matplotlib nara_wpe nltk numba -paddleaudio paddlespeech_ctcdecoders paddlespeech_feat pandas From 33f0e7622ca250d3e520bcf316ddd2d0c9a04cc0 Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Thu, 25 Nov 2021 22:24:30 +0800 Subject: [PATCH 8/9] Add paddlespeech.cls and esc50 example. --- examples/esc50/README.md | 24 ++++++++++++------- examples/esc50/cls0/local/export.sh | 8 +++++++ .../esc50/cls0/local/static_model_infer.sh | 11 +++++++++ examples/esc50/cls0/run.sh | 12 +++++++++- paddlespeech/cls/exps/PANNs/__init__.py | 1 - .../cls/exps/PANNs/deploy/__init__.py | 13 ++++++++++ .../exps/PANNs/deploy/{python => }/predict.py | 14 +++++------ paddlespeech/cls/exps/PANNs/export_model.py | 4 ++-- paddlespeech/cls/exps/PANNs/predict.py | 4 ++-- paddlespeech/cls/exps/PANNs/train.py | 4 ++-- paddlespeech/cls/models/PANNs/__init__.py | 15 ++++++++++++ .../model.py => models/PANNs/classifier.py} | 0 .../cls/{exps => models}/PANNs/panns.py | 0 paddlespeech/cls/models/__init__.py | 14 +++++++++++ 14 files changed, 100 insertions(+), 24 deletions(-) create mode 100755 examples/esc50/cls0/local/export.sh create mode 100755 examples/esc50/cls0/local/static_model_infer.sh create mode 100644 paddlespeech/cls/exps/PANNs/deploy/__init__.py rename paddlespeech/cls/exps/PANNs/deploy/{python => }/predict.py (94%) create mode 100644 paddlespeech/cls/models/PANNs/__init__.py rename paddlespeech/cls/{exps/PANNs/model.py => models/PANNs/classifier.py} (100%) rename paddlespeech/cls/{exps => models}/PANNs/panns.py (100%) create mode 100644 paddlespeech/cls/models/__init__.py diff --git a/examples/esc50/README.md b/examples/esc50/README.md index 3cf93259..aa283845 100644 --- a/examples/esc50/README.md +++ b/examples/esc50/README.md @@ -30,7 +30,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 `paddlespeech/cls/exps/PANNs/train.py` 脚本中可支持配置的参数: -- `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 +- `device`: 指定模型预测时使用的设备。 - `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 - `epochs`: 训练轮次,默认为50。 - `learning_rate`: Fine-tune的学习率;默认为5e-5。 @@ -42,8 +42,8 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过以下方式执行: ```python -from model import SoundClassifier -from paddlespeech.cls.datasets import ESC50 +from paddleaudio.datasets import ESC50 +from paddlespeech.cls.models import SoundClassifier from paddlespeech.cls.models import cnn14, cnn10, cnn6 # CNN14 @@ -67,7 +67,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 `paddlespeech/cls/exps/PANNs/predict.py` 脚本中可支持配置的参数: -- `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 +- `device`: 指定模型预测时使用的设备。 - `wav`: 指定预测的音频文件。 - `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 - `top_k`: 预测显示的top k标签的得分,默认为1。 @@ -88,10 +88,10 @@ Cat: 6.579841738130199e-06 模型训练结束后,可以将已保存的动态图参数导出成静态图的模型和参数,然后实施静态图的部署。 ```shell -python -u export_model.py --checkpoint ./checkpoint/epoch_50/model.pdparams --output_dir ./export +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3 ``` -可支持配置的参数: +`paddlespeech/cls/exps/PANNs/export_model.py` 脚本中可支持配置的参数: - `checkpoint`: 模型参数checkpoint文件。 - `output_dir`: 导出静态图模型和参数文件的保存目录。 @@ -106,8 +106,16 @@ export #### 2. 模型部署和预测 -`deploy/python/predict.py` 脚本使用了`paddle.inference`模块下的api,提供了python端部署的示例: +`paddlespeech/cls/exps/PANNs/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api,提供了python端部署的示例: +```shell +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3 +``` ```sh -python deploy/python/predict.py --model_dir ./export --device gpu +python paddlespeech/cls/exps/PANNs/deploy/predict.py --model_dir ./export --device gpu ``` + +`paddlespeech/cls/exps/PANNs/deploy/predict.py` 脚本中可支持配置的主要参数: +- `device`: 指定模型预测时使用的设备。 +- `model_dir`: 导出静态图模型和参数文件的保存目录。 +- `wav`: 指定预测的音频文件。 diff --git a/examples/esc50/cls0/local/export.sh b/examples/esc50/cls0/local/export.sh new file mode 100755 index 00000000..160dc743 --- /dev/null +++ b/examples/esc50/cls0/local/export.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +ckpt_dir=$1 +output_dir=$2 + +python3 ${BIN_DIR}/export_model.py \ +--checkpoint ${ckpt_dir}/model.pdparams \ +--output_dir ${output_dir} diff --git a/examples/esc50/cls0/local/static_model_infer.sh b/examples/esc50/cls0/local/static_model_infer.sh new file mode 100755 index 00000000..ba4eeda4 --- /dev/null +++ b/examples/esc50/cls0/local/static_model_infer.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +device=$1 +model_dir=$2 +audio_file=$3 + +python3 ${BIN_DIR}/deploy/predict.py \ +--device ${device} \ +--model_dir ${model_dir} \ +--wav ${audio_file} + diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh index e75ad517..63ba99f4 100755 --- a/examples/esc50/cls0/run.sh +++ b/examples/esc50/cls0/run.sh @@ -15,13 +15,23 @@ feat_backend=numpy if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ./local/train.sh ${ngpu} ${device} ${feat_backend} || exit -1 + exit 0 fi audio_file=~/cat.wav ckpt_dir=./checkpoint/epoch_50 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ./local/infer.sh ${device} ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1 + exit 0 fi +output_dir=./export +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + ./local/export.sh ${ckpt_dir} ${output_dir} || exit -1 + exit 0 +fi -exit 0 \ No newline at end of file +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + ./local/static_model_infer.sh ${device} ${output_dir} ${audio_file} || exit -1 + exit 0 +fi diff --git a/paddlespeech/cls/exps/PANNs/__init__.py b/paddlespeech/cls/exps/PANNs/__init__.py index 4bfadda1..185a92b8 100644 --- a/paddlespeech/cls/exps/PANNs/__init__.py +++ b/paddlespeech/cls/exps/PANNs/__init__.py @@ -11,4 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .panns import * diff --git a/paddlespeech/cls/exps/PANNs/deploy/__init__.py b/paddlespeech/cls/exps/PANNs/deploy/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/paddlespeech/cls/exps/PANNs/deploy/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/cls/exps/PANNs/deploy/python/predict.py b/paddlespeech/cls/exps/PANNs/deploy/predict.py similarity index 94% rename from paddlespeech/cls/exps/PANNs/deploy/python/predict.py rename to paddlespeech/cls/exps/PANNs/deploy/predict.py index 13730acd..d4e5c22f 100644 --- a/paddlespeech/cls/exps/PANNs/deploy/python/predict.py +++ b/paddlespeech/cls/exps/PANNs/deploy/predict.py @@ -18,15 +18,16 @@ import numpy as np from paddle import inference from scipy.special import softmax -from paddlespeech.cls.backends import load as load_audio -from paddlespeech.cls.datasets import ESC50 -from paddlespeech.cls.features import melspectrogram +from paddleaudio.backends import load as load_audio +from paddleaudio.datasets import ESC50 +from paddleaudio.features import melspectrogram # yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.") -parser.add_argument("--batch_size", type=int, default=2, help="Batch size per GPU/CPU for training.") parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.") +parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") +parser.add_argument("--batch_size", type=int, default=1, help="Batch size per GPU/CPU for training.") parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.') parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.') parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.') @@ -132,10 +133,7 @@ if __name__ == "__main__": args.use_tensorrt, args.precision, args.cpu_threads, args.enable_mkldnn) - wavs = [ - '~/audio_demo_resource/cat.wav', - '~/audio_demo_resource/dog.wav', - ] + wavs = [args.wav] for i in range(len(wavs)): wavs[i] = os.path.abspath(os.path.expanduser(wavs[i])) diff --git a/paddlespeech/cls/exps/PANNs/export_model.py b/paddlespeech/cls/exps/PANNs/export_model.py index 4dac5237..c295c6a3 100644 --- a/paddlespeech/cls/exps/PANNs/export_model.py +++ b/paddlespeech/cls/exps/PANNs/export_model.py @@ -16,9 +16,9 @@ import os import paddle -from .model import SoundClassifier -from .panns import cnn14 from paddleaudio.datasets import ESC50 +from paddlespeech.cls.models import cnn14 +from paddlespeech.cls.models import SoundClassifier # yapf: disable parser = argparse.ArgumentParser(__doc__) diff --git a/paddlespeech/cls/exps/PANNs/predict.py b/paddlespeech/cls/exps/PANNs/predict.py index 2d97ab1b..717b35ed 100644 --- a/paddlespeech/cls/exps/PANNs/predict.py +++ b/paddlespeech/cls/exps/PANNs/predict.py @@ -16,13 +16,13 @@ import argparse import numpy as np import paddle import paddle.nn.functional as F -from model import SoundClassifier -from panns import cnn14 from paddleaudio.backends import load as load_audio from paddleaudio.datasets import ESC50 from paddleaudio.features import LogMelSpectrogram from paddleaudio.features import melspectrogram +from paddlespeech.cls.models import cnn14 +from paddlespeech.cls.models import SoundClassifier # yapf: disable parser = argparse.ArgumentParser(__doc__) diff --git a/paddlespeech/cls/exps/PANNs/train.py b/paddlespeech/cls/exps/PANNs/train.py index a3fb01ef..e66724b8 100644 --- a/paddlespeech/cls/exps/PANNs/train.py +++ b/paddlespeech/cls/exps/PANNs/train.py @@ -15,13 +15,13 @@ import argparse import os import paddle -from model import SoundClassifier -from panns import cnn14 from paddleaudio.datasets import ESC50 from paddleaudio.features import LogMelSpectrogram from paddleaudio.utils import logger from paddleaudio.utils import Timer +from paddlespeech.cls.models import cnn14 +from paddlespeech.cls.models import SoundClassifier # yapf: disable parser = argparse.ArgumentParser(__doc__) diff --git a/paddlespeech/cls/models/PANNs/__init__.py b/paddlespeech/cls/models/PANNs/__init__.py new file mode 100644 index 00000000..638f772f --- /dev/null +++ b/paddlespeech/cls/models/PANNs/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .classifier import * +from .panns import * diff --git a/paddlespeech/cls/exps/PANNs/model.py b/paddlespeech/cls/models/PANNs/classifier.py similarity index 100% rename from paddlespeech/cls/exps/PANNs/model.py rename to paddlespeech/cls/models/PANNs/classifier.py diff --git a/paddlespeech/cls/exps/PANNs/panns.py b/paddlespeech/cls/models/PANNs/panns.py similarity index 100% rename from paddlespeech/cls/exps/PANNs/panns.py rename to paddlespeech/cls/models/PANNs/panns.py diff --git a/paddlespeech/cls/models/__init__.py b/paddlespeech/cls/models/__init__.py new file mode 100644 index 00000000..66030b72 --- /dev/null +++ b/paddlespeech/cls/models/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .PANNs import * From 6e1ac1cc159cd4ee3ffcb5c7861b858cf854623d Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Fri, 26 Nov 2021 17:06:57 +0800 Subject: [PATCH 9/9] Add paddlespeech.cls and esc50 example. --- examples/esc50/README.md | 15 ++++++--------- examples/esc50/cls0/local/infer.sh | 10 ++++------ .../esc50/cls0/local/static_model_infer.sh | 1 - examples/esc50/cls0/local/train.sh | 6 ++---- examples/esc50/cls0/path.sh | 2 +- examples/esc50/cls0/run.sh | 18 +++++++----------- .../cls/exps/{PANNs => panns}/__init__.py | 0 .../exps/{PANNs => panns}/deploy/__init__.py | 0 .../exps/{PANNs => panns}/deploy/predict.py | 0 .../cls/exps/{PANNs => panns}/export_model.py | 0 .../cls/exps/{PANNs => panns}/predict.py | 2 -- .../cls/exps/{PANNs => panns}/train.py | 2 -- paddlespeech/cls/models/__init__.py | 2 +- .../cls/models/{PANNs => panns}/__init__.py | 0 .../cls/models/{PANNs => panns}/classifier.py | 0 .../cls/models/{PANNs => panns}/panns.py | 0 16 files changed, 21 insertions(+), 37 deletions(-) rename paddlespeech/cls/exps/{PANNs => panns}/__init__.py (100%) rename paddlespeech/cls/exps/{PANNs => panns}/deploy/__init__.py (100%) rename paddlespeech/cls/exps/{PANNs => panns}/deploy/predict.py (100%) rename paddlespeech/cls/exps/{PANNs => panns}/export_model.py (100%) rename paddlespeech/cls/exps/{PANNs => panns}/predict.py (94%) rename paddlespeech/cls/exps/{PANNs => panns}/train.py (97%) rename paddlespeech/cls/models/{PANNs => panns}/__init__.py (100%) rename paddlespeech/cls/models/{PANNs => panns}/classifier.py (100%) rename paddlespeech/cls/models/{PANNs => panns}/panns.py (100%) diff --git a/examples/esc50/README.md b/examples/esc50/README.md index aa283845..66409754 100644 --- a/examples/esc50/README.md +++ b/examples/esc50/README.md @@ -28,7 +28,7 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型,可供用 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 ``` -`paddlespeech/cls/exps/PANNs/train.py` 脚本中可支持配置的参数: +`paddlespeech/cls/exps/panns/train.py` 脚本中可支持配置的参数: - `device`: 指定模型预测时使用的设备。 - `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 @@ -65,7 +65,7 @@ model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 ``` -`paddlespeech/cls/exps/PANNs/predict.py` 脚本中可支持配置的参数: +`paddlespeech/cls/exps/panns/predict.py` 脚本中可支持配置的参数: - `device`: 指定模型预测时使用的设备。 - `wav`: 指定预测的音频文件。 @@ -91,7 +91,7 @@ Cat: 6.579841738130199e-06 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 3 ``` -`paddlespeech/cls/exps/PANNs/export_model.py` 脚本中可支持配置的参数: +`paddlespeech/cls/exps/panns/export_model.py` 脚本中可支持配置的参数: - `checkpoint`: 模型参数checkpoint文件。 - `output_dir`: 导出静态图模型和参数文件的保存目录。 @@ -106,16 +106,13 @@ export #### 2. 模型部署和预测 -`paddlespeech/cls/exps/PANNs/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api,提供了python端部署的示例: +`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api,提供了python端部署的示例: ```shell -$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3 -``` -```sh -python paddlespeech/cls/exps/PANNs/deploy/predict.py --model_dir ./export --device gpu +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4 ``` -`paddlespeech/cls/exps/PANNs/deploy/predict.py` 脚本中可支持配置的主要参数: +`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本中可支持配置的主要参数: - `device`: 指定模型预测时使用的设备。 - `model_dir`: 导出静态图模型和参数文件的保存目录。 - `wav`: 指定预测的音频文件。 diff --git a/examples/esc50/cls0/local/infer.sh b/examples/esc50/cls0/local/infer.sh index 57fc157a..bc03d681 100755 --- a/examples/esc50/cls0/local/infer.sh +++ b/examples/esc50/cls0/local/infer.sh @@ -1,13 +1,11 @@ #!/bin/bash -device=$1 -audio_file=$2 -ckpt_dir=$3 -feat_backend=$4 +audio_file=$1 +ckpt_dir=$2 +feat_backend=$3 python3 ${BIN_DIR}/predict.py \ ---device ${device} \ --wav ${audio_file} \ --feat_backend ${feat_backend} \ --top_k 10 \ ---checkpoint ${ckpt_dir}/model.pdparams \ No newline at end of file +--checkpoint ${ckpt_dir}/model.pdparams diff --git a/examples/esc50/cls0/local/static_model_infer.sh b/examples/esc50/cls0/local/static_model_infer.sh index ba4eeda4..9b3abb5d 100755 --- a/examples/esc50/cls0/local/static_model_infer.sh +++ b/examples/esc50/cls0/local/static_model_infer.sh @@ -8,4 +8,3 @@ python3 ${BIN_DIR}/deploy/predict.py \ --device ${device} \ --model_dir ${model_dir} \ --wav ${audio_file} - diff --git a/examples/esc50/cls0/local/train.sh b/examples/esc50/cls0/local/train.sh index 19490472..0f0f3d09 100755 --- a/examples/esc50/cls0/local/train.sh +++ b/examples/esc50/cls0/local/train.sh @@ -1,15 +1,14 @@ #!/bin/bash ngpu=$1 -device=$2 -feat_backend=$3 +feat_backend=$2 num_epochs=50 batch_size=16 ckpt_dir=./checkpoint save_freq=10 -if [ ${ngpu} -gt 1 ]; then +if [ ${ngpu} -gt 0 ]; then python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \ --epochs ${num_epochs} \ --feat_backend ${feat_backend} \ @@ -18,7 +17,6 @@ if [ ${ngpu} -gt 1 ]; then --save_freq ${save_freq} else python3 ${BIN_DIR}/train.py \ - --device ${device} \ --epochs ${num_epochs} \ --feat_backend ${feat_backend} \ --batch_size ${batch_size} \ diff --git a/examples/esc50/cls0/path.sh b/examples/esc50/cls0/path.sh index 2cc73e27..3eff28e4 100644 --- a/examples/esc50/cls0/path.sh +++ b/examples/esc50/cls0/path.sh @@ -9,5 +9,5 @@ export PYTHONDONTWRITEBYTECODE=1 export PYTHONIOENCODING=UTF-8 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} -MODEL=PANNs +MODEL=panns export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL} \ No newline at end of file diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh index 63ba99f4..7283aa8d 100755 --- a/examples/esc50/cls0/run.sh +++ b/examples/esc50/cls0/run.sh @@ -3,35 +3,31 @@ set -e source path.sh ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -if [ ${ngpu} == 0 ];then - device=cpu -else - device=gpu -fi stage=$1 stop_stage=100 feat_backend=numpy +audio_file=~/cat.wav +ckpt_dir=./checkpoint/epoch_50 +output_dir=./export +infer_device=cpu if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - ./local/train.sh ${ngpu} ${device} ${feat_backend} || exit -1 + ./local/train.sh ${ngpu} ${feat_backend} || exit -1 exit 0 fi -audio_file=~/cat.wav -ckpt_dir=./checkpoint/epoch_50 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ./local/infer.sh ${device} ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1 + ./local/infer.sh ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1 exit 0 fi -output_dir=./export if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ./local/export.sh ${ckpt_dir} ${output_dir} || exit -1 exit 0 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - ./local/static_model_infer.sh ${device} ${output_dir} ${audio_file} || exit -1 + ./local/static_model_infer.sh ${infer_device} ${output_dir} ${audio_file} || exit -1 exit 0 fi diff --git a/paddlespeech/cls/exps/PANNs/__init__.py b/paddlespeech/cls/exps/panns/__init__.py similarity index 100% rename from paddlespeech/cls/exps/PANNs/__init__.py rename to paddlespeech/cls/exps/panns/__init__.py diff --git a/paddlespeech/cls/exps/PANNs/deploy/__init__.py b/paddlespeech/cls/exps/panns/deploy/__init__.py similarity index 100% rename from paddlespeech/cls/exps/PANNs/deploy/__init__.py rename to paddlespeech/cls/exps/panns/deploy/__init__.py diff --git a/paddlespeech/cls/exps/PANNs/deploy/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py similarity index 100% rename from paddlespeech/cls/exps/PANNs/deploy/predict.py rename to paddlespeech/cls/exps/panns/deploy/predict.py diff --git a/paddlespeech/cls/exps/PANNs/export_model.py b/paddlespeech/cls/exps/panns/export_model.py similarity index 100% rename from paddlespeech/cls/exps/PANNs/export_model.py rename to paddlespeech/cls/exps/panns/export_model.py diff --git a/paddlespeech/cls/exps/PANNs/predict.py b/paddlespeech/cls/exps/panns/predict.py similarity index 94% rename from paddlespeech/cls/exps/PANNs/predict.py rename to paddlespeech/cls/exps/panns/predict.py index 717b35ed..9cfd8b6c 100644 --- a/paddlespeech/cls/exps/PANNs/predict.py +++ b/paddlespeech/cls/exps/panns/predict.py @@ -26,7 +26,6 @@ from paddlespeech.cls.models import SoundClassifier # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.") parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.") parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results") @@ -51,7 +50,6 @@ def extract_features(file: str, feat_backend: str='numpy', if __name__ == '__main__': - paddle.set_device(args.device) model = SoundClassifier( backbone=cnn14(pretrained=False, extract_embedding=True), diff --git a/paddlespeech/cls/exps/PANNs/train.py b/paddlespeech/cls/exps/panns/train.py similarity index 97% rename from paddlespeech/cls/exps/PANNs/train.py rename to paddlespeech/cls/exps/panns/train.py index e66724b8..12130978 100644 --- a/paddlespeech/cls/exps/PANNs/train.py +++ b/paddlespeech/cls/exps/panns/train.py @@ -25,7 +25,6 @@ from paddlespeech.cls.models import SoundClassifier # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.") parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.") parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") @@ -38,7 +37,6 @@ args = parser.parse_args() # yapf: enable if __name__ == "__main__": - paddle.set_device(args.device) nranks = paddle.distributed.get_world_size() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() diff --git a/paddlespeech/cls/models/__init__.py b/paddlespeech/cls/models/__init__.py index 66030b72..4bfadda1 100644 --- a/paddlespeech/cls/models/__init__.py +++ b/paddlespeech/cls/models/__init__.py @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .PANNs import * +from .panns import * diff --git a/paddlespeech/cls/models/PANNs/__init__.py b/paddlespeech/cls/models/panns/__init__.py similarity index 100% rename from paddlespeech/cls/models/PANNs/__init__.py rename to paddlespeech/cls/models/panns/__init__.py diff --git a/paddlespeech/cls/models/PANNs/classifier.py b/paddlespeech/cls/models/panns/classifier.py similarity index 100% rename from paddlespeech/cls/models/PANNs/classifier.py rename to paddlespeech/cls/models/panns/classifier.py diff --git a/paddlespeech/cls/models/PANNs/panns.py b/paddlespeech/cls/models/panns/panns.py similarity index 100% rename from paddlespeech/cls/models/PANNs/panns.py rename to paddlespeech/cls/models/panns/panns.py