Merge pull request #938 from KPatr1ck/develop
[audio] Merge PaddleAudio into PaddleSpeech.pull/942/head
commit
b3ebf5d4c4
@ -0,0 +1,7 @@
|
|||||||
|
.ipynb_checkpoints/**
|
||||||
|
*.ipynb
|
||||||
|
nohup.out
|
||||||
|
__pycache__/
|
||||||
|
*.wav
|
||||||
|
*.m4a
|
||||||
|
obsolete/**
|
@ -0,0 +1,45 @@
|
|||||||
|
repos:
|
||||||
|
- repo: local
|
||||||
|
hooks:
|
||||||
|
- id: yapf
|
||||||
|
name: yapf
|
||||||
|
entry: yapf
|
||||||
|
language: system
|
||||||
|
args: [-i, --style .style.yapf]
|
||||||
|
files: \.py$
|
||||||
|
|
||||||
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
rev: a11d9314b22d8f8c7556443875b731ef05965464
|
||||||
|
hooks:
|
||||||
|
- id: check-merge-conflict
|
||||||
|
- id: check-symlinks
|
||||||
|
- id: end-of-file-fixer
|
||||||
|
- id: trailing-whitespace
|
||||||
|
- id: detect-private-key
|
||||||
|
- id: check-symlinks
|
||||||
|
- id: check-added-large-files
|
||||||
|
|
||||||
|
- repo: https://github.com/pycqa/isort
|
||||||
|
rev: 5.8.0
|
||||||
|
hooks:
|
||||||
|
- id: isort
|
||||||
|
name: isort (python)
|
||||||
|
- id: isort
|
||||||
|
name: isort (cython)
|
||||||
|
types: [cython]
|
||||||
|
- id: isort
|
||||||
|
name: isort (pyi)
|
||||||
|
types: [pyi]
|
||||||
|
|
||||||
|
- repo: local
|
||||||
|
hooks:
|
||||||
|
- id: flake8
|
||||||
|
name: flake8
|
||||||
|
entry: flake8
|
||||||
|
language: system
|
||||||
|
args:
|
||||||
|
- --count
|
||||||
|
- --select=E9,F63,F7,F82
|
||||||
|
- --show-source
|
||||||
|
- --statistics
|
||||||
|
files: \.py$
|
@ -0,0 +1,3 @@
|
|||||||
|
[style]
|
||||||
|
based_on_style = pep8
|
||||||
|
column_limit = 80
|
@ -0,0 +1,201 @@
|
|||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for reasonable and customary use in describing the
|
||||||
|
origin of the Work and reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may choose to offer,
|
||||||
|
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||||
|
or other liability obligations and/or rights consistent with this
|
||||||
|
License. However, in accepting such obligations, You may act only
|
||||||
|
on Your own behalf and on Your sole responsibility, not on behalf
|
||||||
|
of any other Contributor, and only if You agree to indemnify,
|
||||||
|
defend, and hold each Contributor harmless for any liability
|
||||||
|
incurred by, or claims asserted against, such Contributor by reason
|
||||||
|
of your accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright [yyyy] [name of copyright owner]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
@ -0,0 +1,37 @@
|
|||||||
|
# PaddleAudio: The audio library for PaddlePaddle
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Features
|
||||||
|
- Spectrogram and related features are compatible with librosa.
|
||||||
|
- State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come.
|
||||||
|
- Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap.
|
||||||
|
- Data loading supports for common open source audio in multiple languages including English, Mandarin and so on.
|
||||||
|
|
||||||
|
|
||||||
|
## Install
|
||||||
|
```
|
||||||
|
git clone https://github.com/PaddlePaddle/models
|
||||||
|
cd models/PaddleAudio
|
||||||
|
pip install .
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
### Audio loading and feature extraction
|
||||||
|
```
|
||||||
|
import paddleaudio as pa
|
||||||
|
s,r = pa.load(f)
|
||||||
|
mel_spect = pa.melspectrogram(s,sr=r)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
We provide a set of examples to help you get started in using PaddleAudio quickly.
|
||||||
|
- [PANNs: acoustic scene and events analysis using pre-trained models](./examples/panns)
|
||||||
|
- [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification)
|
||||||
|
- [Training a audio-tagging network on Audioset](./examples/audioset_training)
|
||||||
|
|
||||||
|
Please refer to [example directory](./examples) for more details.
|
@ -0,0 +1,527 @@
|
|||||||
|
Speech
|
||||||
|
Male speech, man speaking
|
||||||
|
Female speech, woman speaking
|
||||||
|
Child speech, kid speaking
|
||||||
|
Conversation
|
||||||
|
Narration, monologue
|
||||||
|
Babbling
|
||||||
|
Speech synthesizer
|
||||||
|
Shout
|
||||||
|
Bellow
|
||||||
|
Whoop
|
||||||
|
Yell
|
||||||
|
Battle cry
|
||||||
|
Children shouting
|
||||||
|
Screaming
|
||||||
|
Whispering
|
||||||
|
Laughter
|
||||||
|
Baby laughter
|
||||||
|
Giggle
|
||||||
|
Snicker
|
||||||
|
Belly laugh
|
||||||
|
Chuckle, chortle
|
||||||
|
Crying, sobbing
|
||||||
|
Baby cry, infant cry
|
||||||
|
Whimper
|
||||||
|
Wail, moan
|
||||||
|
Sigh
|
||||||
|
Singing
|
||||||
|
Choir
|
||||||
|
Yodeling
|
||||||
|
Chant
|
||||||
|
Mantra
|
||||||
|
Male singing
|
||||||
|
Female singing
|
||||||
|
Child singing
|
||||||
|
Synthetic singing
|
||||||
|
Rapping
|
||||||
|
Humming
|
||||||
|
Groan
|
||||||
|
Grunt
|
||||||
|
Whistling
|
||||||
|
Breathing
|
||||||
|
Wheeze
|
||||||
|
Snoring
|
||||||
|
Gasp
|
||||||
|
Pant
|
||||||
|
Snort
|
||||||
|
Cough
|
||||||
|
Throat clearing
|
||||||
|
Sneeze
|
||||||
|
Sniff
|
||||||
|
Run
|
||||||
|
Shuffle
|
||||||
|
Walk, footsteps
|
||||||
|
Chewing, mastication
|
||||||
|
Biting
|
||||||
|
Gargling
|
||||||
|
Stomach rumble
|
||||||
|
Burping, eructation
|
||||||
|
Hiccup
|
||||||
|
Fart
|
||||||
|
Hands
|
||||||
|
Finger snapping
|
||||||
|
Clapping
|
||||||
|
Heart sounds, heartbeat
|
||||||
|
Heart murmur
|
||||||
|
Cheering
|
||||||
|
Applause
|
||||||
|
Chatter
|
||||||
|
Crowd
|
||||||
|
Hubbub, speech noise, speech babble
|
||||||
|
Children playing
|
||||||
|
Animal
|
||||||
|
Domestic animals, pets
|
||||||
|
Dog
|
||||||
|
Bark
|
||||||
|
Yip
|
||||||
|
Howl
|
||||||
|
Bow-wow
|
||||||
|
Growling
|
||||||
|
Whimper (dog)
|
||||||
|
Cat
|
||||||
|
Purr
|
||||||
|
Meow
|
||||||
|
Hiss
|
||||||
|
Caterwaul
|
||||||
|
Livestock, farm animals, working animals
|
||||||
|
Horse
|
||||||
|
Clip-clop
|
||||||
|
Neigh, whinny
|
||||||
|
Cattle, bovinae
|
||||||
|
Moo
|
||||||
|
Cowbell
|
||||||
|
Pig
|
||||||
|
Oink
|
||||||
|
Goat
|
||||||
|
Bleat
|
||||||
|
Sheep
|
||||||
|
Fowl
|
||||||
|
Chicken, rooster
|
||||||
|
Cluck
|
||||||
|
Crowing, cock-a-doodle-doo
|
||||||
|
Turkey
|
||||||
|
Gobble
|
||||||
|
Duck
|
||||||
|
Quack
|
||||||
|
Goose
|
||||||
|
Honk
|
||||||
|
Wild animals
|
||||||
|
Roaring cats (lions, tigers)
|
||||||
|
Roar
|
||||||
|
Bird
|
||||||
|
Bird vocalization, bird call, bird song
|
||||||
|
Chirp, tweet
|
||||||
|
Squawk
|
||||||
|
Pigeon, dove
|
||||||
|
Coo
|
||||||
|
Crow
|
||||||
|
Caw
|
||||||
|
Owl
|
||||||
|
Hoot
|
||||||
|
Bird flight, flapping wings
|
||||||
|
Canidae, dogs, wolves
|
||||||
|
Rodents, rats, mice
|
||||||
|
Mouse
|
||||||
|
Patter
|
||||||
|
Insect
|
||||||
|
Cricket
|
||||||
|
Mosquito
|
||||||
|
Fly, housefly
|
||||||
|
Buzz
|
||||||
|
Bee, wasp, etc.
|
||||||
|
Frog
|
||||||
|
Croak
|
||||||
|
Snake
|
||||||
|
Rattle
|
||||||
|
Whale vocalization
|
||||||
|
Music
|
||||||
|
Musical instrument
|
||||||
|
Plucked string instrument
|
||||||
|
Guitar
|
||||||
|
Electric guitar
|
||||||
|
Bass guitar
|
||||||
|
Acoustic guitar
|
||||||
|
Steel guitar, slide guitar
|
||||||
|
Tapping (guitar technique)
|
||||||
|
Strum
|
||||||
|
Banjo
|
||||||
|
Sitar
|
||||||
|
Mandolin
|
||||||
|
Zither
|
||||||
|
Ukulele
|
||||||
|
Keyboard (musical)
|
||||||
|
Piano
|
||||||
|
Electric piano
|
||||||
|
Organ
|
||||||
|
Electronic organ
|
||||||
|
Hammond organ
|
||||||
|
Synthesizer
|
||||||
|
Sampler
|
||||||
|
Harpsichord
|
||||||
|
Percussion
|
||||||
|
Drum kit
|
||||||
|
Drum machine
|
||||||
|
Drum
|
||||||
|
Snare drum
|
||||||
|
Rimshot
|
||||||
|
Drum roll
|
||||||
|
Bass drum
|
||||||
|
Timpani
|
||||||
|
Tabla
|
||||||
|
Cymbal
|
||||||
|
Hi-hat
|
||||||
|
Wood block
|
||||||
|
Tambourine
|
||||||
|
Rattle (instrument)
|
||||||
|
Maraca
|
||||||
|
Gong
|
||||||
|
Tubular bells
|
||||||
|
Mallet percussion
|
||||||
|
Marimba, xylophone
|
||||||
|
Glockenspiel
|
||||||
|
Vibraphone
|
||||||
|
Steelpan
|
||||||
|
Orchestra
|
||||||
|
Brass instrument
|
||||||
|
French horn
|
||||||
|
Trumpet
|
||||||
|
Trombone
|
||||||
|
Bowed string instrument
|
||||||
|
String section
|
||||||
|
Violin, fiddle
|
||||||
|
Pizzicato
|
||||||
|
Cello
|
||||||
|
Double bass
|
||||||
|
Wind instrument, woodwind instrument
|
||||||
|
Flute
|
||||||
|
Saxophone
|
||||||
|
Clarinet
|
||||||
|
Harp
|
||||||
|
Bell
|
||||||
|
Church bell
|
||||||
|
Jingle bell
|
||||||
|
Bicycle bell
|
||||||
|
Tuning fork
|
||||||
|
Chime
|
||||||
|
Wind chime
|
||||||
|
Change ringing (campanology)
|
||||||
|
Harmonica
|
||||||
|
Accordion
|
||||||
|
Bagpipes
|
||||||
|
Didgeridoo
|
||||||
|
Shofar
|
||||||
|
Theremin
|
||||||
|
Singing bowl
|
||||||
|
Scratching (performance technique)
|
||||||
|
Pop music
|
||||||
|
Hip hop music
|
||||||
|
Beatboxing
|
||||||
|
Rock music
|
||||||
|
Heavy metal
|
||||||
|
Punk rock
|
||||||
|
Grunge
|
||||||
|
Progressive rock
|
||||||
|
Rock and roll
|
||||||
|
Psychedelic rock
|
||||||
|
Rhythm and blues
|
||||||
|
Soul music
|
||||||
|
Reggae
|
||||||
|
Country
|
||||||
|
Swing music
|
||||||
|
Bluegrass
|
||||||
|
Funk
|
||||||
|
Folk music
|
||||||
|
Middle Eastern music
|
||||||
|
Jazz
|
||||||
|
Disco
|
||||||
|
Classical music
|
||||||
|
Opera
|
||||||
|
Electronic music
|
||||||
|
House music
|
||||||
|
Techno
|
||||||
|
Dubstep
|
||||||
|
Drum and bass
|
||||||
|
Electronica
|
||||||
|
Electronic dance music
|
||||||
|
Ambient music
|
||||||
|
Trance music
|
||||||
|
Music of Latin America
|
||||||
|
Salsa music
|
||||||
|
Flamenco
|
||||||
|
Blues
|
||||||
|
Music for children
|
||||||
|
New-age music
|
||||||
|
Vocal music
|
||||||
|
A capella
|
||||||
|
Music of Africa
|
||||||
|
Afrobeat
|
||||||
|
Christian music
|
||||||
|
Gospel music
|
||||||
|
Music of Asia
|
||||||
|
Carnatic music
|
||||||
|
Music of Bollywood
|
||||||
|
Ska
|
||||||
|
Traditional music
|
||||||
|
Independent music
|
||||||
|
Song
|
||||||
|
Background music
|
||||||
|
Theme music
|
||||||
|
Jingle (music)
|
||||||
|
Soundtrack music
|
||||||
|
Lullaby
|
||||||
|
Video game music
|
||||||
|
Christmas music
|
||||||
|
Dance music
|
||||||
|
Wedding music
|
||||||
|
Happy music
|
||||||
|
Funny music
|
||||||
|
Sad music
|
||||||
|
Tender music
|
||||||
|
Exciting music
|
||||||
|
Angry music
|
||||||
|
Scary music
|
||||||
|
Wind
|
||||||
|
Rustling leaves
|
||||||
|
Wind noise (microphone)
|
||||||
|
Thunderstorm
|
||||||
|
Thunder
|
||||||
|
Water
|
||||||
|
Rain
|
||||||
|
Raindrop
|
||||||
|
Rain on surface
|
||||||
|
Stream
|
||||||
|
Waterfall
|
||||||
|
Ocean
|
||||||
|
Waves, surf
|
||||||
|
Steam
|
||||||
|
Gurgling
|
||||||
|
Fire
|
||||||
|
Crackle
|
||||||
|
Vehicle
|
||||||
|
Boat, Water vehicle
|
||||||
|
Sailboat, sailing ship
|
||||||
|
Rowboat, canoe, kayak
|
||||||
|
Motorboat, speedboat
|
||||||
|
Ship
|
||||||
|
Motor vehicle (road)
|
||||||
|
Car
|
||||||
|
Vehicle horn, car horn, honking
|
||||||
|
Toot
|
||||||
|
Car alarm
|
||||||
|
Power windows, electric windows
|
||||||
|
Skidding
|
||||||
|
Tire squeal
|
||||||
|
Car passing by
|
||||||
|
Race car, auto racing
|
||||||
|
Truck
|
||||||
|
Air brake
|
||||||
|
Air horn, truck horn
|
||||||
|
Reversing beeps
|
||||||
|
Ice cream truck, ice cream van
|
||||||
|
Bus
|
||||||
|
Emergency vehicle
|
||||||
|
Police car (siren)
|
||||||
|
Ambulance (siren)
|
||||||
|
Fire engine, fire truck (siren)
|
||||||
|
Motorcycle
|
||||||
|
Traffic noise, roadway noise
|
||||||
|
Rail transport
|
||||||
|
Train
|
||||||
|
Train whistle
|
||||||
|
Train horn
|
||||||
|
Railroad car, train wagon
|
||||||
|
Train wheels squealing
|
||||||
|
Subway, metro, underground
|
||||||
|
Aircraft
|
||||||
|
Aircraft engine
|
||||||
|
Jet engine
|
||||||
|
Propeller, airscrew
|
||||||
|
Helicopter
|
||||||
|
Fixed-wing aircraft, airplane
|
||||||
|
Bicycle
|
||||||
|
Skateboard
|
||||||
|
Engine
|
||||||
|
Light engine (high frequency)
|
||||||
|
Dental drill, dentist's drill
|
||||||
|
Lawn mower
|
||||||
|
Chainsaw
|
||||||
|
Medium engine (mid frequency)
|
||||||
|
Heavy engine (low frequency)
|
||||||
|
Engine knocking
|
||||||
|
Engine starting
|
||||||
|
Idling
|
||||||
|
Accelerating, revving, vroom
|
||||||
|
Door
|
||||||
|
Doorbell
|
||||||
|
Ding-dong
|
||||||
|
Sliding door
|
||||||
|
Slam
|
||||||
|
Knock
|
||||||
|
Tap
|
||||||
|
Squeak
|
||||||
|
Cupboard open or close
|
||||||
|
Drawer open or close
|
||||||
|
Dishes, pots, and pans
|
||||||
|
Cutlery, silverware
|
||||||
|
Chopping (food)
|
||||||
|
Frying (food)
|
||||||
|
Microwave oven
|
||||||
|
Blender
|
||||||
|
Water tap, faucet
|
||||||
|
Sink (filling or washing)
|
||||||
|
Bathtub (filling or washing)
|
||||||
|
Hair dryer
|
||||||
|
Toilet flush
|
||||||
|
Toothbrush
|
||||||
|
Electric toothbrush
|
||||||
|
Vacuum cleaner
|
||||||
|
Zipper (clothing)
|
||||||
|
Keys jangling
|
||||||
|
Coin (dropping)
|
||||||
|
Scissors
|
||||||
|
Electric shaver, electric razor
|
||||||
|
Shuffling cards
|
||||||
|
Typing
|
||||||
|
Typewriter
|
||||||
|
Computer keyboard
|
||||||
|
Writing
|
||||||
|
Alarm
|
||||||
|
Telephone
|
||||||
|
Telephone bell ringing
|
||||||
|
Ringtone
|
||||||
|
Telephone dialing, DTMF
|
||||||
|
Dial tone
|
||||||
|
Busy signal
|
||||||
|
Alarm clock
|
||||||
|
Siren
|
||||||
|
Civil defense siren
|
||||||
|
Buzzer
|
||||||
|
Smoke detector, smoke alarm
|
||||||
|
Fire alarm
|
||||||
|
Foghorn
|
||||||
|
Whistle
|
||||||
|
Steam whistle
|
||||||
|
Mechanisms
|
||||||
|
Ratchet, pawl
|
||||||
|
Clock
|
||||||
|
Tick
|
||||||
|
Tick-tock
|
||||||
|
Gears
|
||||||
|
Pulleys
|
||||||
|
Sewing machine
|
||||||
|
Mechanical fan
|
||||||
|
Air conditioning
|
||||||
|
Cash register
|
||||||
|
Printer
|
||||||
|
Camera
|
||||||
|
Single-lens reflex camera
|
||||||
|
Tools
|
||||||
|
Hammer
|
||||||
|
Jackhammer
|
||||||
|
Sawing
|
||||||
|
Filing (rasp)
|
||||||
|
Sanding
|
||||||
|
Power tool
|
||||||
|
Drill
|
||||||
|
Explosion
|
||||||
|
Gunshot, gunfire
|
||||||
|
Machine gun
|
||||||
|
Fusillade
|
||||||
|
Artillery fire
|
||||||
|
Cap gun
|
||||||
|
Fireworks
|
||||||
|
Firecracker
|
||||||
|
Burst, pop
|
||||||
|
Eruption
|
||||||
|
Boom
|
||||||
|
Wood
|
||||||
|
Chop
|
||||||
|
Splinter
|
||||||
|
Crack
|
||||||
|
Glass
|
||||||
|
Chink, clink
|
||||||
|
Shatter
|
||||||
|
Liquid
|
||||||
|
Splash, splatter
|
||||||
|
Slosh
|
||||||
|
Squish
|
||||||
|
Drip
|
||||||
|
Pour
|
||||||
|
Trickle, dribble
|
||||||
|
Gush
|
||||||
|
Fill (with liquid)
|
||||||
|
Spray
|
||||||
|
Pump (liquid)
|
||||||
|
Stir
|
||||||
|
Boiling
|
||||||
|
Sonar
|
||||||
|
Arrow
|
||||||
|
Whoosh, swoosh, swish
|
||||||
|
Thump, thud
|
||||||
|
Thunk
|
||||||
|
Electronic tuner
|
||||||
|
Effects unit
|
||||||
|
Chorus effect
|
||||||
|
Basketball bounce
|
||||||
|
Bang
|
||||||
|
Slap, smack
|
||||||
|
Whack, thwack
|
||||||
|
Smash, crash
|
||||||
|
Breaking
|
||||||
|
Bouncing
|
||||||
|
Whip
|
||||||
|
Flap
|
||||||
|
Scratch
|
||||||
|
Scrape
|
||||||
|
Rub
|
||||||
|
Roll
|
||||||
|
Crushing
|
||||||
|
Crumpling, crinkling
|
||||||
|
Tearing
|
||||||
|
Beep, bleep
|
||||||
|
Ping
|
||||||
|
Ding
|
||||||
|
Clang
|
||||||
|
Squeal
|
||||||
|
Creak
|
||||||
|
Rustle
|
||||||
|
Whir
|
||||||
|
Clatter
|
||||||
|
Sizzle
|
||||||
|
Clicking
|
||||||
|
Clickety-clack
|
||||||
|
Rumble
|
||||||
|
Plop
|
||||||
|
Jingle, tinkle
|
||||||
|
Hum
|
||||||
|
Zing
|
||||||
|
Boing
|
||||||
|
Crunch
|
||||||
|
Silence
|
||||||
|
Sine wave
|
||||||
|
Harmonic
|
||||||
|
Chirp tone
|
||||||
|
Sound effect
|
||||||
|
Pulse
|
||||||
|
Inside, small room
|
||||||
|
Inside, large room or hall
|
||||||
|
Inside, public space
|
||||||
|
Outside, urban or manmade
|
||||||
|
Outside, rural or natural
|
||||||
|
Reverberation
|
||||||
|
Echo
|
||||||
|
Noise
|
||||||
|
Environmental noise
|
||||||
|
Static
|
||||||
|
Mains hum
|
||||||
|
Distortion
|
||||||
|
Sidetone
|
||||||
|
Cacophony
|
||||||
|
White noise
|
||||||
|
Pink noise
|
||||||
|
Throbbing
|
||||||
|
Vibration
|
||||||
|
Television
|
||||||
|
Radio
|
||||||
|
Field recording
|
@ -0,0 +1,112 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
from paddleaudio.backends import load as load_audio
|
||||||
|
from paddleaudio.features import melspectrogram
|
||||||
|
from paddleaudio.models.panns import cnn14
|
||||||
|
from paddleaudio.utils import logger
|
||||||
|
|
||||||
|
# yapf: disable
|
||||||
|
parser = argparse.ArgumentParser(__doc__)
|
||||||
|
parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
|
||||||
|
parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
|
||||||
|
parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
|
||||||
|
parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
|
||||||
|
parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
# yapf: enable
|
||||||
|
|
||||||
|
|
||||||
|
def split(waveform: np.ndarray, win_size: int, hop_size: int):
|
||||||
|
"""
|
||||||
|
Split into N waveforms.
|
||||||
|
N is decided by win_size and hop_size.
|
||||||
|
"""
|
||||||
|
assert isinstance(waveform, np.ndarray)
|
||||||
|
time = []
|
||||||
|
data = []
|
||||||
|
for i in range(0, len(waveform), hop_size):
|
||||||
|
segment = waveform[i:i + win_size]
|
||||||
|
if len(segment) < win_size:
|
||||||
|
segment = np.pad(segment, (0, win_size - len(segment)))
|
||||||
|
data.append(segment)
|
||||||
|
time.append(i / len(waveform))
|
||||||
|
return time, data
|
||||||
|
|
||||||
|
|
||||||
|
def batchify(data: List[List[float]],
|
||||||
|
sample_rate: int,
|
||||||
|
batch_size: int,
|
||||||
|
**kwargs):
|
||||||
|
"""
|
||||||
|
Extract features from waveforms and create batches.
|
||||||
|
"""
|
||||||
|
examples = []
|
||||||
|
for waveform in data:
|
||||||
|
feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
|
||||||
|
examples.append(feats)
|
||||||
|
|
||||||
|
# Seperates data into some batches.
|
||||||
|
one_batch = []
|
||||||
|
for example in examples:
|
||||||
|
one_batch.append(example)
|
||||||
|
if len(one_batch) == batch_size:
|
||||||
|
yield one_batch
|
||||||
|
one_batch = []
|
||||||
|
if one_batch:
|
||||||
|
yield one_batch
|
||||||
|
|
||||||
|
|
||||||
|
def predict(model, data: List[List[float]], sample_rate: int,
|
||||||
|
batch_size: int=1):
|
||||||
|
"""
|
||||||
|
Use pretrained model to make predictions.
|
||||||
|
"""
|
||||||
|
batches = batchify(data, sample_rate, batch_size)
|
||||||
|
results = None
|
||||||
|
model.eval()
|
||||||
|
for batch in batches:
|
||||||
|
feats = paddle.to_tensor(batch).unsqueeze(1) \
|
||||||
|
# (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
|
||||||
|
|
||||||
|
audioset_scores = model(feats)
|
||||||
|
if results is None:
|
||||||
|
results = audioset_scores.numpy()
|
||||||
|
else:
|
||||||
|
results = np.concatenate((results, audioset_scores.numpy()))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
paddle.set_device(args.device)
|
||||||
|
model = cnn14(pretrained=True, extract_embedding=False)
|
||||||
|
waveform, sr = load_audio(args.wav, sr=None)
|
||||||
|
time, data = split(waveform,
|
||||||
|
int(args.sample_duration * sr),
|
||||||
|
int(args.hop_duration * sr))
|
||||||
|
results = predict(model, data, sr, batch_size=8)
|
||||||
|
|
||||||
|
if not os.path.exists(args.output_dir):
|
||||||
|
os.makedirs(args.output_dir)
|
||||||
|
time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
|
||||||
|
output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
|
||||||
|
np.savez(output_file, time=time, scores=results)
|
||||||
|
logger.info(f'Saved tagging results to {output_file}')
|
@ -0,0 +1,84 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
import ast
|
||||||
|
import os
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from paddleaudio.utils import logger
|
||||||
|
|
||||||
|
# yapf: disable
|
||||||
|
parser = argparse.ArgumentParser(__doc__)
|
||||||
|
parser.add_argument('--tagging_file', type=str, required=True, help='')
|
||||||
|
parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
|
||||||
|
parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
|
||||||
|
parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
|
||||||
|
parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
|
||||||
|
parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
# yapf: enable
|
||||||
|
|
||||||
|
|
||||||
|
def smooth(results: np.ndarray, win_size: int):
|
||||||
|
"""
|
||||||
|
Execute posterior smoothing in-place.
|
||||||
|
"""
|
||||||
|
for i in range(len(results) - 1, -1, -1):
|
||||||
|
if i < win_size - 1:
|
||||||
|
left = 0
|
||||||
|
else:
|
||||||
|
left = i + 1 - win_size
|
||||||
|
results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
|
||||||
|
"""
|
||||||
|
Return top k result.
|
||||||
|
"""
|
||||||
|
result = np.asarray(result)
|
||||||
|
topk_idx = (-result).argsort()[:k]
|
||||||
|
|
||||||
|
ret = ''
|
||||||
|
for idx in topk_idx:
|
||||||
|
label, score = label_map[idx], result[idx]
|
||||||
|
ret += f'{label}: {score}\n'
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
label_map = {}
|
||||||
|
with open(args.label_file, 'r') as f:
|
||||||
|
for i, l in enumerate(f.readlines()):
|
||||||
|
label_map[i] = l.strip()
|
||||||
|
|
||||||
|
results = np.load(args.tagging_file, allow_pickle=True)
|
||||||
|
times, scores = results['time'], results['scores']
|
||||||
|
|
||||||
|
if args.smooth:
|
||||||
|
logger.info('Posterior smoothing...')
|
||||||
|
smooth(scores, win_size=args.smooth_size)
|
||||||
|
|
||||||
|
if not os.path.exists(args.output_dir):
|
||||||
|
os.makedirs(args.output_dir)
|
||||||
|
output_file = os.path.join(
|
||||||
|
args.output_dir,
|
||||||
|
os.path.basename(args.tagging_file).split('.')[0] + '.txt')
|
||||||
|
with open(output_file, 'w') as f:
|
||||||
|
for time, score in zip(times, scores):
|
||||||
|
f.write(f'{time}\n')
|
||||||
|
f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
|
||||||
|
|
||||||
|
logger.info(f'Saved tagging labels to {output_file}')
|
@ -0,0 +1,147 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from paddle import inference
|
||||||
|
from scipy.special import softmax
|
||||||
|
|
||||||
|
from paddleaudio.backends import load as load_audio
|
||||||
|
from paddleaudio.datasets import ESC50
|
||||||
|
from paddleaudio.features import melspectrogram
|
||||||
|
|
||||||
|
# yapf: disable
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
|
||||||
|
parser.add_argument("--batch_size", type=int, default=2, help="Batch size per GPU/CPU for training.")
|
||||||
|
parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
|
||||||
|
parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
|
||||||
|
parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.')
|
||||||
|
parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.')
|
||||||
|
parser.add_argument('--enable_mkldnn', type=eval, default=False, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
|
||||||
|
parser.add_argument("--log_dir", type=str, default="./log", help="The path to save log.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
# yapf: enable
|
||||||
|
|
||||||
|
|
||||||
|
def extract_features(files: str, **kwargs):
|
||||||
|
waveforms = []
|
||||||
|
srs = []
|
||||||
|
max_length = float('-inf')
|
||||||
|
for file in files:
|
||||||
|
waveform, sr = load_audio(file, sr=None)
|
||||||
|
max_length = max(max_length, len(waveform))
|
||||||
|
waveforms.append(waveform)
|
||||||
|
srs.append(sr)
|
||||||
|
|
||||||
|
feats = []
|
||||||
|
for i in range(len(waveforms)):
|
||||||
|
# padding
|
||||||
|
if len(waveforms[i]) < max_length:
|
||||||
|
pad_width = max_length - len(waveforms[i])
|
||||||
|
waveforms[i] = np.pad(waveforms[i], pad_width=(0, pad_width))
|
||||||
|
|
||||||
|
feat = melspectrogram(waveforms[i], sr, **kwargs).transpose()
|
||||||
|
feats.append(feat)
|
||||||
|
|
||||||
|
return np.stack(feats, axis=0)
|
||||||
|
|
||||||
|
|
||||||
|
class Predictor(object):
|
||||||
|
def __init__(self,
|
||||||
|
model_dir,
|
||||||
|
device="gpu",
|
||||||
|
batch_size=1,
|
||||||
|
use_tensorrt=False,
|
||||||
|
precision="fp32",
|
||||||
|
cpu_threads=10,
|
||||||
|
enable_mkldnn=False):
|
||||||
|
self.batch_size = batch_size
|
||||||
|
|
||||||
|
model_file = os.path.join(model_dir, "inference.pdmodel")
|
||||||
|
params_file = os.path.join(model_dir, "inference.pdiparams")
|
||||||
|
|
||||||
|
assert os.path.isfile(model_file) and os.path.isfile(
|
||||||
|
params_file), 'Please check model and parameter files.'
|
||||||
|
|
||||||
|
config = inference.Config(model_file, params_file)
|
||||||
|
if device == "gpu":
|
||||||
|
# set GPU configs accordingly
|
||||||
|
# such as intialize the gpu memory, enable tensorrt
|
||||||
|
config.enable_use_gpu(100, 0)
|
||||||
|
precision_map = {
|
||||||
|
"fp16": inference.PrecisionType.Half,
|
||||||
|
"fp32": inference.PrecisionType.Float32,
|
||||||
|
}
|
||||||
|
precision_mode = precision_map[precision]
|
||||||
|
|
||||||
|
if use_tensorrt:
|
||||||
|
config.enable_tensorrt_engine(
|
||||||
|
max_batch_size=batch_size,
|
||||||
|
min_subgraph_size=30,
|
||||||
|
precision_mode=precision_mode)
|
||||||
|
elif device == "cpu":
|
||||||
|
# set CPU configs accordingly,
|
||||||
|
# such as enable_mkldnn, set_cpu_math_library_num_threads
|
||||||
|
config.disable_gpu()
|
||||||
|
if enable_mkldnn:
|
||||||
|
# cache 10 different shapes for mkldnn to avoid memory leak
|
||||||
|
config.set_mkldnn_cache_capacity(10)
|
||||||
|
config.enable_mkldnn()
|
||||||
|
config.set_cpu_math_library_num_threads(cpu_threads)
|
||||||
|
elif device == "xpu":
|
||||||
|
# set XPU configs accordingly
|
||||||
|
config.enable_xpu(100)
|
||||||
|
|
||||||
|
config.switch_use_feed_fetch_ops(False)
|
||||||
|
self.predictor = inference.create_predictor(config)
|
||||||
|
self.input_handles = [
|
||||||
|
self.predictor.get_input_handle(name)
|
||||||
|
for name in self.predictor.get_input_names()
|
||||||
|
]
|
||||||
|
self.output_handle = self.predictor.get_output_handle(
|
||||||
|
self.predictor.get_output_names()[0])
|
||||||
|
|
||||||
|
def predict(self, wavs):
|
||||||
|
feats = extract_features(wavs)
|
||||||
|
|
||||||
|
self.input_handles[0].copy_from_cpu(feats)
|
||||||
|
self.predictor.run()
|
||||||
|
logits = self.output_handle.copy_to_cpu()
|
||||||
|
probs = softmax(logits, axis=1)
|
||||||
|
indices = np.argmax(probs, axis=1)
|
||||||
|
|
||||||
|
return indices
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Define predictor to do prediction.
|
||||||
|
predictor = Predictor(args.model_dir, args.device, args.batch_size,
|
||||||
|
args.use_tensorrt, args.precision, args.cpu_threads,
|
||||||
|
args.enable_mkldnn)
|
||||||
|
|
||||||
|
wavs = [
|
||||||
|
'~/audio_demo_resource/cat.wav',
|
||||||
|
'~/audio_demo_resource/dog.wav',
|
||||||
|
]
|
||||||
|
|
||||||
|
for i in range(len(wavs)):
|
||||||
|
wavs[i] = os.path.abspath(os.path.expanduser(wavs[i]))
|
||||||
|
assert os.path.isfile(
|
||||||
|
wavs[i]), f'Please check input wave file: {wavs[i]}'
|
||||||
|
|
||||||
|
results = predictor.predict(wavs)
|
||||||
|
for idx, wav in enumerate(wavs):
|
||||||
|
print(f'Wav: {wav} \t Label: {ESC50.label_list[results[idx]]}')
|
@ -0,0 +1,45 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from model import SoundClassifier
|
||||||
|
|
||||||
|
from paddleaudio.datasets import ESC50
|
||||||
|
from paddleaudio.models.panns import cnn14
|
||||||
|
|
||||||
|
# yapf: disable
|
||||||
|
parser = argparse.ArgumentParser(__doc__)
|
||||||
|
parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
|
||||||
|
parser.add_argument("--output_dir", type=str, default='./export', help="Path to save static model and its parameters.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
# yapf: enable
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
model = SoundClassifier(
|
||||||
|
backbone=cnn14(pretrained=False, extract_embedding=True),
|
||||||
|
num_class=len(ESC50.label_list))
|
||||||
|
model.set_state_dict(paddle.load(args.checkpoint))
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
model = paddle.jit.to_static(
|
||||||
|
model,
|
||||||
|
input_spec=[
|
||||||
|
paddle.static.InputSpec(
|
||||||
|
shape=[None, None, 64], dtype=paddle.float32)
|
||||||
|
])
|
||||||
|
|
||||||
|
# Save in static graph model.
|
||||||
|
paddle.jit.save(model, os.path.join(args.output_dir, "inference"))
|
@ -0,0 +1,36 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import paddle.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class SoundClassifier(nn.Layer):
|
||||||
|
"""
|
||||||
|
Model for sound classification which uses panns pretrained models to extract
|
||||||
|
embeddings from audio files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, backbone, num_class, dropout=0.1):
|
||||||
|
super(SoundClassifier, self).__init__()
|
||||||
|
self.backbone = backbone
|
||||||
|
self.dropout = nn.Dropout(dropout)
|
||||||
|
self.fc = nn.Linear(self.backbone.emb_size, num_class)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# x: (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
|
||||||
|
x = x.unsqueeze(1)
|
||||||
|
x = self.backbone(x)
|
||||||
|
x = self.dropout(x)
|
||||||
|
logits = self.fc(x)
|
||||||
|
|
||||||
|
return logits
|
@ -0,0 +1,61 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
import paddle.nn.functional as F
|
||||||
|
from model import SoundClassifier
|
||||||
|
|
||||||
|
from paddleaudio.backends import load as load_audio
|
||||||
|
from paddleaudio.datasets import ESC50
|
||||||
|
from paddleaudio.features import melspectrogram
|
||||||
|
from paddleaudio.models.panns import cnn14
|
||||||
|
|
||||||
|
# yapf: disable
|
||||||
|
parser = argparse.ArgumentParser(__doc__)
|
||||||
|
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
|
||||||
|
parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
|
||||||
|
parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
|
||||||
|
parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
# yapf: enable
|
||||||
|
|
||||||
|
|
||||||
|
def extract_features(file: str, **kwargs):
|
||||||
|
waveform, sr = load_audio(file, sr=None)
|
||||||
|
feat = melspectrogram(waveform, sr, **kwargs).transpose()
|
||||||
|
return feat
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
paddle.set_device(args.device)
|
||||||
|
|
||||||
|
model = SoundClassifier(
|
||||||
|
backbone=cnn14(pretrained=False, extract_embedding=True),
|
||||||
|
num_class=len(ESC50.label_list))
|
||||||
|
model.set_state_dict(paddle.load(args.checkpoint))
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
feat = np.expand_dims(extract_features(args.wav), 0)
|
||||||
|
feat = paddle.to_tensor(feat)
|
||||||
|
logits = model(feat)
|
||||||
|
probs = F.softmax(logits, axis=1).numpy()
|
||||||
|
|
||||||
|
sorted_indices = (-probs[0]).argsort()
|
||||||
|
|
||||||
|
msg = f'[{args.wav}]\n'
|
||||||
|
for idx in sorted_indices[:args.top_k]:
|
||||||
|
msg += f'{ESC50.label_list[idx]}: {probs[0][idx]}\n'
|
||||||
|
print(msg)
|
@ -0,0 +1,149 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from model import SoundClassifier
|
||||||
|
|
||||||
|
from paddleaudio.datasets import ESC50
|
||||||
|
from paddleaudio.models.panns import cnn14
|
||||||
|
from paddleaudio.utils import logger
|
||||||
|
from paddleaudio.utils import Timer
|
||||||
|
|
||||||
|
# yapf: disable
|
||||||
|
parser = argparse.ArgumentParser(__doc__)
|
||||||
|
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
|
||||||
|
parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
|
||||||
|
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
|
||||||
|
parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
|
||||||
|
parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
|
||||||
|
parser.add_argument("--checkpoint_dir", type=str, default='./checkpoint', help="Directory to save model checkpoints.")
|
||||||
|
parser.add_argument("--save_freq", type=int, default=10, help="Save checkpoint every n epoch.")
|
||||||
|
parser.add_argument("--log_freq", type=int, default=10, help="Log the training infomation every n steps.")
|
||||||
|
args = parser.parse_args()
|
||||||
|
# yapf: enable
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
paddle.set_device(args.device)
|
||||||
|
nranks = paddle.distributed.get_world_size()
|
||||||
|
if paddle.distributed.get_world_size() > 1:
|
||||||
|
paddle.distributed.init_parallel_env()
|
||||||
|
local_rank = paddle.distributed.get_rank()
|
||||||
|
|
||||||
|
backbone = cnn14(pretrained=True, extract_embedding=True)
|
||||||
|
model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
|
||||||
|
model = paddle.DataParallel(model)
|
||||||
|
optimizer = paddle.optimizer.Adam(
|
||||||
|
learning_rate=args.learning_rate, parameters=model.parameters())
|
||||||
|
criterion = paddle.nn.loss.CrossEntropyLoss()
|
||||||
|
|
||||||
|
train_ds = ESC50(mode='train', feat_type='melspectrogram')
|
||||||
|
dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
|
||||||
|
|
||||||
|
train_sampler = paddle.io.DistributedBatchSampler(
|
||||||
|
train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
|
||||||
|
train_loader = paddle.io.DataLoader(
|
||||||
|
train_ds,
|
||||||
|
batch_sampler=train_sampler,
|
||||||
|
num_workers=args.num_workers,
|
||||||
|
return_list=True,
|
||||||
|
use_buffer_reader=True, )
|
||||||
|
|
||||||
|
steps_per_epoch = len(train_sampler)
|
||||||
|
timer = Timer(steps_per_epoch * args.epochs)
|
||||||
|
timer.start()
|
||||||
|
|
||||||
|
for epoch in range(1, args.epochs + 1):
|
||||||
|
model.train()
|
||||||
|
|
||||||
|
avg_loss = 0
|
||||||
|
num_corrects = 0
|
||||||
|
num_samples = 0
|
||||||
|
for batch_idx, batch in enumerate(train_loader):
|
||||||
|
feats, labels = batch
|
||||||
|
logits = model(feats)
|
||||||
|
|
||||||
|
loss = criterion(logits, labels)
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
if isinstance(optimizer._learning_rate,
|
||||||
|
paddle.optimizer.lr.LRScheduler):
|
||||||
|
optimizer._learning_rate.step()
|
||||||
|
optimizer.clear_grad()
|
||||||
|
|
||||||
|
# Calculate loss
|
||||||
|
avg_loss += loss.numpy()[0]
|
||||||
|
|
||||||
|
# Calculate metrics
|
||||||
|
preds = paddle.argmax(logits, axis=1)
|
||||||
|
num_corrects += (preds == labels).numpy().sum()
|
||||||
|
num_samples += feats.shape[0]
|
||||||
|
|
||||||
|
timer.count()
|
||||||
|
|
||||||
|
if (batch_idx + 1) % args.log_freq == 0 and local_rank == 0:
|
||||||
|
lr = optimizer.get_lr()
|
||||||
|
avg_loss /= args.log_freq
|
||||||
|
avg_acc = num_corrects / num_samples
|
||||||
|
|
||||||
|
print_msg = 'Epoch={}/{}, Step={}/{}'.format(
|
||||||
|
epoch, args.epochs, batch_idx + 1, steps_per_epoch)
|
||||||
|
print_msg += ' loss={:.4f}'.format(avg_loss)
|
||||||
|
print_msg += ' acc={:.4f}'.format(avg_acc)
|
||||||
|
print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
|
||||||
|
lr, timer.timing, timer.eta)
|
||||||
|
logger.train(print_msg)
|
||||||
|
|
||||||
|
avg_loss = 0
|
||||||
|
num_corrects = 0
|
||||||
|
num_samples = 0
|
||||||
|
|
||||||
|
if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
|
||||||
|
dev_sampler = paddle.io.BatchSampler(
|
||||||
|
dev_ds,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
shuffle=False,
|
||||||
|
drop_last=False)
|
||||||
|
dev_loader = paddle.io.DataLoader(
|
||||||
|
dev_ds,
|
||||||
|
batch_sampler=dev_sampler,
|
||||||
|
num_workers=args.num_workers,
|
||||||
|
return_list=True, )
|
||||||
|
|
||||||
|
model.eval()
|
||||||
|
num_corrects = 0
|
||||||
|
num_samples = 0
|
||||||
|
with logger.processing('Evaluation on validation dataset'):
|
||||||
|
for batch_idx, batch in enumerate(dev_loader):
|
||||||
|
feats, labels = batch
|
||||||
|
logits = model(feats)
|
||||||
|
|
||||||
|
preds = paddle.argmax(logits, axis=1)
|
||||||
|
num_corrects += (preds == labels).numpy().sum()
|
||||||
|
num_samples += feats.shape[0]
|
||||||
|
|
||||||
|
print_msg = '[Evaluation result]'
|
||||||
|
print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
|
||||||
|
|
||||||
|
logger.eval(print_msg)
|
||||||
|
|
||||||
|
# Save model
|
||||||
|
save_dir = os.path.join(args.checkpoint_dir,
|
||||||
|
'epoch_{}'.format(epoch))
|
||||||
|
logger.info('Saving model checkpoint to {}'.format(save_dir))
|
||||||
|
paddle.save(model.state_dict(),
|
||||||
|
os.path.join(save_dir, 'model.pdparams'))
|
||||||
|
paddle.save(optimizer.state_dict(),
|
||||||
|
os.path.join(save_dir, 'model.pdopt'))
|
@ -0,0 +1,15 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .backends import *
|
||||||
|
from .features import *
|
@ -0,0 +1,14 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .audio import *
|
@ -0,0 +1,303 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import warnings
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Tuple
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import resampy
|
||||||
|
import soundfile as sf
|
||||||
|
from numpy import ndarray as array
|
||||||
|
from scipy.io import wavfile
|
||||||
|
|
||||||
|
from ..utils import ParameterError
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'resample',
|
||||||
|
'to_mono',
|
||||||
|
'depth_convert',
|
||||||
|
'normalize',
|
||||||
|
'save_wav',
|
||||||
|
'load',
|
||||||
|
]
|
||||||
|
NORMALMIZE_TYPES = ['linear', 'gaussian']
|
||||||
|
MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
|
||||||
|
RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
|
||||||
|
EPS = 1e-8
|
||||||
|
|
||||||
|
|
||||||
|
def resample(y: array, src_sr: int, target_sr: int,
|
||||||
|
mode: str='kaiser_fast') -> array:
|
||||||
|
""" Audio resampling
|
||||||
|
|
||||||
|
This function is the same as using resampy.resample().
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
The default mode is kaiser_fast. For better audio quality, use mode = 'kaiser_fast'
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
if mode == 'kaiser_best':
|
||||||
|
warnings.warn(
|
||||||
|
f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
|
||||||
|
we recommend the mode kaiser_fast in large scale audio trainning')
|
||||||
|
|
||||||
|
if not isinstance(y, np.ndarray):
|
||||||
|
raise ParameterError(
|
||||||
|
'Only support numpy array, but received y in {type(y)}')
|
||||||
|
|
||||||
|
if mode not in RESAMPLE_MODES:
|
||||||
|
raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
|
||||||
|
|
||||||
|
return resampy.resample(y, src_sr, target_sr, filter=mode)
|
||||||
|
|
||||||
|
|
||||||
|
def to_mono(y: array, merge_type: str='average') -> array:
|
||||||
|
""" convert sterior audio to mono
|
||||||
|
"""
|
||||||
|
if merge_type not in MERGE_TYPES:
|
||||||
|
raise ParameterError(
|
||||||
|
f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
|
||||||
|
)
|
||||||
|
if y.ndim > 2:
|
||||||
|
raise ParameterError(
|
||||||
|
f'Unsupported audio array, y.ndim > 2, the shape is {y.shape}')
|
||||||
|
if y.ndim == 1: # nothing to merge
|
||||||
|
return y
|
||||||
|
|
||||||
|
if merge_type == 'ch0':
|
||||||
|
return y[0]
|
||||||
|
if merge_type == 'ch1':
|
||||||
|
return y[1]
|
||||||
|
if merge_type == 'random':
|
||||||
|
return y[np.random.randint(0, 2)]
|
||||||
|
|
||||||
|
# need to do averaging according to dtype
|
||||||
|
|
||||||
|
if y.dtype == 'float32':
|
||||||
|
y_out = (y[0] + y[1]) * 0.5
|
||||||
|
elif y.dtype == 'int16':
|
||||||
|
y_out = y.astype('int32')
|
||||||
|
y_out = (y_out[0] + y_out[1]) // 2
|
||||||
|
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
|
||||||
|
np.iinfo(y.dtype).max).astype(y.dtype)
|
||||||
|
|
||||||
|
elif y.dtype == 'int8':
|
||||||
|
y_out = y.astype('int16')
|
||||||
|
y_out = (y_out[0] + y_out[1]) // 2
|
||||||
|
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
|
||||||
|
np.iinfo(y.dtype).max).astype(y.dtype)
|
||||||
|
else:
|
||||||
|
raise ParameterError(f'Unsupported dtype: {y.dtype}')
|
||||||
|
return y_out
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_cast(y: array, dtype: Union[type, str]) -> array:
|
||||||
|
""" data type casting in a safe way, i.e., prevent overflow or underflow
|
||||||
|
|
||||||
|
This function is used internally.
|
||||||
|
"""
|
||||||
|
return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def depth_convert(y: array, dtype: Union[type, str],
|
||||||
|
dithering: bool=True) -> array:
|
||||||
|
"""Convert audio array to target dtype safely
|
||||||
|
|
||||||
|
This function convert audio waveform to a target dtype, with addition steps of
|
||||||
|
preventing overflow/underflow and preserving audio range.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
|
||||||
|
if y.dtype not in SUPPORT_DTYPE:
|
||||||
|
raise ParameterError(
|
||||||
|
'Unsupported audio dtype, '
|
||||||
|
f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
|
||||||
|
|
||||||
|
if dtype not in SUPPORT_DTYPE:
|
||||||
|
raise ParameterError(
|
||||||
|
'Unsupported audio dtype, '
|
||||||
|
f'target dtype is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
|
||||||
|
|
||||||
|
if dtype == y.dtype:
|
||||||
|
return y
|
||||||
|
|
||||||
|
if dtype == 'float64' and y.dtype == 'float32':
|
||||||
|
return _safe_cast(y, dtype)
|
||||||
|
if dtype == 'float32' and y.dtype == 'float64':
|
||||||
|
return _safe_cast(y, dtype)
|
||||||
|
|
||||||
|
if dtype == 'int16' or dtype == 'int8':
|
||||||
|
if y.dtype in ['float64', 'float32']:
|
||||||
|
factor = np.iinfo(dtype).max
|
||||||
|
y = np.clip(y * factor, np.iinfo(dtype).min,
|
||||||
|
np.iinfo(dtype).max).astype(dtype)
|
||||||
|
y = y.astype(dtype)
|
||||||
|
else:
|
||||||
|
if dtype == 'int16' and y.dtype == 'int8':
|
||||||
|
factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
|
||||||
|
y = y.astype('float32') * factor
|
||||||
|
y = y.astype('int16')
|
||||||
|
|
||||||
|
else: # dtype == 'int8' and y.dtype=='int16':
|
||||||
|
y = y.astype('int32') * np.iinfo('int8').max / \
|
||||||
|
np.iinfo('int16').max
|
||||||
|
y = y.astype('int8')
|
||||||
|
|
||||||
|
if dtype in ['float32', 'float64']:
|
||||||
|
org_dtype = y.dtype
|
||||||
|
y = y.astype(dtype) / np.iinfo(org_dtype).max
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
def sound_file_load(file: str,
|
||||||
|
offset: Optional[float]=None,
|
||||||
|
dtype: str='int16',
|
||||||
|
duration: Optional[int]=None) -> Tuple[array, int]:
|
||||||
|
"""Load audio using soundfile library
|
||||||
|
|
||||||
|
This function load audio file using libsndfile.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
http://www.mega-nerd.com/libsndfile/#Features
|
||||||
|
|
||||||
|
"""
|
||||||
|
with sf.SoundFile(file) as sf_desc:
|
||||||
|
sr_native = sf_desc.samplerate
|
||||||
|
if offset:
|
||||||
|
sf_desc.seek(int(offset * sr_native))
|
||||||
|
if duration is not None:
|
||||||
|
frame_duration = int(duration * sr_native)
|
||||||
|
else:
|
||||||
|
frame_duration = -1
|
||||||
|
y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
|
||||||
|
|
||||||
|
return y, sf_desc.samplerate
|
||||||
|
|
||||||
|
|
||||||
|
def audio_file_load():
|
||||||
|
"""Load audio using audiofile library
|
||||||
|
|
||||||
|
This function load audio file using audiofile.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
https://audiofile.68k.org/
|
||||||
|
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
def sox_file_load():
|
||||||
|
"""Load audio using sox library
|
||||||
|
|
||||||
|
This function load audio file using sox.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
http://sox.sourceforge.net/
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(y: array, norm_type: str='linear',
|
||||||
|
mul_factor: float=1.0) -> array:
|
||||||
|
""" normalize an input audio with additional multiplier.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
if norm_type == 'linear':
|
||||||
|
amax = np.max(np.abs(y))
|
||||||
|
factor = 1.0 / (amax + EPS)
|
||||||
|
y = y * factor * mul_factor
|
||||||
|
elif norm_type == 'gaussian':
|
||||||
|
amean = np.mean(y)
|
||||||
|
astd = np.std(y)
|
||||||
|
astd = max(astd, EPS)
|
||||||
|
y = mul_factor * (y - amean) / astd
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
|
||||||
|
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
def save_wav(y: array, sr: int, file: str) -> None:
|
||||||
|
"""Save audio file to disk.
|
||||||
|
This function saves audio to disk using scipy.io.wavfile, with additional step
|
||||||
|
to convert input waveform to int16 unless it already is int16
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
It only support raw wav format.
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not file.endswith('.wav'):
|
||||||
|
raise ParameterError(
|
||||||
|
f'only .wav file supported, but dst file name is: {file}')
|
||||||
|
|
||||||
|
if sr <= 0:
|
||||||
|
raise ParameterError(
|
||||||
|
f'Sample rate should be larger than 0, recieved sr = {sr}')
|
||||||
|
|
||||||
|
if y.dtype not in ['int16', 'int8']:
|
||||||
|
warnings.warn(
|
||||||
|
f'input data type is {y.dtype}, will convert data to int16 format before saving'
|
||||||
|
)
|
||||||
|
y_out = depth_convert(y, 'int16')
|
||||||
|
else:
|
||||||
|
y_out = y
|
||||||
|
|
||||||
|
wavfile.write(file, sr, y_out)
|
||||||
|
|
||||||
|
|
||||||
|
def load(
|
||||||
|
file: str,
|
||||||
|
sr: Optional[int]=None,
|
||||||
|
mono: bool=True,
|
||||||
|
merge_type: str='average', # ch0,ch1,random,average
|
||||||
|
normal: bool=True,
|
||||||
|
norm_type: str='linear',
|
||||||
|
norm_mul_factor: float=1.0,
|
||||||
|
offset: float=0.0,
|
||||||
|
duration: Optional[int]=None,
|
||||||
|
dtype: str='float32',
|
||||||
|
resample_mode: str='kaiser_fast') -> Tuple[array, int]:
|
||||||
|
"""Load audio file from disk.
|
||||||
|
This function loads audio from disk using using audio beackend.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration)
|
||||||
|
|
||||||
|
if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
|
||||||
|
raise ParameterError(f'audio file {file} looks empty')
|
||||||
|
|
||||||
|
if mono:
|
||||||
|
y = to_mono(y, merge_type)
|
||||||
|
|
||||||
|
if sr is not None and sr != r:
|
||||||
|
y = resample(y, r, sr, mode=resample_mode)
|
||||||
|
r = sr
|
||||||
|
|
||||||
|
if normal:
|
||||||
|
y = normalize(y, norm_type, norm_mul_factor)
|
||||||
|
elif dtype in ['int8', 'int16']:
|
||||||
|
# still need to do normalization, before depth convertion
|
||||||
|
y = normalize(y, 'linear', 1.0)
|
||||||
|
|
||||||
|
y = depth_convert(y, dtype)
|
||||||
|
return y, r
|
@ -0,0 +1,34 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .aishell import AISHELL1
|
||||||
|
from .dcase import UrbanAcousticScenes
|
||||||
|
from .dcase import UrbanAudioVisualScenes
|
||||||
|
from .esc50 import ESC50
|
||||||
|
from .gtzan import GTZAN
|
||||||
|
from .librispeech import LIBRISPEECH
|
||||||
|
from .ravdess import RAVDESS
|
||||||
|
from .tess import TESS
|
||||||
|
from .urban_sound import UrbanSound8K
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'AISHELL1',
|
||||||
|
'LIBRISPEECH',
|
||||||
|
'ESC50',
|
||||||
|
'UrbanSound8K',
|
||||||
|
'GTZAN',
|
||||||
|
'UrbanAcousticScenes',
|
||||||
|
'UrbanAudioVisualScenes',
|
||||||
|
'RAVDESS',
|
||||||
|
'TESS',
|
||||||
|
]
|
@ -0,0 +1,154 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import codecs
|
||||||
|
import collections
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
from paddle.io import Dataset
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from ..backends import load as load_audio
|
||||||
|
from ..utils.download import decompress
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from ..utils.env import DATA_HOME
|
||||||
|
from ..utils.log import logger
|
||||||
|
from .dataset import feat_funcs
|
||||||
|
|
||||||
|
__all__ = ['AISHELL1']
|
||||||
|
|
||||||
|
|
||||||
|
class AISHELL1(Dataset):
|
||||||
|
"""
|
||||||
|
This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
|
||||||
|
It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
|
||||||
|
smart home, autonomous driving, and industrial production. The whole recording was
|
||||||
|
put in quiet indoor environment, using 3 different devices at the same time: high
|
||||||
|
fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
|
||||||
|
iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
|
||||||
|
to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
|
||||||
|
in China were invited to participate in the recording. The manual transcription
|
||||||
|
accuracy rate is above 95%, through professional speech annotation and strict
|
||||||
|
quality inspection. The corpus is divided into training, development and testing
|
||||||
|
sets.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
|
||||||
|
https://arxiv.org/abs/1709.05522
|
||||||
|
"""
|
||||||
|
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
|
||||||
|
'md5': '2f494334227864a8a8fec932999db9d8',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
text_meta = os.path.join('data_aishell', 'transcript',
|
||||||
|
'aishell_transcript_v0.8.txt')
|
||||||
|
utt_info = collections.namedtuple('META_INFO',
|
||||||
|
('file_path', 'utt_id', 'text'))
|
||||||
|
audio_path = os.path.join('data_aishell', 'wav')
|
||||||
|
manifest_path = os.path.join('data_aishell', 'manifest')
|
||||||
|
subset = ['train', 'dev', 'test']
|
||||||
|
|
||||||
|
def __init__(self, subset: str='train', feat_type: str='raw', **kwargs):
|
||||||
|
assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
|
||||||
|
self.subset, subset)
|
||||||
|
self.subset = subset
|
||||||
|
self.feat_type = feat_type
|
||||||
|
self.feat_config = kwargs
|
||||||
|
self._data = self._get_data()
|
||||||
|
super(AISHELL1, self).__init__()
|
||||||
|
|
||||||
|
def _get_text_info(self) -> Dict[str, str]:
|
||||||
|
ret = {}
|
||||||
|
with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
|
||||||
|
for line in rf.readlines()[1:]:
|
||||||
|
utt_id, text = map(str.strip, line.split(' ',
|
||||||
|
1)) # utt_id, text
|
||||||
|
ret.update({utt_id: ''.join(text.split())})
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self):
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||||
|
not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
|
||||||
|
download_and_decompress(self.archieves, DATA_HOME)
|
||||||
|
# Extract *wav from *.tar.gz.
|
||||||
|
for root, _, files in os.walk(
|
||||||
|
os.path.join(DATA_HOME, self.audio_path)):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.tar.gz'):
|
||||||
|
decompress(os.path.join(root, file))
|
||||||
|
os.remove(os.path.join(root, file))
|
||||||
|
|
||||||
|
text_info = self._get_text_info()
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for root, _, files in os.walk(
|
||||||
|
os.path.join(DATA_HOME, self.audio_path, self.subset)):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.wav'):
|
||||||
|
utt_id = os.path.splitext(file)[0]
|
||||||
|
if utt_id not in text_info: # There are some utt_id that without label
|
||||||
|
continue
|
||||||
|
text = text_info[utt_id]
|
||||||
|
file_path = os.path.join(root, file)
|
||||||
|
data.append(self.utt_info(file_path, utt_id, text))
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _convert_to_record(self, idx: int):
|
||||||
|
sample = self._data[idx]
|
||||||
|
|
||||||
|
record = {}
|
||||||
|
# To show all fields in a namedtuple: `type(sample)._fields`
|
||||||
|
for field in type(sample)._fields:
|
||||||
|
record[field] = getattr(sample, field)
|
||||||
|
|
||||||
|
waveform, sr = load_audio(
|
||||||
|
sample[0]) # The first element of sample is file path
|
||||||
|
feat_func = feat_funcs[self.feat_type]
|
||||||
|
feat = feat_func(
|
||||||
|
waveform, sample_rate=sr,
|
||||||
|
**self.feat_config) if feat_func else waveform
|
||||||
|
record.update({'feat': feat, 'duration': len(waveform) / sr})
|
||||||
|
return record
|
||||||
|
|
||||||
|
def create_manifest(self, prefix='manifest'):
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
|
||||||
|
os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
|
||||||
|
|
||||||
|
manifest_file = os.path.join(DATA_HOME, self.manifest_path,
|
||||||
|
f'{prefix}.{self.subset}')
|
||||||
|
with codecs.open(manifest_file, 'w', 'utf-8') as f:
|
||||||
|
for idx in tqdm(range(len(self))):
|
||||||
|
record = self._convert_to_record(idx)
|
||||||
|
record_line = json.dumps(
|
||||||
|
{
|
||||||
|
'utt': record['utt_id'],
|
||||||
|
'feat': record['file_path'],
|
||||||
|
'feat_shape': (record['duration'], ),
|
||||||
|
'text': record['text']
|
||||||
|
},
|
||||||
|
ensure_ascii=False)
|
||||||
|
f.write(record_line + '\n')
|
||||||
|
logger.info(f'Manifest file {manifest_file} created.')
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
record = self._convert_to_record(idx)
|
||||||
|
return tuple(record.values())
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._data)
|
@ -0,0 +1,82 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
|
||||||
|
from ..backends import load as load_audio
|
||||||
|
from ..features import melspectrogram
|
||||||
|
from ..features import mfcc
|
||||||
|
|
||||||
|
feat_funcs = {
|
||||||
|
'raw': None,
|
||||||
|
'melspectrogram': melspectrogram,
|
||||||
|
'mfcc': mfcc,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class AudioClassificationDataset(paddle.io.Dataset):
|
||||||
|
"""
|
||||||
|
Base class of audio classification dataset.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
files: List[str],
|
||||||
|
labels: List[int],
|
||||||
|
feat_type: str='raw',
|
||||||
|
**kwargs):
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
files (:obj:`List[str]`): A list of absolute path of audio files.
|
||||||
|
labels (:obj:`List[int]`): Labels of audio files.
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
super(AudioClassificationDataset, self).__init__()
|
||||||
|
|
||||||
|
if feat_type not in feat_funcs.keys():
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.files = files
|
||||||
|
self.labels = labels
|
||||||
|
|
||||||
|
self.feat_type = feat_type
|
||||||
|
self.feat_config = kwargs # Pass keyword arguments to customize feature config
|
||||||
|
|
||||||
|
def _get_data(self, input_file: str):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _convert_to_record(self, idx):
|
||||||
|
file, label = self.files[idx], self.labels[idx]
|
||||||
|
|
||||||
|
waveform, sample_rate = load_audio(file)
|
||||||
|
feat_func = feat_funcs[self.feat_type]
|
||||||
|
|
||||||
|
record = {}
|
||||||
|
record['feat'] = feat_func(
|
||||||
|
waveform, sample_rate,
|
||||||
|
**self.feat_config) if feat_func else waveform
|
||||||
|
record['label'] = label
|
||||||
|
return record
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
record = self._convert_to_record(idx)
|
||||||
|
return np.array(record['feat']).transpose(), np.array(
|
||||||
|
record['label'], dtype=np.int64)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.files)
|
@ -0,0 +1,298 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from ..utils.env import DATA_HOME
|
||||||
|
from .dataset import AudioClassificationDataset
|
||||||
|
|
||||||
|
__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
|
||||||
|
|
||||||
|
|
||||||
|
class UrbanAcousticScenes(AudioClassificationDataset):
|
||||||
|
"""
|
||||||
|
TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from
|
||||||
|
12 European cities in 10 different acoustic scenes using 4 different devices.
|
||||||
|
Additionally, synthetic data for 11 mobile devices was created based on the original
|
||||||
|
recordings. Of the 12 cities, two are present only in the evaluation set.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
A multi-device dataset for urban acoustic scene classification
|
||||||
|
https://arxiv.org/abs/1807.09840
|
||||||
|
"""
|
||||||
|
|
||||||
|
source_url = 'https://zenodo.org/record/3819968/files/'
|
||||||
|
base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.meta.zip',
|
||||||
|
'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.1.zip',
|
||||||
|
'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.2.zip',
|
||||||
|
'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.3.zip',
|
||||||
|
'md5': 'ed38956c4246abb56190c1e9b602b7b8',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.4.zip',
|
||||||
|
'md5': '97ab8560056b6816808dedc044dcc023',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.5.zip',
|
||||||
|
'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.6.zip',
|
||||||
|
'md5': 'fbf856a3a86fff7520549c899dc94372',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.7.zip',
|
||||||
|
'md5': '0dbffe7b6e45564da649378723284062',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.8.zip',
|
||||||
|
'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.9.zip',
|
||||||
|
'md5': 'a65596a5372eab10c78e08a0de797c9e',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.10.zip',
|
||||||
|
'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.11.zip',
|
||||||
|
'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.12.zip',
|
||||||
|
'md5': 'e5f4400c6b9697295fab4cf507155a2f',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.13.zip',
|
||||||
|
'md5': '8855ab9f9896422746ab4c5d89d8da2f',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.14.zip',
|
||||||
|
'md5': '092ad744452cd3e7de78f988a3d13020',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.15.zip',
|
||||||
|
'md5': '4b5eb85f6592aebf846088d9df76b420',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.16.zip',
|
||||||
|
'md5': '2e0a89723e58a3836be019e6996ae460',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
label_list = [
|
||||||
|
'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
|
||||||
|
'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
|
||||||
|
]
|
||||||
|
|
||||||
|
meta = os.path.join(base_name, 'meta.csv')
|
||||||
|
meta_info = collections.namedtuple('META_INFO', (
|
||||||
|
'filename', 'scene_label', 'identifier', 'source_label'))
|
||||||
|
subset_meta = {
|
||||||
|
'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
|
||||||
|
'dev':
|
||||||
|
os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
|
||||||
|
'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
|
||||||
|
}
|
||||||
|
subset_meta_info = collections.namedtuple('SUBSET_META_INFO',
|
||||||
|
('filename', 'scene_label'))
|
||||||
|
audio_path = os.path.join(base_name, 'audio')
|
||||||
|
|
||||||
|
def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||||
|
It identifies the dataset mode (train or dev).
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
files, labels = self._get_data(mode)
|
||||||
|
super(UrbanAcousticScenes, self).__init__(
|
||||||
|
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||||
|
|
||||||
|
def _get_meta_info(self, subset: str=None,
|
||||||
|
skip_header: bool=True) -> List[collections.namedtuple]:
|
||||||
|
if subset is None:
|
||||||
|
meta_file = self.meta
|
||||||
|
meta_info = self.meta_info
|
||||||
|
else:
|
||||||
|
assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
|
||||||
|
meta_file = self.subset_meta[subset]
|
||||||
|
meta_info = self.subset_meta_info
|
||||||
|
|
||||||
|
ret = []
|
||||||
|
with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
|
||||||
|
lines = rf.readlines()[1:] if skip_header else rf.readlines()
|
||||||
|
for line in lines:
|
||||||
|
ret.append(meta_info(*line.strip().split('\t')))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||||
|
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
||||||
|
download_and_decompress(self.archieves, DATA_HOME)
|
||||||
|
|
||||||
|
meta_info = self._get_meta_info(subset=mode, skip_header=True)
|
||||||
|
|
||||||
|
files = []
|
||||||
|
labels = []
|
||||||
|
for sample in meta_info:
|
||||||
|
filename, label = sample[:2]
|
||||||
|
filename = os.path.basename(filename)
|
||||||
|
target = self.label_list.index(label)
|
||||||
|
|
||||||
|
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
|
||||||
|
labels.append(int(target))
|
||||||
|
|
||||||
|
return files, labels
|
||||||
|
|
||||||
|
|
||||||
|
class UrbanAudioVisualScenes(AudioClassificationDataset):
|
||||||
|
"""
|
||||||
|
TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
|
||||||
|
and video recordings from 12 European cities in 10 different scenes.
|
||||||
|
This dataset consists of 10-seconds audio and video segments from 10
|
||||||
|
acoustic scenes. The total amount of audio in the development set is 34 hours.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
|
||||||
|
https://arxiv.org/abs/2011.00030
|
||||||
|
"""
|
||||||
|
|
||||||
|
source_url = 'https://zenodo.org/record/4477542/files/'
|
||||||
|
base_name = 'TAU-urban-audio-visual-scenes-2021-development'
|
||||||
|
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.meta.zip',
|
||||||
|
'md5': '76e3d7ed5291b118372e06379cb2b490',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.1.zip',
|
||||||
|
'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.2.zip',
|
||||||
|
'md5': '7fd6bb63127f5785874a55aba4e77aa5',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.3.zip',
|
||||||
|
'md5': '61396bede29d7c8c89729a01a6f6b2e2',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.4.zip',
|
||||||
|
'md5': '6ddac89717fcf9c92c451868eed77fe1',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.5.zip',
|
||||||
|
'md5': 'af4820756cdf1a7d4bd6037dc034d384',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.6.zip',
|
||||||
|
'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.7.zip',
|
||||||
|
'md5': '2be39a76aeed704d5929d020a2909efd',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + base_name + '.audio.8.zip',
|
||||||
|
'md5': '972d8afe0874720fc2f28086e7cb22a9',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
label_list = [
|
||||||
|
'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
|
||||||
|
'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
|
||||||
|
]
|
||||||
|
|
||||||
|
meta_base_path = os.path.join(base_name, base_name + '.meta')
|
||||||
|
meta = os.path.join(meta_base_path, 'meta.csv')
|
||||||
|
meta_info = collections.namedtuple('META_INFO', (
|
||||||
|
'filename_audio', 'filename_video', 'scene_label', 'identifier'))
|
||||||
|
subset_meta = {
|
||||||
|
'train':
|
||||||
|
os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
|
||||||
|
'dev':
|
||||||
|
os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
|
||||||
|
'test':
|
||||||
|
os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
|
||||||
|
}
|
||||||
|
subset_meta_info = collections.namedtuple('SUBSET_META_INFO', (
|
||||||
|
'filename_audio', 'filename_video', 'scene_label'))
|
||||||
|
audio_path = os.path.join(base_name, 'audio')
|
||||||
|
|
||||||
|
def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||||
|
It identifies the dataset mode (train or dev).
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
files, labels = self._get_data(mode)
|
||||||
|
super(UrbanAudioVisualScenes, self).__init__(
|
||||||
|
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||||
|
|
||||||
|
def _get_meta_info(self, subset: str=None,
|
||||||
|
skip_header: bool=True) -> List[collections.namedtuple]:
|
||||||
|
if subset is None:
|
||||||
|
meta_file = self.meta
|
||||||
|
meta_info = self.meta_info
|
||||||
|
else:
|
||||||
|
assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
|
||||||
|
meta_file = self.subset_meta[subset]
|
||||||
|
meta_info = self.subset_meta_info
|
||||||
|
|
||||||
|
ret = []
|
||||||
|
with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
|
||||||
|
lines = rf.readlines()[1:] if skip_header else rf.readlines()
|
||||||
|
for line in lines:
|
||||||
|
ret.append(meta_info(*line.strip().split('\t')))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||||
|
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
||||||
|
download_and_decompress(self.archieves,
|
||||||
|
os.path.join(DATA_HOME, self.base_name))
|
||||||
|
|
||||||
|
meta_info = self._get_meta_info(subset=mode, skip_header=True)
|
||||||
|
|
||||||
|
files = []
|
||||||
|
labels = []
|
||||||
|
for sample in meta_info:
|
||||||
|
filename, _, label = sample[:3]
|
||||||
|
filename = os.path.basename(filename)
|
||||||
|
target = self.label_list.index(label)
|
||||||
|
|
||||||
|
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
|
||||||
|
labels.append(int(target))
|
||||||
|
|
||||||
|
return files, labels
|
@ -0,0 +1,152 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from ..utils.env import DATA_HOME
|
||||||
|
from .dataset import AudioClassificationDataset
|
||||||
|
|
||||||
|
__all__ = ['ESC50']
|
||||||
|
|
||||||
|
|
||||||
|
class ESC50(AudioClassificationDataset):
|
||||||
|
"""
|
||||||
|
The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
|
||||||
|
suitable for benchmarking methods of environmental sound classification. The dataset
|
||||||
|
consists of 5-second-long recordings organized into 50 semantical classes (with
|
||||||
|
40 examples per class)
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
ESC: Dataset for Environmental Sound Classification
|
||||||
|
http://dx.doi.org/10.1145/2733373.2806390
|
||||||
|
"""
|
||||||
|
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url':
|
||||||
|
'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
|
||||||
|
'md5': '7771e4b9d86d0945acce719c7a59305a',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
label_list = [
|
||||||
|
# Animals
|
||||||
|
'Dog',
|
||||||
|
'Rooster',
|
||||||
|
'Pig',
|
||||||
|
'Cow',
|
||||||
|
'Frog',
|
||||||
|
'Cat',
|
||||||
|
'Hen',
|
||||||
|
'Insects (flying)',
|
||||||
|
'Sheep',
|
||||||
|
'Crow',
|
||||||
|
# Natural soundscapes & water sounds
|
||||||
|
'Rain',
|
||||||
|
'Sea waves',
|
||||||
|
'Crackling fire',
|
||||||
|
'Crickets',
|
||||||
|
'Chirping birds',
|
||||||
|
'Water drops',
|
||||||
|
'Wind',
|
||||||
|
'Pouring water',
|
||||||
|
'Toilet flush',
|
||||||
|
'Thunderstorm',
|
||||||
|
# Human, non-speech sounds
|
||||||
|
'Crying baby',
|
||||||
|
'Sneezing',
|
||||||
|
'Clapping',
|
||||||
|
'Breathing',
|
||||||
|
'Coughing',
|
||||||
|
'Footsteps',
|
||||||
|
'Laughing',
|
||||||
|
'Brushing teeth',
|
||||||
|
'Snoring',
|
||||||
|
'Drinking, sipping',
|
||||||
|
# Interior/domestic sounds
|
||||||
|
'Door knock',
|
||||||
|
'Mouse click',
|
||||||
|
'Keyboard typing',
|
||||||
|
'Door, wood creaks',
|
||||||
|
'Can opening',
|
||||||
|
'Washing machine',
|
||||||
|
'Vacuum cleaner',
|
||||||
|
'Clock alarm',
|
||||||
|
'Clock tick',
|
||||||
|
'Glass breaking',
|
||||||
|
# Exterior/urban noises
|
||||||
|
'Helicopter',
|
||||||
|
'Chainsaw',
|
||||||
|
'Siren',
|
||||||
|
'Car horn',
|
||||||
|
'Engine',
|
||||||
|
'Train',
|
||||||
|
'Church bells',
|
||||||
|
'Airplane',
|
||||||
|
'Fireworks',
|
||||||
|
'Hand saw',
|
||||||
|
]
|
||||||
|
meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
|
||||||
|
meta_info = collections.namedtuple(
|
||||||
|
'META_INFO',
|
||||||
|
('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
|
||||||
|
audio_path = os.path.join('ESC-50-master', 'audio')
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
mode: str='train',
|
||||||
|
split: int=1,
|
||||||
|
feat_type: str='raw',
|
||||||
|
**kwargs):
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||||
|
It identifies the dataset mode (train or dev).
|
||||||
|
split (:obj:`int`, `optional`, defaults to 1):
|
||||||
|
It specify the fold of dev dataset.
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
files, labels = self._get_data(mode, split)
|
||||||
|
super(ESC50, self).__init__(
|
||||||
|
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||||
|
|
||||||
|
def _get_meta_info(self) -> List[collections.namedtuple]:
|
||||||
|
ret = []
|
||||||
|
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
|
||||||
|
for line in rf.readlines()[1:]:
|
||||||
|
ret.append(self.meta_info(*line.strip().split(',')))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||||
|
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
||||||
|
download_and_decompress(self.archieves, DATA_HOME)
|
||||||
|
|
||||||
|
meta_info = self._get_meta_info()
|
||||||
|
|
||||||
|
files = []
|
||||||
|
labels = []
|
||||||
|
for sample in meta_info:
|
||||||
|
filename, fold, target, _, _, _, _ = sample
|
||||||
|
if mode == 'train' and int(fold) != split:
|
||||||
|
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
|
||||||
|
labels.append(int(target))
|
||||||
|
|
||||||
|
if mode != 'train' and int(fold) == split:
|
||||||
|
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
|
||||||
|
labels.append(int(target))
|
||||||
|
|
||||||
|
return files, labels
|
@ -0,0 +1,115 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from ..utils.env import DATA_HOME
|
||||||
|
from .dataset import AudioClassificationDataset
|
||||||
|
|
||||||
|
__all__ = ['GTZAN']
|
||||||
|
|
||||||
|
|
||||||
|
class GTZAN(AudioClassificationDataset):
|
||||||
|
"""
|
||||||
|
The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
|
||||||
|
each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
|
||||||
|
in machine listening research for music genre recognition (MGR).
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
Musical genre classification of audio signals
|
||||||
|
https://ieeexplore.ieee.org/document/1021072/
|
||||||
|
"""
|
||||||
|
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
|
||||||
|
'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
label_list = [
|
||||||
|
'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
|
||||||
|
'pop', 'reggae', 'rock'
|
||||||
|
]
|
||||||
|
meta = os.path.join('genres', 'input.mf')
|
||||||
|
meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
|
||||||
|
audio_path = 'genres'
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
mode='train',
|
||||||
|
seed=0,
|
||||||
|
n_folds=5,
|
||||||
|
split=1,
|
||||||
|
feat_type='raw',
|
||||||
|
**kwargs):
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||||
|
It identifies the dataset mode (train or dev).
|
||||||
|
seed (:obj:`int`, `optional`, defaults to 0):
|
||||||
|
Set the random seed to shuffle samples.
|
||||||
|
n_folds (:obj:`int`, `optional`, defaults to 5):
|
||||||
|
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
|
||||||
|
split (:obj:`int`, `optional`, defaults to 1):
|
||||||
|
It specify the fold of dev dataset.
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
|
||||||
|
files, labels = self._get_data(mode, seed, n_folds, split)
|
||||||
|
super(GTZAN, self).__init__(
|
||||||
|
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||||
|
|
||||||
|
def _get_meta_info(self) -> List[collections.namedtuple]:
|
||||||
|
ret = []
|
||||||
|
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
|
||||||
|
for line in rf.readlines():
|
||||||
|
ret.append(self.meta_info(*line.strip().split('\t')))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self, mode, seed, n_folds,
|
||||||
|
split) -> Tuple[List[str], List[int]]:
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||||
|
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
||||||
|
download_and_decompress(self.archieves, DATA_HOME)
|
||||||
|
|
||||||
|
meta_info = self._get_meta_info()
|
||||||
|
random.seed(seed) # shuffle samples to split data
|
||||||
|
random.shuffle(
|
||||||
|
meta_info
|
||||||
|
) # make sure using the same seed to create train and dev dataset
|
||||||
|
|
||||||
|
files = []
|
||||||
|
labels = []
|
||||||
|
n_samples_per_fold = len(meta_info) // n_folds
|
||||||
|
for idx, sample in enumerate(meta_info):
|
||||||
|
file_path, label = sample
|
||||||
|
filename = os.path.basename(file_path)
|
||||||
|
target = self.label_list.index(label)
|
||||||
|
fold = idx // n_samples_per_fold + 1
|
||||||
|
|
||||||
|
if mode == 'train' and int(fold) != split:
|
||||||
|
files.append(
|
||||||
|
os.path.join(DATA_HOME, self.audio_path, label, filename))
|
||||||
|
labels.append(target)
|
||||||
|
|
||||||
|
if mode != 'train' and int(fold) == split:
|
||||||
|
files.append(
|
||||||
|
os.path.join(DATA_HOME, self.audio_path, label, filename))
|
||||||
|
labels.append(target)
|
||||||
|
|
||||||
|
return files, labels
|
@ -0,0 +1,199 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import codecs
|
||||||
|
import collections
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
from paddle.io import Dataset
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from ..backends import load as load_audio
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from ..utils.env import DATA_HOME
|
||||||
|
from ..utils.log import logger
|
||||||
|
from .dataset import feat_funcs
|
||||||
|
|
||||||
|
__all__ = ['LIBRISPEECH']
|
||||||
|
|
||||||
|
|
||||||
|
class LIBRISPEECH(Dataset):
|
||||||
|
"""
|
||||||
|
LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
|
||||||
|
prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
|
||||||
|
derived from read audiobooks from the LibriVox project, and has been carefully
|
||||||
|
segmented and aligned.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
|
||||||
|
http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
|
||||||
|
https://arxiv.org/abs/1709.05522
|
||||||
|
"""
|
||||||
|
|
||||||
|
source_url = 'http://www.openslr.org/resources/12/'
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url': source_url + 'train-clean-100.tar.gz',
|
||||||
|
'md5': '2a93770f6d5c6c964bc36631d331a522',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + 'train-clean-360.tar.gz',
|
||||||
|
'md5': 'c0e676e450a7ff2f54aeade5171606fa',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + 'train-other-500.tar.gz',
|
||||||
|
'md5': 'd1a0fd59409feb2c614ce4d30c387708',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + 'dev-clean.tar.gz',
|
||||||
|
'md5': '42e2234ba48799c1f50f24a7926300a1',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + 'dev-other.tar.gz',
|
||||||
|
'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + 'test-clean.tar.gz',
|
||||||
|
'md5': '32fa31d27d2e1cad72775fee3f4849a9',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url': source_url + 'test-other.tar.gz',
|
||||||
|
'md5': 'fb5a50374b501bb3bac4815ee91d3135',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
|
||||||
|
utt_info = collections.namedtuple('META_INFO', (
|
||||||
|
'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
|
||||||
|
audio_path = 'LibriSpeech'
|
||||||
|
manifest_path = os.path.join('LibriSpeech', 'manifest')
|
||||||
|
subset = [
|
||||||
|
'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean',
|
||||||
|
'dev-other', 'test-clean', 'test-other'
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
subset: str='train-clean-100',
|
||||||
|
feat_type: str='raw',
|
||||||
|
**kwargs):
|
||||||
|
assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
|
||||||
|
self.subset, subset)
|
||||||
|
self.subset = subset
|
||||||
|
self.feat_type = feat_type
|
||||||
|
self.feat_config = kwargs
|
||||||
|
self._data = self._get_data()
|
||||||
|
super(LIBRISPEECH, self).__init__()
|
||||||
|
|
||||||
|
def _get_speaker_info(self) -> Dict[str, str]:
|
||||||
|
ret = {}
|
||||||
|
with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
|
||||||
|
for line in rf.readlines():
|
||||||
|
if ';' in line: # Skip dataset abstract
|
||||||
|
continue
|
||||||
|
spk_id, gender = map(str.strip,
|
||||||
|
line.split('|')[:2]) # spk_id, gender
|
||||||
|
ret.update({spk_id: gender})
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_text_info(self, trans_file) -> Dict[str, str]:
|
||||||
|
ret = {}
|
||||||
|
with open(trans_file, 'r') as rf:
|
||||||
|
for line in rf.readlines():
|
||||||
|
utt_id, text = map(str.strip, line.split(' ',
|
||||||
|
1)) # utt_id, text
|
||||||
|
ret.update({utt_id: text})
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self):
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||||
|
not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
|
||||||
|
download_and_decompress(self.archieves, DATA_HOME,
|
||||||
|
len(self.archieves))
|
||||||
|
|
||||||
|
# Speaker info
|
||||||
|
speaker_info = self._get_speaker_info()
|
||||||
|
|
||||||
|
# Text info
|
||||||
|
text_info = {}
|
||||||
|
for root, _, files in os.walk(
|
||||||
|
os.path.join(DATA_HOME, self.audio_path, self.subset)):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.trans.txt'):
|
||||||
|
text_info.update(
|
||||||
|
self._get_text_info(os.path.join(root, file)))
|
||||||
|
|
||||||
|
data = []
|
||||||
|
for root, _, files in os.walk(
|
||||||
|
os.path.join(DATA_HOME, self.audio_path, self.subset)):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.flac'):
|
||||||
|
utt_id = os.path.splitext(file)[0]
|
||||||
|
spk_id = utt_id.split('-')[0]
|
||||||
|
if utt_id not in text_info \
|
||||||
|
or spk_id not in speaker_info : # Skip samples with incomplete data
|
||||||
|
continue
|
||||||
|
file_path = os.path.join(root, file)
|
||||||
|
text = text_info[utt_id]
|
||||||
|
spk_gender = speaker_info[spk_id]
|
||||||
|
data.append(
|
||||||
|
self.utt_info(file_path, utt_id, text, spk_id,
|
||||||
|
spk_gender))
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _convert_to_record(self, idx: int):
|
||||||
|
sample = self._data[idx]
|
||||||
|
|
||||||
|
record = {}
|
||||||
|
# To show all fields in a namedtuple: `type(sample)._fields`
|
||||||
|
for field in type(sample)._fields:
|
||||||
|
record[field] = getattr(sample, field)
|
||||||
|
|
||||||
|
waveform, sr = load_audio(
|
||||||
|
sample[0]) # The first element of sample is file path
|
||||||
|
feat_func = feat_funcs[self.feat_type]
|
||||||
|
feat = feat_func(
|
||||||
|
waveform, sample_rate=sr,
|
||||||
|
**self.feat_config) if feat_func else waveform
|
||||||
|
record.update({'feat': feat, 'duration': len(waveform) / sr})
|
||||||
|
return record
|
||||||
|
|
||||||
|
def create_manifest(self, prefix='manifest'):
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
|
||||||
|
os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
|
||||||
|
|
||||||
|
manifest_file = os.path.join(DATA_HOME, self.manifest_path,
|
||||||
|
f'{prefix}.{self.subset}')
|
||||||
|
with codecs.open(manifest_file, 'w', 'utf-8') as f:
|
||||||
|
for idx in tqdm(range(len(self))):
|
||||||
|
record = self._convert_to_record(idx)
|
||||||
|
record_line = json.dumps(
|
||||||
|
{
|
||||||
|
'utt': record['utt_id'],
|
||||||
|
'feat': record['file_path'],
|
||||||
|
'feat_shape': (record['duration'], ),
|
||||||
|
'text': record['text'],
|
||||||
|
'spk': record['spk_id'],
|
||||||
|
'gender': record['spk_gender'],
|
||||||
|
},
|
||||||
|
ensure_ascii=False)
|
||||||
|
f.write(record_line + '\n')
|
||||||
|
logger.info(f'Manifest file {manifest_file} created.')
|
||||||
|
|
||||||
|
def __getitem__(self, idx):
|
||||||
|
record = self._convert_to_record(idx)
|
||||||
|
return tuple(record.values())
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._data)
|
@ -0,0 +1,136 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from ..utils.env import DATA_HOME
|
||||||
|
from .dataset import AudioClassificationDataset
|
||||||
|
|
||||||
|
__all__ = ['RAVDESS']
|
||||||
|
|
||||||
|
|
||||||
|
class RAVDESS(AudioClassificationDataset):
|
||||||
|
"""
|
||||||
|
The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
|
||||||
|
lexically-matched statements in a neutral North American accent. Speech emotions
|
||||||
|
includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
|
||||||
|
Each expression is produced at two levels of emotional intensity (normal, strong),
|
||||||
|
with an additional neutral expression.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
|
||||||
|
A dynamic, multimodal set of facial and vocal expressions in North American English
|
||||||
|
https://doi.org/10.1371/journal.pone.0196391
|
||||||
|
"""
|
||||||
|
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url':
|
||||||
|
'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
|
||||||
|
'md5':
|
||||||
|
'5411230427d67a21e18aa4d466e6d1b9',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'url':
|
||||||
|
'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
|
||||||
|
'md5':
|
||||||
|
'bc696df654c87fed845eb13823edef8a',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
label_list = [
|
||||||
|
'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
|
||||||
|
'surprised'
|
||||||
|
]
|
||||||
|
meta_info = collections.namedtuple(
|
||||||
|
'META_INFO', ('modality', 'vocal_channel', 'emotion',
|
||||||
|
'emotion_intensity', 'statement', 'repitition', 'actor'))
|
||||||
|
speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
|
||||||
|
song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
mode='train',
|
||||||
|
seed=0,
|
||||||
|
n_folds=5,
|
||||||
|
split=1,
|
||||||
|
feat_type='raw',
|
||||||
|
**kwargs):
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||||
|
It identifies the dataset mode (train or dev).
|
||||||
|
seed (:obj:`int`, `optional`, defaults to 0):
|
||||||
|
Set the random seed to shuffle samples.
|
||||||
|
n_folds (:obj:`int`, `optional`, defaults to 5):
|
||||||
|
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
|
||||||
|
split (:obj:`int`, `optional`, defaults to 1):
|
||||||
|
It specify the fold of dev dataset.
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
|
||||||
|
files, labels = self._get_data(mode, seed, n_folds, split)
|
||||||
|
super(RAVDESS, self).__init__(
|
||||||
|
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||||
|
|
||||||
|
def _get_meta_info(self, files) -> List[collections.namedtuple]:
|
||||||
|
ret = []
|
||||||
|
for file in files:
|
||||||
|
basename_without_extend = os.path.basename(file)[:-4]
|
||||||
|
ret.append(self.meta_info(*basename_without_extend.split('-')))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self, mode, seed, n_folds,
|
||||||
|
split) -> Tuple[List[str], List[int]]:
|
||||||
|
if not os.path.isdir(self.speech_path) and not os.path.isdir(
|
||||||
|
self.song_path):
|
||||||
|
download_and_decompress(self.archieves, DATA_HOME)
|
||||||
|
|
||||||
|
wav_files = []
|
||||||
|
for root, _, files in os.walk(self.speech_path):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.wav'):
|
||||||
|
wav_files.append(os.path.join(root, file))
|
||||||
|
|
||||||
|
for root, _, files in os.walk(self.song_path):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.wav'):
|
||||||
|
wav_files.append(os.path.join(root, file))
|
||||||
|
|
||||||
|
random.seed(seed) # shuffle samples to split data
|
||||||
|
random.shuffle(
|
||||||
|
wav_files
|
||||||
|
) # make sure using the same seed to create train and dev dataset
|
||||||
|
meta_info = self._get_meta_info(wav_files)
|
||||||
|
|
||||||
|
files = []
|
||||||
|
labels = []
|
||||||
|
n_samples_per_fold = len(meta_info) // n_folds
|
||||||
|
for idx, sample in enumerate(meta_info):
|
||||||
|
_, _, emotion, _, _, _, _ = sample
|
||||||
|
target = int(emotion) - 1
|
||||||
|
fold = idx // n_samples_per_fold + 1
|
||||||
|
|
||||||
|
if mode == 'train' and int(fold) != split:
|
||||||
|
files.append(wav_files[idx])
|
||||||
|
labels.append(target)
|
||||||
|
|
||||||
|
if mode != 'train' and int(fold) == split:
|
||||||
|
files.append(wav_files[idx])
|
||||||
|
labels.append(target)
|
||||||
|
|
||||||
|
return files, labels
|
@ -0,0 +1,126 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from ..utils.env import DATA_HOME
|
||||||
|
from .dataset import AudioClassificationDataset
|
||||||
|
|
||||||
|
__all__ = ['TESS']
|
||||||
|
|
||||||
|
|
||||||
|
class TESS(AudioClassificationDataset):
|
||||||
|
"""
|
||||||
|
TESS is a set of 200 target words were spoken in the carrier phrase
|
||||||
|
"Say the word _____' by two actresses (aged 26 and 64 years) and
|
||||||
|
recordings were made of the set portraying each of seven emotions(anger,
|
||||||
|
disgust, fear, happiness, pleasant surprise, sadness, and neutral).
|
||||||
|
There are 2800 stimuli in total.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
Toronto emotional speech set (TESS)
|
||||||
|
https://doi.org/10.5683/SP2/E8H2MF
|
||||||
|
"""
|
||||||
|
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url':
|
||||||
|
'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
|
||||||
|
'md5':
|
||||||
|
'1465311b24d1de704c4c63e4ccc470c7',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
label_list = [
|
||||||
|
'angry',
|
||||||
|
'disgust',
|
||||||
|
'fear',
|
||||||
|
'happy',
|
||||||
|
'neutral',
|
||||||
|
'ps', # pleasant surprise
|
||||||
|
'sad',
|
||||||
|
]
|
||||||
|
meta_info = collections.namedtuple('META_INFO',
|
||||||
|
('speaker', 'word', 'emotion'))
|
||||||
|
audio_path = 'TESS_Toronto_emotional_speech_set'
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
mode='train',
|
||||||
|
seed=0,
|
||||||
|
n_folds=5,
|
||||||
|
split=1,
|
||||||
|
feat_type='raw',
|
||||||
|
**kwargs):
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||||
|
It identifies the dataset mode (train or dev).
|
||||||
|
seed (:obj:`int`, `optional`, defaults to 0):
|
||||||
|
Set the random seed to shuffle samples.
|
||||||
|
n_folds (:obj:`int`, `optional`, defaults to 5):
|
||||||
|
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
|
||||||
|
split (:obj:`int`, `optional`, defaults to 1):
|
||||||
|
It specify the fold of dev dataset.
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
|
||||||
|
files, labels = self._get_data(mode, seed, n_folds, split)
|
||||||
|
super(TESS, self).__init__(
|
||||||
|
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||||
|
|
||||||
|
def _get_meta_info(self, files) -> List[collections.namedtuple]:
|
||||||
|
ret = []
|
||||||
|
for file in files:
|
||||||
|
basename_without_extend = os.path.basename(file)[:-4]
|
||||||
|
ret.append(self.meta_info(*basename_without_extend.split('_')))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self, mode, seed, n_folds,
|
||||||
|
split) -> Tuple[List[str], List[int]]:
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
|
||||||
|
download_and_decompress(self.archieves, DATA_HOME)
|
||||||
|
|
||||||
|
wav_files = []
|
||||||
|
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.wav'):
|
||||||
|
wav_files.append(os.path.join(root, file))
|
||||||
|
|
||||||
|
random.seed(seed) # shuffle samples to split data
|
||||||
|
random.shuffle(
|
||||||
|
wav_files
|
||||||
|
) # make sure using the same seed to create train and dev dataset
|
||||||
|
meta_info = self._get_meta_info(wav_files)
|
||||||
|
|
||||||
|
files = []
|
||||||
|
labels = []
|
||||||
|
n_samples_per_fold = len(meta_info) // n_folds
|
||||||
|
for idx, sample in enumerate(meta_info):
|
||||||
|
_, _, emotion = sample
|
||||||
|
target = self.label_list.index(emotion)
|
||||||
|
fold = idx // n_samples_per_fold + 1
|
||||||
|
|
||||||
|
if mode == 'train' and int(fold) != split:
|
||||||
|
files.append(wav_files[idx])
|
||||||
|
labels.append(target)
|
||||||
|
|
||||||
|
if mode != 'train' and int(fold) == split:
|
||||||
|
files.append(wav_files[idx])
|
||||||
|
labels.append(target)
|
||||||
|
|
||||||
|
return files, labels
|
@ -0,0 +1,104 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import collections
|
||||||
|
import os
|
||||||
|
from typing import List
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from ..utils.download import download_and_decompress
|
||||||
|
from ..utils.env import DATA_HOME
|
||||||
|
from .dataset import AudioClassificationDataset
|
||||||
|
|
||||||
|
__all__ = ['UrbanSound8K']
|
||||||
|
|
||||||
|
|
||||||
|
class UrbanSound8K(AudioClassificationDataset):
|
||||||
|
"""
|
||||||
|
UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
|
||||||
|
sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
|
||||||
|
drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
|
||||||
|
classes are drawn from the urban sound taxonomy.
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
A Dataset and Taxonomy for Urban Sound Research
|
||||||
|
https://dl.acm.org/doi/10.1145/2647868.2655045
|
||||||
|
"""
|
||||||
|
|
||||||
|
archieves = [
|
||||||
|
{
|
||||||
|
'url':
|
||||||
|
'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
|
||||||
|
'md5': '9aa69802bbf37fb986f71ec1483a196e',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
label_list = [
|
||||||
|
"air_conditioner", "car_horn", "children_playing", "dog_bark",
|
||||||
|
"drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
|
||||||
|
"street_music"
|
||||||
|
]
|
||||||
|
meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
|
||||||
|
meta_info = collections.namedtuple(
|
||||||
|
'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
|
||||||
|
'class_id', 'label'))
|
||||||
|
audio_path = os.path.join('UrbanSound8K', 'audio')
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
mode: str='train',
|
||||||
|
split: int=1,
|
||||||
|
feat_type: str='raw',
|
||||||
|
**kwargs):
|
||||||
|
files, labels = self._get_data(mode, split)
|
||||||
|
super(UrbanSound8K, self).__init__(
|
||||||
|
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
||||||
|
"""
|
||||||
|
Ags:
|
||||||
|
mode (:obj:`str`, `optional`, defaults to `train`):
|
||||||
|
It identifies the dataset mode (train or dev).
|
||||||
|
split (:obj:`int`, `optional`, defaults to 1):
|
||||||
|
It specify the fold of dev dataset.
|
||||||
|
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
||||||
|
It identifies the feature type that user wants to extrace of an audio file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _get_meta_info(self):
|
||||||
|
ret = []
|
||||||
|
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
|
||||||
|
for line in rf.readlines()[1:]:
|
||||||
|
ret.append(self.meta_info(*line.strip().split(',')))
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
|
||||||
|
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
||||||
|
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
||||||
|
download_and_decompress(self.archieves, DATA_HOME)
|
||||||
|
|
||||||
|
meta_info = self._get_meta_info()
|
||||||
|
|
||||||
|
files = []
|
||||||
|
labels = []
|
||||||
|
for sample in meta_info:
|
||||||
|
filename, _, _, _, _, fold, target, _ = sample
|
||||||
|
if mode == 'train' and int(fold) != split:
|
||||||
|
files.append(
|
||||||
|
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
|
||||||
|
filename))
|
||||||
|
labels.append(int(target))
|
||||||
|
|
||||||
|
if mode != 'train' and int(fold) == split:
|
||||||
|
files.append(
|
||||||
|
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
|
||||||
|
filename))
|
||||||
|
labels.append(int(target))
|
||||||
|
|
||||||
|
return files, labels
|
@ -0,0 +1,15 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .augment import *
|
||||||
|
from .core import *
|
@ -0,0 +1,170 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from numpy import ndarray as array
|
||||||
|
|
||||||
|
from paddleaudio.backends import depth_convert
|
||||||
|
from paddleaudio.utils import ParameterError
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'depth_augment',
|
||||||
|
'spect_augment',
|
||||||
|
'random_crop1d',
|
||||||
|
'random_crop2d',
|
||||||
|
'adaptive_spect_augment',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def randint(high: int) -> int:
|
||||||
|
"""Generate one random integer in range [0 high)
|
||||||
|
|
||||||
|
This is a helper function for random data augmentaiton
|
||||||
|
"""
|
||||||
|
return int(np.random.randint(0, high=high))
|
||||||
|
|
||||||
|
|
||||||
|
def rand() -> float:
|
||||||
|
"""Generate one floating-point number in range [0 1)
|
||||||
|
|
||||||
|
This is a helper function for random data augmentaiton
|
||||||
|
"""
|
||||||
|
return float(np.random.rand(1))
|
||||||
|
|
||||||
|
|
||||||
|
def depth_augment(y: array,
|
||||||
|
choices: List=['int8', 'int16'],
|
||||||
|
probs: List[float]=[0.5, 0.5]) -> array:
|
||||||
|
""" Audio depth augmentation
|
||||||
|
|
||||||
|
Do audio depth augmentation to simulate the distortion brought by quantization.
|
||||||
|
"""
|
||||||
|
assert len(probs) == len(
|
||||||
|
choices
|
||||||
|
), 'number of choices {} must be equal to size of probs {}'.format(
|
||||||
|
len(choices), len(probs))
|
||||||
|
depth = np.random.choice(choices, p=probs)
|
||||||
|
src_depth = y.dtype
|
||||||
|
y1 = depth_convert(y, depth)
|
||||||
|
y2 = depth_convert(y1, src_depth)
|
||||||
|
|
||||||
|
return y2
|
||||||
|
|
||||||
|
|
||||||
|
def adaptive_spect_augment(spect: array, tempo_axis: int=0,
|
||||||
|
level: float=0.1) -> array:
|
||||||
|
"""Do adpative spectrogram augmentation
|
||||||
|
|
||||||
|
The level of the augmentation is gowern by the paramter level,
|
||||||
|
ranging from 0 to 1, with 0 represents no augmentation。
|
||||||
|
|
||||||
|
"""
|
||||||
|
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
|
||||||
|
if tempo_axis == 0:
|
||||||
|
nt, nf = spect.shape
|
||||||
|
else:
|
||||||
|
nf, nt = spect.shape
|
||||||
|
|
||||||
|
time_mask_width = int(nt * level * 0.5)
|
||||||
|
freq_mask_width = int(nf * level * 0.5)
|
||||||
|
|
||||||
|
num_time_mask = int(10 * level)
|
||||||
|
num_freq_mask = int(10 * level)
|
||||||
|
|
||||||
|
if tempo_axis == 0:
|
||||||
|
for _ in range(num_time_mask):
|
||||||
|
start = randint(nt - time_mask_width)
|
||||||
|
spect[start:start + time_mask_width, :] = 0
|
||||||
|
for _ in range(num_freq_mask):
|
||||||
|
start = randint(nf - freq_mask_width)
|
||||||
|
spect[:, start:start + freq_mask_width] = 0
|
||||||
|
else:
|
||||||
|
for _ in range(num_time_mask):
|
||||||
|
start = randint(nt - time_mask_width)
|
||||||
|
spect[:, start:start + time_mask_width] = 0
|
||||||
|
for _ in range(num_freq_mask):
|
||||||
|
start = randint(nf - freq_mask_width)
|
||||||
|
spect[start:start + freq_mask_width, :] = 0
|
||||||
|
|
||||||
|
return spect
|
||||||
|
|
||||||
|
|
||||||
|
def spect_augment(spect: array,
|
||||||
|
tempo_axis: int=0,
|
||||||
|
max_time_mask: int=3,
|
||||||
|
max_freq_mask: int=3,
|
||||||
|
max_time_mask_width: int=30,
|
||||||
|
max_freq_mask_width: int=20) -> array:
|
||||||
|
"""Do spectrogram augmentation in both time and freq axis
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
|
||||||
|
"""
|
||||||
|
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
|
||||||
|
if tempo_axis == 0:
|
||||||
|
nt, nf = spect.shape
|
||||||
|
else:
|
||||||
|
nf, nt = spect.shape
|
||||||
|
|
||||||
|
num_time_mask = randint(max_time_mask)
|
||||||
|
num_freq_mask = randint(max_freq_mask)
|
||||||
|
|
||||||
|
time_mask_width = randint(max_time_mask_width)
|
||||||
|
freq_mask_width = randint(max_freq_mask_width)
|
||||||
|
|
||||||
|
if tempo_axis == 0:
|
||||||
|
for _ in range(num_time_mask):
|
||||||
|
start = randint(nt - time_mask_width)
|
||||||
|
spect[start:start + time_mask_width, :] = 0
|
||||||
|
for _ in range(num_freq_mask):
|
||||||
|
start = randint(nf - freq_mask_width)
|
||||||
|
spect[:, start:start + freq_mask_width] = 0
|
||||||
|
else:
|
||||||
|
for _ in range(num_time_mask):
|
||||||
|
start = randint(nt - time_mask_width)
|
||||||
|
spect[:, start:start + time_mask_width] = 0
|
||||||
|
for _ in range(num_freq_mask):
|
||||||
|
start = randint(nf - freq_mask_width)
|
||||||
|
spect[start:start + freq_mask_width, :] = 0
|
||||||
|
|
||||||
|
return spect
|
||||||
|
|
||||||
|
|
||||||
|
def random_crop1d(y: array, crop_len: int) -> array:
|
||||||
|
""" Do random cropping on 1d input signal
|
||||||
|
|
||||||
|
The input is a 1d signal, typically a sound waveform
|
||||||
|
"""
|
||||||
|
if y.ndim != 1:
|
||||||
|
'only accept 1d tensor or numpy array'
|
||||||
|
n = len(y)
|
||||||
|
idx = randint(n - crop_len)
|
||||||
|
return y[idx:idx + crop_len]
|
||||||
|
|
||||||
|
|
||||||
|
def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
|
||||||
|
""" Do random cropping for 2D array, typically a spectrogram.
|
||||||
|
|
||||||
|
The cropping is done in temporal direction on the time-freq input signal.
|
||||||
|
"""
|
||||||
|
if tempo_axis >= s.ndim:
|
||||||
|
raise ParameterError('axis out of range')
|
||||||
|
|
||||||
|
n = s.shape[tempo_axis]
|
||||||
|
idx = randint(high=n - crop_len)
|
||||||
|
sli = [slice(None) for i in range(s.ndim)]
|
||||||
|
sli[tempo_axis] = slice(idx, idx + crop_len)
|
||||||
|
out = s[tuple(sli)]
|
||||||
|
return out
|
@ -0,0 +1,576 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import warnings
|
||||||
|
from typing import List
|
||||||
|
from typing import Optional
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import scipy
|
||||||
|
from numpy import ndarray as array
|
||||||
|
from numpy.lib.stride_tricks import as_strided
|
||||||
|
from scipy.signal import get_window
|
||||||
|
|
||||||
|
from paddleaudio.utils import ParameterError
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'stft',
|
||||||
|
'mfcc',
|
||||||
|
'hz_to_mel',
|
||||||
|
'mel_to_hz',
|
||||||
|
'split_frames',
|
||||||
|
'mel_frequencies',
|
||||||
|
'power_to_db',
|
||||||
|
'compute_fbank_matrix',
|
||||||
|
'melspectrogram',
|
||||||
|
'spectrogram',
|
||||||
|
'mu_encode',
|
||||||
|
'mu_decode',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array:
|
||||||
|
"""Pad an array to a target length along a target axis.
|
||||||
|
|
||||||
|
This differs from `np.pad` by centering the data prior to padding,
|
||||||
|
analogous to `str.center`
|
||||||
|
"""
|
||||||
|
|
||||||
|
kwargs.setdefault("mode", "constant")
|
||||||
|
n = data.shape[axis]
|
||||||
|
lpad = int((size - n) // 2)
|
||||||
|
lengths = [(0, 0)] * data.ndim
|
||||||
|
lengths[axis] = (lpad, int(size - n - lpad))
|
||||||
|
|
||||||
|
if lpad < 0:
|
||||||
|
raise ParameterError(("Target size ({size:d}) must be "
|
||||||
|
"at least input size ({n:d})"))
|
||||||
|
|
||||||
|
return np.pad(data, lengths, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def split_frames(x: array, frame_length: int, hop_length: int,
|
||||||
|
axis: int=-1) -> array:
|
||||||
|
"""Slice a data array into (overlapping) frames.
|
||||||
|
|
||||||
|
This function is aligned with librosa.frame
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not isinstance(x, np.ndarray):
|
||||||
|
raise ParameterError(
|
||||||
|
f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
|
||||||
|
|
||||||
|
if x.shape[axis] < frame_length:
|
||||||
|
raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
|
||||||
|
f" for frame_length={frame_length:d}")
|
||||||
|
|
||||||
|
if hop_length < 1:
|
||||||
|
raise ParameterError(f"Invalid hop_length: {hop_length:d}")
|
||||||
|
|
||||||
|
if axis == -1 and not x.flags["F_CONTIGUOUS"]:
|
||||||
|
warnings.warn(f"librosa.util.frame called with axis={axis} "
|
||||||
|
"on a non-contiguous input. This will result in a copy.")
|
||||||
|
x = np.asfortranarray(x)
|
||||||
|
elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
|
||||||
|
warnings.warn(f"librosa.util.frame called with axis={axis} "
|
||||||
|
"on a non-contiguous input. This will result in a copy.")
|
||||||
|
x = np.ascontiguousarray(x)
|
||||||
|
|
||||||
|
n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
|
||||||
|
strides = np.asarray(x.strides)
|
||||||
|
|
||||||
|
new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
|
||||||
|
|
||||||
|
if axis == -1:
|
||||||
|
shape = list(x.shape)[:-1] + [frame_length, n_frames]
|
||||||
|
strides = list(strides) + [hop_length * new_stride]
|
||||||
|
|
||||||
|
elif axis == 0:
|
||||||
|
shape = [n_frames, frame_length] + list(x.shape)[1:]
|
||||||
|
strides = [hop_length * new_stride] + list(strides)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
|
||||||
|
|
||||||
|
return as_strided(x, shape=shape, strides=strides)
|
||||||
|
|
||||||
|
|
||||||
|
def _check_audio(y, mono=True) -> bool:
|
||||||
|
"""Determine whether a variable contains valid audio data.
|
||||||
|
|
||||||
|
The audio y must be a np.ndarray, ether 1-channel or two channel
|
||||||
|
"""
|
||||||
|
if not isinstance(y, np.ndarray):
|
||||||
|
raise ParameterError("Audio data must be of type numpy.ndarray")
|
||||||
|
if y.ndim > 2:
|
||||||
|
raise ParameterError(
|
||||||
|
f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
|
||||||
|
|
||||||
|
if mono and y.ndim == 2:
|
||||||
|
raise ParameterError(
|
||||||
|
f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
|
||||||
|
|
||||||
|
if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
|
||||||
|
raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
|
||||||
|
|
||||||
|
if not np.issubdtype(y.dtype, np.floating):
|
||||||
|
raise ParameterError("Audio data must be floating-point")
|
||||||
|
|
||||||
|
if not np.isfinite(y).all():
|
||||||
|
raise ParameterError("Audio buffer is not finite everywhere")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def hz_to_mel(frequencies: Union[float, List[float], array],
|
||||||
|
htk: bool=False) -> array:
|
||||||
|
"""Convert Hz to Mels
|
||||||
|
|
||||||
|
This function is aligned with librosa.
|
||||||
|
"""
|
||||||
|
freq = np.asanyarray(frequencies)
|
||||||
|
|
||||||
|
if htk:
|
||||||
|
return 2595.0 * np.log10(1.0 + freq / 700.0)
|
||||||
|
|
||||||
|
# Fill in the linear part
|
||||||
|
f_min = 0.0
|
||||||
|
f_sp = 200.0 / 3
|
||||||
|
|
||||||
|
mels = (freq - f_min) / f_sp
|
||||||
|
|
||||||
|
# Fill in the log-scale part
|
||||||
|
|
||||||
|
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||||
|
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||||
|
logstep = np.log(6.4) / 27.0 # step size for log region
|
||||||
|
|
||||||
|
if freq.ndim:
|
||||||
|
# If we have array data, vectorize
|
||||||
|
log_t = freq >= min_log_hz
|
||||||
|
mels[log_t] = min_log_mel + \
|
||||||
|
np.log(freq[log_t] / min_log_hz) / logstep
|
||||||
|
elif freq >= min_log_hz:
|
||||||
|
# If we have scalar data, heck directly
|
||||||
|
mels = min_log_mel + np.log(freq / min_log_hz) / logstep
|
||||||
|
|
||||||
|
return mels
|
||||||
|
|
||||||
|
|
||||||
|
def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array:
|
||||||
|
"""Convert mel bin numbers to frequencies.
|
||||||
|
|
||||||
|
This function is aligned with librosa.
|
||||||
|
"""
|
||||||
|
mel_array = np.asanyarray(mels)
|
||||||
|
|
||||||
|
if htk:
|
||||||
|
return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
|
||||||
|
|
||||||
|
# Fill in the linear scale
|
||||||
|
f_min = 0.0
|
||||||
|
f_sp = 200.0 / 3
|
||||||
|
freqs = f_min + f_sp * mel_array
|
||||||
|
|
||||||
|
# And now the nonlinear scale
|
||||||
|
min_log_hz = 1000.0 # beginning of log region (Hz)
|
||||||
|
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
||||||
|
logstep = np.log(6.4) / 27.0 # step size for log region
|
||||||
|
|
||||||
|
if mel_array.ndim:
|
||||||
|
# If we have vector data, vectorize
|
||||||
|
log_t = mel_array >= min_log_mel
|
||||||
|
freqs[log_t] = min_log_hz * \
|
||||||
|
np.exp(logstep * (mel_array[log_t] - min_log_mel))
|
||||||
|
elif mel_array >= min_log_mel:
|
||||||
|
# If we have scalar data, check directly
|
||||||
|
freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
|
||||||
|
|
||||||
|
return freqs
|
||||||
|
|
||||||
|
|
||||||
|
def mel_frequencies(n_mels: int=128,
|
||||||
|
fmin: float=0.0,
|
||||||
|
fmax: float=11025.0,
|
||||||
|
htk: bool=False) -> array:
|
||||||
|
"""Compute mel frequencies
|
||||||
|
|
||||||
|
This function is aligned with librosa.
|
||||||
|
"""
|
||||||
|
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||||
|
min_mel = hz_to_mel(fmin, htk=htk)
|
||||||
|
max_mel = hz_to_mel(fmax, htk=htk)
|
||||||
|
|
||||||
|
mels = np.linspace(min_mel, max_mel, n_mels)
|
||||||
|
|
||||||
|
return mel_to_hz(mels, htk=htk)
|
||||||
|
|
||||||
|
|
||||||
|
def fft_frequencies(sr: int, n_fft: int) -> array:
|
||||||
|
"""Compute fourier frequencies.
|
||||||
|
|
||||||
|
This function is aligned with librosa.
|
||||||
|
"""
|
||||||
|
return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_fbank_matrix(sr: int,
|
||||||
|
n_fft: int,
|
||||||
|
n_mels: int=128,
|
||||||
|
fmin: float=0.0,
|
||||||
|
fmax: Optional[float]=None,
|
||||||
|
htk: bool=False,
|
||||||
|
norm: str="slaney",
|
||||||
|
dtype: type=np.float32):
|
||||||
|
"""Compute fbank matrix.
|
||||||
|
|
||||||
|
This funciton is aligned with librosa.
|
||||||
|
"""
|
||||||
|
if norm != "slaney":
|
||||||
|
raise ParameterError('norm must set to slaney')
|
||||||
|
|
||||||
|
if fmax is None:
|
||||||
|
fmax = float(sr) / 2
|
||||||
|
|
||||||
|
# Initialize the weights
|
||||||
|
n_mels = int(n_mels)
|
||||||
|
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
|
||||||
|
|
||||||
|
# Center freqs of each FFT bin
|
||||||
|
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
|
||||||
|
|
||||||
|
# 'Center freqs' of mel bands - uniformly spaced between limits
|
||||||
|
mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
|
||||||
|
|
||||||
|
fdiff = np.diff(mel_f)
|
||||||
|
ramps = np.subtract.outer(mel_f, fftfreqs)
|
||||||
|
|
||||||
|
for i in range(n_mels):
|
||||||
|
# lower and upper slopes for all bins
|
||||||
|
lower = -ramps[i] / fdiff[i]
|
||||||
|
upper = ramps[i + 2] / fdiff[i + 1]
|
||||||
|
|
||||||
|
# .. then intersect them with each other and zero
|
||||||
|
weights[i] = np.maximum(0, np.minimum(lower, upper))
|
||||||
|
|
||||||
|
if norm == "slaney":
|
||||||
|
# Slaney-style mel is scaled to be approx constant energy per channel
|
||||||
|
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
|
||||||
|
weights *= enorm[:, np.newaxis]
|
||||||
|
|
||||||
|
# Only check weights if f_mel[0] is positive
|
||||||
|
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
|
||||||
|
# This means we have an empty channel somewhere
|
||||||
|
warnings.warn("Empty filters detected in mel frequency basis. "
|
||||||
|
"Some channels will produce empty responses. "
|
||||||
|
"Try increasing your sampling rate (and fmax) or "
|
||||||
|
"reducing n_mels.")
|
||||||
|
|
||||||
|
return weights
|
||||||
|
|
||||||
|
|
||||||
|
def stft(x: array,
|
||||||
|
n_fft: int=2048,
|
||||||
|
hop_length: Optional[int]=None,
|
||||||
|
win_length: Optional[int]=None,
|
||||||
|
window: str="hann",
|
||||||
|
center: bool=True,
|
||||||
|
dtype: type=np.complex64,
|
||||||
|
pad_mode: str="reflect") -> array:
|
||||||
|
"""Short-time Fourier transform (STFT).
|
||||||
|
|
||||||
|
This function is aligned with librosa.
|
||||||
|
"""
|
||||||
|
_check_audio(x)
|
||||||
|
# By default, use the entire frame
|
||||||
|
if win_length is None:
|
||||||
|
win_length = n_fft
|
||||||
|
|
||||||
|
# Set the default hop, if it's not already specified
|
||||||
|
if hop_length is None:
|
||||||
|
hop_length = int(win_length // 4)
|
||||||
|
|
||||||
|
fft_window = get_window(window, win_length, fftbins=True)
|
||||||
|
|
||||||
|
# Pad the window out to n_fft size
|
||||||
|
fft_window = pad_center(fft_window, n_fft)
|
||||||
|
|
||||||
|
# Reshape so that the window can be broadcast
|
||||||
|
fft_window = fft_window.reshape((-1, 1))
|
||||||
|
|
||||||
|
# Pad the time series so that frames are centered
|
||||||
|
if center:
|
||||||
|
if n_fft > x.shape[-1]:
|
||||||
|
warnings.warn(
|
||||||
|
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
|
||||||
|
)
|
||||||
|
x = np.pad(x, int(n_fft // 2), mode=pad_mode)
|
||||||
|
|
||||||
|
elif n_fft > x.shape[-1]:
|
||||||
|
raise ParameterError(
|
||||||
|
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Window the time series.
|
||||||
|
x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length)
|
||||||
|
# Pre-allocate the STFT matrix
|
||||||
|
stft_matrix = np.empty(
|
||||||
|
(int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
|
||||||
|
fft = np.fft # use numpy fft as default
|
||||||
|
# Constrain STFT block sizes to 256 KB
|
||||||
|
MAX_MEM_BLOCK = 2**8 * 2**10
|
||||||
|
# how many columns can we fit within MAX_MEM_BLOCK?
|
||||||
|
n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
|
||||||
|
n_columns = max(n_columns, 1)
|
||||||
|
|
||||||
|
for bl_s in range(0, stft_matrix.shape[1], n_columns):
|
||||||
|
bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
|
||||||
|
stft_matrix[:, bl_s:bl_t] = fft.rfft(
|
||||||
|
fft_window * x_frames[:, bl_s:bl_t], axis=0)
|
||||||
|
|
||||||
|
return stft_matrix
|
||||||
|
|
||||||
|
|
||||||
|
def power_to_db(spect: array,
|
||||||
|
ref: float=1.0,
|
||||||
|
amin: float=1e-10,
|
||||||
|
top_db: Optional[float]=80.0) -> array:
|
||||||
|
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units
|
||||||
|
|
||||||
|
This computes the scaling ``10 * log10(spect / ref)`` in a numerically
|
||||||
|
stable way.
|
||||||
|
|
||||||
|
This function is aligned with librosa.
|
||||||
|
"""
|
||||||
|
spect = np.asarray(spect)
|
||||||
|
|
||||||
|
if amin <= 0:
|
||||||
|
raise ParameterError("amin must be strictly positive")
|
||||||
|
|
||||||
|
if np.issubdtype(spect.dtype, np.complexfloating):
|
||||||
|
warnings.warn(
|
||||||
|
"power_to_db was called on complex input so phase "
|
||||||
|
"information will be discarded. To suppress this warning, "
|
||||||
|
"call power_to_db(np.abs(D)**2) instead.")
|
||||||
|
magnitude = np.abs(spect)
|
||||||
|
else:
|
||||||
|
magnitude = spect
|
||||||
|
|
||||||
|
if callable(ref):
|
||||||
|
# User supplied a function to calculate reference power
|
||||||
|
ref_value = ref(magnitude)
|
||||||
|
else:
|
||||||
|
ref_value = np.abs(ref)
|
||||||
|
|
||||||
|
log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
|
||||||
|
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
|
||||||
|
|
||||||
|
if top_db is not None:
|
||||||
|
if top_db < 0:
|
||||||
|
raise ParameterError("top_db must be non-negative")
|
||||||
|
log_spec = np.maximum(log_spec, log_spec.max() - top_db)
|
||||||
|
|
||||||
|
return log_spec
|
||||||
|
|
||||||
|
|
||||||
|
def mfcc(x,
|
||||||
|
sr: int=16000,
|
||||||
|
spect: Optional[array]=None,
|
||||||
|
n_mfcc: int=20,
|
||||||
|
dct_type: int=2,
|
||||||
|
norm: str="ortho",
|
||||||
|
lifter: int=0,
|
||||||
|
**kwargs) -> array:
|
||||||
|
"""Mel-frequency cepstral coefficients (MFCCs)
|
||||||
|
|
||||||
|
This function is NOT strictly aligned with librosa. The following example shows how to get the
|
||||||
|
same result with librosa:
|
||||||
|
|
||||||
|
# paddleaudioe mfcc:
|
||||||
|
kwargs = {
|
||||||
|
'window_size':512,
|
||||||
|
'hop_length':320,
|
||||||
|
'mel_bins':64,
|
||||||
|
'fmin':50,
|
||||||
|
'to_db':False}
|
||||||
|
a = mfcc(x,
|
||||||
|
spect=None,
|
||||||
|
n_mfcc=20,
|
||||||
|
dct_type=2,
|
||||||
|
norm='ortho',
|
||||||
|
lifter=0,
|
||||||
|
**kwargs)
|
||||||
|
|
||||||
|
# librosa mfcc:
|
||||||
|
spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512,
|
||||||
|
win_length=512,
|
||||||
|
hop_length=320,
|
||||||
|
n_mels=64, fmin=50)
|
||||||
|
b = librosa.feature.mfcc(x,
|
||||||
|
sr=16000,
|
||||||
|
S=spect,
|
||||||
|
n_mfcc=20,
|
||||||
|
dct_type=2,
|
||||||
|
norm='ortho',
|
||||||
|
lifter=0)
|
||||||
|
|
||||||
|
assert np.mean( (a-b)**2) < 1e-8
|
||||||
|
|
||||||
|
"""
|
||||||
|
if spect is None:
|
||||||
|
spect = melspectrogram(x, sr=sr, **kwargs)
|
||||||
|
|
||||||
|
M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
|
||||||
|
|
||||||
|
if lifter > 0:
|
||||||
|
factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
|
||||||
|
lifter)
|
||||||
|
return M * factor[:, np.newaxis]
|
||||||
|
elif lifter == 0:
|
||||||
|
return M
|
||||||
|
else:
|
||||||
|
raise ParameterError(
|
||||||
|
f"MFCC lifter={lifter} must be a non-negative number")
|
||||||
|
|
||||||
|
|
||||||
|
def melspectrogram(x: array,
|
||||||
|
sr: int=16000,
|
||||||
|
window_size: int=512,
|
||||||
|
hop_length: int=320,
|
||||||
|
n_mels: int=64,
|
||||||
|
fmin: int=50,
|
||||||
|
fmax: Optional[float]=None,
|
||||||
|
window: str='hann',
|
||||||
|
center: bool=True,
|
||||||
|
pad_mode: str='reflect',
|
||||||
|
power: float=2.0,
|
||||||
|
to_db: bool=True,
|
||||||
|
ref: float=1.0,
|
||||||
|
amin: float=1e-10,
|
||||||
|
top_db: Optional[float]=None) -> array:
|
||||||
|
"""Compute mel-spectrogram.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
x: numpy.ndarray
|
||||||
|
The input wavform is a numpy array [shape=(n,)]
|
||||||
|
|
||||||
|
window_size: int, typically 512, 1024, 2048, etc.
|
||||||
|
The window size for framing, also used as n_fft for stft
|
||||||
|
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The mel-spectrogram in power scale or db scale(default)
|
||||||
|
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
1. sr is default to 16000, which is commonly used in speech/speaker processing.
|
||||||
|
2. when fmax is None, it is set to sr//2.
|
||||||
|
3. this function will convert mel spectgrum to db scale by default. This is different
|
||||||
|
that of librosa.
|
||||||
|
|
||||||
|
"""
|
||||||
|
_check_audio(x, mono=True)
|
||||||
|
if len(x) <= 0:
|
||||||
|
raise ParameterError('The input waveform is empty')
|
||||||
|
|
||||||
|
if fmax is None:
|
||||||
|
fmax = sr // 2
|
||||||
|
if fmin < 0 or fmin >= fmax:
|
||||||
|
raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
|
||||||
|
|
||||||
|
s = stft(
|
||||||
|
x,
|
||||||
|
n_fft=window_size,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=window_size,
|
||||||
|
window=window,
|
||||||
|
center=center,
|
||||||
|
pad_mode=pad_mode)
|
||||||
|
|
||||||
|
spect_power = np.abs(s)**power
|
||||||
|
fb_matrix = compute_fbank_matrix(
|
||||||
|
sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
|
||||||
|
mel_spect = np.matmul(fb_matrix, spect_power)
|
||||||
|
if to_db:
|
||||||
|
return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
|
||||||
|
else:
|
||||||
|
return mel_spect
|
||||||
|
|
||||||
|
|
||||||
|
def spectrogram(x: array,
|
||||||
|
sr: int=16000,
|
||||||
|
window_size: int=512,
|
||||||
|
hop_length: int=320,
|
||||||
|
window: str='hann',
|
||||||
|
center: bool=True,
|
||||||
|
pad_mode: str='reflect',
|
||||||
|
power: float=2.0) -> array:
|
||||||
|
"""Compute spectrogram from an input waveform.
|
||||||
|
|
||||||
|
This function is a wrapper for librosa.feature.stft, with addition step to
|
||||||
|
compute the magnitude of the complex spectrogram.
|
||||||
|
"""
|
||||||
|
|
||||||
|
s = stft(
|
||||||
|
x,
|
||||||
|
n_fft=window_size,
|
||||||
|
hop_length=hop_length,
|
||||||
|
win_length=window_size,
|
||||||
|
window=window,
|
||||||
|
center=center,
|
||||||
|
pad_mode=pad_mode)
|
||||||
|
|
||||||
|
return np.abs(s)**power
|
||||||
|
|
||||||
|
|
||||||
|
def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array:
|
||||||
|
"""Mu-law encoding.
|
||||||
|
|
||||||
|
Compute the mu-law decoding given an input code.
|
||||||
|
When quantized is True, the result will be converted to
|
||||||
|
integer in range [0,mu-1]. Otherwise, the resulting signal
|
||||||
|
is in range [-1,1]
|
||||||
|
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
|
||||||
|
|
||||||
|
"""
|
||||||
|
mu = 255
|
||||||
|
y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
|
||||||
|
if quantized:
|
||||||
|
y = np.floor((y + 1) / 2 * mu + 0.5) # convert to [0 , mu-1]
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
|
||||||
|
"""Mu-law decoding.
|
||||||
|
|
||||||
|
Compute the mu-law decoding given an input code.
|
||||||
|
|
||||||
|
it assumes that the input y is in
|
||||||
|
range [0,mu-1] when quantize is True and [-1,1] otherwise
|
||||||
|
|
||||||
|
Reference:
|
||||||
|
https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
|
||||||
|
|
||||||
|
"""
|
||||||
|
if mu < 1:
|
||||||
|
raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
|
||||||
|
|
||||||
|
mu = mu - 1
|
||||||
|
if quantized: # undo the quantization
|
||||||
|
y = y * 2 / mu - 1
|
||||||
|
x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
|
||||||
|
return x
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,18 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from .download import *
|
||||||
|
from .env import *
|
||||||
|
from .error import *
|
||||||
|
from .log import *
|
||||||
|
from .time import *
|
@ -0,0 +1,66 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import os
|
||||||
|
from typing import Dict
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from paddle.framework import load as load_state_dict
|
||||||
|
from paddle.utils import download
|
||||||
|
from pathos.multiprocessing import ProcessPool
|
||||||
|
|
||||||
|
from .log import logger
|
||||||
|
|
||||||
|
download.logger = logger
|
||||||
|
|
||||||
|
|
||||||
|
def decompress(file: str):
|
||||||
|
"""
|
||||||
|
Extracts all files from a compressed file.
|
||||||
|
"""
|
||||||
|
assert os.path.isfile(file), "File: {} not exists.".format(file)
|
||||||
|
download._decompress(file)
|
||||||
|
|
||||||
|
|
||||||
|
def download_and_decompress(archives: List[Dict[str, str]],
|
||||||
|
path: str,
|
||||||
|
n_workers: int=0):
|
||||||
|
"""
|
||||||
|
Download archieves and decompress to specific path.
|
||||||
|
"""
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
os.makedirs(path)
|
||||||
|
|
||||||
|
if n_workers <= 0:
|
||||||
|
for archive in archives:
|
||||||
|
assert 'url' in archive and 'md5' in archive, \
|
||||||
|
'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
|
||||||
|
|
||||||
|
download.get_path_from_url(archive['url'], path, archive['md5'])
|
||||||
|
else:
|
||||||
|
pool = ProcessPool(nodes=n_workers)
|
||||||
|
pool.imap(download.get_path_from_url, [_['url'] for _ in archives],
|
||||||
|
[path] * len(archives), [_['md5'] for _ in archives])
|
||||||
|
pool.close()
|
||||||
|
pool.join()
|
||||||
|
|
||||||
|
|
||||||
|
def load_state_dict_from_url(url: str, path: str, md5: str=None):
|
||||||
|
"""
|
||||||
|
Download and load a state dict from url
|
||||||
|
"""
|
||||||
|
if not os.path.isdir(path):
|
||||||
|
os.makedirs(path)
|
||||||
|
|
||||||
|
download.get_path_from_url(url, path, md5)
|
||||||
|
return load_state_dict(os.path.join(path, os.path.basename(url)))
|
@ -0,0 +1,53 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
'''
|
||||||
|
This module is used to store environmental variables in PaddleAudio.
|
||||||
|
PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
|
||||||
|
├ default value through the PPAUDIO_HOME environment variable.
|
||||||
|
├─ MODEL_HOME --> Store model files.
|
||||||
|
└─ DATA_HOME --> Store automatically downloaded datasets.
|
||||||
|
'''
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def _get_user_home():
|
||||||
|
return os.path.expanduser('~')
|
||||||
|
|
||||||
|
|
||||||
|
def _get_ppaudio_home():
|
||||||
|
if 'PPAUDIO_HOME' in os.environ:
|
||||||
|
home_path = os.environ['PPAUDIO_HOME']
|
||||||
|
if os.path.exists(home_path):
|
||||||
|
if os.path.isdir(home_path):
|
||||||
|
return home_path
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
'The environment variable PPAUDIO_HOME {} is not a directory.'.
|
||||||
|
format(home_path))
|
||||||
|
else:
|
||||||
|
return home_path
|
||||||
|
return os.path.join(_get_user_home(), '.paddleaudio')
|
||||||
|
|
||||||
|
|
||||||
|
def _get_sub_home(directory):
|
||||||
|
home = os.path.join(_get_ppaudio_home(), directory)
|
||||||
|
if not os.path.exists(home):
|
||||||
|
os.makedirs(home)
|
||||||
|
return home
|
||||||
|
|
||||||
|
|
||||||
|
USER_HOME = _get_user_home()
|
||||||
|
PPAUDIO_HOME = _get_ppaudio_home()
|
||||||
|
MODEL_HOME = _get_sub_home('models')
|
||||||
|
DATA_HOME = _get_sub_home('datasets')
|
@ -0,0 +1,20 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
__all__ = ['ParameterError']
|
||||||
|
|
||||||
|
|
||||||
|
class ParameterError(Exception):
|
||||||
|
"""Exception class for Parameter checking"""
|
||||||
|
pass
|
@ -0,0 +1,136 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import contextlib
|
||||||
|
import functools
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
|
||||||
|
import colorlog
|
||||||
|
|
||||||
|
loggers = {}
|
||||||
|
|
||||||
|
log_config = {
|
||||||
|
'DEBUG': {
|
||||||
|
'level': 10,
|
||||||
|
'color': 'purple'
|
||||||
|
},
|
||||||
|
'INFO': {
|
||||||
|
'level': 20,
|
||||||
|
'color': 'green'
|
||||||
|
},
|
||||||
|
'TRAIN': {
|
||||||
|
'level': 21,
|
||||||
|
'color': 'cyan'
|
||||||
|
},
|
||||||
|
'EVAL': {
|
||||||
|
'level': 22,
|
||||||
|
'color': 'blue'
|
||||||
|
},
|
||||||
|
'WARNING': {
|
||||||
|
'level': 30,
|
||||||
|
'color': 'yellow'
|
||||||
|
},
|
||||||
|
'ERROR': {
|
||||||
|
'level': 40,
|
||||||
|
'color': 'red'
|
||||||
|
},
|
||||||
|
'CRITICAL': {
|
||||||
|
'level': 50,
|
||||||
|
'color': 'bold_red'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class Logger(object):
|
||||||
|
'''
|
||||||
|
Deafult logger in PaddleAudio
|
||||||
|
Args:
|
||||||
|
name(str) : Logger name, default is 'PaddleAudio'
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, name: str=None):
|
||||||
|
name = 'PaddleAudio' if not name else name
|
||||||
|
self.logger = logging.getLogger(name)
|
||||||
|
|
||||||
|
for key, conf in log_config.items():
|
||||||
|
logging.addLevelName(conf['level'], key)
|
||||||
|
self.__dict__[key] = functools.partial(self.__call__, conf['level'])
|
||||||
|
self.__dict__[key.lower()] = functools.partial(self.__call__,
|
||||||
|
conf['level'])
|
||||||
|
|
||||||
|
self.format = colorlog.ColoredFormatter(
|
||||||
|
'%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
|
||||||
|
log_colors={key: conf['color']
|
||||||
|
for key, conf in log_config.items()})
|
||||||
|
|
||||||
|
self.handler = logging.StreamHandler()
|
||||||
|
self.handler.setFormatter(self.format)
|
||||||
|
|
||||||
|
self.logger.addHandler(self.handler)
|
||||||
|
self.logLevel = 'DEBUG'
|
||||||
|
self.logger.setLevel(logging.DEBUG)
|
||||||
|
self.logger.propagate = False
|
||||||
|
self._is_enable = True
|
||||||
|
|
||||||
|
def disable(self):
|
||||||
|
self._is_enable = False
|
||||||
|
|
||||||
|
def enable(self):
|
||||||
|
self._is_enable = True
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_enable(self) -> bool:
|
||||||
|
return self._is_enable
|
||||||
|
|
||||||
|
def __call__(self, log_level: str, msg: str):
|
||||||
|
if not self.is_enable:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.logger.log(log_level, msg)
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def use_terminator(self, terminator: str):
|
||||||
|
old_terminator = self.handler.terminator
|
||||||
|
self.handler.terminator = terminator
|
||||||
|
yield
|
||||||
|
self.handler.terminator = old_terminator
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def processing(self, msg: str, interval: float=0.1):
|
||||||
|
'''
|
||||||
|
Continuously print a progress bar with rotating special effects.
|
||||||
|
Args:
|
||||||
|
msg(str): Message to be printed.
|
||||||
|
interval(float): Rotation interval. Default to 0.1.
|
||||||
|
'''
|
||||||
|
end = False
|
||||||
|
|
||||||
|
def _printer():
|
||||||
|
index = 0
|
||||||
|
flags = ['\\', '|', '/', '-']
|
||||||
|
while not end:
|
||||||
|
flag = flags[index % len(flags)]
|
||||||
|
with self.use_terminator('\r'):
|
||||||
|
self.info('{}: {}'.format(msg, flag))
|
||||||
|
time.sleep(interval)
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
t = threading.Thread(target=_printer)
|
||||||
|
t.start()
|
||||||
|
yield
|
||||||
|
end = True
|
||||||
|
|
||||||
|
|
||||||
|
logger = Logger()
|
@ -0,0 +1,67 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import math
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
class Timer(object):
|
||||||
|
'''Calculate runing speed and estimated time of arrival(ETA)'''
|
||||||
|
|
||||||
|
def __init__(self, total_step: int):
|
||||||
|
self.total_step = total_step
|
||||||
|
self.last_start_step = 0
|
||||||
|
self.current_step = 0
|
||||||
|
self._is_running = True
|
||||||
|
|
||||||
|
def start(self):
|
||||||
|
self.last_time = time.time()
|
||||||
|
self.start_time = time.time()
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self._is_running = False
|
||||||
|
self.end_time = time.time()
|
||||||
|
|
||||||
|
def count(self) -> int:
|
||||||
|
if not self.current_step >= self.total_step:
|
||||||
|
self.current_step += 1
|
||||||
|
return self.current_step
|
||||||
|
|
||||||
|
@property
|
||||||
|
def timing(self) -> float:
|
||||||
|
run_steps = self.current_step - self.last_start_step
|
||||||
|
self.last_start_step = self.current_step
|
||||||
|
time_used = time.time() - self.last_time
|
||||||
|
self.last_time = time.time()
|
||||||
|
return run_steps / time_used
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_running(self) -> bool:
|
||||||
|
return self._is_running
|
||||||
|
|
||||||
|
@property
|
||||||
|
def eta(self) -> str:
|
||||||
|
if not self.is_running:
|
||||||
|
return '00:00:00'
|
||||||
|
scale = self.total_step / self.current_step
|
||||||
|
remaining_time = (time.time() - self.start_time) * scale
|
||||||
|
return seconds_to_hms(remaining_time)
|
||||||
|
|
||||||
|
|
||||||
|
def seconds_to_hms(seconds: int) -> str:
|
||||||
|
'''Convert the number of seconds to hh:mm:ss'''
|
||||||
|
h = math.floor(seconds / 3600)
|
||||||
|
m = math.floor((seconds - h * 3600) / 60)
|
||||||
|
s = int(seconds - h * 3600 - m * 60)
|
||||||
|
hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
|
||||||
|
return hms_str
|
@ -0,0 +1,4 @@
|
|||||||
|
numpy >= 1.15.0
|
||||||
|
resampy >= 0.2.2
|
||||||
|
scipy >= 1.0.0
|
||||||
|
soundfile >= 0.9.0
|
@ -0,0 +1,43 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import setuptools
|
||||||
|
|
||||||
|
# set the version here
|
||||||
|
version = '0.1.0a'
|
||||||
|
|
||||||
|
with open("README.md", "r") as fh:
|
||||||
|
long_description = fh.read()
|
||||||
|
setuptools.setup(
|
||||||
|
name="paddleaudio",
|
||||||
|
version=version,
|
||||||
|
author="",
|
||||||
|
author_email="",
|
||||||
|
description="PaddleAudio, in development",
|
||||||
|
long_description=long_description,
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
url="",
|
||||||
|
packages=setuptools.find_packages(exclude=["build*", "test*", "examples*"]),
|
||||||
|
classifiers=[
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
],
|
||||||
|
python_requires='>=3.6',
|
||||||
|
install_requires=[
|
||||||
|
'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
|
||||||
|
'soundfile >= 0.9.0'
|
||||||
|
],
|
||||||
|
extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2']
|
||||||
|
} # for dev only, install: pip install -e .[dev]
|
||||||
|
)
|
@ -0,0 +1,41 @@
|
|||||||
|
# PaddleAudio Testing Guide
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
First clone a version of the project by
|
||||||
|
```
|
||||||
|
git clone https://github.com/PaddlePaddle/models.git
|
||||||
|
|
||||||
|
```
|
||||||
|
Then install the project in your virtual environment.
|
||||||
|
```
|
||||||
|
cd models/PaddleAudio
|
||||||
|
python setup.py bdist_wheel
|
||||||
|
pip install -e .[dev]
|
||||||
|
```
|
||||||
|
The requirements for testing will be installed along with PaddleAudio.
|
||||||
|
|
||||||
|
Now run
|
||||||
|
```
|
||||||
|
pytest test
|
||||||
|
```
|
||||||
|
|
||||||
|
If it goes well, you will see outputs like these:
|
||||||
|
```
|
||||||
|
platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
|
||||||
|
rootdir: ./models/PaddleAudio
|
||||||
|
plugins: hydra-core-1.0.6
|
||||||
|
collected 16 items
|
||||||
|
|
||||||
|
test/unit_test/test_backend.py ........... [ 68%]
|
||||||
|
test/unit_test/test_features.py ..... [100%]
|
||||||
|
|
||||||
|
==================================================== warnings summary ====================================================
|
||||||
|
.
|
||||||
|
.
|
||||||
|
.
|
||||||
|
-- Docs: https://docs.pytest.org/en/stable/warnings.html
|
||||||
|
============================================ 16 passed, 11 warnings in 6.76s =============================================
|
||||||
|
```
|
@ -0,0 +1,114 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import paddleaudio
|
||||||
|
|
||||||
|
TEST_FILE = './test/data/test_audio.wav'
|
||||||
|
|
||||||
|
|
||||||
|
def relative_err(a, b, real=True):
|
||||||
|
"""compute relative error of two matrices or vectors"""
|
||||||
|
if real:
|
||||||
|
return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
|
||||||
|
else:
|
||||||
|
err = np.sum((a.real - b.real)**2) / \
|
||||||
|
(EPS + np.sum(a.real**2) + np.sum(b.real**2))
|
||||||
|
err += np.sum((a.imag - b.imag)**2) / \
|
||||||
|
(EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
|
||||||
|
|
||||||
|
return err
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
def load_audio():
|
||||||
|
x, r = librosa.load(TEST_FILE, sr=16000)
|
||||||
|
print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
|
||||||
|
return x, r
|
||||||
|
|
||||||
|
|
||||||
|
# start testing
|
||||||
|
x, r = load_audio()
|
||||||
|
EPS = 1e-8
|
||||||
|
|
||||||
|
|
||||||
|
def test_load():
|
||||||
|
s, r = paddleaudio.load(TEST_FILE, sr=16000)
|
||||||
|
assert r == 16000
|
||||||
|
assert s.dtype == 'float32'
|
||||||
|
|
||||||
|
s, r = paddleaudio.load(
|
||||||
|
TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
|
||||||
|
assert len(s) / r == 2.0
|
||||||
|
assert r == 16000
|
||||||
|
assert s.dtype == 'int16'
|
||||||
|
|
||||||
|
|
||||||
|
def test_depth_convert():
|
||||||
|
y = paddleaudio.depth_convert(x, 'int16')
|
||||||
|
assert len(y) == len(x)
|
||||||
|
assert y.dtype == 'int16'
|
||||||
|
assert np.max(y) <= 32767
|
||||||
|
assert np.min(y) >= -32768
|
||||||
|
assert np.std(y) > EPS
|
||||||
|
|
||||||
|
y = paddleaudio.depth_convert(x, 'int8')
|
||||||
|
assert len(y) == len(x)
|
||||||
|
assert y.dtype == 'int8'
|
||||||
|
assert np.max(y) <= 127
|
||||||
|
assert np.min(y) >= -128
|
||||||
|
assert np.std(y) > EPS
|
||||||
|
|
||||||
|
|
||||||
|
# test case for resample
|
||||||
|
rs_test_data = [
|
||||||
|
(32000, 'kaiser_fast'),
|
||||||
|
(16000, 'kaiser_fast'),
|
||||||
|
(8000, 'kaiser_fast'),
|
||||||
|
(32000, 'kaiser_best'),
|
||||||
|
(16000, 'kaiser_best'),
|
||||||
|
(8000, 'kaiser_best'),
|
||||||
|
(22050, 'kaiser_best'),
|
||||||
|
(44100, 'kaiser_best'),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('sr,mode', rs_test_data)
|
||||||
|
def test_resample(sr, mode):
|
||||||
|
y = paddleaudio.resample(x, 16000, sr, mode=mode)
|
||||||
|
factor = sr / 16000
|
||||||
|
err = relative_err(len(y), len(x) * factor)
|
||||||
|
print('err:', err)
|
||||||
|
assert err < EPS
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize():
|
||||||
|
y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
|
||||||
|
assert np.max(y) < 0.5 + EPS
|
||||||
|
|
||||||
|
y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
|
||||||
|
assert np.max(y) <= 2.0 + EPS
|
||||||
|
|
||||||
|
y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
|
||||||
|
print('np.std(y):', np.std(y))
|
||||||
|
assert np.abs(np.std(y) - 1.0) < EPS
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_load()
|
||||||
|
test_depth_convert()
|
||||||
|
test_resample(22050, 'kaiser_fast')
|
||||||
|
test_normalize()
|
@ -0,0 +1,144 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import librosa
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import paddleaudio as pa
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
def load_audio():
|
||||||
|
x, r = librosa.load('./test/data/test_audio.wav')
|
||||||
|
#x,r = librosa.load('../data/test_audio.wav',sr=16000)
|
||||||
|
return x, r
|
||||||
|
|
||||||
|
|
||||||
|
## start testing
|
||||||
|
x, r = load_audio()
|
||||||
|
EPS = 1e-8
|
||||||
|
|
||||||
|
|
||||||
|
def relative_err(a, b, real=True):
|
||||||
|
"""compute relative error of two matrices or vectors"""
|
||||||
|
if real:
|
||||||
|
return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
|
||||||
|
else:
|
||||||
|
err = np.sum((a.real - b.real)**2) / (
|
||||||
|
EPS + np.sum(a.real**2) + np.sum(b.real**2))
|
||||||
|
err += np.sum((a.imag - b.imag)**2) / (
|
||||||
|
EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
|
||||||
|
|
||||||
|
return err
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
def test_melspectrogram():
|
||||||
|
a = pa.melspectrogram(
|
||||||
|
x,
|
||||||
|
window_size=512,
|
||||||
|
sr=16000,
|
||||||
|
hop_length=320,
|
||||||
|
n_mels=64,
|
||||||
|
fmin=50,
|
||||||
|
to_db=False, )
|
||||||
|
b = librosa.feature.melspectrogram(
|
||||||
|
x,
|
||||||
|
sr=16000,
|
||||||
|
n_fft=512,
|
||||||
|
win_length=512,
|
||||||
|
hop_length=320,
|
||||||
|
n_mels=64,
|
||||||
|
fmin=50)
|
||||||
|
assert relative_err(a, b) < EPS
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
def test_melspectrogram_db():
|
||||||
|
|
||||||
|
a = pa.melspectrogram(
|
||||||
|
x,
|
||||||
|
window_size=512,
|
||||||
|
sr=16000,
|
||||||
|
hop_length=320,
|
||||||
|
n_mels=64,
|
||||||
|
fmin=50,
|
||||||
|
to_db=True,
|
||||||
|
ref=1.0,
|
||||||
|
amin=1e-10,
|
||||||
|
top_db=None)
|
||||||
|
b = librosa.feature.melspectrogram(
|
||||||
|
x,
|
||||||
|
sr=16000,
|
||||||
|
n_fft=512,
|
||||||
|
win_length=512,
|
||||||
|
hop_length=320,
|
||||||
|
n_mels=64,
|
||||||
|
fmin=50)
|
||||||
|
b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
|
||||||
|
assert relative_err(a, b) < EPS
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
def test_stft():
|
||||||
|
a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
|
||||||
|
b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
|
||||||
|
assert a.shape == b.shape
|
||||||
|
assert relative_err(a, b, real=False) < EPS
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
def test_split_frames():
|
||||||
|
a = librosa.util.frame(x, frame_length=512, hop_length=320)
|
||||||
|
b = pa.split_frames(x, frame_length=512, hop_length=320)
|
||||||
|
assert relative_err(a, b) < EPS
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
||||||
|
def test_mfcc():
|
||||||
|
kwargs = {
|
||||||
|
'window_size': 512,
|
||||||
|
'hop_length': 320,
|
||||||
|
'n_mels': 64,
|
||||||
|
'fmin': 50,
|
||||||
|
'to_db': False
|
||||||
|
}
|
||||||
|
a = pa.mfcc(
|
||||||
|
x,
|
||||||
|
#sample_rate=16000,
|
||||||
|
spect=None,
|
||||||
|
n_mfcc=20,
|
||||||
|
dct_type=2,
|
||||||
|
norm='ortho',
|
||||||
|
lifter=0,
|
||||||
|
**kwargs)
|
||||||
|
S = librosa.feature.melspectrogram(
|
||||||
|
x,
|
||||||
|
sr=16000,
|
||||||
|
n_fft=512,
|
||||||
|
win_length=512,
|
||||||
|
hop_length=320,
|
||||||
|
n_mels=64,
|
||||||
|
fmin=50)
|
||||||
|
b = librosa.feature.mfcc(
|
||||||
|
x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
|
||||||
|
assert relative_err(a, b) < EPS
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_melspectrogram()
|
||||||
|
test_melspectrogram_db()
|
||||||
|
test_stft()
|
||||||
|
test_split_frames()
|
||||||
|
test_mfcc()
|
Loading…
Reference in new issue