commit
dcc2390323
@ -1,7 +0,0 @@
|
|||||||
.ipynb_checkpoints/**
|
|
||||||
*.ipynb
|
|
||||||
nohup.out
|
|
||||||
__pycache__/
|
|
||||||
*.wav
|
|
||||||
*.m4a
|
|
||||||
obsolete/**
|
|
@ -1,45 +0,0 @@
|
|||||||
repos:
|
|
||||||
- repo: local
|
|
||||||
hooks:
|
|
||||||
- id: yapf
|
|
||||||
name: yapf
|
|
||||||
entry: yapf
|
|
||||||
language: system
|
|
||||||
args: [-i, --style .style.yapf]
|
|
||||||
files: \.py$
|
|
||||||
|
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
||||||
rev: a11d9314b22d8f8c7556443875b731ef05965464
|
|
||||||
hooks:
|
|
||||||
- id: check-merge-conflict
|
|
||||||
- id: check-symlinks
|
|
||||||
- id: end-of-file-fixer
|
|
||||||
- id: trailing-whitespace
|
|
||||||
- id: detect-private-key
|
|
||||||
- id: check-symlinks
|
|
||||||
- id: check-added-large-files
|
|
||||||
|
|
||||||
- repo: https://github.com/pycqa/isort
|
|
||||||
rev: 5.8.0
|
|
||||||
hooks:
|
|
||||||
- id: isort
|
|
||||||
name: isort (python)
|
|
||||||
- id: isort
|
|
||||||
name: isort (cython)
|
|
||||||
types: [cython]
|
|
||||||
- id: isort
|
|
||||||
name: isort (pyi)
|
|
||||||
types: [pyi]
|
|
||||||
|
|
||||||
- repo: local
|
|
||||||
hooks:
|
|
||||||
- id: flake8
|
|
||||||
name: flake8
|
|
||||||
entry: flake8
|
|
||||||
language: system
|
|
||||||
args:
|
|
||||||
- --count
|
|
||||||
- --select=E9,F63,F7,F82
|
|
||||||
- --show-source
|
|
||||||
- --statistics
|
|
||||||
files: \.py$
|
|
@ -1,3 +0,0 @@
|
|||||||
[style]
|
|
||||||
based_on_style = pep8
|
|
||||||
column_limit = 80
|
|
@ -1,201 +0,0 @@
|
|||||||
Apache License
|
|
||||||
Version 2.0, January 2004
|
|
||||||
http://www.apache.org/licenses/
|
|
||||||
|
|
||||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
||||||
|
|
||||||
1. Definitions.
|
|
||||||
|
|
||||||
"License" shall mean the terms and conditions for use, reproduction,
|
|
||||||
and distribution as defined by Sections 1 through 9 of this document.
|
|
||||||
|
|
||||||
"Licensor" shall mean the copyright owner or entity authorized by
|
|
||||||
the copyright owner that is granting the License.
|
|
||||||
|
|
||||||
"Legal Entity" shall mean the union of the acting entity and all
|
|
||||||
other entities that control, are controlled by, or are under common
|
|
||||||
control with that entity. For the purposes of this definition,
|
|
||||||
"control" means (i) the power, direct or indirect, to cause the
|
|
||||||
direction or management of such entity, whether by contract or
|
|
||||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
|
||||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
|
||||||
|
|
||||||
"You" (or "Your") shall mean an individual or Legal Entity
|
|
||||||
exercising permissions granted by this License.
|
|
||||||
|
|
||||||
"Source" form shall mean the preferred form for making modifications,
|
|
||||||
including but not limited to software source code, documentation
|
|
||||||
source, and configuration files.
|
|
||||||
|
|
||||||
"Object" form shall mean any form resulting from mechanical
|
|
||||||
transformation or translation of a Source form, including but
|
|
||||||
not limited to compiled object code, generated documentation,
|
|
||||||
and conversions to other media types.
|
|
||||||
|
|
||||||
"Work" shall mean the work of authorship, whether in Source or
|
|
||||||
Object form, made available under the License, as indicated by a
|
|
||||||
copyright notice that is included in or attached to the work
|
|
||||||
(an example is provided in the Appendix below).
|
|
||||||
|
|
||||||
"Derivative Works" shall mean any work, whether in Source or Object
|
|
||||||
form, that is based on (or derived from) the Work and for which the
|
|
||||||
editorial revisions, annotations, elaborations, or other modifications
|
|
||||||
represent, as a whole, an original work of authorship. For the purposes
|
|
||||||
of this License, Derivative Works shall not include works that remain
|
|
||||||
separable from, or merely link (or bind by name) to the interfaces of,
|
|
||||||
the Work and Derivative Works thereof.
|
|
||||||
|
|
||||||
"Contribution" shall mean any work of authorship, including
|
|
||||||
the original version of the Work and any modifications or additions
|
|
||||||
to that Work or Derivative Works thereof, that is intentionally
|
|
||||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
|
||||||
or by an individual or Legal Entity authorized to submit on behalf of
|
|
||||||
the copyright owner. For the purposes of this definition, "submitted"
|
|
||||||
means any form of electronic, verbal, or written communication sent
|
|
||||||
to the Licensor or its representatives, including but not limited to
|
|
||||||
communication on electronic mailing lists, source code control systems,
|
|
||||||
and issue tracking systems that are managed by, or on behalf of, the
|
|
||||||
Licensor for the purpose of discussing and improving the Work, but
|
|
||||||
excluding communication that is conspicuously marked or otherwise
|
|
||||||
designated in writing by the copyright owner as "Not a Contribution."
|
|
||||||
|
|
||||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
|
||||||
on behalf of whom a Contribution has been received by Licensor and
|
|
||||||
subsequently incorporated within the Work.
|
|
||||||
|
|
||||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
|
||||||
this License, each Contributor hereby grants to You a perpetual,
|
|
||||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
||||||
copyright license to reproduce, prepare Derivative Works of,
|
|
||||||
publicly display, publicly perform, sublicense, and distribute the
|
|
||||||
Work and such Derivative Works in Source or Object form.
|
|
||||||
|
|
||||||
3. Grant of Patent License. Subject to the terms and conditions of
|
|
||||||
this License, each Contributor hereby grants to You a perpetual,
|
|
||||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
|
||||||
(except as stated in this section) patent license to make, have made,
|
|
||||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
|
||||||
where such license applies only to those patent claims licensable
|
|
||||||
by such Contributor that are necessarily infringed by their
|
|
||||||
Contribution(s) alone or by combination of their Contribution(s)
|
|
||||||
with the Work to which such Contribution(s) was submitted. If You
|
|
||||||
institute patent litigation against any entity (including a
|
|
||||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
|
||||||
or a Contribution incorporated within the Work constitutes direct
|
|
||||||
or contributory patent infringement, then any patent licenses
|
|
||||||
granted to You under this License for that Work shall terminate
|
|
||||||
as of the date such litigation is filed.
|
|
||||||
|
|
||||||
4. Redistribution. You may reproduce and distribute copies of the
|
|
||||||
Work or Derivative Works thereof in any medium, with or without
|
|
||||||
modifications, and in Source or Object form, provided that You
|
|
||||||
meet the following conditions:
|
|
||||||
|
|
||||||
(a) You must give any other recipients of the Work or
|
|
||||||
Derivative Works a copy of this License; and
|
|
||||||
|
|
||||||
(b) You must cause any modified files to carry prominent notices
|
|
||||||
stating that You changed the files; and
|
|
||||||
|
|
||||||
(c) You must retain, in the Source form of any Derivative Works
|
|
||||||
that You distribute, all copyright, patent, trademark, and
|
|
||||||
attribution notices from the Source form of the Work,
|
|
||||||
excluding those notices that do not pertain to any part of
|
|
||||||
the Derivative Works; and
|
|
||||||
|
|
||||||
(d) If the Work includes a "NOTICE" text file as part of its
|
|
||||||
distribution, then any Derivative Works that You distribute must
|
|
||||||
include a readable copy of the attribution notices contained
|
|
||||||
within such NOTICE file, excluding those notices that do not
|
|
||||||
pertain to any part of the Derivative Works, in at least one
|
|
||||||
of the following places: within a NOTICE text file distributed
|
|
||||||
as part of the Derivative Works; within the Source form or
|
|
||||||
documentation, if provided along with the Derivative Works; or,
|
|
||||||
within a display generated by the Derivative Works, if and
|
|
||||||
wherever such third-party notices normally appear. The contents
|
|
||||||
of the NOTICE file are for informational purposes only and
|
|
||||||
do not modify the License. You may add Your own attribution
|
|
||||||
notices within Derivative Works that You distribute, alongside
|
|
||||||
or as an addendum to the NOTICE text from the Work, provided
|
|
||||||
that such additional attribution notices cannot be construed
|
|
||||||
as modifying the License.
|
|
||||||
|
|
||||||
You may add Your own copyright statement to Your modifications and
|
|
||||||
may provide additional or different license terms and conditions
|
|
||||||
for use, reproduction, or distribution of Your modifications, or
|
|
||||||
for any such Derivative Works as a whole, provided Your use,
|
|
||||||
reproduction, and distribution of the Work otherwise complies with
|
|
||||||
the conditions stated in this License.
|
|
||||||
|
|
||||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
|
||||||
any Contribution intentionally submitted for inclusion in the Work
|
|
||||||
by You to the Licensor shall be under the terms and conditions of
|
|
||||||
this License, without any additional terms or conditions.
|
|
||||||
Notwithstanding the above, nothing herein shall supersede or modify
|
|
||||||
the terms of any separate license agreement you may have executed
|
|
||||||
with Licensor regarding such Contributions.
|
|
||||||
|
|
||||||
6. Trademarks. This License does not grant permission to use the trade
|
|
||||||
names, trademarks, service marks, or product names of the Licensor,
|
|
||||||
except as required for reasonable and customary use in describing the
|
|
||||||
origin of the Work and reproducing the content of the NOTICE file.
|
|
||||||
|
|
||||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
|
||||||
agreed to in writing, Licensor provides the Work (and each
|
|
||||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
||||||
implied, including, without limitation, any warranties or conditions
|
|
||||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
|
||||||
appropriateness of using or redistributing the Work and assume any
|
|
||||||
risks associated with Your exercise of permissions under this License.
|
|
||||||
|
|
||||||
8. Limitation of Liability. In no event and under no legal theory,
|
|
||||||
whether in tort (including negligence), contract, or otherwise,
|
|
||||||
unless required by applicable law (such as deliberate and grossly
|
|
||||||
negligent acts) or agreed to in writing, shall any Contributor be
|
|
||||||
liable to You for damages, including any direct, indirect, special,
|
|
||||||
incidental, or consequential damages of any character arising as a
|
|
||||||
result of this License or out of the use or inability to use the
|
|
||||||
Work (including but not limited to damages for loss of goodwill,
|
|
||||||
work stoppage, computer failure or malfunction, or any and all
|
|
||||||
other commercial damages or losses), even if such Contributor
|
|
||||||
has been advised of the possibility of such damages.
|
|
||||||
|
|
||||||
9. Accepting Warranty or Additional Liability. While redistributing
|
|
||||||
the Work or Derivative Works thereof, You may choose to offer,
|
|
||||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
|
||||||
or other liability obligations and/or rights consistent with this
|
|
||||||
License. However, in accepting such obligations, You may act only
|
|
||||||
on Your own behalf and on Your sole responsibility, not on behalf
|
|
||||||
of any other Contributor, and only if You agree to indemnify,
|
|
||||||
defend, and hold each Contributor harmless for any liability
|
|
||||||
incurred by, or claims asserted against, such Contributor by reason
|
|
||||||
of your accepting any such warranty or additional liability.
|
|
||||||
|
|
||||||
END OF TERMS AND CONDITIONS
|
|
||||||
|
|
||||||
APPENDIX: How to apply the Apache License to your work.
|
|
||||||
|
|
||||||
To apply the Apache License to your work, attach the following
|
|
||||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
|
||||||
replaced with your own identifying information. (Don't include
|
|
||||||
the brackets!) The text should be enclosed in the appropriate
|
|
||||||
comment syntax for the file format. We also recommend that a
|
|
||||||
file or class name and description of purpose be included on the
|
|
||||||
same "printed page" as the copyright notice for easier
|
|
||||||
identification within third-party archives.
|
|
||||||
|
|
||||||
Copyright [yyyy] [name of copyright owner]
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this file except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
@ -1,37 +0,0 @@
|
|||||||
# PaddleAudio: The audio library for PaddlePaddle
|
|
||||||
|
|
||||||
## Introduction
|
|
||||||
PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Features
|
|
||||||
- Spectrogram and related features are compatible with librosa.
|
|
||||||
- State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come.
|
|
||||||
- Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap.
|
|
||||||
- Data loading supports for common open source audio in multiple languages including English, Mandarin and so on.
|
|
||||||
|
|
||||||
|
|
||||||
## Install
|
|
||||||
```
|
|
||||||
git clone https://github.com/PaddlePaddle/models
|
|
||||||
cd models/PaddleAudio
|
|
||||||
pip install .
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
## Quick start
|
|
||||||
### Audio loading and feature extraction
|
|
||||||
```
|
|
||||||
import paddleaudio as pa
|
|
||||||
s,r = pa.load(f)
|
|
||||||
mel_spect = pa.melspectrogram(s,sr=r)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Examples
|
|
||||||
We provide a set of examples to help you get started in using PaddleAudio quickly.
|
|
||||||
- [PANNs: acoustic scene and events analysis using pre-trained models](./examples/panns)
|
|
||||||
- [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification)
|
|
||||||
- [Training a audio-tagging network on Audioset](./examples/audioset_training)
|
|
||||||
|
|
||||||
Please refer to [example directory](./examples) for more details.
|
|
@ -1,527 +0,0 @@
|
|||||||
Speech
|
|
||||||
Male speech, man speaking
|
|
||||||
Female speech, woman speaking
|
|
||||||
Child speech, kid speaking
|
|
||||||
Conversation
|
|
||||||
Narration, monologue
|
|
||||||
Babbling
|
|
||||||
Speech synthesizer
|
|
||||||
Shout
|
|
||||||
Bellow
|
|
||||||
Whoop
|
|
||||||
Yell
|
|
||||||
Battle cry
|
|
||||||
Children shouting
|
|
||||||
Screaming
|
|
||||||
Whispering
|
|
||||||
Laughter
|
|
||||||
Baby laughter
|
|
||||||
Giggle
|
|
||||||
Snicker
|
|
||||||
Belly laugh
|
|
||||||
Chuckle, chortle
|
|
||||||
Crying, sobbing
|
|
||||||
Baby cry, infant cry
|
|
||||||
Whimper
|
|
||||||
Wail, moan
|
|
||||||
Sigh
|
|
||||||
Singing
|
|
||||||
Choir
|
|
||||||
Yodeling
|
|
||||||
Chant
|
|
||||||
Mantra
|
|
||||||
Male singing
|
|
||||||
Female singing
|
|
||||||
Child singing
|
|
||||||
Synthetic singing
|
|
||||||
Rapping
|
|
||||||
Humming
|
|
||||||
Groan
|
|
||||||
Grunt
|
|
||||||
Whistling
|
|
||||||
Breathing
|
|
||||||
Wheeze
|
|
||||||
Snoring
|
|
||||||
Gasp
|
|
||||||
Pant
|
|
||||||
Snort
|
|
||||||
Cough
|
|
||||||
Throat clearing
|
|
||||||
Sneeze
|
|
||||||
Sniff
|
|
||||||
Run
|
|
||||||
Shuffle
|
|
||||||
Walk, footsteps
|
|
||||||
Chewing, mastication
|
|
||||||
Biting
|
|
||||||
Gargling
|
|
||||||
Stomach rumble
|
|
||||||
Burping, eructation
|
|
||||||
Hiccup
|
|
||||||
Fart
|
|
||||||
Hands
|
|
||||||
Finger snapping
|
|
||||||
Clapping
|
|
||||||
Heart sounds, heartbeat
|
|
||||||
Heart murmur
|
|
||||||
Cheering
|
|
||||||
Applause
|
|
||||||
Chatter
|
|
||||||
Crowd
|
|
||||||
Hubbub, speech noise, speech babble
|
|
||||||
Children playing
|
|
||||||
Animal
|
|
||||||
Domestic animals, pets
|
|
||||||
Dog
|
|
||||||
Bark
|
|
||||||
Yip
|
|
||||||
Howl
|
|
||||||
Bow-wow
|
|
||||||
Growling
|
|
||||||
Whimper (dog)
|
|
||||||
Cat
|
|
||||||
Purr
|
|
||||||
Meow
|
|
||||||
Hiss
|
|
||||||
Caterwaul
|
|
||||||
Livestock, farm animals, working animals
|
|
||||||
Horse
|
|
||||||
Clip-clop
|
|
||||||
Neigh, whinny
|
|
||||||
Cattle, bovinae
|
|
||||||
Moo
|
|
||||||
Cowbell
|
|
||||||
Pig
|
|
||||||
Oink
|
|
||||||
Goat
|
|
||||||
Bleat
|
|
||||||
Sheep
|
|
||||||
Fowl
|
|
||||||
Chicken, rooster
|
|
||||||
Cluck
|
|
||||||
Crowing, cock-a-doodle-doo
|
|
||||||
Turkey
|
|
||||||
Gobble
|
|
||||||
Duck
|
|
||||||
Quack
|
|
||||||
Goose
|
|
||||||
Honk
|
|
||||||
Wild animals
|
|
||||||
Roaring cats (lions, tigers)
|
|
||||||
Roar
|
|
||||||
Bird
|
|
||||||
Bird vocalization, bird call, bird song
|
|
||||||
Chirp, tweet
|
|
||||||
Squawk
|
|
||||||
Pigeon, dove
|
|
||||||
Coo
|
|
||||||
Crow
|
|
||||||
Caw
|
|
||||||
Owl
|
|
||||||
Hoot
|
|
||||||
Bird flight, flapping wings
|
|
||||||
Canidae, dogs, wolves
|
|
||||||
Rodents, rats, mice
|
|
||||||
Mouse
|
|
||||||
Patter
|
|
||||||
Insect
|
|
||||||
Cricket
|
|
||||||
Mosquito
|
|
||||||
Fly, housefly
|
|
||||||
Buzz
|
|
||||||
Bee, wasp, etc.
|
|
||||||
Frog
|
|
||||||
Croak
|
|
||||||
Snake
|
|
||||||
Rattle
|
|
||||||
Whale vocalization
|
|
||||||
Music
|
|
||||||
Musical instrument
|
|
||||||
Plucked string instrument
|
|
||||||
Guitar
|
|
||||||
Electric guitar
|
|
||||||
Bass guitar
|
|
||||||
Acoustic guitar
|
|
||||||
Steel guitar, slide guitar
|
|
||||||
Tapping (guitar technique)
|
|
||||||
Strum
|
|
||||||
Banjo
|
|
||||||
Sitar
|
|
||||||
Mandolin
|
|
||||||
Zither
|
|
||||||
Ukulele
|
|
||||||
Keyboard (musical)
|
|
||||||
Piano
|
|
||||||
Electric piano
|
|
||||||
Organ
|
|
||||||
Electronic organ
|
|
||||||
Hammond organ
|
|
||||||
Synthesizer
|
|
||||||
Sampler
|
|
||||||
Harpsichord
|
|
||||||
Percussion
|
|
||||||
Drum kit
|
|
||||||
Drum machine
|
|
||||||
Drum
|
|
||||||
Snare drum
|
|
||||||
Rimshot
|
|
||||||
Drum roll
|
|
||||||
Bass drum
|
|
||||||
Timpani
|
|
||||||
Tabla
|
|
||||||
Cymbal
|
|
||||||
Hi-hat
|
|
||||||
Wood block
|
|
||||||
Tambourine
|
|
||||||
Rattle (instrument)
|
|
||||||
Maraca
|
|
||||||
Gong
|
|
||||||
Tubular bells
|
|
||||||
Mallet percussion
|
|
||||||
Marimba, xylophone
|
|
||||||
Glockenspiel
|
|
||||||
Vibraphone
|
|
||||||
Steelpan
|
|
||||||
Orchestra
|
|
||||||
Brass instrument
|
|
||||||
French horn
|
|
||||||
Trumpet
|
|
||||||
Trombone
|
|
||||||
Bowed string instrument
|
|
||||||
String section
|
|
||||||
Violin, fiddle
|
|
||||||
Pizzicato
|
|
||||||
Cello
|
|
||||||
Double bass
|
|
||||||
Wind instrument, woodwind instrument
|
|
||||||
Flute
|
|
||||||
Saxophone
|
|
||||||
Clarinet
|
|
||||||
Harp
|
|
||||||
Bell
|
|
||||||
Church bell
|
|
||||||
Jingle bell
|
|
||||||
Bicycle bell
|
|
||||||
Tuning fork
|
|
||||||
Chime
|
|
||||||
Wind chime
|
|
||||||
Change ringing (campanology)
|
|
||||||
Harmonica
|
|
||||||
Accordion
|
|
||||||
Bagpipes
|
|
||||||
Didgeridoo
|
|
||||||
Shofar
|
|
||||||
Theremin
|
|
||||||
Singing bowl
|
|
||||||
Scratching (performance technique)
|
|
||||||
Pop music
|
|
||||||
Hip hop music
|
|
||||||
Beatboxing
|
|
||||||
Rock music
|
|
||||||
Heavy metal
|
|
||||||
Punk rock
|
|
||||||
Grunge
|
|
||||||
Progressive rock
|
|
||||||
Rock and roll
|
|
||||||
Psychedelic rock
|
|
||||||
Rhythm and blues
|
|
||||||
Soul music
|
|
||||||
Reggae
|
|
||||||
Country
|
|
||||||
Swing music
|
|
||||||
Bluegrass
|
|
||||||
Funk
|
|
||||||
Folk music
|
|
||||||
Middle Eastern music
|
|
||||||
Jazz
|
|
||||||
Disco
|
|
||||||
Classical music
|
|
||||||
Opera
|
|
||||||
Electronic music
|
|
||||||
House music
|
|
||||||
Techno
|
|
||||||
Dubstep
|
|
||||||
Drum and bass
|
|
||||||
Electronica
|
|
||||||
Electronic dance music
|
|
||||||
Ambient music
|
|
||||||
Trance music
|
|
||||||
Music of Latin America
|
|
||||||
Salsa music
|
|
||||||
Flamenco
|
|
||||||
Blues
|
|
||||||
Music for children
|
|
||||||
New-age music
|
|
||||||
Vocal music
|
|
||||||
A capella
|
|
||||||
Music of Africa
|
|
||||||
Afrobeat
|
|
||||||
Christian music
|
|
||||||
Gospel music
|
|
||||||
Music of Asia
|
|
||||||
Carnatic music
|
|
||||||
Music of Bollywood
|
|
||||||
Ska
|
|
||||||
Traditional music
|
|
||||||
Independent music
|
|
||||||
Song
|
|
||||||
Background music
|
|
||||||
Theme music
|
|
||||||
Jingle (music)
|
|
||||||
Soundtrack music
|
|
||||||
Lullaby
|
|
||||||
Video game music
|
|
||||||
Christmas music
|
|
||||||
Dance music
|
|
||||||
Wedding music
|
|
||||||
Happy music
|
|
||||||
Funny music
|
|
||||||
Sad music
|
|
||||||
Tender music
|
|
||||||
Exciting music
|
|
||||||
Angry music
|
|
||||||
Scary music
|
|
||||||
Wind
|
|
||||||
Rustling leaves
|
|
||||||
Wind noise (microphone)
|
|
||||||
Thunderstorm
|
|
||||||
Thunder
|
|
||||||
Water
|
|
||||||
Rain
|
|
||||||
Raindrop
|
|
||||||
Rain on surface
|
|
||||||
Stream
|
|
||||||
Waterfall
|
|
||||||
Ocean
|
|
||||||
Waves, surf
|
|
||||||
Steam
|
|
||||||
Gurgling
|
|
||||||
Fire
|
|
||||||
Crackle
|
|
||||||
Vehicle
|
|
||||||
Boat, Water vehicle
|
|
||||||
Sailboat, sailing ship
|
|
||||||
Rowboat, canoe, kayak
|
|
||||||
Motorboat, speedboat
|
|
||||||
Ship
|
|
||||||
Motor vehicle (road)
|
|
||||||
Car
|
|
||||||
Vehicle horn, car horn, honking
|
|
||||||
Toot
|
|
||||||
Car alarm
|
|
||||||
Power windows, electric windows
|
|
||||||
Skidding
|
|
||||||
Tire squeal
|
|
||||||
Car passing by
|
|
||||||
Race car, auto racing
|
|
||||||
Truck
|
|
||||||
Air brake
|
|
||||||
Air horn, truck horn
|
|
||||||
Reversing beeps
|
|
||||||
Ice cream truck, ice cream van
|
|
||||||
Bus
|
|
||||||
Emergency vehicle
|
|
||||||
Police car (siren)
|
|
||||||
Ambulance (siren)
|
|
||||||
Fire engine, fire truck (siren)
|
|
||||||
Motorcycle
|
|
||||||
Traffic noise, roadway noise
|
|
||||||
Rail transport
|
|
||||||
Train
|
|
||||||
Train whistle
|
|
||||||
Train horn
|
|
||||||
Railroad car, train wagon
|
|
||||||
Train wheels squealing
|
|
||||||
Subway, metro, underground
|
|
||||||
Aircraft
|
|
||||||
Aircraft engine
|
|
||||||
Jet engine
|
|
||||||
Propeller, airscrew
|
|
||||||
Helicopter
|
|
||||||
Fixed-wing aircraft, airplane
|
|
||||||
Bicycle
|
|
||||||
Skateboard
|
|
||||||
Engine
|
|
||||||
Light engine (high frequency)
|
|
||||||
Dental drill, dentist's drill
|
|
||||||
Lawn mower
|
|
||||||
Chainsaw
|
|
||||||
Medium engine (mid frequency)
|
|
||||||
Heavy engine (low frequency)
|
|
||||||
Engine knocking
|
|
||||||
Engine starting
|
|
||||||
Idling
|
|
||||||
Accelerating, revving, vroom
|
|
||||||
Door
|
|
||||||
Doorbell
|
|
||||||
Ding-dong
|
|
||||||
Sliding door
|
|
||||||
Slam
|
|
||||||
Knock
|
|
||||||
Tap
|
|
||||||
Squeak
|
|
||||||
Cupboard open or close
|
|
||||||
Drawer open or close
|
|
||||||
Dishes, pots, and pans
|
|
||||||
Cutlery, silverware
|
|
||||||
Chopping (food)
|
|
||||||
Frying (food)
|
|
||||||
Microwave oven
|
|
||||||
Blender
|
|
||||||
Water tap, faucet
|
|
||||||
Sink (filling or washing)
|
|
||||||
Bathtub (filling or washing)
|
|
||||||
Hair dryer
|
|
||||||
Toilet flush
|
|
||||||
Toothbrush
|
|
||||||
Electric toothbrush
|
|
||||||
Vacuum cleaner
|
|
||||||
Zipper (clothing)
|
|
||||||
Keys jangling
|
|
||||||
Coin (dropping)
|
|
||||||
Scissors
|
|
||||||
Electric shaver, electric razor
|
|
||||||
Shuffling cards
|
|
||||||
Typing
|
|
||||||
Typewriter
|
|
||||||
Computer keyboard
|
|
||||||
Writing
|
|
||||||
Alarm
|
|
||||||
Telephone
|
|
||||||
Telephone bell ringing
|
|
||||||
Ringtone
|
|
||||||
Telephone dialing, DTMF
|
|
||||||
Dial tone
|
|
||||||
Busy signal
|
|
||||||
Alarm clock
|
|
||||||
Siren
|
|
||||||
Civil defense siren
|
|
||||||
Buzzer
|
|
||||||
Smoke detector, smoke alarm
|
|
||||||
Fire alarm
|
|
||||||
Foghorn
|
|
||||||
Whistle
|
|
||||||
Steam whistle
|
|
||||||
Mechanisms
|
|
||||||
Ratchet, pawl
|
|
||||||
Clock
|
|
||||||
Tick
|
|
||||||
Tick-tock
|
|
||||||
Gears
|
|
||||||
Pulleys
|
|
||||||
Sewing machine
|
|
||||||
Mechanical fan
|
|
||||||
Air conditioning
|
|
||||||
Cash register
|
|
||||||
Printer
|
|
||||||
Camera
|
|
||||||
Single-lens reflex camera
|
|
||||||
Tools
|
|
||||||
Hammer
|
|
||||||
Jackhammer
|
|
||||||
Sawing
|
|
||||||
Filing (rasp)
|
|
||||||
Sanding
|
|
||||||
Power tool
|
|
||||||
Drill
|
|
||||||
Explosion
|
|
||||||
Gunshot, gunfire
|
|
||||||
Machine gun
|
|
||||||
Fusillade
|
|
||||||
Artillery fire
|
|
||||||
Cap gun
|
|
||||||
Fireworks
|
|
||||||
Firecracker
|
|
||||||
Burst, pop
|
|
||||||
Eruption
|
|
||||||
Boom
|
|
||||||
Wood
|
|
||||||
Chop
|
|
||||||
Splinter
|
|
||||||
Crack
|
|
||||||
Glass
|
|
||||||
Chink, clink
|
|
||||||
Shatter
|
|
||||||
Liquid
|
|
||||||
Splash, splatter
|
|
||||||
Slosh
|
|
||||||
Squish
|
|
||||||
Drip
|
|
||||||
Pour
|
|
||||||
Trickle, dribble
|
|
||||||
Gush
|
|
||||||
Fill (with liquid)
|
|
||||||
Spray
|
|
||||||
Pump (liquid)
|
|
||||||
Stir
|
|
||||||
Boiling
|
|
||||||
Sonar
|
|
||||||
Arrow
|
|
||||||
Whoosh, swoosh, swish
|
|
||||||
Thump, thud
|
|
||||||
Thunk
|
|
||||||
Electronic tuner
|
|
||||||
Effects unit
|
|
||||||
Chorus effect
|
|
||||||
Basketball bounce
|
|
||||||
Bang
|
|
||||||
Slap, smack
|
|
||||||
Whack, thwack
|
|
||||||
Smash, crash
|
|
||||||
Breaking
|
|
||||||
Bouncing
|
|
||||||
Whip
|
|
||||||
Flap
|
|
||||||
Scratch
|
|
||||||
Scrape
|
|
||||||
Rub
|
|
||||||
Roll
|
|
||||||
Crushing
|
|
||||||
Crumpling, crinkling
|
|
||||||
Tearing
|
|
||||||
Beep, bleep
|
|
||||||
Ping
|
|
||||||
Ding
|
|
||||||
Clang
|
|
||||||
Squeal
|
|
||||||
Creak
|
|
||||||
Rustle
|
|
||||||
Whir
|
|
||||||
Clatter
|
|
||||||
Sizzle
|
|
||||||
Clicking
|
|
||||||
Clickety-clack
|
|
||||||
Rumble
|
|
||||||
Plop
|
|
||||||
Jingle, tinkle
|
|
||||||
Hum
|
|
||||||
Zing
|
|
||||||
Boing
|
|
||||||
Crunch
|
|
||||||
Silence
|
|
||||||
Sine wave
|
|
||||||
Harmonic
|
|
||||||
Chirp tone
|
|
||||||
Sound effect
|
|
||||||
Pulse
|
|
||||||
Inside, small room
|
|
||||||
Inside, large room or hall
|
|
||||||
Inside, public space
|
|
||||||
Outside, urban or manmade
|
|
||||||
Outside, rural or natural
|
|
||||||
Reverberation
|
|
||||||
Echo
|
|
||||||
Noise
|
|
||||||
Environmental noise
|
|
||||||
Static
|
|
||||||
Mains hum
|
|
||||||
Distortion
|
|
||||||
Sidetone
|
|
||||||
Cacophony
|
|
||||||
White noise
|
|
||||||
Pink noise
|
|
||||||
Throbbing
|
|
||||||
Vibration
|
|
||||||
Television
|
|
||||||
Radio
|
|
||||||
Field recording
|
|
@ -1,111 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import paddle
|
|
||||||
from paddleaudio.backends import load as load_audio
|
|
||||||
from paddleaudio.features import melspectrogram
|
|
||||||
from paddleaudio.models.panns import cnn14
|
|
||||||
from paddleaudio.utils import logger
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
parser = argparse.ArgumentParser(__doc__)
|
|
||||||
parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
|
|
||||||
parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
|
|
||||||
parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
|
|
||||||
parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
|
|
||||||
parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
|
|
||||||
args = parser.parse_args()
|
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
|
|
||||||
def split(waveform: np.ndarray, win_size: int, hop_size: int):
|
|
||||||
"""
|
|
||||||
Split into N waveforms.
|
|
||||||
N is decided by win_size and hop_size.
|
|
||||||
"""
|
|
||||||
assert isinstance(waveform, np.ndarray)
|
|
||||||
time = []
|
|
||||||
data = []
|
|
||||||
for i in range(0, len(waveform), hop_size):
|
|
||||||
segment = waveform[i:i + win_size]
|
|
||||||
if len(segment) < win_size:
|
|
||||||
segment = np.pad(segment, (0, win_size - len(segment)))
|
|
||||||
data.append(segment)
|
|
||||||
time.append(i / len(waveform))
|
|
||||||
return time, data
|
|
||||||
|
|
||||||
|
|
||||||
def batchify(data: List[List[float]],
|
|
||||||
sample_rate: int,
|
|
||||||
batch_size: int,
|
|
||||||
**kwargs):
|
|
||||||
"""
|
|
||||||
Extract features from waveforms and create batches.
|
|
||||||
"""
|
|
||||||
examples = []
|
|
||||||
for waveform in data:
|
|
||||||
feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
|
|
||||||
examples.append(feats)
|
|
||||||
|
|
||||||
# Seperates data into some batches.
|
|
||||||
one_batch = []
|
|
||||||
for example in examples:
|
|
||||||
one_batch.append(example)
|
|
||||||
if len(one_batch) == batch_size:
|
|
||||||
yield one_batch
|
|
||||||
one_batch = []
|
|
||||||
if one_batch:
|
|
||||||
yield one_batch
|
|
||||||
|
|
||||||
|
|
||||||
def predict(model, data: List[List[float]], sample_rate: int,
|
|
||||||
batch_size: int=1):
|
|
||||||
"""
|
|
||||||
Use pretrained model to make predictions.
|
|
||||||
"""
|
|
||||||
batches = batchify(data, sample_rate, batch_size)
|
|
||||||
results = None
|
|
||||||
model.eval()
|
|
||||||
for batch in batches:
|
|
||||||
feats = paddle.to_tensor(batch).unsqueeze(1) \
|
|
||||||
# (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
|
|
||||||
|
|
||||||
audioset_scores = model(feats)
|
|
||||||
if results is None:
|
|
||||||
results = audioset_scores.numpy()
|
|
||||||
else:
|
|
||||||
results = np.concatenate((results, audioset_scores.numpy()))
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
paddle.set_device(args.device)
|
|
||||||
model = cnn14(pretrained=True, extract_embedding=False)
|
|
||||||
waveform, sr = load_audio(args.wav, sr=None)
|
|
||||||
time, data = split(waveform,
|
|
||||||
int(args.sample_duration * sr),
|
|
||||||
int(args.hop_duration * sr))
|
|
||||||
results = predict(model, data, sr, batch_size=8)
|
|
||||||
|
|
||||||
if not os.path.exists(args.output_dir):
|
|
||||||
os.makedirs(args.output_dir)
|
|
||||||
time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
|
|
||||||
output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
|
|
||||||
np.savez(output_file, time=time, scores=results)
|
|
||||||
logger.info(f'Saved tagging results to {output_file}')
|
|
@ -1,83 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
|
||||||
import ast
|
|
||||||
import os
|
|
||||||
from typing import Dict
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from paddleaudio.utils import logger
|
|
||||||
|
|
||||||
# yapf: disable
|
|
||||||
parser = argparse.ArgumentParser(__doc__)
|
|
||||||
parser.add_argument('--tagging_file', type=str, required=True, help='')
|
|
||||||
parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
|
|
||||||
parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
|
|
||||||
parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
|
|
||||||
parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
|
|
||||||
parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
|
|
||||||
args = parser.parse_args()
|
|
||||||
# yapf: enable
|
|
||||||
|
|
||||||
|
|
||||||
def smooth(results: np.ndarray, win_size: int):
|
|
||||||
"""
|
|
||||||
Execute posterior smoothing in-place.
|
|
||||||
"""
|
|
||||||
for i in range(len(results) - 1, -1, -1):
|
|
||||||
if i < win_size - 1:
|
|
||||||
left = 0
|
|
||||||
else:
|
|
||||||
left = i + 1 - win_size
|
|
||||||
results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
|
|
||||||
"""
|
|
||||||
Return top k result.
|
|
||||||
"""
|
|
||||||
result = np.asarray(result)
|
|
||||||
topk_idx = (-result).argsort()[:k]
|
|
||||||
|
|
||||||
ret = ''
|
|
||||||
for idx in topk_idx:
|
|
||||||
label, score = label_map[idx], result[idx]
|
|
||||||
ret += f'{label}: {score}\n'
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
label_map = {}
|
|
||||||
with open(args.label_file, 'r') as f:
|
|
||||||
for i, l in enumerate(f.readlines()):
|
|
||||||
label_map[i] = l.strip()
|
|
||||||
|
|
||||||
results = np.load(args.tagging_file, allow_pickle=True)
|
|
||||||
times, scores = results['time'], results['scores']
|
|
||||||
|
|
||||||
if args.smooth:
|
|
||||||
logger.info('Posterior smoothing...')
|
|
||||||
smooth(scores, win_size=args.smooth_size)
|
|
||||||
|
|
||||||
if not os.path.exists(args.output_dir):
|
|
||||||
os.makedirs(args.output_dir)
|
|
||||||
output_file = os.path.join(
|
|
||||||
args.output_dir,
|
|
||||||
os.path.basename(args.tagging_file).split('.')[0] + '.txt')
|
|
||||||
with open(output_file, 'w') as f:
|
|
||||||
for time, score in zip(times, scores):
|
|
||||||
f.write(f'{time}\n')
|
|
||||||
f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
|
|
||||||
|
|
||||||
logger.info(f'Saved tagging labels to {output_file}')
|
|
@ -1,154 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import codecs
|
|
||||||
import collections
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from typing import Dict
|
|
||||||
|
|
||||||
from paddle.io import Dataset
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from ..backends import load as load_audio
|
|
||||||
from ..utils.download import decompress
|
|
||||||
from ..utils.download import download_and_decompress
|
|
||||||
from ..utils.env import DATA_HOME
|
|
||||||
from ..utils.log import logger
|
|
||||||
from .dataset import feat_funcs
|
|
||||||
|
|
||||||
__all__ = ['AISHELL1']
|
|
||||||
|
|
||||||
|
|
||||||
class AISHELL1(Dataset):
|
|
||||||
"""
|
|
||||||
This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
|
|
||||||
It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
|
|
||||||
smart home, autonomous driving, and industrial production. The whole recording was
|
|
||||||
put in quiet indoor environment, using 3 different devices at the same time: high
|
|
||||||
fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
|
|
||||||
iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
|
|
||||||
to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
|
|
||||||
in China were invited to participate in the recording. The manual transcription
|
|
||||||
accuracy rate is above 95%, through professional speech annotation and strict
|
|
||||||
quality inspection. The corpus is divided into training, development and testing
|
|
||||||
sets.
|
|
||||||
|
|
||||||
Reference:
|
|
||||||
AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
|
|
||||||
https://arxiv.org/abs/1709.05522
|
|
||||||
"""
|
|
||||||
|
|
||||||
archieves = [
|
|
||||||
{
|
|
||||||
'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
|
|
||||||
'md5': '2f494334227864a8a8fec932999db9d8',
|
|
||||||
},
|
|
||||||
]
|
|
||||||
text_meta = os.path.join('data_aishell', 'transcript',
|
|
||||||
'aishell_transcript_v0.8.txt')
|
|
||||||
utt_info = collections.namedtuple('META_INFO',
|
|
||||||
('file_path', 'utt_id', 'text'))
|
|
||||||
audio_path = os.path.join('data_aishell', 'wav')
|
|
||||||
manifest_path = os.path.join('data_aishell', 'manifest')
|
|
||||||
subset = ['train', 'dev', 'test']
|
|
||||||
|
|
||||||
def __init__(self, subset: str='train', feat_type: str='raw', **kwargs):
|
|
||||||
assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
|
|
||||||
self.subset, subset)
|
|
||||||
self.subset = subset
|
|
||||||
self.feat_type = feat_type
|
|
||||||
self.feat_config = kwargs
|
|
||||||
self._data = self._get_data()
|
|
||||||
super(AISHELL1, self).__init__()
|
|
||||||
|
|
||||||
def _get_text_info(self) -> Dict[str, str]:
|
|
||||||
ret = {}
|
|
||||||
with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
|
|
||||||
for line in rf.readlines()[1:]:
|
|
||||||
utt_id, text = map(str.strip, line.split(' ',
|
|
||||||
1)) # utt_id, text
|
|
||||||
ret.update({utt_id: ''.join(text.split())})
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def _get_data(self):
|
|
||||||
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
|
||||||
not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
|
|
||||||
download_and_decompress(self.archieves, DATA_HOME)
|
|
||||||
# Extract *wav from *.tar.gz.
|
|
||||||
for root, _, files in os.walk(
|
|
||||||
os.path.join(DATA_HOME, self.audio_path)):
|
|
||||||
for file in files:
|
|
||||||
if file.endswith('.tar.gz'):
|
|
||||||
decompress(os.path.join(root, file))
|
|
||||||
os.remove(os.path.join(root, file))
|
|
||||||
|
|
||||||
text_info = self._get_text_info()
|
|
||||||
|
|
||||||
data = []
|
|
||||||
for root, _, files in os.walk(
|
|
||||||
os.path.join(DATA_HOME, self.audio_path, self.subset)):
|
|
||||||
for file in files:
|
|
||||||
if file.endswith('.wav'):
|
|
||||||
utt_id = os.path.splitext(file)[0]
|
|
||||||
if utt_id not in text_info: # There are some utt_id that without label
|
|
||||||
continue
|
|
||||||
text = text_info[utt_id]
|
|
||||||
file_path = os.path.join(root, file)
|
|
||||||
data.append(self.utt_info(file_path, utt_id, text))
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
def _convert_to_record(self, idx: int):
|
|
||||||
sample = self._data[idx]
|
|
||||||
|
|
||||||
record = {}
|
|
||||||
# To show all fields in a namedtuple: `type(sample)._fields`
|
|
||||||
for field in type(sample)._fields:
|
|
||||||
record[field] = getattr(sample, field)
|
|
||||||
|
|
||||||
waveform, sr = load_audio(
|
|
||||||
sample[0]) # The first element of sample is file path
|
|
||||||
feat_func = feat_funcs[self.feat_type]
|
|
||||||
feat = feat_func(
|
|
||||||
waveform, sample_rate=sr,
|
|
||||||
**self.feat_config) if feat_func else waveform
|
|
||||||
record.update({'feat': feat, 'duration': len(waveform) / sr})
|
|
||||||
return record
|
|
||||||
|
|
||||||
def create_manifest(self, prefix='manifest'):
|
|
||||||
if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
|
|
||||||
os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
|
|
||||||
|
|
||||||
manifest_file = os.path.join(DATA_HOME, self.manifest_path,
|
|
||||||
f'{prefix}.{self.subset}')
|
|
||||||
with codecs.open(manifest_file, 'w', 'utf-8') as f:
|
|
||||||
for idx in tqdm(range(len(self))):
|
|
||||||
record = self._convert_to_record(idx)
|
|
||||||
record_line = json.dumps(
|
|
||||||
{
|
|
||||||
'utt': record['utt_id'],
|
|
||||||
'feat': record['file_path'],
|
|
||||||
'feat_shape': (record['duration'], ),
|
|
||||||
'text': record['text']
|
|
||||||
},
|
|
||||||
ensure_ascii=False)
|
|
||||||
f.write(record_line + '\n')
|
|
||||||
logger.info(f'Manifest file {manifest_file} created.')
|
|
||||||
|
|
||||||
def __getitem__(self, idx):
|
|
||||||
record = self._convert_to_record(idx)
|
|
||||||
return tuple(record.values())
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self._data)
|
|
@ -1,298 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import collections
|
|
||||||
import os
|
|
||||||
from typing import List
|
|
||||||
from typing import Tuple
|
|
||||||
|
|
||||||
from ..utils.download import download_and_decompress
|
|
||||||
from ..utils.env import DATA_HOME
|
|
||||||
from .dataset import AudioClassificationDataset
|
|
||||||
|
|
||||||
__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
|
|
||||||
|
|
||||||
|
|
||||||
class UrbanAcousticScenes(AudioClassificationDataset):
|
|
||||||
"""
|
|
||||||
TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from
|
|
||||||
12 European cities in 10 different acoustic scenes using 4 different devices.
|
|
||||||
Additionally, synthetic data for 11 mobile devices was created based on the original
|
|
||||||
recordings. Of the 12 cities, two are present only in the evaluation set.
|
|
||||||
|
|
||||||
Reference:
|
|
||||||
A multi-device dataset for urban acoustic scene classification
|
|
||||||
https://arxiv.org/abs/1807.09840
|
|
||||||
"""
|
|
||||||
|
|
||||||
source_url = 'https://zenodo.org/record/3819968/files/'
|
|
||||||
base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
|
|
||||||
archieves = [
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.meta.zip',
|
|
||||||
'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.1.zip',
|
|
||||||
'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.2.zip',
|
|
||||||
'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.3.zip',
|
|
||||||
'md5': 'ed38956c4246abb56190c1e9b602b7b8',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.4.zip',
|
|
||||||
'md5': '97ab8560056b6816808dedc044dcc023',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.5.zip',
|
|
||||||
'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.6.zip',
|
|
||||||
'md5': 'fbf856a3a86fff7520549c899dc94372',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.7.zip',
|
|
||||||
'md5': '0dbffe7b6e45564da649378723284062',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.8.zip',
|
|
||||||
'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.9.zip',
|
|
||||||
'md5': 'a65596a5372eab10c78e08a0de797c9e',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.10.zip',
|
|
||||||
'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.11.zip',
|
|
||||||
'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.12.zip',
|
|
||||||
'md5': 'e5f4400c6b9697295fab4cf507155a2f',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.13.zip',
|
|
||||||
'md5': '8855ab9f9896422746ab4c5d89d8da2f',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.14.zip',
|
|
||||||
'md5': '092ad744452cd3e7de78f988a3d13020',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.15.zip',
|
|
||||||
'md5': '4b5eb85f6592aebf846088d9df76b420',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.16.zip',
|
|
||||||
'md5': '2e0a89723e58a3836be019e6996ae460',
|
|
||||||
},
|
|
||||||
]
|
|
||||||
label_list = [
|
|
||||||
'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
|
|
||||||
'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
|
|
||||||
]
|
|
||||||
|
|
||||||
meta = os.path.join(base_name, 'meta.csv')
|
|
||||||
meta_info = collections.namedtuple('META_INFO', (
|
|
||||||
'filename', 'scene_label', 'identifier', 'source_label'))
|
|
||||||
subset_meta = {
|
|
||||||
'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
|
|
||||||
'dev':
|
|
||||||
os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
|
|
||||||
'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
|
|
||||||
}
|
|
||||||
subset_meta_info = collections.namedtuple('SUBSET_META_INFO',
|
|
||||||
('filename', 'scene_label'))
|
|
||||||
audio_path = os.path.join(base_name, 'audio')
|
|
||||||
|
|
||||||
def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
|
|
||||||
"""
|
|
||||||
Ags:
|
|
||||||
mode (:obj:`str`, `optional`, defaults to `train`):
|
|
||||||
It identifies the dataset mode (train or dev).
|
|
||||||
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
|
||||||
It identifies the feature type that user wants to extrace of an audio file.
|
|
||||||
"""
|
|
||||||
files, labels = self._get_data(mode)
|
|
||||||
super(UrbanAcousticScenes, self).__init__(
|
|
||||||
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
|
||||||
|
|
||||||
def _get_meta_info(self, subset: str=None,
|
|
||||||
skip_header: bool=True) -> List[collections.namedtuple]:
|
|
||||||
if subset is None:
|
|
||||||
meta_file = self.meta
|
|
||||||
meta_info = self.meta_info
|
|
||||||
else:
|
|
||||||
assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
|
|
||||||
meta_file = self.subset_meta[subset]
|
|
||||||
meta_info = self.subset_meta_info
|
|
||||||
|
|
||||||
ret = []
|
|
||||||
with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
|
|
||||||
lines = rf.readlines()[1:] if skip_header else rf.readlines()
|
|
||||||
for line in lines:
|
|
||||||
ret.append(meta_info(*line.strip().split('\t')))
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
|
|
||||||
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
|
||||||
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
|
||||||
download_and_decompress(self.archieves, DATA_HOME)
|
|
||||||
|
|
||||||
meta_info = self._get_meta_info(subset=mode, skip_header=True)
|
|
||||||
|
|
||||||
files = []
|
|
||||||
labels = []
|
|
||||||
for sample in meta_info:
|
|
||||||
filename, label = sample[:2]
|
|
||||||
filename = os.path.basename(filename)
|
|
||||||
target = self.label_list.index(label)
|
|
||||||
|
|
||||||
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
|
|
||||||
labels.append(int(target))
|
|
||||||
|
|
||||||
return files, labels
|
|
||||||
|
|
||||||
|
|
||||||
class UrbanAudioVisualScenes(AudioClassificationDataset):
|
|
||||||
"""
|
|
||||||
TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
|
|
||||||
and video recordings from 12 European cities in 10 different scenes.
|
|
||||||
This dataset consists of 10-seconds audio and video segments from 10
|
|
||||||
acoustic scenes. The total amount of audio in the development set is 34 hours.
|
|
||||||
|
|
||||||
Reference:
|
|
||||||
A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
|
|
||||||
https://arxiv.org/abs/2011.00030
|
|
||||||
"""
|
|
||||||
|
|
||||||
source_url = 'https://zenodo.org/record/4477542/files/'
|
|
||||||
base_name = 'TAU-urban-audio-visual-scenes-2021-development'
|
|
||||||
|
|
||||||
archieves = [
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.meta.zip',
|
|
||||||
'md5': '76e3d7ed5291b118372e06379cb2b490',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.1.zip',
|
|
||||||
'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.2.zip',
|
|
||||||
'md5': '7fd6bb63127f5785874a55aba4e77aa5',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.3.zip',
|
|
||||||
'md5': '61396bede29d7c8c89729a01a6f6b2e2',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.4.zip',
|
|
||||||
'md5': '6ddac89717fcf9c92c451868eed77fe1',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.5.zip',
|
|
||||||
'md5': 'af4820756cdf1a7d4bd6037dc034d384',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.6.zip',
|
|
||||||
'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.7.zip',
|
|
||||||
'md5': '2be39a76aeed704d5929d020a2909efd',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + base_name + '.audio.8.zip',
|
|
||||||
'md5': '972d8afe0874720fc2f28086e7cb22a9',
|
|
||||||
},
|
|
||||||
]
|
|
||||||
label_list = [
|
|
||||||
'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
|
|
||||||
'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
|
|
||||||
]
|
|
||||||
|
|
||||||
meta_base_path = os.path.join(base_name, base_name + '.meta')
|
|
||||||
meta = os.path.join(meta_base_path, 'meta.csv')
|
|
||||||
meta_info = collections.namedtuple('META_INFO', (
|
|
||||||
'filename_audio', 'filename_video', 'scene_label', 'identifier'))
|
|
||||||
subset_meta = {
|
|
||||||
'train':
|
|
||||||
os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
|
|
||||||
'dev':
|
|
||||||
os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
|
|
||||||
'test':
|
|
||||||
os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
|
|
||||||
}
|
|
||||||
subset_meta_info = collections.namedtuple('SUBSET_META_INFO', (
|
|
||||||
'filename_audio', 'filename_video', 'scene_label'))
|
|
||||||
audio_path = os.path.join(base_name, 'audio')
|
|
||||||
|
|
||||||
def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
|
|
||||||
"""
|
|
||||||
Ags:
|
|
||||||
mode (:obj:`str`, `optional`, defaults to `train`):
|
|
||||||
It identifies the dataset mode (train or dev).
|
|
||||||
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
|
||||||
It identifies the feature type that user wants to extrace of an audio file.
|
|
||||||
"""
|
|
||||||
files, labels = self._get_data(mode)
|
|
||||||
super(UrbanAudioVisualScenes, self).__init__(
|
|
||||||
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
|
||||||
|
|
||||||
def _get_meta_info(self, subset: str=None,
|
|
||||||
skip_header: bool=True) -> List[collections.namedtuple]:
|
|
||||||
if subset is None:
|
|
||||||
meta_file = self.meta
|
|
||||||
meta_info = self.meta_info
|
|
||||||
else:
|
|
||||||
assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
|
|
||||||
meta_file = self.subset_meta[subset]
|
|
||||||
meta_info = self.subset_meta_info
|
|
||||||
|
|
||||||
ret = []
|
|
||||||
with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
|
|
||||||
lines = rf.readlines()[1:] if skip_header else rf.readlines()
|
|
||||||
for line in lines:
|
|
||||||
ret.append(meta_info(*line.strip().split('\t')))
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
|
|
||||||
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
|
||||||
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
|
|
||||||
download_and_decompress(self.archieves,
|
|
||||||
os.path.join(DATA_HOME, self.base_name))
|
|
||||||
|
|
||||||
meta_info = self._get_meta_info(subset=mode, skip_header=True)
|
|
||||||
|
|
||||||
files = []
|
|
||||||
labels = []
|
|
||||||
for sample in meta_info:
|
|
||||||
filename, _, label = sample[:3]
|
|
||||||
filename = os.path.basename(filename)
|
|
||||||
target = self.label_list.index(label)
|
|
||||||
|
|
||||||
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
|
|
||||||
labels.append(int(target))
|
|
||||||
|
|
||||||
return files, labels
|
|
@ -1,199 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import codecs
|
|
||||||
import collections
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from typing import Dict
|
|
||||||
|
|
||||||
from paddle.io import Dataset
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from ..backends import load as load_audio
|
|
||||||
from ..utils.download import download_and_decompress
|
|
||||||
from ..utils.env import DATA_HOME
|
|
||||||
from ..utils.log import logger
|
|
||||||
from .dataset import feat_funcs
|
|
||||||
|
|
||||||
__all__ = ['LIBRISPEECH']
|
|
||||||
|
|
||||||
|
|
||||||
class LIBRISPEECH(Dataset):
|
|
||||||
"""
|
|
||||||
LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
|
|
||||||
prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
|
|
||||||
derived from read audiobooks from the LibriVox project, and has been carefully
|
|
||||||
segmented and aligned.
|
|
||||||
|
|
||||||
Reference:
|
|
||||||
LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
|
|
||||||
http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
|
|
||||||
https://arxiv.org/abs/1709.05522
|
|
||||||
"""
|
|
||||||
|
|
||||||
source_url = 'http://www.openslr.org/resources/12/'
|
|
||||||
archieves = [
|
|
||||||
{
|
|
||||||
'url': source_url + 'train-clean-100.tar.gz',
|
|
||||||
'md5': '2a93770f6d5c6c964bc36631d331a522',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + 'train-clean-360.tar.gz',
|
|
||||||
'md5': 'c0e676e450a7ff2f54aeade5171606fa',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + 'train-other-500.tar.gz',
|
|
||||||
'md5': 'd1a0fd59409feb2c614ce4d30c387708',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + 'dev-clean.tar.gz',
|
|
||||||
'md5': '42e2234ba48799c1f50f24a7926300a1',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + 'dev-other.tar.gz',
|
|
||||||
'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + 'test-clean.tar.gz',
|
|
||||||
'md5': '32fa31d27d2e1cad72775fee3f4849a9',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url': source_url + 'test-other.tar.gz',
|
|
||||||
'md5': 'fb5a50374b501bb3bac4815ee91d3135',
|
|
||||||
},
|
|
||||||
]
|
|
||||||
speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
|
|
||||||
utt_info = collections.namedtuple('META_INFO', (
|
|
||||||
'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
|
|
||||||
audio_path = 'LibriSpeech'
|
|
||||||
manifest_path = os.path.join('LibriSpeech', 'manifest')
|
|
||||||
subset = [
|
|
||||||
'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean',
|
|
||||||
'dev-other', 'test-clean', 'test-other'
|
|
||||||
]
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
subset: str='train-clean-100',
|
|
||||||
feat_type: str='raw',
|
|
||||||
**kwargs):
|
|
||||||
assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
|
|
||||||
self.subset, subset)
|
|
||||||
self.subset = subset
|
|
||||||
self.feat_type = feat_type
|
|
||||||
self.feat_config = kwargs
|
|
||||||
self._data = self._get_data()
|
|
||||||
super(LIBRISPEECH, self).__init__()
|
|
||||||
|
|
||||||
def _get_speaker_info(self) -> Dict[str, str]:
|
|
||||||
ret = {}
|
|
||||||
with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
|
|
||||||
for line in rf.readlines():
|
|
||||||
if ';' in line: # Skip dataset abstract
|
|
||||||
continue
|
|
||||||
spk_id, gender = map(str.strip,
|
|
||||||
line.split('|')[:2]) # spk_id, gender
|
|
||||||
ret.update({spk_id: gender})
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def _get_text_info(self, trans_file) -> Dict[str, str]:
|
|
||||||
ret = {}
|
|
||||||
with open(trans_file, 'r') as rf:
|
|
||||||
for line in rf.readlines():
|
|
||||||
utt_id, text = map(str.strip, line.split(' ',
|
|
||||||
1)) # utt_id, text
|
|
||||||
ret.update({utt_id: text})
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def _get_data(self):
|
|
||||||
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
|
|
||||||
not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
|
|
||||||
download_and_decompress(self.archieves, DATA_HOME,
|
|
||||||
len(self.archieves))
|
|
||||||
|
|
||||||
# Speaker info
|
|
||||||
speaker_info = self._get_speaker_info()
|
|
||||||
|
|
||||||
# Text info
|
|
||||||
text_info = {}
|
|
||||||
for root, _, files in os.walk(
|
|
||||||
os.path.join(DATA_HOME, self.audio_path, self.subset)):
|
|
||||||
for file in files:
|
|
||||||
if file.endswith('.trans.txt'):
|
|
||||||
text_info.update(
|
|
||||||
self._get_text_info(os.path.join(root, file)))
|
|
||||||
|
|
||||||
data = []
|
|
||||||
for root, _, files in os.walk(
|
|
||||||
os.path.join(DATA_HOME, self.audio_path, self.subset)):
|
|
||||||
for file in files:
|
|
||||||
if file.endswith('.flac'):
|
|
||||||
utt_id = os.path.splitext(file)[0]
|
|
||||||
spk_id = utt_id.split('-')[0]
|
|
||||||
if utt_id not in text_info \
|
|
||||||
or spk_id not in speaker_info : # Skip samples with incomplete data
|
|
||||||
continue
|
|
||||||
file_path = os.path.join(root, file)
|
|
||||||
text = text_info[utt_id]
|
|
||||||
spk_gender = speaker_info[spk_id]
|
|
||||||
data.append(
|
|
||||||
self.utt_info(file_path, utt_id, text, spk_id,
|
|
||||||
spk_gender))
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
def _convert_to_record(self, idx: int):
|
|
||||||
sample = self._data[idx]
|
|
||||||
|
|
||||||
record = {}
|
|
||||||
# To show all fields in a namedtuple: `type(sample)._fields`
|
|
||||||
for field in type(sample)._fields:
|
|
||||||
record[field] = getattr(sample, field)
|
|
||||||
|
|
||||||
waveform, sr = load_audio(
|
|
||||||
sample[0]) # The first element of sample is file path
|
|
||||||
feat_func = feat_funcs[self.feat_type]
|
|
||||||
feat = feat_func(
|
|
||||||
waveform, sample_rate=sr,
|
|
||||||
**self.feat_config) if feat_func else waveform
|
|
||||||
record.update({'feat': feat, 'duration': len(waveform) / sr})
|
|
||||||
return record
|
|
||||||
|
|
||||||
def create_manifest(self, prefix='manifest'):
|
|
||||||
if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
|
|
||||||
os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
|
|
||||||
|
|
||||||
manifest_file = os.path.join(DATA_HOME, self.manifest_path,
|
|
||||||
f'{prefix}.{self.subset}')
|
|
||||||
with codecs.open(manifest_file, 'w', 'utf-8') as f:
|
|
||||||
for idx in tqdm(range(len(self))):
|
|
||||||
record = self._convert_to_record(idx)
|
|
||||||
record_line = json.dumps(
|
|
||||||
{
|
|
||||||
'utt': record['utt_id'],
|
|
||||||
'feat': record['file_path'],
|
|
||||||
'feat_shape': (record['duration'], ),
|
|
||||||
'text': record['text'],
|
|
||||||
'spk': record['spk_id'],
|
|
||||||
'gender': record['spk_gender'],
|
|
||||||
},
|
|
||||||
ensure_ascii=False)
|
|
||||||
f.write(record_line + '\n')
|
|
||||||
logger.info(f'Manifest file {manifest_file} created.')
|
|
||||||
|
|
||||||
def __getitem__(self, idx):
|
|
||||||
record = self._convert_to_record(idx)
|
|
||||||
return tuple(record.values())
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self._data)
|
|
@ -1,136 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import collections
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
from typing import List
|
|
||||||
from typing import Tuple
|
|
||||||
|
|
||||||
from ..utils.download import download_and_decompress
|
|
||||||
from ..utils.env import DATA_HOME
|
|
||||||
from .dataset import AudioClassificationDataset
|
|
||||||
|
|
||||||
__all__ = ['RAVDESS']
|
|
||||||
|
|
||||||
|
|
||||||
class RAVDESS(AudioClassificationDataset):
|
|
||||||
"""
|
|
||||||
The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
|
|
||||||
lexically-matched statements in a neutral North American accent. Speech emotions
|
|
||||||
includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
|
|
||||||
Each expression is produced at two levels of emotional intensity (normal, strong),
|
|
||||||
with an additional neutral expression.
|
|
||||||
|
|
||||||
Reference:
|
|
||||||
The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
|
|
||||||
A dynamic, multimodal set of facial and vocal expressions in North American English
|
|
||||||
https://doi.org/10.1371/journal.pone.0196391
|
|
||||||
"""
|
|
||||||
|
|
||||||
archieves = [
|
|
||||||
{
|
|
||||||
'url':
|
|
||||||
'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
|
|
||||||
'md5':
|
|
||||||
'5411230427d67a21e18aa4d466e6d1b9',
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'url':
|
|
||||||
'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
|
|
||||||
'md5':
|
|
||||||
'bc696df654c87fed845eb13823edef8a',
|
|
||||||
},
|
|
||||||
]
|
|
||||||
label_list = [
|
|
||||||
'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
|
|
||||||
'surprised'
|
|
||||||
]
|
|
||||||
meta_info = collections.namedtuple(
|
|
||||||
'META_INFO', ('modality', 'vocal_channel', 'emotion',
|
|
||||||
'emotion_intensity', 'statement', 'repitition', 'actor'))
|
|
||||||
speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
|
|
||||||
song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
mode='train',
|
|
||||||
seed=0,
|
|
||||||
n_folds=5,
|
|
||||||
split=1,
|
|
||||||
feat_type='raw',
|
|
||||||
**kwargs):
|
|
||||||
"""
|
|
||||||
Ags:
|
|
||||||
mode (:obj:`str`, `optional`, defaults to `train`):
|
|
||||||
It identifies the dataset mode (train or dev).
|
|
||||||
seed (:obj:`int`, `optional`, defaults to 0):
|
|
||||||
Set the random seed to shuffle samples.
|
|
||||||
n_folds (:obj:`int`, `optional`, defaults to 5):
|
|
||||||
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
|
|
||||||
split (:obj:`int`, `optional`, defaults to 1):
|
|
||||||
It specify the fold of dev dataset.
|
|
||||||
feat_type (:obj:`str`, `optional`, defaults to `raw`):
|
|
||||||
It identifies the feature type that user wants to extrace of an audio file.
|
|
||||||
"""
|
|
||||||
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
|
|
||||||
files, labels = self._get_data(mode, seed, n_folds, split)
|
|
||||||
super(RAVDESS, self).__init__(
|
|
||||||
files=files, labels=labels, feat_type=feat_type, **kwargs)
|
|
||||||
|
|
||||||
def _get_meta_info(self, files) -> List[collections.namedtuple]:
|
|
||||||
ret = []
|
|
||||||
for file in files:
|
|
||||||
basename_without_extend = os.path.basename(file)[:-4]
|
|
||||||
ret.append(self.meta_info(*basename_without_extend.split('-')))
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def _get_data(self, mode, seed, n_folds,
|
|
||||||
split) -> Tuple[List[str], List[int]]:
|
|
||||||
if not os.path.isdir(self.speech_path) and not os.path.isdir(
|
|
||||||
self.song_path):
|
|
||||||
download_and_decompress(self.archieves, DATA_HOME)
|
|
||||||
|
|
||||||
wav_files = []
|
|
||||||
for root, _, files in os.walk(self.speech_path):
|
|
||||||
for file in files:
|
|
||||||
if file.endswith('.wav'):
|
|
||||||
wav_files.append(os.path.join(root, file))
|
|
||||||
|
|
||||||
for root, _, files in os.walk(self.song_path):
|
|
||||||
for file in files:
|
|
||||||
if file.endswith('.wav'):
|
|
||||||
wav_files.append(os.path.join(root, file))
|
|
||||||
|
|
||||||
random.seed(seed) # shuffle samples to split data
|
|
||||||
random.shuffle(
|
|
||||||
wav_files
|
|
||||||
) # make sure using the same seed to create train and dev dataset
|
|
||||||
meta_info = self._get_meta_info(wav_files)
|
|
||||||
|
|
||||||
files = []
|
|
||||||
labels = []
|
|
||||||
n_samples_per_fold = len(meta_info) // n_folds
|
|
||||||
for idx, sample in enumerate(meta_info):
|
|
||||||
_, _, emotion, _, _, _, _ = sample
|
|
||||||
target = int(emotion) - 1
|
|
||||||
fold = idx // n_samples_per_fold + 1
|
|
||||||
|
|
||||||
if mode == 'train' and int(fold) != split:
|
|
||||||
files.append(wav_files[idx])
|
|
||||||
labels.append(target)
|
|
||||||
|
|
||||||
if mode != 'train' and int(fold) == split:
|
|
||||||
files.append(wav_files[idx])
|
|
||||||
labels.append(target)
|
|
||||||
|
|
||||||
return files, labels
|
|
@ -1,41 +0,0 @@
|
|||||||
# PaddleAudio Testing Guide
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Testing
|
|
||||||
First clone a version of the project by
|
|
||||||
```
|
|
||||||
git clone https://github.com/PaddlePaddle/models.git
|
|
||||||
|
|
||||||
```
|
|
||||||
Then install the project in your virtual environment.
|
|
||||||
```
|
|
||||||
cd models/PaddleAudio
|
|
||||||
python setup.py bdist_wheel
|
|
||||||
pip install -e .[dev]
|
|
||||||
```
|
|
||||||
The requirements for testing will be installed along with PaddleAudio.
|
|
||||||
|
|
||||||
Now run
|
|
||||||
```
|
|
||||||
pytest test
|
|
||||||
```
|
|
||||||
|
|
||||||
If it goes well, you will see outputs like these:
|
|
||||||
```
|
|
||||||
platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
|
|
||||||
rootdir: ./models/PaddleAudio
|
|
||||||
plugins: hydra-core-1.0.6
|
|
||||||
collected 16 items
|
|
||||||
|
|
||||||
test/unit_test/test_backend.py ........... [ 68%]
|
|
||||||
test/unit_test/test_features.py ..... [100%]
|
|
||||||
|
|
||||||
==================================================== warnings summary ====================================================
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
-- Docs: https://docs.pytest.org/en/stable/warnings.html
|
|
||||||
============================================ 16 passed, 11 warnings in 6.76s =============================================
|
|
||||||
```
|
|
@ -1,113 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import librosa
|
|
||||||
import numpy as np
|
|
||||||
import paddleaudio
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
TEST_FILE = './test/data/test_audio.wav'
|
|
||||||
|
|
||||||
|
|
||||||
def relative_err(a, b, real=True):
|
|
||||||
"""compute relative error of two matrices or vectors"""
|
|
||||||
if real:
|
|
||||||
return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
|
|
||||||
else:
|
|
||||||
err = np.sum((a.real - b.real)**2) / \
|
|
||||||
(EPS + np.sum(a.real**2) + np.sum(b.real**2))
|
|
||||||
err += np.sum((a.imag - b.imag)**2) / \
|
|
||||||
(EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
|
|
||||||
|
|
||||||
return err
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
|
||||||
def load_audio():
|
|
||||||
x, r = librosa.load(TEST_FILE, sr=16000)
|
|
||||||
print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
|
|
||||||
return x, r
|
|
||||||
|
|
||||||
|
|
||||||
# start testing
|
|
||||||
x, r = load_audio()
|
|
||||||
EPS = 1e-8
|
|
||||||
|
|
||||||
|
|
||||||
def test_load():
|
|
||||||
s, r = paddleaudio.load(TEST_FILE, sr=16000)
|
|
||||||
assert r == 16000
|
|
||||||
assert s.dtype == 'float32'
|
|
||||||
|
|
||||||
s, r = paddleaudio.load(
|
|
||||||
TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
|
|
||||||
assert len(s) / r == 2.0
|
|
||||||
assert r == 16000
|
|
||||||
assert s.dtype == 'int16'
|
|
||||||
|
|
||||||
|
|
||||||
def test_depth_convert():
|
|
||||||
y = paddleaudio.depth_convert(x, 'int16')
|
|
||||||
assert len(y) == len(x)
|
|
||||||
assert y.dtype == 'int16'
|
|
||||||
assert np.max(y) <= 32767
|
|
||||||
assert np.min(y) >= -32768
|
|
||||||
assert np.std(y) > EPS
|
|
||||||
|
|
||||||
y = paddleaudio.depth_convert(x, 'int8')
|
|
||||||
assert len(y) == len(x)
|
|
||||||
assert y.dtype == 'int8'
|
|
||||||
assert np.max(y) <= 127
|
|
||||||
assert np.min(y) >= -128
|
|
||||||
assert np.std(y) > EPS
|
|
||||||
|
|
||||||
|
|
||||||
# test case for resample
|
|
||||||
rs_test_data = [
|
|
||||||
(32000, 'kaiser_fast'),
|
|
||||||
(16000, 'kaiser_fast'),
|
|
||||||
(8000, 'kaiser_fast'),
|
|
||||||
(32000, 'kaiser_best'),
|
|
||||||
(16000, 'kaiser_best'),
|
|
||||||
(8000, 'kaiser_best'),
|
|
||||||
(22050, 'kaiser_best'),
|
|
||||||
(44100, 'kaiser_best'),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('sr,mode', rs_test_data)
|
|
||||||
def test_resample(sr, mode):
|
|
||||||
y = paddleaudio.resample(x, 16000, sr, mode=mode)
|
|
||||||
factor = sr / 16000
|
|
||||||
err = relative_err(len(y), len(x) * factor)
|
|
||||||
print('err:', err)
|
|
||||||
assert err < EPS
|
|
||||||
|
|
||||||
|
|
||||||
def test_normalize():
|
|
||||||
y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
|
|
||||||
assert np.max(y) < 0.5 + EPS
|
|
||||||
|
|
||||||
y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
|
|
||||||
assert np.max(y) <= 2.0 + EPS
|
|
||||||
|
|
||||||
y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
|
|
||||||
print('np.std(y):', np.std(y))
|
|
||||||
assert np.abs(np.std(y) - 1.0) < EPS
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
test_load()
|
|
||||||
test_depth_convert()
|
|
||||||
test_resample(22050, 'kaiser_fast')
|
|
||||||
test_normalize()
|
|
@ -1,143 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import librosa
|
|
||||||
import numpy as np
|
|
||||||
import paddleaudio as pa
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
|
||||||
def load_audio():
|
|
||||||
x, r = librosa.load('./test/data/test_audio.wav')
|
|
||||||
#x,r = librosa.load('../data/test_audio.wav',sr=16000)
|
|
||||||
return x, r
|
|
||||||
|
|
||||||
|
|
||||||
## start testing
|
|
||||||
x, r = load_audio()
|
|
||||||
EPS = 1e-8
|
|
||||||
|
|
||||||
|
|
||||||
def relative_err(a, b, real=True):
|
|
||||||
"""compute relative error of two matrices or vectors"""
|
|
||||||
if real:
|
|
||||||
return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
|
|
||||||
else:
|
|
||||||
err = np.sum((a.real - b.real)**2) / (
|
|
||||||
EPS + np.sum(a.real**2) + np.sum(b.real**2))
|
|
||||||
err += np.sum((a.imag - b.imag)**2) / (
|
|
||||||
EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
|
|
||||||
|
|
||||||
return err
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
|
||||||
def test_melspectrogram():
|
|
||||||
a = pa.melspectrogram(
|
|
||||||
x,
|
|
||||||
window_size=512,
|
|
||||||
sr=16000,
|
|
||||||
hop_length=320,
|
|
||||||
n_mels=64,
|
|
||||||
fmin=50,
|
|
||||||
to_db=False, )
|
|
||||||
b = librosa.feature.melspectrogram(
|
|
||||||
x,
|
|
||||||
sr=16000,
|
|
||||||
n_fft=512,
|
|
||||||
win_length=512,
|
|
||||||
hop_length=320,
|
|
||||||
n_mels=64,
|
|
||||||
fmin=50)
|
|
||||||
assert relative_err(a, b) < EPS
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
|
||||||
def test_melspectrogram_db():
|
|
||||||
|
|
||||||
a = pa.melspectrogram(
|
|
||||||
x,
|
|
||||||
window_size=512,
|
|
||||||
sr=16000,
|
|
||||||
hop_length=320,
|
|
||||||
n_mels=64,
|
|
||||||
fmin=50,
|
|
||||||
to_db=True,
|
|
||||||
ref=1.0,
|
|
||||||
amin=1e-10,
|
|
||||||
top_db=None)
|
|
||||||
b = librosa.feature.melspectrogram(
|
|
||||||
x,
|
|
||||||
sr=16000,
|
|
||||||
n_fft=512,
|
|
||||||
win_length=512,
|
|
||||||
hop_length=320,
|
|
||||||
n_mels=64,
|
|
||||||
fmin=50)
|
|
||||||
b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
|
|
||||||
assert relative_err(a, b) < EPS
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
|
||||||
def test_stft():
|
|
||||||
a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
|
|
||||||
b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
|
|
||||||
assert a.shape == b.shape
|
|
||||||
assert relative_err(a, b, real=False) < EPS
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
|
||||||
def test_split_frames():
|
|
||||||
a = librosa.util.frame(x, frame_length=512, hop_length=320)
|
|
||||||
b = pa.split_frames(x, frame_length=512, hop_length=320)
|
|
||||||
assert relative_err(a, b) < EPS
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
|
|
||||||
def test_mfcc():
|
|
||||||
kwargs = {
|
|
||||||
'window_size': 512,
|
|
||||||
'hop_length': 320,
|
|
||||||
'n_mels': 64,
|
|
||||||
'fmin': 50,
|
|
||||||
'to_db': False
|
|
||||||
}
|
|
||||||
a = pa.mfcc(
|
|
||||||
x,
|
|
||||||
#sample_rate=16000,
|
|
||||||
spect=None,
|
|
||||||
n_mfcc=20,
|
|
||||||
dct_type=2,
|
|
||||||
norm='ortho',
|
|
||||||
lifter=0,
|
|
||||||
**kwargs)
|
|
||||||
S = librosa.feature.melspectrogram(
|
|
||||||
x,
|
|
||||||
sr=16000,
|
|
||||||
n_fft=512,
|
|
||||||
win_length=512,
|
|
||||||
hop_length=320,
|
|
||||||
n_mels=64,
|
|
||||||
fmin=50)
|
|
||||||
b = librosa.feature.mfcc(
|
|
||||||
x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
|
|
||||||
assert relative_err(a, b) < EPS
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
test_melspectrogram()
|
|
||||||
test_melspectrogram_db()
|
|
||||||
test_stft()
|
|
||||||
test_split_frames()
|
|
||||||
test_mfcc()
|
|
@ -1,10 +1,10 @@
|
|||||||
#! /usr/bin/env bash
|
#! /usr/bin/env bash
|
||||||
|
|
||||||
TARGET_DIR=${MAIN_ROOT}/examples/dataset/voxforge
|
TARGET_DIR=${MAIN_ROOT}/dataset/voxforge
|
||||||
mkdir -p ${TARGET_DIR}
|
mkdir -p ${TARGET_DIR}
|
||||||
|
|
||||||
# download data, generate manifests
|
# download data, generate manifests
|
||||||
python ${MAIN_ROOT}/examples/dataset/voxforge/voxforge.py \
|
python ${MAIN_ROOT}/dataset/voxforge/voxforge.py \
|
||||||
--manifest_prefix="${TARGET_DIR}/manifest" \
|
--manifest_prefix="${TARGET_DIR}/manifest" \
|
||||||
--target_dir="${TARGET_DIR}" \
|
--target_dir="${TARGET_DIR}" \
|
||||||
--is_merge_dialect=True \
|
--is_merge_dialect=True \
|
@ -0,0 +1,109 @@
|
|||||||
|
###########################################################
|
||||||
|
# FEATURE EXTRACTION SETTING #
|
||||||
|
###########################################################
|
||||||
|
|
||||||
|
fs: 24000 # sr
|
||||||
|
n_fft: 2048 # FFT size.
|
||||||
|
n_shift: 300 # Hop size.
|
||||||
|
win_length: 1200 # Window length.
|
||||||
|
# If set to null, it will be the same as fft_size.
|
||||||
|
window: "hann" # Window function.
|
||||||
|
|
||||||
|
# Only used for feats_type != raw
|
||||||
|
|
||||||
|
fmin: 80 # Minimum frequency of Mel basis.
|
||||||
|
fmax: 7600 # Maximum frequency of Mel basis.
|
||||||
|
n_mels: 80 # The number of mel basis.
|
||||||
|
|
||||||
|
# Only used for the model using pitch features (e.g. FastSpeech2)
|
||||||
|
f0min: 80 # Maximum f0 for pitch extraction.
|
||||||
|
f0max: 400 # Minimum f0 for pitch extraction.
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 64
|
||||||
|
num_workers: 4
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# MODEL SETTING #
|
||||||
|
###########################################################
|
||||||
|
model:
|
||||||
|
adim: 384 # attention dimension
|
||||||
|
aheads: 2 # number of attention heads
|
||||||
|
elayers: 4 # number of encoder layers
|
||||||
|
eunits: 1536 # number of encoder ff units
|
||||||
|
dlayers: 4 # number of decoder layers
|
||||||
|
dunits: 1536 # number of decoder ff units
|
||||||
|
positionwise_layer_type: conv1d # type of position-wise layer
|
||||||
|
positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
|
||||||
|
duration_predictor_layers: 2 # number of layers of duration predictor
|
||||||
|
duration_predictor_chans: 256 # number of channels of duration predictor
|
||||||
|
duration_predictor_kernel_size: 3 # filter size of duration predictor
|
||||||
|
postnet_layers: 5 # number of layers of postnset
|
||||||
|
postnet_filts: 5 # filter size of conv layers in postnet
|
||||||
|
postnet_chans: 256 # number of channels of conv layers in postnet
|
||||||
|
encoder_normalize_before: True # whether to perform layer normalization before the input
|
||||||
|
decoder_normalize_before: True # whether to perform layer normalization before the input
|
||||||
|
reduction_factor: 1 # reduction factor
|
||||||
|
encoder_type: conformer # encoder type
|
||||||
|
decoder_type: conformer # decoder type
|
||||||
|
conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type
|
||||||
|
conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
|
||||||
|
conformer_activation_type: swish # conformer activation type
|
||||||
|
use_macaron_style_in_conformer: true # whether to use macaron style in conformer
|
||||||
|
use_cnn_in_conformer: true # whether to use CNN in conformer
|
||||||
|
conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder
|
||||||
|
conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder
|
||||||
|
init_type: xavier_uniform # initialization type
|
||||||
|
transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
|
||||||
|
transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
|
||||||
|
transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
|
||||||
|
transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer
|
||||||
|
transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
|
||||||
|
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
|
||||||
|
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
|
||||||
|
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
|
||||||
|
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
|
||||||
|
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
|
||||||
|
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
|
||||||
|
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
|
||||||
|
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
|
||||||
|
energy_predictor_layers: 2 # number of conv layers in energy predictor
|
||||||
|
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
|
||||||
|
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
|
||||||
|
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
|
||||||
|
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
|
||||||
|
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
|
||||||
|
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# UPDATER SETTING #
|
||||||
|
###########################################################
|
||||||
|
updater:
|
||||||
|
use_masking: True # whether to apply masking for padded part in loss calculation
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OPTIMIZER SETTING #
|
||||||
|
###########################################################
|
||||||
|
optimizer:
|
||||||
|
optim: adam # optimizer type
|
||||||
|
learning_rate: 0.001 # learning rate
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# TRAINING SETTING #
|
||||||
|
###########################################################
|
||||||
|
max_epoch: 1000
|
||||||
|
num_snapshots: 5
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
seed: 10086
|
@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
ckpt_dir=$1
|
||||||
|
output_dir=$2
|
||||||
|
|
||||||
|
python3 ${BIN_DIR}/export_model.py \
|
||||||
|
--checkpoint ${ckpt_dir}/model.pdparams \
|
||||||
|
--output_dir ${output_dir}
|
@ -0,0 +1,11 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
audio_file=$1
|
||||||
|
ckpt_dir=$2
|
||||||
|
feat_backend=$3
|
||||||
|
|
||||||
|
python3 ${BIN_DIR}/predict.py \
|
||||||
|
--wav ${audio_file} \
|
||||||
|
--feat_backend ${feat_backend} \
|
||||||
|
--top_k 10 \
|
||||||
|
--checkpoint ${ckpt_dir}/model.pdparams
|
@ -0,0 +1,10 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
device=$1
|
||||||
|
model_dir=$2
|
||||||
|
audio_file=$3
|
||||||
|
|
||||||
|
python3 ${BIN_DIR}/deploy/predict.py \
|
||||||
|
--device ${device} \
|
||||||
|
--model_dir ${model_dir} \
|
||||||
|
--wav ${audio_file}
|
@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
ngpu=$1
|
||||||
|
feat_backend=$2
|
||||||
|
|
||||||
|
num_epochs=50
|
||||||
|
batch_size=16
|
||||||
|
ckpt_dir=./checkpoint
|
||||||
|
save_freq=10
|
||||||
|
|
||||||
|
if [ ${ngpu} -gt 0 ]; then
|
||||||
|
python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \
|
||||||
|
--epochs ${num_epochs} \
|
||||||
|
--feat_backend ${feat_backend} \
|
||||||
|
--batch_size ${batch_size} \
|
||||||
|
--checkpoint_dir ${ckpt_dir} \
|
||||||
|
--save_freq ${save_freq}
|
||||||
|
else
|
||||||
|
python3 ${BIN_DIR}/train.py \
|
||||||
|
--epochs ${num_epochs} \
|
||||||
|
--feat_backend ${feat_backend} \
|
||||||
|
--batch_size ${batch_size} \
|
||||||
|
--checkpoint_dir ${ckpt_dir} \
|
||||||
|
--save_freq ${save_freq}
|
||||||
|
fi
|
@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
export PYTHONDONTWRITEBYTECODE=1
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||||
|
|
||||||
|
MODEL=panns
|
||||||
|
export BIN_DIR=${MAIN_ROOT}/paddlespeech/cls/exps/${MODEL}
|
@ -0,0 +1,33 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||||
|
|
||||||
|
stage=$1
|
||||||
|
stop_stage=100
|
||||||
|
feat_backend=numpy
|
||||||
|
audio_file=~/cat.wav
|
||||||
|
ckpt_dir=./checkpoint/epoch_50
|
||||||
|
output_dir=./export
|
||||||
|
infer_device=cpu
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
./local/train.sh ${ngpu} ${feat_backend} || exit -1
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
./local/infer.sh ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
./local/export.sh ${ckpt_dir} ${output_dir} || exit -1
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||||
|
./local/static_model_infer.sh ${infer_device} ${output_dir} ${audio_file} || exit -1
|
||||||
|
exit 0
|
||||||
|
fi
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue