Merge remote-tracking branch 'upstream/develop' into develop

pull/969/head
TianYuan 3 years ago
commit 4d6f1646d4

6
.gitignore vendored

@ -11,6 +11,10 @@
*.npz
*.done
*.whl
*.egg-info
build
docs/build/
tools/venv
tools/kenlm
@ -20,5 +24,7 @@ tools/montreal-forced-aligner/
tools/Montreal-Forced-Aligner/
tools/sctk
tools/sctk-20159b5/
tools/kaldi
tools/OpenBLAS/
*output/

@ -39,12 +39,30 @@ pull_request_rules:
actions:
label:
remove: ["conflicts"]
- name: "auto add label=enhancement"
- name: "auto add label=S2T"
conditions:
- files~=^deepspeech/
- files~=^paddlespeech/s2t/
actions:
label:
add: ["enhancement"]
add: ["S2T"]
- name: "auto add label=T2S"
conditions:
- files~=^paddlespeech/t2s/
actions:
label:
add: ["T2S"]
- name: "auto add label=Audio"
conditions:
- files~=^paddleaudio/
actions:
label:
add: ["Audio"]
- name: "auto add label=TextProcess"
conditions:
- files~=^paddlespeech/text/
actions:
label:
add: ["TextProcess"]
- name: "auto add label=Example"
conditions:
- files~=^examples/

@ -7,7 +7,7 @@ version: 2
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/src/conf.py
configuration: docs/source/conf.py
# Build documentation with MkDocs
#mkdocs:
@ -20,11 +20,6 @@ formats: []
python:
version: 3.7
install:
- method: pip
path: .
extra_requirements:
- doc
- requirements: docs/requirements.txt

@ -1,37 +0,0 @@
#!/bin/bash
setup_env(){
cd tools && make && cd -
}
install(){
if [ -f "setup.sh" ]; then
bash setup.sh
#export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
fi
if [ $? != 0 ]; then
exit 1
fi
}
print_env(){
cat /etc/lsb-release
gcc -v
g++ -v
}
abort(){
echo "Run install failed" 1>&2
echo "Please check your code" 1>&2
exit 1
}
trap 'abort' 0
set -e
print_env
setup_env
source tools/venv/bin/activate
install
trap : 0

@ -1,23 +0,0 @@
#!/bin/bash
function abort(){
echo "Your commit not fit PaddlePaddle code style" 1>&2
echo "Please use pre-commit scripts to auto-format your code" 1>&2
exit 1
}
trap 'abort' 0
set -e
source tools/venv/bin/activate
python3 --version
if ! pre-commit run -a ; then
ls -lh
git diff --exit-code
exit 1
fi
trap : 0

@ -1,54 +0,0 @@
#!/bin/bash
abort(){
echo "Run unittest failed" 1>&2
echo "Please check your code" 1>&2
exit 1
}
unittest(){
cd $1 > /dev/null
if [ -f "setup.sh" ]; then
bash setup.sh
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
fi
if [ $? != 0 ]; then
exit 1
fi
find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \
xargs -0 -I{} -n1 bash -c \
'python3 -m unittest discover -v -s {}'
cd - > /dev/null
}
coverage(){
cd $1 > /dev/null
if [ -f "setup.sh" ]; then
bash setup.sh
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
fi
if [ $? != 0 ]; then
exit 1
fi
find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \
xargs -0 -I{} -n1 bash -c \
'python3 -m coverage run --branch {}'
python3 -m coverage report -m
python3 -m coverage html
cd - > /dev/null
}
trap 'abort' 0
set -e
source tools/venv/bin/activate
#pip3 install pytest
#unittest .
coverage .
trap : 0

@ -10,7 +10,7 @@ English | [简体中文](README_ch.md)
<h3>
<a href="#quick-start"> Quick Start </a>
| <a href="#tutorials"> Tutorials </a>
| <a href="#model-list"> Models List </a>
| <a href="#models-list"> Models List </a>
</div>
------------------------------------------------------------------------------------
@ -57,7 +57,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
</table>
</div>
##### Text-To-Speech
<div align = "center">
<table style="width:100%">
@ -119,7 +119,7 @@ If you want to set up PaddleSpeech in other environment, please see the [install
Developers can have a try of our model with only a few lines of code.
A tiny DeepSpeech2 *Speech-To-Text* model training on toy set of LibriSpeech:
A tiny DeepSpeech2 **Speech-To-Text** model training on toy set of LibriSpeech:
```shell
cd examples/tiny/s0/
@ -131,10 +131,7 @@ bash local/data.sh
bash local/test.sh conf/deepspeech2.yaml ckptfile offline
```
For *Text-To-Speech*, try FastSpeech2 on LJSpeech:
- Download LJSpeech-1.1 from the [ljspeech official website](https://keithito.com/LJ-Speech-Dataset/), our prepared durations for fastspeech2 [ljspeech_alignment](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz).
- The pretrained models are seperated into two parts: [fastspeech2_nosil_ljspeech_ckpt](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip) and [pwg_ljspeech_ckpt](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip). Please download then unzip to `./model/fastspeech2` and `./model/pwg` respectively.
- Assume your path to the dataset is `~/datasets/LJSpeech-1.1` and `./ljspeech_alignment` accordingly, preprocess your data and then use our pretrained model to synthesize:
For **Text-To-Speech**, try pretrained FastSpeech2 + Parallel WaveGAN on CSMSC:
```shell
cd examples/csmsc/tts3
# download the pretrained models and unaip them
@ -161,7 +158,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
--phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
```
If you want to try more functions like training and tuning, please see [Speech-To-Text getting started](./docs/source/asr/getting_started.md) and [Text-To-Speech Basic Use](./docs/source/tts/basic_usage.md).
If you want to try more functions like training and tuning, please see [Speech-To-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-To-Speech Quick Start](./docs/source/tts/quick_start.md).
# Models List
@ -240,7 +237,7 @@ PaddleSpeech Text-To-Speech mainly contains three modules: *Text Frontend*, *Aco
<td> Text Frontend</td>
<td colspan="2"> &emsp; </td>
<td>
<a href = "./examples/other/text_frontend">chinese-fronted</a>
<a href = "./examples/other/tn">tn</a> / <a href = "./examples/other/g2p">g2p</a>
</td>
</tr>
<tr>
@ -283,7 +280,7 @@ PaddleSpeech Text-To-Speech mainly contains three modules: *Text Frontend*, *Aco
<td >Parallel WaveGAN</td>
<td >LJSpeech / VCTK / CSMSC</td>
<td>
<a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a>
<a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a>
</td>
</tr>
<tr>
@ -305,11 +302,10 @@ PaddleSpeech Text-To-Speech mainly contains three modules: *Text Frontend*, *Aco
</tbody>
</table>
# Tutorials
Normally, [Speech SoTA](https://paperswithcode.com/area/speech) gives you an overview of the hot academic topics in speech. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas.
- [Overview](./docs/source/introduction.md)
- [Overview](./docs/source/introduction.md)
- Quick Start
- [Dependencies](./docs/source/dependencies.md) and [Installation](./docs/source/install.md)
- [Quick Start of Speech-To-Text](./docs/source/asr/quick_start.md)
@ -327,8 +323,12 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech) gives you an ove
- [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) and [PaddleSpeech VS. Espnet](https://paddlespeech.readthedocs.io/en/latest/tts/demo_2.html)
- [Released Models](./docs/source/released_model.md)
# License and Acknowledgement
The TTS module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with DeepSpeech. If you are interested in academic research about this function, please see [TTS research overview](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://paddleparakeet.readthedocs.io/en/latest/released_models.html) is a good guideline for the pipeline components.
## FAQ and Contributing
You are warmly welcome to submit questions in [discussions](https://github.com/PaddlePaddle/DeepSpeech/discussions) and bug reports in [issues](https://github.com/PaddlePaddle/DeepSpeech/issues)! Also, we highly appreciate if you would like to contribute to this project!
# License and Acknowledgement
PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE).
PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.

7
audio/.gitignore vendored

@ -0,0 +1,7 @@
.ipynb_checkpoints/**
*.ipynb
nohup.out
__pycache__/
*.wav
*.m4a
obsolete/**

@ -0,0 +1,45 @@
repos:
- repo: local
hooks:
- id: yapf
name: yapf
entry: yapf
language: system
args: [-i, --style .style.yapf]
files: \.py$
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: a11d9314b22d8f8c7556443875b731ef05965464
hooks:
- id: check-merge-conflict
- id: check-symlinks
- id: end-of-file-fixer
- id: trailing-whitespace
- id: detect-private-key
- id: check-symlinks
- id: check-added-large-files
- repo: https://github.com/pycqa/isort
rev: 5.8.0
hooks:
- id: isort
name: isort (python)
- id: isort
name: isort (cython)
types: [cython]
- id: isort
name: isort (pyi)
types: [pyi]
- repo: local
hooks:
- id: flake8
name: flake8
entry: flake8
language: system
args:
- --count
- --select=E9,F63,F7,F82
- --show-source
- --statistics
files: \.py$

@ -1,4 +1,3 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/

@ -0,0 +1,37 @@
# PaddleAudio: The audio library for PaddlePaddle
## Introduction
PaddleAudio is the audio toolkit to speed up your audio research and development loop in PaddlePaddle. It currently provides a collection of audio datasets, feature-extraction functions, audio transforms,state-of-the-art pre-trained models in sound tagging/classification and anomaly sound detection. More models and features are on the roadmap.
## Features
- Spectrogram and related features are compatible with librosa.
- State-of-the-art models in sound tagging on Audioset, sound classification on esc50, and more to come.
- Ready-to-use audio embedding with a line of code, includes sound embedding and more on the roadmap.
- Data loading supports for common open source audio in multiple languages including English, Mandarin and so on.
## Install
```
git clone https://github.com/PaddlePaddle/models
cd models/PaddleAudio
pip install .
```
## Quick start
### Audio loading and feature extraction
```
import paddleaudio as pa
s,r = pa.load(f)
mel_spect = pa.melspectrogram(s,sr=r)
```
### Examples
We provide a set of examples to help you get started in using PaddleAudio quickly.
- [PANNs: acoustic scene and events analysis using pre-trained models](./examples/panns)
- [Environmental Sound classification on ESC-50 dataset](./examples/sound_classification)
- [Training a audio-tagging network on Audioset](./examples/audioset_training)
Please refer to [example directory](./examples) for more details.

@ -0,0 +1,128 @@
# Audio Tagging
声音分类的任务是单标签的分类任务,但是对于一段音频来说,它可以是多标签的。譬如在一般的室内办公环境进行录音,这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音,还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言,对一段音频预测多个标签是具有很强的实用性的。
在IEEE ICASSP 2017 大会上,谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段来源于YouTube视频。目前该数据集已经有210万个已标注的视频数据5800小时的音频数据经过标记的声音样本的标签类别为527。
`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别因此可用于声音的实时tagging。
本示例采用`PANNs`预训练模型基于Audioset的标签类别对输入音频实时tagging并最终以文本形式输出对应时刻的top k类别和对应的得分。
## 模型简介
PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型可供用户选择使用
- CNN14: 该模型主要包含12个卷积层和2个全连接层模型参数的数量为79.6Membbedding维度是2048。
- CNN10: 该模型主要包含8个卷积层和2个全连接层模型参数的数量为4.9Membbedding维度是512。
- CNN6: 该模型主要包含4个卷积层和2个全连接层模型参数的数量为4.5Membbedding维度是512。
## 快速开始
### 模型预测
```shell
export CUDA_VISIBLE_DEVICES=0
python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir
```
可支持配置的参数:
- `device`: 选用什么设备进行训练可选cpu或gpu默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
- `wav`: 指定预测的音频文件。
- `sample_duration`: 模型每次预测的音频时间长度单位为秒默认为2s。
- `hop_duration`: 每两个预测音频的时间间隔单位为秒默认为0.3s。
- `output_dir`: 模型预测结果存放的路径,默认为`./output_dir`。
示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过以下方式执行:
```python
from paddleaudio.models.panns import cnn14, cnn10, cnn6
# CNN14
model = cnn14(pretrained=True, extract_embedding=False)
# CNN10
model = cnn10(pretrained=True, extract_embedding=False)
# CNN6
model = cnn6(pretrained=True, extract_embedding=False)
```
执行结果:
```
[2021-04-30 19:15:41,025] [ INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz
```
执行后得分结果保存在`output_dir`的`.npz`文件中。
### 生成tagging标签文本
```shell
python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir
```
可支持配置的参数:
- `tagging_file`: 模型预测结果文件。
- `top_k`: 获取预测结果中得分最高的前top_k个标签默认为10。
- `smooth`: 预测结果的后验概率平滑默认为True表示应用平滑。
- `smooth_size`: 平滑计算过程中的样本数量默认为5。
- `label_file`: 模型预测结果对应的Audioset类别的文本文件。
- `output_dir`: 标签文本存放的路径,默认为`./output_dir`。
执行结果:
```
[2021-04-30 19:26:58,743] [ INFO] - Posterior smoothing...
[2021-04-30 19:26:58,746] [ INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt
```
执行后文本结果保存在`output_dir`的`.txt`文件中。
## Tagging标签文本
最终输出的文本结果如下所示。
样本每个时间范围的top k结果用空行分隔。在每一个结果中第一行是时间信息数字表示tagging结果在时间起点信息比例值代表当前时刻`t`与音频总长度`T`的比值紧接的k行是对应的标签和得分。
```
0.0
Cat: 0.9144676923751831
Animal: 0.8855036497116089
Domestic animals, pets: 0.804577112197876
Meow: 0.7422927021980286
Music: 0.19959309697151184
Inside, small room: 0.12550437450408936
Caterwaul: 0.021584441885352135
Purr: 0.020247288048267365
Speech: 0.018197158351540565
Vehicle: 0.007446660194545984
0.059197544398158296
Cat: 0.9250872135162354
Animal: 0.8957151174545288
Domestic animals, pets: 0.8228275775909424
Meow: 0.7650775909423828
Music: 0.20210561156272888
Inside, small room: 0.12290887534618378
Caterwaul: 0.029371455311775208
Purr: 0.018731823191046715
Speech: 0.017130598425865173
Vehicle: 0.007748497650027275
0.11839508879631659
Cat: 0.9336574673652649
Animal: 0.9111202359199524
Domestic animals, pets: 0.8349071145057678
Meow: 0.7761964797973633
Music: 0.20467285811901093
Inside, small room: 0.10709915310144424
Caterwaul: 0.05370649695396423
Purr: 0.018830426037311554
Speech: 0.017361722886562347
Vehicle: 0.006929398979991674
...
...
```
以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子可以实时地对音频进行多标签预测。
![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif)

@ -0,0 +1,527 @@
Speech
Male speech, man speaking
Female speech, woman speaking
Child speech, kid speaking
Conversation
Narration, monologue
Babbling
Speech synthesizer
Shout
Bellow
Whoop
Yell
Battle cry
Children shouting
Screaming
Whispering
Laughter
Baby laughter
Giggle
Snicker
Belly laugh
Chuckle, chortle
Crying, sobbing
Baby cry, infant cry
Whimper
Wail, moan
Sigh
Singing
Choir
Yodeling
Chant
Mantra
Male singing
Female singing
Child singing
Synthetic singing
Rapping
Humming
Groan
Grunt
Whistling
Breathing
Wheeze
Snoring
Gasp
Pant
Snort
Cough
Throat clearing
Sneeze
Sniff
Run
Shuffle
Walk, footsteps
Chewing, mastication
Biting
Gargling
Stomach rumble
Burping, eructation
Hiccup
Fart
Hands
Finger snapping
Clapping
Heart sounds, heartbeat
Heart murmur
Cheering
Applause
Chatter
Crowd
Hubbub, speech noise, speech babble
Children playing
Animal
Domestic animals, pets
Dog
Bark
Yip
Howl
Bow-wow
Growling
Whimper (dog)
Cat
Purr
Meow
Hiss
Caterwaul
Livestock, farm animals, working animals
Horse
Clip-clop
Neigh, whinny
Cattle, bovinae
Moo
Cowbell
Pig
Oink
Goat
Bleat
Sheep
Fowl
Chicken, rooster
Cluck
Crowing, cock-a-doodle-doo
Turkey
Gobble
Duck
Quack
Goose
Honk
Wild animals
Roaring cats (lions, tigers)
Roar
Bird
Bird vocalization, bird call, bird song
Chirp, tweet
Squawk
Pigeon, dove
Coo
Crow
Caw
Owl
Hoot
Bird flight, flapping wings
Canidae, dogs, wolves
Rodents, rats, mice
Mouse
Patter
Insect
Cricket
Mosquito
Fly, housefly
Buzz
Bee, wasp, etc.
Frog
Croak
Snake
Rattle
Whale vocalization
Music
Musical instrument
Plucked string instrument
Guitar
Electric guitar
Bass guitar
Acoustic guitar
Steel guitar, slide guitar
Tapping (guitar technique)
Strum
Banjo
Sitar
Mandolin
Zither
Ukulele
Keyboard (musical)
Piano
Electric piano
Organ
Electronic organ
Hammond organ
Synthesizer
Sampler
Harpsichord
Percussion
Drum kit
Drum machine
Drum
Snare drum
Rimshot
Drum roll
Bass drum
Timpani
Tabla
Cymbal
Hi-hat
Wood block
Tambourine
Rattle (instrument)
Maraca
Gong
Tubular bells
Mallet percussion
Marimba, xylophone
Glockenspiel
Vibraphone
Steelpan
Orchestra
Brass instrument
French horn
Trumpet
Trombone
Bowed string instrument
String section
Violin, fiddle
Pizzicato
Cello
Double bass
Wind instrument, woodwind instrument
Flute
Saxophone
Clarinet
Harp
Bell
Church bell
Jingle bell
Bicycle bell
Tuning fork
Chime
Wind chime
Change ringing (campanology)
Harmonica
Accordion
Bagpipes
Didgeridoo
Shofar
Theremin
Singing bowl
Scratching (performance technique)
Pop music
Hip hop music
Beatboxing
Rock music
Heavy metal
Punk rock
Grunge
Progressive rock
Rock and roll
Psychedelic rock
Rhythm and blues
Soul music
Reggae
Country
Swing music
Bluegrass
Funk
Folk music
Middle Eastern music
Jazz
Disco
Classical music
Opera
Electronic music
House music
Techno
Dubstep
Drum and bass
Electronica
Electronic dance music
Ambient music
Trance music
Music of Latin America
Salsa music
Flamenco
Blues
Music for children
New-age music
Vocal music
A capella
Music of Africa
Afrobeat
Christian music
Gospel music
Music of Asia
Carnatic music
Music of Bollywood
Ska
Traditional music
Independent music
Song
Background music
Theme music
Jingle (music)
Soundtrack music
Lullaby
Video game music
Christmas music
Dance music
Wedding music
Happy music
Funny music
Sad music
Tender music
Exciting music
Angry music
Scary music
Wind
Rustling leaves
Wind noise (microphone)
Thunderstorm
Thunder
Water
Rain
Raindrop
Rain on surface
Stream
Waterfall
Ocean
Waves, surf
Steam
Gurgling
Fire
Crackle
Vehicle
Boat, Water vehicle
Sailboat, sailing ship
Rowboat, canoe, kayak
Motorboat, speedboat
Ship
Motor vehicle (road)
Car
Vehicle horn, car horn, honking
Toot
Car alarm
Power windows, electric windows
Skidding
Tire squeal
Car passing by
Race car, auto racing
Truck
Air brake
Air horn, truck horn
Reversing beeps
Ice cream truck, ice cream van
Bus
Emergency vehicle
Police car (siren)
Ambulance (siren)
Fire engine, fire truck (siren)
Motorcycle
Traffic noise, roadway noise
Rail transport
Train
Train whistle
Train horn
Railroad car, train wagon
Train wheels squealing
Subway, metro, underground
Aircraft
Aircraft engine
Jet engine
Propeller, airscrew
Helicopter
Fixed-wing aircraft, airplane
Bicycle
Skateboard
Engine
Light engine (high frequency)
Dental drill, dentist's drill
Lawn mower
Chainsaw
Medium engine (mid frequency)
Heavy engine (low frequency)
Engine knocking
Engine starting
Idling
Accelerating, revving, vroom
Door
Doorbell
Ding-dong
Sliding door
Slam
Knock
Tap
Squeak
Cupboard open or close
Drawer open or close
Dishes, pots, and pans
Cutlery, silverware
Chopping (food)
Frying (food)
Microwave oven
Blender
Water tap, faucet
Sink (filling or washing)
Bathtub (filling or washing)
Hair dryer
Toilet flush
Toothbrush
Electric toothbrush
Vacuum cleaner
Zipper (clothing)
Keys jangling
Coin (dropping)
Scissors
Electric shaver, electric razor
Shuffling cards
Typing
Typewriter
Computer keyboard
Writing
Alarm
Telephone
Telephone bell ringing
Ringtone
Telephone dialing, DTMF
Dial tone
Busy signal
Alarm clock
Siren
Civil defense siren
Buzzer
Smoke detector, smoke alarm
Fire alarm
Foghorn
Whistle
Steam whistle
Mechanisms
Ratchet, pawl
Clock
Tick
Tick-tock
Gears
Pulleys
Sewing machine
Mechanical fan
Air conditioning
Cash register
Printer
Camera
Single-lens reflex camera
Tools
Hammer
Jackhammer
Sawing
Filing (rasp)
Sanding
Power tool
Drill
Explosion
Gunshot, gunfire
Machine gun
Fusillade
Artillery fire
Cap gun
Fireworks
Firecracker
Burst, pop
Eruption
Boom
Wood
Chop
Splinter
Crack
Glass
Chink, clink
Shatter
Liquid
Splash, splatter
Slosh
Squish
Drip
Pour
Trickle, dribble
Gush
Fill (with liquid)
Spray
Pump (liquid)
Stir
Boiling
Sonar
Arrow
Whoosh, swoosh, swish
Thump, thud
Thunk
Electronic tuner
Effects unit
Chorus effect
Basketball bounce
Bang
Slap, smack
Whack, thwack
Smash, crash
Breaking
Bouncing
Whip
Flap
Scratch
Scrape
Rub
Roll
Crushing
Crumpling, crinkling
Tearing
Beep, bleep
Ping
Ding
Clang
Squeal
Creak
Rustle
Whir
Clatter
Sizzle
Clicking
Clickety-clack
Rumble
Plop
Jingle, tinkle
Hum
Zing
Boing
Crunch
Silence
Sine wave
Harmonic
Chirp tone
Sound effect
Pulse
Inside, small room
Inside, large room or hall
Inside, public space
Outside, urban or manmade
Outside, rural or natural
Reverberation
Echo
Noise
Environmental noise
Static
Mains hum
Distortion
Sidetone
Cacophony
White noise
Pink noise
Throbbing
Vibration
Television
Radio
Field recording

@ -0,0 +1,111 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from typing import List
import numpy as np
import paddle
from paddleaudio.backends import load as load_audio
from paddleaudio.features import melspectrogram
from paddleaudio.models.panns import cnn14
from paddleaudio.utils import logger
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
args = parser.parse_args()
# yapf: enable
def split(waveform: np.ndarray, win_size: int, hop_size: int):
"""
Split into N waveforms.
N is decided by win_size and hop_size.
"""
assert isinstance(waveform, np.ndarray)
time = []
data = []
for i in range(0, len(waveform), hop_size):
segment = waveform[i:i + win_size]
if len(segment) < win_size:
segment = np.pad(segment, (0, win_size - len(segment)))
data.append(segment)
time.append(i / len(waveform))
return time, data
def batchify(data: List[List[float]],
sample_rate: int,
batch_size: int,
**kwargs):
"""
Extract features from waveforms and create batches.
"""
examples = []
for waveform in data:
feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
examples.append(feats)
# Seperates data into some batches.
one_batch = []
for example in examples:
one_batch.append(example)
if len(one_batch) == batch_size:
yield one_batch
one_batch = []
if one_batch:
yield one_batch
def predict(model, data: List[List[float]], sample_rate: int,
batch_size: int=1):
"""
Use pretrained model to make predictions.
"""
batches = batchify(data, sample_rate, batch_size)
results = None
model.eval()
for batch in batches:
feats = paddle.to_tensor(batch).unsqueeze(1) \
# (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
audioset_scores = model(feats)
if results is None:
results = audioset_scores.numpy()
else:
results = np.concatenate((results, audioset_scores.numpy()))
return results
if __name__ == '__main__':
paddle.set_device(args.device)
model = cnn14(pretrained=True, extract_embedding=False)
waveform, sr = load_audio(args.wav, sr=None)
time, data = split(waveform,
int(args.sample_duration * sr),
int(args.hop_duration * sr))
results = predict(model, data, sr, batch_size=8)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
np.savez(output_file, time=time, scores=results)
logger.info(f'Saved tagging results to {output_file}')

@ -0,0 +1,83 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import ast
import os
from typing import Dict
import numpy as np
from paddleaudio.utils import logger
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--tagging_file', type=str, required=True, help='')
parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
args = parser.parse_args()
# yapf: enable
def smooth(results: np.ndarray, win_size: int):
"""
Execute posterior smoothing in-place.
"""
for i in range(len(results) - 1, -1, -1):
if i < win_size - 1:
left = 0
else:
left = i + 1 - win_size
results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
"""
Return top k result.
"""
result = np.asarray(result)
topk_idx = (-result).argsort()[:k]
ret = ''
for idx in topk_idx:
label, score = label_map[idx], result[idx]
ret += f'{label}: {score}\n'
return ret
if __name__ == "__main__":
label_map = {}
with open(args.label_file, 'r') as f:
for i, l in enumerate(f.readlines()):
label_map[i] = l.strip()
results = np.load(args.tagging_file, allow_pickle=True)
times, scores = results['time'], results['scores']
if args.smooth:
logger.info('Posterior smoothing...')
smooth(scores, win_size=args.smooth_size)
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
output_file = os.path.join(
args.output_dir,
os.path.basename(args.tagging_file).split('.')[0] + '.txt')
with open(output_file, 'w') as f:
for time, score in zip(times, scores):
f.write(f'{time}\n')
f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
logger.info(f'Saved tagging labels to {output_file}')

@ -0,0 +1,116 @@
# 声音分类
声音分类和检测是声音算法的一个热门研究方向。
对于声音分类任务传统机器学习的一个常用做法是首先人工提取音频的时域和频域的多种特征并做特征选择、组合、变换等然后基于SVM或决策树进行分类。而端到端的深度学习则通常利用深度网络如RNNCNN等直接对声间波形(waveform)或时频特征(time-frequency)进行特征学习(representation learning)和分类预测。
在IEEE ICASSP 2017 大会上,谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段来源于YouTube视频。目前该数据集已经有210万个已标注的视频数据5800小时的音频数据经过标记的声音样本的标签类别为527。
`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。经过预训练后模型可以用于提取音频的embbedding。本示例将使用`PANNs`的预训练模型Finetune完成声音分类的任务。
## 模型简介
PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型可供用户选择使用
- CNN14: 该模型主要包含12个卷积层和2个全连接层模型参数的数量为79.6Membbedding维度是2048。
- CNN10: 该模型主要包含8个卷积层和2个全连接层模型参数的数量为4.9Membbedding维度是512。
- CNN6: 该模型主要包含4个卷积层和2个全连接层模型参数的数量为4.5Membbedding维度是512。
## 快速开始
### 模型训练
以环境声音分类数据集`ESC50`为示例运行下面的命令可在训练集上进行模型的finetune支持单机的单卡训练和多卡训练。关于如何使用`paddle.distributed.launch`启动多卡训练,请查看[单机多卡训练](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.html)。
单卡训练:
```shell
$ python train.py --epochs 50 --batch_size 16 --checkpoint_dir ./checkpoint --save_freq 10
```
多卡训练:
```shell
$ unset CUDA_VISIBLE_DEVICES
$ python -m paddle.distributed.launch --gpus "0,1" train.py --epochs 50 --batch_size 16 --num_worker 4 --checkpoint_dir ./checkpoint --save_freq 10
```
可支持配置的参数:
- `device`: 选用什么设备进行训练可选cpu或gpu默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
- `epochs`: 训练轮次默认为50。
- `learning_rate`: Fine-tune的学习率默认为5e-5。
- `batch_size`: 批处理大小请结合显存情况进行调整若出现显存不足请适当调低这一参数默认为16。
- `num_workers`: Dataloader获取数据的子进程数。默认为0加载数据的流程在主进程执行。
- `checkpoint_dir`: 模型参数文件和optimizer参数文件的保存目录默认为`./checkpoint`。
- `save_freq`: 训练过程中的模型保存频率默认为10。
- `log_freq`: 训练过程中的信息打印频率默认为10。
示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过以下方式执行:
```python
from model import SoundClassifier
from paddleaudio.datasets import ESC50
from paddleaudio.models.panns import cnn14, cnn10, cnn6
# CNN14
backbone = cnn14(pretrained=True, extract_embedding=True)
model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
# CNN10
backbone = cnn10(pretrained=True, extract_embedding=True)
model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
# CNN6
backbone = cnn6(pretrained=True, extract_embedding=True)
model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
```
### 模型预测
```shell
python -u predict.py --wav ./dog.wav --top_k 3 --checkpoint ./checkpoint/epoch_50/model.pdparams
```
可支持配置的参数:
- `device`: 选用什么设备进行训练可选cpu或gpu默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。
- `wav`: 指定预测的音频文件。
- `top_k`: 预测显示的top k标签的得分默认为1。
- `checkpoint`: 模型参数checkpoint文件。
输出的预测结果如下:
```
[/audio/dog.wav]
Dog: 0.9999538660049438
Clock tick: 1.3341237718123011e-05
Cat: 6.579841738130199e-06
```
### 模型部署
#### 1. 动转静
模型训练结束后,可以将已保存的动态图参数导出成静态图的模型和参数,然后实施静态图的部署。
```shell
python -u export_model.py --checkpoint ./checkpoint/epoch_50/model.pdparams --output_dir ./export
```
可支持配置的参数:
- `checkpoint`: 模型参数checkpoint文件。
- `output_dir`: 导出静态图模型和参数文件的保存目录。
导出的静态图模型和参数文件如下:
```sh
$ tree export
export
├── inference.pdiparams
├── inference.pdiparams.info
└── inference.pdmodel
```
#### 2. 模型部署和预测
`deploy/python/predict.py` 脚本使用了`paddle.inference`模块下的api提供了python端部署的示例
```sh
python deploy/python/predict.py --model_dir ./export --device gpu
```

@ -0,0 +1,146 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import numpy as np
from paddle import inference
from paddleaudio.backends import load as load_audio
from paddleaudio.datasets import ESC50
from paddleaudio.features import melspectrogram
from scipy.special import softmax
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
parser.add_argument("--batch_size", type=int, default=2, help="Batch size per GPU/CPU for training.")
parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.')
parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.')
parser.add_argument('--enable_mkldnn', type=eval, default=False, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
parser.add_argument("--log_dir", type=str, default="./log", help="The path to save log.")
args = parser.parse_args()
# yapf: enable
def extract_features(files: str, **kwargs):
waveforms = []
srs = []
max_length = float('-inf')
for file in files:
waveform, sr = load_audio(file, sr=None)
max_length = max(max_length, len(waveform))
waveforms.append(waveform)
srs.append(sr)
feats = []
for i in range(len(waveforms)):
# padding
if len(waveforms[i]) < max_length:
pad_width = max_length - len(waveforms[i])
waveforms[i] = np.pad(waveforms[i], pad_width=(0, pad_width))
feat = melspectrogram(waveforms[i], sr, **kwargs).transpose()
feats.append(feat)
return np.stack(feats, axis=0)
class Predictor(object):
def __init__(self,
model_dir,
device="gpu",
batch_size=1,
use_tensorrt=False,
precision="fp32",
cpu_threads=10,
enable_mkldnn=False):
self.batch_size = batch_size
model_file = os.path.join(model_dir, "inference.pdmodel")
params_file = os.path.join(model_dir, "inference.pdiparams")
assert os.path.isfile(model_file) and os.path.isfile(
params_file), 'Please check model and parameter files.'
config = inference.Config(model_file, params_file)
if device == "gpu":
# set GPU configs accordingly
# such as intialize the gpu memory, enable tensorrt
config.enable_use_gpu(100, 0)
precision_map = {
"fp16": inference.PrecisionType.Half,
"fp32": inference.PrecisionType.Float32,
}
precision_mode = precision_map[precision]
if use_tensorrt:
config.enable_tensorrt_engine(
max_batch_size=batch_size,
min_subgraph_size=30,
precision_mode=precision_mode)
elif device == "cpu":
# set CPU configs accordingly,
# such as enable_mkldnn, set_cpu_math_library_num_threads
config.disable_gpu()
if enable_mkldnn:
# cache 10 different shapes for mkldnn to avoid memory leak
config.set_mkldnn_cache_capacity(10)
config.enable_mkldnn()
config.set_cpu_math_library_num_threads(cpu_threads)
elif device == "xpu":
# set XPU configs accordingly
config.enable_xpu(100)
config.switch_use_feed_fetch_ops(False)
self.predictor = inference.create_predictor(config)
self.input_handles = [
self.predictor.get_input_handle(name)
for name in self.predictor.get_input_names()
]
self.output_handle = self.predictor.get_output_handle(
self.predictor.get_output_names()[0])
def predict(self, wavs):
feats = extract_features(wavs)
self.input_handles[0].copy_from_cpu(feats)
self.predictor.run()
logits = self.output_handle.copy_to_cpu()
probs = softmax(logits, axis=1)
indices = np.argmax(probs, axis=1)
return indices
if __name__ == "__main__":
# Define predictor to do prediction.
predictor = Predictor(args.model_dir, args.device, args.batch_size,
args.use_tensorrt, args.precision, args.cpu_threads,
args.enable_mkldnn)
wavs = [
'~/audio_demo_resource/cat.wav',
'~/audio_demo_resource/dog.wav',
]
for i in range(len(wavs)):
wavs[i] = os.path.abspath(os.path.expanduser(wavs[i]))
assert os.path.isfile(
wavs[i]), f'Please check input wave file: {wavs[i]}'
results = predictor.predict(wavs)
for idx, wav in enumerate(wavs):
print(f'Wav: {wav} \t Label: {ESC50.label_list[results[idx]]}')

@ -0,0 +1,44 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
from model import SoundClassifier
from paddleaudio.datasets import ESC50
from paddleaudio.models.panns import cnn14
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
parser.add_argument("--output_dir", type=str, default='./export', help="Path to save static model and its parameters.")
args = parser.parse_args()
# yapf: enable
if __name__ == '__main__':
model = SoundClassifier(
backbone=cnn14(pretrained=False, extract_embedding=True),
num_class=len(ESC50.label_list))
model.set_state_dict(paddle.load(args.checkpoint))
model.eval()
model = paddle.jit.to_static(
model,
input_spec=[
paddle.static.InputSpec(
shape=[None, None, 64], dtype=paddle.float32)
])
# Save in static graph model.
paddle.jit.save(model, os.path.join(args.output_dir, "inference"))

@ -0,0 +1,36 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.nn as nn
class SoundClassifier(nn.Layer):
"""
Model for sound classification which uses panns pretrained models to extract
embeddings from audio files.
"""
def __init__(self, backbone, num_class, dropout=0.1):
super(SoundClassifier, self).__init__()
self.backbone = backbone
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(self.backbone.emb_size, num_class)
def forward(self, x):
# x: (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
x = x.unsqueeze(1)
x = self.backbone(x)
x = self.dropout(x)
logits = self.fc(x)
return logits

@ -0,0 +1,60 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import numpy as np
import paddle
import paddle.nn.functional as F
from model import SoundClassifier
from paddleaudio.backends import load as load_audio
from paddleaudio.datasets import ESC50
from paddleaudio.features import melspectrogram
from paddleaudio.models.panns import cnn14
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
args = parser.parse_args()
# yapf: enable
def extract_features(file: str, **kwargs):
waveform, sr = load_audio(file, sr=None)
feat = melspectrogram(waveform, sr, **kwargs).transpose()
return feat
if __name__ == '__main__':
paddle.set_device(args.device)
model = SoundClassifier(
backbone=cnn14(pretrained=False, extract_embedding=True),
num_class=len(ESC50.label_list))
model.set_state_dict(paddle.load(args.checkpoint))
model.eval()
feat = np.expand_dims(extract_features(args.wav), 0)
feat = paddle.to_tensor(feat)
logits = model(feat)
probs = F.softmax(logits, axis=1).numpy()
sorted_indices = (-probs[0]).argsort()
msg = f'[{args.wav}]\n'
for idx in sorted_indices[:args.top_k]:
msg += f'{ESC50.label_list[idx]}: {probs[0][idx]}\n'
print(msg)

@ -0,0 +1,148 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
import paddle
from model import SoundClassifier
from paddleaudio.datasets import ESC50
from paddleaudio.models.panns import cnn14
from paddleaudio.utils import logger
from paddleaudio.utils import Timer
# yapf: disable
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
parser.add_argument("--checkpoint_dir", type=str, default='./checkpoint', help="Directory to save model checkpoints.")
parser.add_argument("--save_freq", type=int, default=10, help="Save checkpoint every n epoch.")
parser.add_argument("--log_freq", type=int, default=10, help="Log the training infomation every n steps.")
args = parser.parse_args()
# yapf: enable
if __name__ == "__main__":
paddle.set_device(args.device)
nranks = paddle.distributed.get_world_size()
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
local_rank = paddle.distributed.get_rank()
backbone = cnn14(pretrained=True, extract_embedding=True)
model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
model = paddle.DataParallel(model)
optimizer = paddle.optimizer.Adam(
learning_rate=args.learning_rate, parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
train_ds = ESC50(mode='train', feat_type='melspectrogram')
dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
train_sampler = paddle.io.DistributedBatchSampler(
train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
train_loader = paddle.io.DataLoader(
train_ds,
batch_sampler=train_sampler,
num_workers=args.num_workers,
return_list=True,
use_buffer_reader=True, )
steps_per_epoch = len(train_sampler)
timer = Timer(steps_per_epoch * args.epochs)
timer.start()
for epoch in range(1, args.epochs + 1):
model.train()
avg_loss = 0
num_corrects = 0
num_samples = 0
for batch_idx, batch in enumerate(train_loader):
feats, labels = batch
logits = model(feats)
loss = criterion(logits, labels)
loss.backward()
optimizer.step()
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler):
optimizer._learning_rate.step()
optimizer.clear_grad()
# Calculate loss
avg_loss += loss.numpy()[0]
# Calculate metrics
preds = paddle.argmax(logits, axis=1)
num_corrects += (preds == labels).numpy().sum()
num_samples += feats.shape[0]
timer.count()
if (batch_idx + 1) % args.log_freq == 0 and local_rank == 0:
lr = optimizer.get_lr()
avg_loss /= args.log_freq
avg_acc = num_corrects / num_samples
print_msg = 'Epoch={}/{}, Step={}/{}'.format(
epoch, args.epochs, batch_idx + 1, steps_per_epoch)
print_msg += ' loss={:.4f}'.format(avg_loss)
print_msg += ' acc={:.4f}'.format(avg_acc)
print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
lr, timer.timing, timer.eta)
logger.train(print_msg)
avg_loss = 0
num_corrects = 0
num_samples = 0
if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
dev_sampler = paddle.io.BatchSampler(
dev_ds,
batch_size=args.batch_size,
shuffle=False,
drop_last=False)
dev_loader = paddle.io.DataLoader(
dev_ds,
batch_sampler=dev_sampler,
num_workers=args.num_workers,
return_list=True, )
model.eval()
num_corrects = 0
num_samples = 0
with logger.processing('Evaluation on validation dataset'):
for batch_idx, batch in enumerate(dev_loader):
feats, labels = batch
logits = model(feats)
preds = paddle.argmax(logits, axis=1)
num_corrects += (preds == labels).numpy().sum()
num_samples += feats.shape[0]
print_msg = '[Evaluation result]'
print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
logger.eval(print_msg)
# Save model
save_dir = os.path.join(args.checkpoint_dir,
'epoch_{}'.format(epoch))
logger.info('Saving model checkpoint to {}'.format(save_dir))
paddle.save(model.state_dict(),
os.path.join(save_dir, 'model.pdparams'))
paddle.save(optimizer.state_dict(),
os.path.join(save_dir, 'model.pdopt'))

@ -0,0 +1,15 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .backends import *
from .features import *

@ -0,0 +1,14 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .audio import *

@ -0,0 +1,303 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
from typing import Optional
from typing import Tuple
from typing import Union
import numpy as np
import resampy
import soundfile as sf
from numpy import ndarray as array
from scipy.io import wavfile
from ..utils import ParameterError
__all__ = [
'resample',
'to_mono',
'depth_convert',
'normalize',
'save_wav',
'load',
]
NORMALMIZE_TYPES = ['linear', 'gaussian']
MERGE_TYPES = ['ch0', 'ch1', 'random', 'average']
RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
EPS = 1e-8
def resample(y: array, src_sr: int, target_sr: int,
mode: str='kaiser_fast') -> array:
""" Audio resampling
This function is the same as using resampy.resample().
Notes:
The default mode is kaiser_fast. For better audio quality, use mode = 'kaiser_fast'
"""
if mode == 'kaiser_best':
warnings.warn(
f'Using resampy in kaiser_best to {src_sr}=>{target_sr}. This function is pretty slow, \
we recommend the mode kaiser_fast in large scale audio trainning')
if not isinstance(y, np.ndarray):
raise ParameterError(
'Only support numpy array, but received y in {type(y)}')
if mode not in RESAMPLE_MODES:
raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
return resampy.resample(y, src_sr, target_sr, filter=mode)
def to_mono(y: array, merge_type: str='average') -> array:
""" convert sterior audio to mono
"""
if merge_type not in MERGE_TYPES:
raise ParameterError(
f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
)
if y.ndim > 2:
raise ParameterError(
f'Unsupported audio array, y.ndim > 2, the shape is {y.shape}')
if y.ndim == 1: # nothing to merge
return y
if merge_type == 'ch0':
return y[0]
if merge_type == 'ch1':
return y[1]
if merge_type == 'random':
return y[np.random.randint(0, 2)]
# need to do averaging according to dtype
if y.dtype == 'float32':
y_out = (y[0] + y[1]) * 0.5
elif y.dtype == 'int16':
y_out = y.astype('int32')
y_out = (y_out[0] + y_out[1]) // 2
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
np.iinfo(y.dtype).max).astype(y.dtype)
elif y.dtype == 'int8':
y_out = y.astype('int16')
y_out = (y_out[0] + y_out[1]) // 2
y_out = np.clip(y_out, np.iinfo(y.dtype).min,
np.iinfo(y.dtype).max).astype(y.dtype)
else:
raise ParameterError(f'Unsupported dtype: {y.dtype}')
return y_out
def _safe_cast(y: array, dtype: Union[type, str]) -> array:
""" data type casting in a safe way, i.e., prevent overflow or underflow
This function is used internally.
"""
return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
def depth_convert(y: array, dtype: Union[type, str],
dithering: bool=True) -> array:
"""Convert audio array to target dtype safely
This function convert audio waveform to a target dtype, with addition steps of
preventing overflow/underflow and preserving audio range.
"""
SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
if y.dtype not in SUPPORT_DTYPE:
raise ParameterError(
'Unsupported audio dtype, '
f'y.dtype is {y.dtype}, supported dtypes are {SUPPORT_DTYPE}')
if dtype not in SUPPORT_DTYPE:
raise ParameterError(
'Unsupported audio dtype, '
f'target dtype is {dtype}, supported dtypes are {SUPPORT_DTYPE}')
if dtype == y.dtype:
return y
if dtype == 'float64' and y.dtype == 'float32':
return _safe_cast(y, dtype)
if dtype == 'float32' and y.dtype == 'float64':
return _safe_cast(y, dtype)
if dtype == 'int16' or dtype == 'int8':
if y.dtype in ['float64', 'float32']:
factor = np.iinfo(dtype).max
y = np.clip(y * factor, np.iinfo(dtype).min,
np.iinfo(dtype).max).astype(dtype)
y = y.astype(dtype)
else:
if dtype == 'int16' and y.dtype == 'int8':
factor = np.iinfo('int16').max / np.iinfo('int8').max - EPS
y = y.astype('float32') * factor
y = y.astype('int16')
else: # dtype == 'int8' and y.dtype=='int16':
y = y.astype('int32') * np.iinfo('int8').max / \
np.iinfo('int16').max
y = y.astype('int8')
if dtype in ['float32', 'float64']:
org_dtype = y.dtype
y = y.astype(dtype) / np.iinfo(org_dtype).max
return y
def sound_file_load(file: str,
offset: Optional[float]=None,
dtype: str='int16',
duration: Optional[int]=None) -> Tuple[array, int]:
"""Load audio using soundfile library
This function load audio file using libsndfile.
Reference:
http://www.mega-nerd.com/libsndfile/#Features
"""
with sf.SoundFile(file) as sf_desc:
sr_native = sf_desc.samplerate
if offset:
sf_desc.seek(int(offset * sr_native))
if duration is not None:
frame_duration = int(duration * sr_native)
else:
frame_duration = -1
y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
return y, sf_desc.samplerate
def audio_file_load():
"""Load audio using audiofile library
This function load audio file using audiofile.
Reference:
https://audiofile.68k.org/
"""
raise NotImplementedError()
def sox_file_load():
"""Load audio using sox library
This function load audio file using sox.
Reference:
http://sox.sourceforge.net/
"""
raise NotImplementedError()
def normalize(y: array, norm_type: str='linear',
mul_factor: float=1.0) -> array:
""" normalize an input audio with additional multiplier.
"""
if norm_type == 'linear':
amax = np.max(np.abs(y))
factor = 1.0 / (amax + EPS)
y = y * factor * mul_factor
elif norm_type == 'gaussian':
amean = np.mean(y)
astd = np.std(y)
astd = max(astd, EPS)
y = mul_factor * (y - amean) / astd
else:
raise NotImplementedError(f'norm_type should be in {NORMALMIZE_TYPES}')
return y
def save_wav(y: array, sr: int, file: str) -> None:
"""Save audio file to disk.
This function saves audio to disk using scipy.io.wavfile, with additional step
to convert input waveform to int16 unless it already is int16
Notes:
It only support raw wav format.
"""
if not file.endswith('.wav'):
raise ParameterError(
f'only .wav file supported, but dst file name is: {file}')
if sr <= 0:
raise ParameterError(
f'Sample rate should be larger than 0, recieved sr = {sr}')
if y.dtype not in ['int16', 'int8']:
warnings.warn(
f'input data type is {y.dtype}, will convert data to int16 format before saving'
)
y_out = depth_convert(y, 'int16')
else:
y_out = y
wavfile.write(file, sr, y_out)
def load(
file: str,
sr: Optional[int]=None,
mono: bool=True,
merge_type: str='average', # ch0,ch1,random,average
normal: bool=True,
norm_type: str='linear',
norm_mul_factor: float=1.0,
offset: float=0.0,
duration: Optional[int]=None,
dtype: str='float32',
resample_mode: str='kaiser_fast') -> Tuple[array, int]:
"""Load audio file from disk.
This function loads audio from disk using using audio beackend.
Parameters:
Notes:
"""
y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration)
if not ((y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0)):
raise ParameterError(f'audio file {file} looks empty')
if mono:
y = to_mono(y, merge_type)
if sr is not None and sr != r:
y = resample(y, r, sr, mode=resample_mode)
r = sr
if normal:
y = normalize(y, norm_type, norm_mul_factor)
elif dtype in ['int8', 'int16']:
# still need to do normalization, before depth convertion
y = normalize(y, 'linear', 1.0)
y = depth_convert(y, dtype)
return y, r

@ -0,0 +1,34 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .aishell import AISHELL1
from .dcase import UrbanAcousticScenes
from .dcase import UrbanAudioVisualScenes
from .esc50 import ESC50
from .gtzan import GTZAN
from .librispeech import LIBRISPEECH
from .ravdess import RAVDESS
from .tess import TESS
from .urban_sound import UrbanSound8K
__all__ = [
'AISHELL1',
'LIBRISPEECH',
'ESC50',
'UrbanSound8K',
'GTZAN',
'UrbanAcousticScenes',
'UrbanAudioVisualScenes',
'RAVDESS',
'TESS',
]

@ -0,0 +1,154 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import codecs
import collections
import json
import os
from typing import Dict
from paddle.io import Dataset
from tqdm import tqdm
from ..backends import load as load_audio
from ..utils.download import decompress
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from ..utils.log import logger
from .dataset import feat_funcs
__all__ = ['AISHELL1']
class AISHELL1(Dataset):
"""
This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long.
It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including
smart home, autonomous driving, and industrial production. The whole recording was
put in quiet indoor environment, using 3 different devices at the same time: high
fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit),
iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled
to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas
in China were invited to participate in the recording. The manual transcription
accuracy rate is above 95%, through professional speech annotation and strict
quality inspection. The corpus is divided into training, development and testing
sets.
Reference:
AISHELL-1: An Open-Source Mandarin Speech Corpus and A Speech Recognition Baseline
https://arxiv.org/abs/1709.05522
"""
archieves = [
{
'url': 'http://www.openslr.org/resources/33/data_aishell.tgz',
'md5': '2f494334227864a8a8fec932999db9d8',
},
]
text_meta = os.path.join('data_aishell', 'transcript',
'aishell_transcript_v0.8.txt')
utt_info = collections.namedtuple('META_INFO',
('file_path', 'utt_id', 'text'))
audio_path = os.path.join('data_aishell', 'wav')
manifest_path = os.path.join('data_aishell', 'manifest')
subset = ['train', 'dev', 'test']
def __init__(self, subset: str='train', feat_type: str='raw', **kwargs):
assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
self.subset, subset)
self.subset = subset
self.feat_type = feat_type
self.feat_config = kwargs
self._data = self._get_data()
super(AISHELL1, self).__init__()
def _get_text_info(self) -> Dict[str, str]:
ret = {}
with open(os.path.join(DATA_HOME, self.text_meta), 'r') as rf:
for line in rf.readlines()[1:]:
utt_id, text = map(str.strip, line.split(' ',
1)) # utt_id, text
ret.update({utt_id: ''.join(text.split())})
return ret
def _get_data(self):
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.text_meta)):
download_and_decompress(self.archieves, DATA_HOME)
# Extract *wav from *.tar.gz.
for root, _, files in os.walk(
os.path.join(DATA_HOME, self.audio_path)):
for file in files:
if file.endswith('.tar.gz'):
decompress(os.path.join(root, file))
os.remove(os.path.join(root, file))
text_info = self._get_text_info()
data = []
for root, _, files in os.walk(
os.path.join(DATA_HOME, self.audio_path, self.subset)):
for file in files:
if file.endswith('.wav'):
utt_id = os.path.splitext(file)[0]
if utt_id not in text_info: # There are some utt_id that without label
continue
text = text_info[utt_id]
file_path = os.path.join(root, file)
data.append(self.utt_info(file_path, utt_id, text))
return data
def _convert_to_record(self, idx: int):
sample = self._data[idx]
record = {}
# To show all fields in a namedtuple: `type(sample)._fields`
for field in type(sample)._fields:
record[field] = getattr(sample, field)
waveform, sr = load_audio(
sample[0]) # The first element of sample is file path
feat_func = feat_funcs[self.feat_type]
feat = feat_func(
waveform, sample_rate=sr,
**self.feat_config) if feat_func else waveform
record.update({'feat': feat, 'duration': len(waveform) / sr})
return record
def create_manifest(self, prefix='manifest'):
if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
manifest_file = os.path.join(DATA_HOME, self.manifest_path,
f'{prefix}.{self.subset}')
with codecs.open(manifest_file, 'w', 'utf-8') as f:
for idx in tqdm(range(len(self))):
record = self._convert_to_record(idx)
record_line = json.dumps(
{
'utt': record['utt_id'],
'feat': record['file_path'],
'feat_shape': (record['duration'], ),
'text': record['text']
},
ensure_ascii=False)
f.write(record_line + '\n')
logger.info(f'Manifest file {manifest_file} created.')
def __getitem__(self, idx):
record = self._convert_to_record(idx)
return tuple(record.values())
def __len__(self):
return len(self._data)

@ -0,0 +1,82 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import numpy as np
import paddle
from ..backends import load as load_audio
from ..features import melspectrogram
from ..features import mfcc
feat_funcs = {
'raw': None,
'melspectrogram': melspectrogram,
'mfcc': mfcc,
}
class AudioClassificationDataset(paddle.io.Dataset):
"""
Base class of audio classification dataset.
"""
def __init__(self,
files: List[str],
labels: List[int],
feat_type: str='raw',
**kwargs):
"""
Ags:
files (:obj:`List[str]`): A list of absolute path of audio files.
labels (:obj:`List[int]`): Labels of audio files.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
super(AudioClassificationDataset, self).__init__()
if feat_type not in feat_funcs.keys():
raise RuntimeError(
f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
)
self.files = files
self.labels = labels
self.feat_type = feat_type
self.feat_config = kwargs # Pass keyword arguments to customize feature config
def _get_data(self, input_file: str):
raise NotImplementedError
def _convert_to_record(self, idx):
file, label = self.files[idx], self.labels[idx]
waveform, sample_rate = load_audio(file)
feat_func = feat_funcs[self.feat_type]
record = {}
record['feat'] = feat_func(
waveform, sample_rate,
**self.feat_config) if feat_func else waveform
record['label'] = label
return record
def __getitem__(self, idx):
record = self._convert_to_record(idx)
return np.array(record['feat']).transpose(), np.array(
record['label'], dtype=np.int64)
def __len__(self):
return len(self.files)

@ -0,0 +1,298 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['UrbanAcousticScenes', 'UrbanAudioVisualScenes']
class UrbanAcousticScenes(AudioClassificationDataset):
"""
TAU Urban Acoustic Scenes 2020 Mobile Development dataset contains recordings from
12 European cities in 10 different acoustic scenes using 4 different devices.
Additionally, synthetic data for 11 mobile devices was created based on the original
recordings. Of the 12 cities, two are present only in the evaluation set.
Reference:
A multi-device dataset for urban acoustic scene classification
https://arxiv.org/abs/1807.09840
"""
source_url = 'https://zenodo.org/record/3819968/files/'
base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
archieves = [
{
'url': source_url + base_name + '.meta.zip',
'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
},
{
'url': source_url + base_name + '.audio.1.zip',
'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
},
{
'url': source_url + base_name + '.audio.2.zip',
'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
},
{
'url': source_url + base_name + '.audio.3.zip',
'md5': 'ed38956c4246abb56190c1e9b602b7b8',
},
{
'url': source_url + base_name + '.audio.4.zip',
'md5': '97ab8560056b6816808dedc044dcc023',
},
{
'url': source_url + base_name + '.audio.5.zip',
'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
},
{
'url': source_url + base_name + '.audio.6.zip',
'md5': 'fbf856a3a86fff7520549c899dc94372',
},
{
'url': source_url + base_name + '.audio.7.zip',
'md5': '0dbffe7b6e45564da649378723284062',
},
{
'url': source_url + base_name + '.audio.8.zip',
'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
},
{
'url': source_url + base_name + '.audio.9.zip',
'md5': 'a65596a5372eab10c78e08a0de797c9e',
},
{
'url': source_url + base_name + '.audio.10.zip',
'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
},
{
'url': source_url + base_name + '.audio.11.zip',
'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
},
{
'url': source_url + base_name + '.audio.12.zip',
'md5': 'e5f4400c6b9697295fab4cf507155a2f',
},
{
'url': source_url + base_name + '.audio.13.zip',
'md5': '8855ab9f9896422746ab4c5d89d8da2f',
},
{
'url': source_url + base_name + '.audio.14.zip',
'md5': '092ad744452cd3e7de78f988a3d13020',
},
{
'url': source_url + base_name + '.audio.15.zip',
'md5': '4b5eb85f6592aebf846088d9df76b420',
},
{
'url': source_url + base_name + '.audio.16.zip',
'md5': '2e0a89723e58a3836be019e6996ae460',
},
]
label_list = [
'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
]
meta = os.path.join(base_name, 'meta.csv')
meta_info = collections.namedtuple('META_INFO', (
'filename', 'scene_label', 'identifier', 'source_label'))
subset_meta = {
'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
'dev':
os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
}
subset_meta_info = collections.namedtuple('SUBSET_META_INFO',
('filename', 'scene_label'))
audio_path = os.path.join(base_name, 'audio')
def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
files, labels = self._get_data(mode)
super(UrbanAcousticScenes, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self, subset: str=None,
skip_header: bool=True) -> List[collections.namedtuple]:
if subset is None:
meta_file = self.meta
meta_info = self.meta_info
else:
assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
meta_file = self.subset_meta[subset]
meta_info = self.subset_meta_info
ret = []
with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
lines = rf.readlines()[1:] if skip_header else rf.readlines()
for line in lines:
ret.append(meta_info(*line.strip().split('\t')))
return ret
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
meta_info = self._get_meta_info(subset=mode, skip_header=True)
files = []
labels = []
for sample in meta_info:
filename, label = sample[:2]
filename = os.path.basename(filename)
target = self.label_list.index(label)
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
labels.append(int(target))
return files, labels
class UrbanAudioVisualScenes(AudioClassificationDataset):
"""
TAU Urban Audio Visual Scenes 2021 Development dataset contains synchronized audio
and video recordings from 12 European cities in 10 different scenes.
This dataset consists of 10-seconds audio and video segments from 10
acoustic scenes. The total amount of audio in the development set is 34 hours.
Reference:
A Curated Dataset of Urban Scenes for Audio-Visual Scene Analysis
https://arxiv.org/abs/2011.00030
"""
source_url = 'https://zenodo.org/record/4477542/files/'
base_name = 'TAU-urban-audio-visual-scenes-2021-development'
archieves = [
{
'url': source_url + base_name + '.meta.zip',
'md5': '76e3d7ed5291b118372e06379cb2b490',
},
{
'url': source_url + base_name + '.audio.1.zip',
'md5': '186f6273f8f69ed9dbdc18ad65ac234f',
},
{
'url': source_url + base_name + '.audio.2.zip',
'md5': '7fd6bb63127f5785874a55aba4e77aa5',
},
{
'url': source_url + base_name + '.audio.3.zip',
'md5': '61396bede29d7c8c89729a01a6f6b2e2',
},
{
'url': source_url + base_name + '.audio.4.zip',
'md5': '6ddac89717fcf9c92c451868eed77fe1',
},
{
'url': source_url + base_name + '.audio.5.zip',
'md5': 'af4820756cdf1a7d4bd6037dc034d384',
},
{
'url': source_url + base_name + '.audio.6.zip',
'md5': 'ebd11ec24411f2a17a64723bd4aa7fff',
},
{
'url': source_url + base_name + '.audio.7.zip',
'md5': '2be39a76aeed704d5929d020a2909efd',
},
{
'url': source_url + base_name + '.audio.8.zip',
'md5': '972d8afe0874720fc2f28086e7cb22a9',
},
]
label_list = [
'airport', 'shopping_mall', 'metro_station', 'street_pedestrian',
'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'
]
meta_base_path = os.path.join(base_name, base_name + '.meta')
meta = os.path.join(meta_base_path, 'meta.csv')
meta_info = collections.namedtuple('META_INFO', (
'filename_audio', 'filename_video', 'scene_label', 'identifier'))
subset_meta = {
'train':
os.path.join(meta_base_path, 'evaluation_setup', 'fold1_train.csv'),
'dev':
os.path.join(meta_base_path, 'evaluation_setup', 'fold1_evaluate.csv'),
'test':
os.path.join(meta_base_path, 'evaluation_setup', 'fold1_test.csv'),
}
subset_meta_info = collections.namedtuple('SUBSET_META_INFO', (
'filename_audio', 'filename_video', 'scene_label'))
audio_path = os.path.join(base_name, 'audio')
def __init__(self, mode: str='train', feat_type: str='raw', **kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
files, labels = self._get_data(mode)
super(UrbanAudioVisualScenes, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self, subset: str=None,
skip_header: bool=True) -> List[collections.namedtuple]:
if subset is None:
meta_file = self.meta
meta_info = self.meta_info
else:
assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
meta_file = self.subset_meta[subset]
meta_info = self.subset_meta_info
ret = []
with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
lines = rf.readlines()[1:] if skip_header else rf.readlines()
for line in lines:
ret.append(meta_info(*line.strip().split('\t')))
return ret
def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves,
os.path.join(DATA_HOME, self.base_name))
meta_info = self._get_meta_info(subset=mode, skip_header=True)
files = []
labels = []
for sample in meta_info:
filename, _, label = sample[:3]
filename = os.path.basename(filename)
target = self.label_list.index(label)
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
labels.append(int(target))
return files, labels

@ -0,0 +1,152 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['ESC50']
class ESC50(AudioClassificationDataset):
"""
The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
suitable for benchmarking methods of environmental sound classification. The dataset
consists of 5-second-long recordings organized into 50 semantical classes (with
40 examples per class)
Reference:
ESC: Dataset for Environmental Sound Classification
http://dx.doi.org/10.1145/2733373.2806390
"""
archieves = [
{
'url':
'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
'md5': '7771e4b9d86d0945acce719c7a59305a',
},
]
label_list = [
# Animals
'Dog',
'Rooster',
'Pig',
'Cow',
'Frog',
'Cat',
'Hen',
'Insects (flying)',
'Sheep',
'Crow',
# Natural soundscapes & water sounds
'Rain',
'Sea waves',
'Crackling fire',
'Crickets',
'Chirping birds',
'Water drops',
'Wind',
'Pouring water',
'Toilet flush',
'Thunderstorm',
# Human, non-speech sounds
'Crying baby',
'Sneezing',
'Clapping',
'Breathing',
'Coughing',
'Footsteps',
'Laughing',
'Brushing teeth',
'Snoring',
'Drinking, sipping',
# Interior/domestic sounds
'Door knock',
'Mouse click',
'Keyboard typing',
'Door, wood creaks',
'Can opening',
'Washing machine',
'Vacuum cleaner',
'Clock alarm',
'Clock tick',
'Glass breaking',
# Exterior/urban noises
'Helicopter',
'Chainsaw',
'Siren',
'Car horn',
'Engine',
'Train',
'Church bells',
'Airplane',
'Fireworks',
'Hand saw',
]
meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
meta_info = collections.namedtuple(
'META_INFO',
('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
audio_path = os.path.join('ESC-50-master', 'audio')
def __init__(self,
mode: str='train',
split: int=1,
feat_type: str='raw',
**kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
files, labels = self._get_data(mode, split)
super(ESC50, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self) -> List[collections.namedtuple]:
ret = []
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
for line in rf.readlines()[1:]:
ret.append(self.meta_info(*line.strip().split(',')))
return ret
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
meta_info = self._get_meta_info()
files = []
labels = []
for sample in meta_info:
filename, fold, target, _, _, _, _ = sample
if mode == 'train' and int(fold) != split:
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
labels.append(int(target))
if mode != 'train' and int(fold) == split:
files.append(os.path.join(DATA_HOME, self.audio_path, filename))
labels.append(int(target))
return files, labels

@ -0,0 +1,115 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import random
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['GTZAN']
class GTZAN(AudioClassificationDataset):
"""
The GTZAN dataset consists of 1000 audio tracks each 30 seconds long. It contains 10 genres,
each represented by 100 tracks. The dataset is the most-used public dataset for evaluation
in machine listening research for music genre recognition (MGR).
Reference:
Musical genre classification of audio signals
https://ieeexplore.ieee.org/document/1021072/
"""
archieves = [
{
'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
},
]
label_list = [
'blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal',
'pop', 'reggae', 'rock'
]
meta = os.path.join('genres', 'input.mf')
meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
audio_path = 'genres'
def __init__(self,
mode='train',
seed=0,
n_folds=5,
split=1,
feat_type='raw',
**kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
seed (:obj:`int`, `optional`, defaults to 0):
Set the random seed to shuffle samples.
n_folds (:obj:`int`, `optional`, defaults to 5):
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
files, labels = self._get_data(mode, seed, n_folds, split)
super(GTZAN, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self) -> List[collections.namedtuple]:
ret = []
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
for line in rf.readlines():
ret.append(self.meta_info(*line.strip().split('\t')))
return ret
def _get_data(self, mode, seed, n_folds,
split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
meta_info = self._get_meta_info()
random.seed(seed) # shuffle samples to split data
random.shuffle(
meta_info
) # make sure using the same seed to create train and dev dataset
files = []
labels = []
n_samples_per_fold = len(meta_info) // n_folds
for idx, sample in enumerate(meta_info):
file_path, label = sample
filename = os.path.basename(file_path)
target = self.label_list.index(label)
fold = idx // n_samples_per_fold + 1
if mode == 'train' and int(fold) != split:
files.append(
os.path.join(DATA_HOME, self.audio_path, label, filename))
labels.append(target)
if mode != 'train' and int(fold) == split:
files.append(
os.path.join(DATA_HOME, self.audio_path, label, filename))
labels.append(target)
return files, labels

@ -0,0 +1,199 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import codecs
import collections
import json
import os
from typing import Dict
from paddle.io import Dataset
from tqdm import tqdm
from ..backends import load as load_audio
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from ..utils.log import logger
from .dataset import feat_funcs
__all__ = ['LIBRISPEECH']
class LIBRISPEECH(Dataset):
"""
LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech,
prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is
derived from read audiobooks from the LibriVox project, and has been carefully
segmented and aligned.
Reference:
LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS
http://www.danielpovey.com/files/2015_icassp_librispeech.pdf
https://arxiv.org/abs/1709.05522
"""
source_url = 'http://www.openslr.org/resources/12/'
archieves = [
{
'url': source_url + 'train-clean-100.tar.gz',
'md5': '2a93770f6d5c6c964bc36631d331a522',
},
{
'url': source_url + 'train-clean-360.tar.gz',
'md5': 'c0e676e450a7ff2f54aeade5171606fa',
},
{
'url': source_url + 'train-other-500.tar.gz',
'md5': 'd1a0fd59409feb2c614ce4d30c387708',
},
{
'url': source_url + 'dev-clean.tar.gz',
'md5': '42e2234ba48799c1f50f24a7926300a1',
},
{
'url': source_url + 'dev-other.tar.gz',
'md5': 'c8d0bcc9cca99d4f8b62fcc847357931',
},
{
'url': source_url + 'test-clean.tar.gz',
'md5': '32fa31d27d2e1cad72775fee3f4849a9',
},
{
'url': source_url + 'test-other.tar.gz',
'md5': 'fb5a50374b501bb3bac4815ee91d3135',
},
]
speaker_meta = os.path.join('LibriSpeech', 'SPEAKERS.TXT')
utt_info = collections.namedtuple('META_INFO', (
'file_path', 'utt_id', 'text', 'spk_id', 'spk_gender'))
audio_path = 'LibriSpeech'
manifest_path = os.path.join('LibriSpeech', 'manifest')
subset = [
'train-clean-100', 'train-clean-360', 'train-clean-500', 'dev-clean',
'dev-other', 'test-clean', 'test-other'
]
def __init__(self,
subset: str='train-clean-100',
feat_type: str='raw',
**kwargs):
assert subset in self.subset, 'Dataset subset must be one in {}, but got {}'.format(
self.subset, subset)
self.subset = subset
self.feat_type = feat_type
self.feat_config = kwargs
self._data = self._get_data()
super(LIBRISPEECH, self).__init__()
def _get_speaker_info(self) -> Dict[str, str]:
ret = {}
with open(os.path.join(DATA_HOME, self.speaker_meta), 'r') as rf:
for line in rf.readlines():
if ';' in line: # Skip dataset abstract
continue
spk_id, gender = map(str.strip,
line.split('|')[:2]) # spk_id, gender
ret.update({spk_id: gender})
return ret
def _get_text_info(self, trans_file) -> Dict[str, str]:
ret = {}
with open(trans_file, 'r') as rf:
for line in rf.readlines():
utt_id, text = map(str.strip, line.split(' ',
1)) # utt_id, text
ret.update({utt_id: text})
return ret
def _get_data(self):
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.speaker_meta)):
download_and_decompress(self.archieves, DATA_HOME,
len(self.archieves))
# Speaker info
speaker_info = self._get_speaker_info()
# Text info
text_info = {}
for root, _, files in os.walk(
os.path.join(DATA_HOME, self.audio_path, self.subset)):
for file in files:
if file.endswith('.trans.txt'):
text_info.update(
self._get_text_info(os.path.join(root, file)))
data = []
for root, _, files in os.walk(
os.path.join(DATA_HOME, self.audio_path, self.subset)):
for file in files:
if file.endswith('.flac'):
utt_id = os.path.splitext(file)[0]
spk_id = utt_id.split('-')[0]
if utt_id not in text_info \
or spk_id not in speaker_info : # Skip samples with incomplete data
continue
file_path = os.path.join(root, file)
text = text_info[utt_id]
spk_gender = speaker_info[spk_id]
data.append(
self.utt_info(file_path, utt_id, text, spk_id,
spk_gender))
return data
def _convert_to_record(self, idx: int):
sample = self._data[idx]
record = {}
# To show all fields in a namedtuple: `type(sample)._fields`
for field in type(sample)._fields:
record[field] = getattr(sample, field)
waveform, sr = load_audio(
sample[0]) # The first element of sample is file path
feat_func = feat_funcs[self.feat_type]
feat = feat_func(
waveform, sample_rate=sr,
**self.feat_config) if feat_func else waveform
record.update({'feat': feat, 'duration': len(waveform) / sr})
return record
def create_manifest(self, prefix='manifest'):
if not os.path.isdir(os.path.join(DATA_HOME, self.manifest_path)):
os.makedirs(os.path.join(DATA_HOME, self.manifest_path))
manifest_file = os.path.join(DATA_HOME, self.manifest_path,
f'{prefix}.{self.subset}')
with codecs.open(manifest_file, 'w', 'utf-8') as f:
for idx in tqdm(range(len(self))):
record = self._convert_to_record(idx)
record_line = json.dumps(
{
'utt': record['utt_id'],
'feat': record['file_path'],
'feat_shape': (record['duration'], ),
'text': record['text'],
'spk': record['spk_id'],
'gender': record['spk_gender'],
},
ensure_ascii=False)
f.write(record_line + '\n')
logger.info(f'Manifest file {manifest_file} created.')
def __getitem__(self, idx):
record = self._convert_to_record(idx)
return tuple(record.values())
def __len__(self):
return len(self._data)

@ -0,0 +1,136 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import random
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['RAVDESS']
class RAVDESS(AudioClassificationDataset):
"""
The RAVDESS contains 24 professional actors (12 female, 12 male), vocalizing two
lexically-matched statements in a neutral North American accent. Speech emotions
includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
Each expression is produced at two levels of emotional intensity (normal, strong),
with an additional neutral expression.
Reference:
The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS):
A dynamic, multimodal set of facial and vocal expressions in North American English
https://doi.org/10.1371/journal.pone.0196391
"""
archieves = [
{
'url':
'https://zenodo.org/record/1188976/files/Audio_Song_Actors_01-24.zip',
'md5':
'5411230427d67a21e18aa4d466e6d1b9',
},
{
'url':
'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip',
'md5':
'bc696df654c87fed845eb13823edef8a',
},
]
label_list = [
'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust',
'surprised'
]
meta_info = collections.namedtuple(
'META_INFO', ('modality', 'vocal_channel', 'emotion',
'emotion_intensity', 'statement', 'repitition', 'actor'))
speech_path = os.path.join(DATA_HOME, 'Audio_Speech_Actors_01-24')
song_path = os.path.join(DATA_HOME, 'Audio_Song_Actors_01-24')
def __init__(self,
mode='train',
seed=0,
n_folds=5,
split=1,
feat_type='raw',
**kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
seed (:obj:`int`, `optional`, defaults to 0):
Set the random seed to shuffle samples.
n_folds (:obj:`int`, `optional`, defaults to 5):
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
files, labels = self._get_data(mode, seed, n_folds, split)
super(RAVDESS, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self, files) -> List[collections.namedtuple]:
ret = []
for file in files:
basename_without_extend = os.path.basename(file)[:-4]
ret.append(self.meta_info(*basename_without_extend.split('-')))
return ret
def _get_data(self, mode, seed, n_folds,
split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(self.speech_path) and not os.path.isdir(
self.song_path):
download_and_decompress(self.archieves, DATA_HOME)
wav_files = []
for root, _, files in os.walk(self.speech_path):
for file in files:
if file.endswith('.wav'):
wav_files.append(os.path.join(root, file))
for root, _, files in os.walk(self.song_path):
for file in files:
if file.endswith('.wav'):
wav_files.append(os.path.join(root, file))
random.seed(seed) # shuffle samples to split data
random.shuffle(
wav_files
) # make sure using the same seed to create train and dev dataset
meta_info = self._get_meta_info(wav_files)
files = []
labels = []
n_samples_per_fold = len(meta_info) // n_folds
for idx, sample in enumerate(meta_info):
_, _, emotion, _, _, _, _ = sample
target = int(emotion) - 1
fold = idx // n_samples_per_fold + 1
if mode == 'train' and int(fold) != split:
files.append(wav_files[idx])
labels.append(target)
if mode != 'train' and int(fold) == split:
files.append(wav_files[idx])
labels.append(target)
return files, labels

@ -0,0 +1,126 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import random
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['TESS']
class TESS(AudioClassificationDataset):
"""
TESS is a set of 200 target words were spoken in the carrier phrase
"Say the word _____' by two actresses (aged 26 and 64 years) and
recordings were made of the set portraying each of seven emotions(anger,
disgust, fear, happiness, pleasant surprise, sadness, and neutral).
There are 2800 stimuli in total.
Reference:
Toronto emotional speech set (TESS)
https://doi.org/10.5683/SP2/E8H2MF
"""
archieves = [
{
'url':
'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
'md5':
'1465311b24d1de704c4c63e4ccc470c7',
},
]
label_list = [
'angry',
'disgust',
'fear',
'happy',
'neutral',
'ps', # pleasant surprise
'sad',
]
meta_info = collections.namedtuple('META_INFO',
('speaker', 'word', 'emotion'))
audio_path = 'TESS_Toronto_emotional_speech_set'
def __init__(self,
mode='train',
seed=0,
n_folds=5,
split=1,
feat_type='raw',
**kwargs):
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
seed (:obj:`int`, `optional`, defaults to 0):
Set the random seed to shuffle samples.
n_folds (:obj:`int`, `optional`, defaults to 5):
Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
files, labels = self._get_data(mode, seed, n_folds, split)
super(TESS, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
def _get_meta_info(self, files) -> List[collections.namedtuple]:
ret = []
for file in files:
basename_without_extend = os.path.basename(file)[:-4]
ret.append(self.meta_info(*basename_without_extend.split('_')))
return ret
def _get_data(self, mode, seed, n_folds,
split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
download_and_decompress(self.archieves, DATA_HOME)
wav_files = []
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
for file in files:
if file.endswith('.wav'):
wav_files.append(os.path.join(root, file))
random.seed(seed) # shuffle samples to split data
random.shuffle(
wav_files
) # make sure using the same seed to create train and dev dataset
meta_info = self._get_meta_info(wav_files)
files = []
labels = []
n_samples_per_fold = len(meta_info) // n_folds
for idx, sample in enumerate(meta_info):
_, _, emotion = sample
target = self.label_list.index(emotion)
fold = idx // n_samples_per_fold + 1
if mode == 'train' and int(fold) != split:
files.append(wav_files[idx])
labels.append(target)
if mode != 'train' and int(fold) == split:
files.append(wav_files[idx])
labels.append(target)
return files, labels

@ -0,0 +1,104 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
from typing import List
from typing import Tuple
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['UrbanSound8K']
class UrbanSound8K(AudioClassificationDataset):
"""
UrbanSound8K dataset contains 8732 labeled sound excerpts (<=4s) of urban
sounds from 10 classes: air_conditioner, car_horn, children_playing, dog_bark,
drilling, enginge_idling, gun_shot, jackhammer, siren, and street_music. The
classes are drawn from the urban sound taxonomy.
Reference:
A Dataset and Taxonomy for Urban Sound Research
https://dl.acm.org/doi/10.1145/2647868.2655045
"""
archieves = [
{
'url':
'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
'md5': '9aa69802bbf37fb986f71ec1483a196e',
},
]
label_list = [
"air_conditioner", "car_horn", "children_playing", "dog_bark",
"drilling", "engine_idling", "gun_shot", "jackhammer", "siren",
"street_music"
]
meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
meta_info = collections.namedtuple(
'META_INFO', ('filename', 'fsid', 'start', 'end', 'salience', 'fold',
'class_id', 'label'))
audio_path = os.path.join('UrbanSound8K', 'audio')
def __init__(self,
mode: str='train',
split: int=1,
feat_type: str='raw',
**kwargs):
files, labels = self._get_data(mode, split)
super(UrbanSound8K, self).__init__(
files=files, labels=labels, feat_type=feat_type, **kwargs)
"""
Ags:
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train or dev).
split (:obj:`int`, `optional`, defaults to 1):
It specify the fold of dev dataset.
feat_type (:obj:`str`, `optional`, defaults to `raw`):
It identifies the feature type that user wants to extrace of an audio file.
"""
def _get_meta_info(self):
ret = []
with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
for line in rf.readlines()[1:]:
ret.append(self.meta_info(*line.strip().split(',')))
return ret
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
meta_info = self._get_meta_info()
files = []
labels = []
for sample in meta_info:
filename, _, _, _, _, fold, target, _ = sample
if mode == 'train' and int(fold) != split:
files.append(
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
filename))
labels.append(int(target))
if mode != 'train' and int(fold) == split:
files.append(
os.path.join(DATA_HOME, self.audio_path, f'fold{fold}',
filename))
labels.append(int(target))
return files, labels

@ -0,0 +1,15 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .augment import *
from .core import *

@ -0,0 +1,169 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import numpy as np
from numpy import ndarray as array
from paddleaudio.backends import depth_convert
from paddleaudio.utils import ParameterError
__all__ = [
'depth_augment',
'spect_augment',
'random_crop1d',
'random_crop2d',
'adaptive_spect_augment',
]
def randint(high: int) -> int:
"""Generate one random integer in range [0 high)
This is a helper function for random data augmentaiton
"""
return int(np.random.randint(0, high=high))
def rand() -> float:
"""Generate one floating-point number in range [0 1)
This is a helper function for random data augmentaiton
"""
return float(np.random.rand(1))
def depth_augment(y: array,
choices: List=['int8', 'int16'],
probs: List[float]=[0.5, 0.5]) -> array:
""" Audio depth augmentation
Do audio depth augmentation to simulate the distortion brought by quantization.
"""
assert len(probs) == len(
choices
), 'number of choices {} must be equal to size of probs {}'.format(
len(choices), len(probs))
depth = np.random.choice(choices, p=probs)
src_depth = y.dtype
y1 = depth_convert(y, depth)
y2 = depth_convert(y1, src_depth)
return y2
def adaptive_spect_augment(spect: array, tempo_axis: int=0,
level: float=0.1) -> array:
"""Do adpative spectrogram augmentation
The level of the augmentation is gowern by the paramter level,
ranging from 0 to 1, with 0 represents no augmentation
"""
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
if tempo_axis == 0:
nt, nf = spect.shape
else:
nf, nt = spect.shape
time_mask_width = int(nt * level * 0.5)
freq_mask_width = int(nf * level * 0.5)
num_time_mask = int(10 * level)
num_freq_mask = int(10 * level)
if tempo_axis == 0:
for _ in range(num_time_mask):
start = randint(nt - time_mask_width)
spect[start:start + time_mask_width, :] = 0
for _ in range(num_freq_mask):
start = randint(nf - freq_mask_width)
spect[:, start:start + freq_mask_width] = 0
else:
for _ in range(num_time_mask):
start = randint(nt - time_mask_width)
spect[:, start:start + time_mask_width] = 0
for _ in range(num_freq_mask):
start = randint(nf - freq_mask_width)
spect[start:start + freq_mask_width, :] = 0
return spect
def spect_augment(spect: array,
tempo_axis: int=0,
max_time_mask: int=3,
max_freq_mask: int=3,
max_time_mask_width: int=30,
max_freq_mask_width: int=20) -> array:
"""Do spectrogram augmentation in both time and freq axis
Reference:
"""
assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
if tempo_axis == 0:
nt, nf = spect.shape
else:
nf, nt = spect.shape
num_time_mask = randint(max_time_mask)
num_freq_mask = randint(max_freq_mask)
time_mask_width = randint(max_time_mask_width)
freq_mask_width = randint(max_freq_mask_width)
if tempo_axis == 0:
for _ in range(num_time_mask):
start = randint(nt - time_mask_width)
spect[start:start + time_mask_width, :] = 0
for _ in range(num_freq_mask):
start = randint(nf - freq_mask_width)
spect[:, start:start + freq_mask_width] = 0
else:
for _ in range(num_time_mask):
start = randint(nt - time_mask_width)
spect[:, start:start + time_mask_width] = 0
for _ in range(num_freq_mask):
start = randint(nf - freq_mask_width)
spect[start:start + freq_mask_width, :] = 0
return spect
def random_crop1d(y: array, crop_len: int) -> array:
""" Do random cropping on 1d input signal
The input is a 1d signal, typically a sound waveform
"""
if y.ndim != 1:
'only accept 1d tensor or numpy array'
n = len(y)
idx = randint(n - crop_len)
return y[idx:idx + crop_len]
def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
""" Do random cropping for 2D array, typically a spectrogram.
The cropping is done in temporal direction on the time-freq input signal.
"""
if tempo_axis >= s.ndim:
raise ParameterError('axis out of range')
n = s.shape[tempo_axis]
idx = randint(high=n - crop_len)
sli = [slice(None) for i in range(s.ndim)]
sli[tempo_axis] = slice(idx, idx + crop_len)
out = s[tuple(sli)]
return out

@ -0,0 +1,576 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from librosa(https://github.com/librosa/librosa)
import warnings
from typing import List
from typing import Optional
from typing import Union
import numpy as np
import scipy
from numpy import ndarray as array
from numpy.lib.stride_tricks import as_strided
from paddleaudio.utils import ParameterError
from scipy.signal import get_window
__all__ = [
'stft',
'mfcc',
'hz_to_mel',
'mel_to_hz',
'split_frames',
'mel_frequencies',
'power_to_db',
'compute_fbank_matrix',
'melspectrogram',
'spectrogram',
'mu_encode',
'mu_decode',
]
def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array:
"""Pad an array to a target length along a target axis.
This differs from `np.pad` by centering the data prior to padding,
analogous to `str.center`
"""
kwargs.setdefault("mode", "constant")
n = data.shape[axis]
lpad = int((size - n) // 2)
lengths = [(0, 0)] * data.ndim
lengths[axis] = (lpad, int(size - n - lpad))
if lpad < 0:
raise ParameterError(("Target size ({size:d}) must be "
"at least input size ({n:d})"))
return np.pad(data, lengths, **kwargs)
def split_frames(x: array, frame_length: int, hop_length: int,
axis: int=-1) -> array:
"""Slice a data array into (overlapping) frames.
This function is aligned with librosa.frame
"""
if not isinstance(x, np.ndarray):
raise ParameterError(
f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
if x.shape[axis] < frame_length:
raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
f" for frame_length={frame_length:d}")
if hop_length < 1:
raise ParameterError(f"Invalid hop_length: {hop_length:d}")
if axis == -1 and not x.flags["F_CONTIGUOUS"]:
warnings.warn(f"librosa.util.frame called with axis={axis} "
"on a non-contiguous input. This will result in a copy.")
x = np.asfortranarray(x)
elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
warnings.warn(f"librosa.util.frame called with axis={axis} "
"on a non-contiguous input. This will result in a copy.")
x = np.ascontiguousarray(x)
n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
strides = np.asarray(x.strides)
new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
if axis == -1:
shape = list(x.shape)[:-1] + [frame_length, n_frames]
strides = list(strides) + [hop_length * new_stride]
elif axis == 0:
shape = [n_frames, frame_length] + list(x.shape)[1:]
strides = [hop_length * new_stride] + list(strides)
else:
raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
return as_strided(x, shape=shape, strides=strides)
def _check_audio(y, mono=True) -> bool:
"""Determine whether a variable contains valid audio data.
The audio y must be a np.ndarray, ether 1-channel or two channel
"""
if not isinstance(y, np.ndarray):
raise ParameterError("Audio data must be of type numpy.ndarray")
if y.ndim > 2:
raise ParameterError(
f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
if mono and y.ndim == 2:
raise ParameterError(
f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
if not np.issubdtype(y.dtype, np.floating):
raise ParameterError("Audio data must be floating-point")
if not np.isfinite(y).all():
raise ParameterError("Audio buffer is not finite everywhere")
return True
def hz_to_mel(frequencies: Union[float, List[float], array],
htk: bool=False) -> array:
"""Convert Hz to Mels
This function is aligned with librosa.
"""
freq = np.asanyarray(frequencies)
if htk:
return 2595.0 * np.log10(1.0 + freq / 700.0)
# Fill in the linear part
f_min = 0.0
f_sp = 200.0 / 3
mels = (freq - f_min) / f_sp
# Fill in the log-scale part
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
if freq.ndim:
# If we have array data, vectorize
log_t = freq >= min_log_hz
mels[log_t] = min_log_mel + \
np.log(freq[log_t] / min_log_hz) / logstep
elif freq >= min_log_hz:
# If we have scalar data, heck directly
mels = min_log_mel + np.log(freq / min_log_hz) / logstep
return mels
def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array:
"""Convert mel bin numbers to frequencies.
This function is aligned with librosa.
"""
mel_array = np.asanyarray(mels)
if htk:
return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
# Fill in the linear scale
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mel_array
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
logstep = np.log(6.4) / 27.0 # step size for log region
if mel_array.ndim:
# If we have vector data, vectorize
log_t = mel_array >= min_log_mel
freqs[log_t] = min_log_hz * \
np.exp(logstep * (mel_array[log_t] - min_log_mel))
elif mel_array >= min_log_mel:
# If we have scalar data, check directly
freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
return freqs
def mel_frequencies(n_mels: int=128,
fmin: float=0.0,
fmax: float=11025.0,
htk: bool=False) -> array:
"""Compute mel frequencies
This function is aligned with librosa.
"""
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = hz_to_mel(fmin, htk=htk)
max_mel = hz_to_mel(fmax, htk=htk)
mels = np.linspace(min_mel, max_mel, n_mels)
return mel_to_hz(mels, htk=htk)
def fft_frequencies(sr: int, n_fft: int) -> array:
"""Compute fourier frequencies.
This function is aligned with librosa.
"""
return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
def compute_fbank_matrix(sr: int,
n_fft: int,
n_mels: int=128,
fmin: float=0.0,
fmax: Optional[float]=None,
htk: bool=False,
norm: str="slaney",
dtype: type=np.float32):
"""Compute fbank matrix.
This funciton is aligned with librosa.
"""
if norm != "slaney":
raise ParameterError('norm must set to slaney')
if fmax is None:
fmax = float(sr) / 2
# Initialize the weights
n_mels = int(n_mels)
weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
# Center freqs of each FFT bin
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
# 'Center freqs' of mel bands - uniformly spaced between limits
mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fftfreqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# .. then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))
if norm == "slaney":
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
weights *= enorm[:, np.newaxis]
# Only check weights if f_mel[0] is positive
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
# This means we have an empty channel somewhere
warnings.warn("Empty filters detected in mel frequency basis. "
"Some channels will produce empty responses. "
"Try increasing your sampling rate (and fmax) or "
"reducing n_mels.")
return weights
def stft(x: array,
n_fft: int=2048,
hop_length: Optional[int]=None,
win_length: Optional[int]=None,
window: str="hann",
center: bool=True,
dtype: type=np.complex64,
pad_mode: str="reflect") -> array:
"""Short-time Fourier transform (STFT).
This function is aligned with librosa.
"""
_check_audio(x)
# By default, use the entire frame
if win_length is None:
win_length = n_fft
# Set the default hop, if it's not already specified
if hop_length is None:
hop_length = int(win_length // 4)
fft_window = get_window(window, win_length, fftbins=True)
# Pad the window out to n_fft size
fft_window = pad_center(fft_window, n_fft)
# Reshape so that the window can be broadcast
fft_window = fft_window.reshape((-1, 1))
# Pad the time series so that frames are centered
if center:
if n_fft > x.shape[-1]:
warnings.warn(
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
)
x = np.pad(x, int(n_fft // 2), mode=pad_mode)
elif n_fft > x.shape[-1]:
raise ParameterError(
f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
)
# Window the time series.
x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length)
# Pre-allocate the STFT matrix
stft_matrix = np.empty(
(int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
fft = np.fft # use numpy fft as default
# Constrain STFT block sizes to 256 KB
MAX_MEM_BLOCK = 2**8 * 2**10
# how many columns can we fit within MAX_MEM_BLOCK?
n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
n_columns = max(n_columns, 1)
for bl_s in range(0, stft_matrix.shape[1], n_columns):
bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
stft_matrix[:, bl_s:bl_t] = fft.rfft(
fft_window * x_frames[:, bl_s:bl_t], axis=0)
return stft_matrix
def power_to_db(spect: array,
ref: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=80.0) -> array:
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units
This computes the scaling ``10 * log10(spect / ref)`` in a numerically
stable way.
This function is aligned with librosa.
"""
spect = np.asarray(spect)
if amin <= 0:
raise ParameterError("amin must be strictly positive")
if np.issubdtype(spect.dtype, np.complexfloating):
warnings.warn(
"power_to_db was called on complex input so phase "
"information will be discarded. To suppress this warning, "
"call power_to_db(np.abs(D)**2) instead.")
magnitude = np.abs(spect)
else:
magnitude = spect
if callable(ref):
# User supplied a function to calculate reference power
ref_value = ref(magnitude)
else:
ref_value = np.abs(ref)
log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
if top_db is not None:
if top_db < 0:
raise ParameterError("top_db must be non-negative")
log_spec = np.maximum(log_spec, log_spec.max() - top_db)
return log_spec
def mfcc(x,
sr: int=16000,
spect: Optional[array]=None,
n_mfcc: int=20,
dct_type: int=2,
norm: str="ortho",
lifter: int=0,
**kwargs) -> array:
"""Mel-frequency cepstral coefficients (MFCCs)
This function is NOT strictly aligned with librosa. The following example shows how to get the
same result with librosa:
# paddleaudioe mfcc:
kwargs = {
'window_size':512,
'hop_length':320,
'mel_bins':64,
'fmin':50,
'to_db':False}
a = mfcc(x,
spect=None,
n_mfcc=20,
dct_type=2,
norm='ortho',
lifter=0,
**kwargs)
# librosa mfcc:
spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512,
win_length=512,
hop_length=320,
n_mels=64, fmin=50)
b = librosa.feature.mfcc(x,
sr=16000,
S=spect,
n_mfcc=20,
dct_type=2,
norm='ortho',
lifter=0)
assert np.mean( (a-b)**2) < 1e-8
"""
if spect is None:
spect = melspectrogram(x, sr=sr, **kwargs)
M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
if lifter > 0:
factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
lifter)
return M * factor[:, np.newaxis]
elif lifter == 0:
return M
else:
raise ParameterError(
f"MFCC lifter={lifter} must be a non-negative number")
def melspectrogram(x: array,
sr: int=16000,
window_size: int=512,
hop_length: int=320,
n_mels: int=64,
fmin: int=50,
fmax: Optional[float]=None,
window: str='hann',
center: bool=True,
pad_mode: str='reflect',
power: float=2.0,
to_db: bool=True,
ref: float=1.0,
amin: float=1e-10,
top_db: Optional[float]=None) -> array:
"""Compute mel-spectrogram.
Parameters:
x: numpy.ndarray
The input wavform is a numpy array [shape=(n,)]
window_size: int, typically 512, 1024, 2048, etc.
The window size for framing, also used as n_fft for stft
Returns:
The mel-spectrogram in power scale or db scale(default)
Notes:
1. sr is default to 16000, which is commonly used in speech/speaker processing.
2. when fmax is None, it is set to sr//2.
3. this function will convert mel spectgrum to db scale by default. This is different
that of librosa.
"""
_check_audio(x, mono=True)
if len(x) <= 0:
raise ParameterError('The input waveform is empty')
if fmax is None:
fmax = sr // 2
if fmin < 0 or fmin >= fmax:
raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
s = stft(
x,
n_fft=window_size,
hop_length=hop_length,
win_length=window_size,
window=window,
center=center,
pad_mode=pad_mode)
spect_power = np.abs(s)**power
fb_matrix = compute_fbank_matrix(
sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
mel_spect = np.matmul(fb_matrix, spect_power)
if to_db:
return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
else:
return mel_spect
def spectrogram(x: array,
sr: int=16000,
window_size: int=512,
hop_length: int=320,
window: str='hann',
center: bool=True,
pad_mode: str='reflect',
power: float=2.0) -> array:
"""Compute spectrogram from an input waveform.
This function is a wrapper for librosa.feature.stft, with addition step to
compute the magnitude of the complex spectrogram.
"""
s = stft(
x,
n_fft=window_size,
hop_length=hop_length,
win_length=window_size,
window=window,
center=center,
pad_mode=pad_mode)
return np.abs(s)**power
def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array:
"""Mu-law encoding.
Compute the mu-law decoding given an input code.
When quantized is True, the result will be converted to
integer in range [0,mu-1]. Otherwise, the resulting signal
is in range [-1,1]
Reference:
https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
"""
mu = 255
y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
if quantized:
y = np.floor((y + 1) / 2 * mu + 0.5) # convert to [0 , mu-1]
return y
def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
"""Mu-law decoding.
Compute the mu-law decoding given an input code.
it assumes that the input y is in
range [0,mu-1] when quantize is True and [-1,1] otherwise
Reference:
https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
"""
if mu < 1:
raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
mu = mu - 1
if quantized: # undo the quantization
y = y * 2 / mu - 1
x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
return x

@ -0,0 +1,309 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import paddle.nn as nn
import paddle.nn.functional as F
from ..utils.download import load_state_dict_from_url
from ..utils.env import MODEL_HOME
__all__ = ['CNN14', 'CNN10', 'CNN6', 'cnn14', 'cnn10', 'cnn6']
pretrained_model_urls = {
'cnn14': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams',
'cnn10': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams',
'cnn6': 'https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams',
}
class ConvBlock(nn.Layer):
def __init__(self, in_channels, out_channels):
super(ConvBlock, self).__init__()
self.conv1 = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
bias_attr=False)
self.conv2 = nn.Conv2D(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=(3, 3),
stride=(1, 1),
padding=(1, 1),
bias_attr=False)
self.bn1 = nn.BatchNorm2D(out_channels)
self.bn2 = nn.BatchNorm2D(out_channels)
def forward(self, x, pool_size=(2, 2), pool_type='avg'):
x = self.conv1(x)
x = self.bn1(x)
x = F.relu(x)
x = self.conv2(x)
x = self.bn2(x)
x = F.relu(x)
if pool_type == 'max':
x = F.max_pool2d(x, kernel_size=pool_size)
elif pool_type == 'avg':
x = F.avg_pool2d(x, kernel_size=pool_size)
elif pool_type == 'avg+max':
x = F.avg_pool2d(
x, kernel_size=pool_size) + F.max_pool2d(
x, kernel_size=pool_size)
else:
raise Exception(
f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".'
)
return x
class ConvBlock5x5(nn.Layer):
def __init__(self, in_channels, out_channels):
super(ConvBlock5x5, self).__init__()
self.conv1 = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=(5, 5),
stride=(1, 1),
padding=(2, 2),
bias_attr=False)
self.bn1 = nn.BatchNorm2D(out_channels)
def forward(self, x, pool_size=(2, 2), pool_type='avg'):
x = self.conv1(x)
x = self.bn1(x)
x = F.relu(x)
if pool_type == 'max':
x = F.max_pool2d(x, kernel_size=pool_size)
elif pool_type == 'avg':
x = F.avg_pool2d(x, kernel_size=pool_size)
elif pool_type == 'avg+max':
x = F.avg_pool2d(
x, kernel_size=pool_size) + F.max_pool2d(
x, kernel_size=pool_size)
else:
raise Exception(
f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".'
)
return x
class CNN14(nn.Layer):
"""
The CNN14(14-layer CNNs) mainly consist of 6 convolutional blocks while each convolutional
block consists of 2 convolutional layers with a kernel size of 3 × 3.
Reference:
PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
https://arxiv.org/pdf/1912.10211.pdf
"""
emb_size = 2048
def __init__(self, extract_embedding: bool=True):
super(CNN14, self).__init__()
self.bn0 = nn.BatchNorm2D(64)
self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
self.fc1 = nn.Linear(2048, self.emb_size)
self.fc_audioset = nn.Linear(self.emb_size, 527)
self.extract_embedding = extract_embedding
def forward(self, x):
x.stop_gradient = False
x = x.transpose([0, 3, 2, 1])
x = self.bn0(x)
x = x.transpose([0, 3, 2, 1])
x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = x.mean(axis=3)
x = x.max(axis=2) + x.mean(axis=2)
x = F.dropout(x, p=0.5, training=self.training)
x = F.relu(self.fc1(x))
if self.extract_embedding:
output = F.dropout(x, p=0.5, training=self.training)
else:
output = F.sigmoid(self.fc_audioset(x))
return output
class CNN10(nn.Layer):
"""
The CNN10(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
block consists of 2 convolutional layers with a kernel size of 3 × 3.
Reference:
PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
https://arxiv.org/pdf/1912.10211.pdf
"""
emb_size = 512
def __init__(self, extract_embedding: bool=True):
super(CNN10, self).__init__()
self.bn0 = nn.BatchNorm2D(64)
self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
self.fc1 = nn.Linear(512, self.emb_size)
self.fc_audioset = nn.Linear(self.emb_size, 527)
self.extract_embedding = extract_embedding
def forward(self, x):
x.stop_gradient = False
x = x.transpose([0, 3, 2, 1])
x = self.bn0(x)
x = x.transpose([0, 3, 2, 1])
x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = x.mean(axis=3)
x = x.max(axis=2) + x.mean(axis=2)
x = F.dropout(x, p=0.5, training=self.training)
x = F.relu(self.fc1(x))
if self.extract_embedding:
output = F.dropout(x, p=0.5, training=self.training)
else:
output = F.sigmoid(self.fc_audioset(x))
return output
class CNN6(nn.Layer):
"""
The CNN14(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
block consists of 1 convolutional layers with a kernel size of 5 × 5.
Reference:
PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
https://arxiv.org/pdf/1912.10211.pdf
"""
emb_size = 512
def __init__(self, extract_embedding: bool=True):
super(CNN6, self).__init__()
self.bn0 = nn.BatchNorm2D(64)
self.conv_block1 = ConvBlock5x5(in_channels=1, out_channels=64)
self.conv_block2 = ConvBlock5x5(in_channels=64, out_channels=128)
self.conv_block3 = ConvBlock5x5(in_channels=128, out_channels=256)
self.conv_block4 = ConvBlock5x5(in_channels=256, out_channels=512)
self.fc1 = nn.Linear(512, self.emb_size)
self.fc_audioset = nn.Linear(self.emb_size, 527)
self.extract_embedding = extract_embedding
def forward(self, x):
x.stop_gradient = False
x = x.transpose([0, 3, 2, 1])
x = self.bn0(x)
x = x.transpose([0, 3, 2, 1])
x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
x = F.dropout(x, p=0.2, training=self.training)
x = x.mean(axis=3)
x = x.max(axis=2) + x.mean(axis=2)
x = F.dropout(x, p=0.5, training=self.training)
x = F.relu(self.fc1(x))
if self.extract_embedding:
output = F.dropout(x, p=0.5, training=self.training)
else:
output = F.sigmoid(self.fc_audioset(x))
return output
def cnn14(pretrained: bool=False, extract_embedding: bool=True) -> CNN14:
model = CNN14(extract_embedding=extract_embedding)
if pretrained:
state_dict = load_state_dict_from_url(
url=pretrained_model_urls['cnn14'],
path=os.path.join(MODEL_HOME, 'panns'))
model.set_state_dict(state_dict)
return model
def cnn10(pretrained: bool=False, extract_embedding: bool=True) -> CNN10:
model = CNN10(extract_embedding=extract_embedding)
if pretrained:
state_dict = load_state_dict_from_url(
url=pretrained_model_urls['cnn10'],
path=os.path.join(MODEL_HOME, 'panns'))
model.set_state_dict(state_dict)
return model
def cnn6(pretrained: bool=False, extract_embedding: bool=True) -> CNN6:
model = CNN6(extract_embedding=extract_embedding)
if pretrained:
state_dict = load_state_dict_from_url(
url=pretrained_model_urls['cnn6'],
path=os.path.join(MODEL_HOME, 'panns'))
model.set_state_dict(state_dict)
return model

@ -0,0 +1,18 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .download import *
from .env import *
from .error import *
from .log import *
from .time import *

@ -0,0 +1,66 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import Dict
from typing import List
from paddle.framework import load as load_state_dict
from paddle.utils import download
from pathos.multiprocessing import ProcessPool
from .log import logger
download.logger = logger
def decompress(file: str):
"""
Extracts all files from a compressed file.
"""
assert os.path.isfile(file), "File: {} not exists.".format(file)
download._decompress(file)
def download_and_decompress(archives: List[Dict[str, str]],
path: str,
n_workers: int=0):
"""
Download archieves and decompress to specific path.
"""
if not os.path.isdir(path):
os.makedirs(path)
if n_workers <= 0:
for archive in archives:
assert 'url' in archive and 'md5' in archive, \
'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
download.get_path_from_url(archive['url'], path, archive['md5'])
else:
pool = ProcessPool(nodes=n_workers)
pool.imap(download.get_path_from_url, [_['url'] for _ in archives],
[path] * len(archives), [_['md5'] for _ in archives])
pool.close()
pool.join()
def load_state_dict_from_url(url: str, path: str, md5: str=None):
"""
Download and load a state dict from url
"""
if not os.path.isdir(path):
os.makedirs(path)
download.get_path_from_url(url, path, md5)
return load_state_dict(os.path.join(path, os.path.basename(url)))

@ -0,0 +1,53 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
This module is used to store environmental variables in PaddleAudio.
PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
default value through the PPAUDIO_HOME environment variable.
MODEL_HOME --> Store model files.
DATA_HOME --> Store automatically downloaded datasets.
'''
import os
def _get_user_home():
return os.path.expanduser('~')
def _get_ppaudio_home():
if 'PPAUDIO_HOME' in os.environ:
home_path = os.environ['PPAUDIO_HOME']
if os.path.exists(home_path):
if os.path.isdir(home_path):
return home_path
else:
raise RuntimeError(
'The environment variable PPAUDIO_HOME {} is not a directory.'.
format(home_path))
else:
return home_path
return os.path.join(_get_user_home(), '.paddleaudio')
def _get_sub_home(directory):
home = os.path.join(_get_ppaudio_home(), directory)
if not os.path.exists(home):
os.makedirs(home)
return home
USER_HOME = _get_user_home()
PPAUDIO_HOME = _get_ppaudio_home()
MODEL_HOME = _get_sub_home('models')
DATA_HOME = _get_sub_home('datasets')

@ -0,0 +1,20 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ['ParameterError']
class ParameterError(Exception):
"""Exception class for Parameter checking"""
pass

@ -0,0 +1,136 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import functools
import logging
import threading
import time
import colorlog
loggers = {}
log_config = {
'DEBUG': {
'level': 10,
'color': 'purple'
},
'INFO': {
'level': 20,
'color': 'green'
},
'TRAIN': {
'level': 21,
'color': 'cyan'
},
'EVAL': {
'level': 22,
'color': 'blue'
},
'WARNING': {
'level': 30,
'color': 'yellow'
},
'ERROR': {
'level': 40,
'color': 'red'
},
'CRITICAL': {
'level': 50,
'color': 'bold_red'
}
}
class Logger(object):
'''
Deafult logger in PaddleAudio
Args:
name(str) : Logger name, default is 'PaddleAudio'
'''
def __init__(self, name: str=None):
name = 'PaddleAudio' if not name else name
self.logger = logging.getLogger(name)
for key, conf in log_config.items():
logging.addLevelName(conf['level'], key)
self.__dict__[key] = functools.partial(self.__call__, conf['level'])
self.__dict__[key.lower()] = functools.partial(self.__call__,
conf['level'])
self.format = colorlog.ColoredFormatter(
'%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
log_colors={key: conf['color']
for key, conf in log_config.items()})
self.handler = logging.StreamHandler()
self.handler.setFormatter(self.format)
self.logger.addHandler(self.handler)
self.logLevel = 'DEBUG'
self.logger.setLevel(logging.DEBUG)
self.logger.propagate = False
self._is_enable = True
def disable(self):
self._is_enable = False
def enable(self):
self._is_enable = True
@property
def is_enable(self) -> bool:
return self._is_enable
def __call__(self, log_level: str, msg: str):
if not self.is_enable:
return
self.logger.log(log_level, msg)
@contextlib.contextmanager
def use_terminator(self, terminator: str):
old_terminator = self.handler.terminator
self.handler.terminator = terminator
yield
self.handler.terminator = old_terminator
@contextlib.contextmanager
def processing(self, msg: str, interval: float=0.1):
'''
Continuously print a progress bar with rotating special effects.
Args:
msg(str): Message to be printed.
interval(float): Rotation interval. Default to 0.1.
'''
end = False
def _printer():
index = 0
flags = ['\\', '|', '/', '-']
while not end:
flag = flags[index % len(flags)]
with self.use_terminator('\r'):
self.info('{}: {}'.format(msg, flag))
time.sleep(interval)
index += 1
t = threading.Thread(target=_printer)
t.start()
yield
end = True
logger = Logger()

@ -0,0 +1,67 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import time
class Timer(object):
'''Calculate runing speed and estimated time of arrival(ETA)'''
def __init__(self, total_step: int):
self.total_step = total_step
self.last_start_step = 0
self.current_step = 0
self._is_running = True
def start(self):
self.last_time = time.time()
self.start_time = time.time()
def stop(self):
self._is_running = False
self.end_time = time.time()
def count(self) -> int:
if not self.current_step >= self.total_step:
self.current_step += 1
return self.current_step
@property
def timing(self) -> float:
run_steps = self.current_step - self.last_start_step
self.last_start_step = self.current_step
time_used = time.time() - self.last_time
self.last_time = time.time()
return run_steps / time_used
@property
def is_running(self) -> bool:
return self._is_running
@property
def eta(self) -> str:
if not self.is_running:
return '00:00:00'
scale = self.total_step / self.current_step
remaining_time = (time.time() - self.start_time) * scale
return seconds_to_hms(remaining_time)
def seconds_to_hms(seconds: int) -> str:
'''Convert the number of seconds to hh:mm:ss'''
h = math.floor(seconds / 3600)
m = math.floor((seconds - h * 3600) / 60)
s = int(seconds - h * 3600 - m * 60)
hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
return hms_str

@ -0,0 +1,48 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import setuptools
# set the version here
version = '0.1.0a'
with open("README.md", "r") as fh:
long_description = fh.read()
setuptools.setup(
name="paddleaudio",
version=version,
author="",
author_email="",
description="PaddleAudio, in development",
long_description=long_description,
long_description_content_type="text/markdown",
url="",
packages=setuptools.find_packages(exclude=["build*", "test*", "examples*"]),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
install_requires=[
'numpy >= 1.15.0',
'scipy >= 1.0.0',
'resampy >= 0.2.2',
'soundfile >= 0.9.0',
'colorlog',
'pathos',
],
extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2']
} # for dev only, install: pip install -e .[dev]
)

@ -0,0 +1,41 @@
# PaddleAudio Testing Guide
# Testing
First clone a version of the project by
```
git clone https://github.com/PaddlePaddle/models.git
```
Then install the project in your virtual environment.
```
cd models/PaddleAudio
python setup.py bdist_wheel
pip install -e .[dev]
```
The requirements for testing will be installed along with PaddleAudio.
Now run
```
pytest test
```
If it goes well, you will see outputs like these:
```
platform linux -- Python 3.7.10, pytest-6.2.4, py-1.10.0, pluggy-0.13.1
rootdir: ./models/PaddleAudio
plugins: hydra-core-1.0.6
collected 16 items
test/unit_test/test_backend.py ........... [ 68%]
test/unit_test/test_features.py ..... [100%]
==================================================== warnings summary ====================================================
.
.
.
-- Docs: https://docs.pytest.org/en/stable/warnings.html
============================================ 16 passed, 11 warnings in 6.76s =============================================
```

@ -0,0 +1,113 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import librosa
import numpy as np
import paddleaudio
import pytest
TEST_FILE = './test/data/test_audio.wav'
def relative_err(a, b, real=True):
"""compute relative error of two matrices or vectors"""
if real:
return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
else:
err = np.sum((a.real - b.real)**2) / \
(EPS + np.sum(a.real**2) + np.sum(b.real**2))
err += np.sum((a.imag - b.imag)**2) / \
(EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
return err
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def load_audio():
x, r = librosa.load(TEST_FILE, sr=16000)
print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
return x, r
# start testing
x, r = load_audio()
EPS = 1e-8
def test_load():
s, r = paddleaudio.load(TEST_FILE, sr=16000)
assert r == 16000
assert s.dtype == 'float32'
s, r = paddleaudio.load(
TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
assert len(s) / r == 2.0
assert r == 16000
assert s.dtype == 'int16'
def test_depth_convert():
y = paddleaudio.depth_convert(x, 'int16')
assert len(y) == len(x)
assert y.dtype == 'int16'
assert np.max(y) <= 32767
assert np.min(y) >= -32768
assert np.std(y) > EPS
y = paddleaudio.depth_convert(x, 'int8')
assert len(y) == len(x)
assert y.dtype == 'int8'
assert np.max(y) <= 127
assert np.min(y) >= -128
assert np.std(y) > EPS
# test case for resample
rs_test_data = [
(32000, 'kaiser_fast'),
(16000, 'kaiser_fast'),
(8000, 'kaiser_fast'),
(32000, 'kaiser_best'),
(16000, 'kaiser_best'),
(8000, 'kaiser_best'),
(22050, 'kaiser_best'),
(44100, 'kaiser_best'),
]
@pytest.mark.parametrize('sr,mode', rs_test_data)
def test_resample(sr, mode):
y = paddleaudio.resample(x, 16000, sr, mode=mode)
factor = sr / 16000
err = relative_err(len(y), len(x) * factor)
print('err:', err)
assert err < EPS
def test_normalize():
y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
assert np.max(y) < 0.5 + EPS
y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
assert np.max(y) <= 2.0 + EPS
y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
print('np.std(y):', np.std(y))
assert np.abs(np.std(y) - 1.0) < EPS
if __name__ == '__main__':
test_load()
test_depth_convert()
test_resample(22050, 'kaiser_fast')
test_normalize()

@ -0,0 +1,143 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import librosa
import numpy as np
import paddleaudio as pa
import pytest
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def load_audio():
x, r = librosa.load('./test/data/test_audio.wav')
#x,r = librosa.load('../data/test_audio.wav',sr=16000)
return x, r
## start testing
x, r = load_audio()
EPS = 1e-8
def relative_err(a, b, real=True):
"""compute relative error of two matrices or vectors"""
if real:
return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
else:
err = np.sum((a.real - b.real)**2) / (
EPS + np.sum(a.real**2) + np.sum(b.real**2))
err += np.sum((a.imag - b.imag)**2) / (
EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
return err
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_melspectrogram():
a = pa.melspectrogram(
x,
window_size=512,
sr=16000,
hop_length=320,
n_mels=64,
fmin=50,
to_db=False, )
b = librosa.feature.melspectrogram(
x,
sr=16000,
n_fft=512,
win_length=512,
hop_length=320,
n_mels=64,
fmin=50)
assert relative_err(a, b) < EPS
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_melspectrogram_db():
a = pa.melspectrogram(
x,
window_size=512,
sr=16000,
hop_length=320,
n_mels=64,
fmin=50,
to_db=True,
ref=1.0,
amin=1e-10,
top_db=None)
b = librosa.feature.melspectrogram(
x,
sr=16000,
n_fft=512,
win_length=512,
hop_length=320,
n_mels=64,
fmin=50)
b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
assert relative_err(a, b) < EPS
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_stft():
a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
assert a.shape == b.shape
assert relative_err(a, b, real=False) < EPS
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_split_frames():
a = librosa.util.frame(x, frame_length=512, hop_length=320)
b = pa.split_frames(x, frame_length=512, hop_length=320)
assert relative_err(a, b) < EPS
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_mfcc():
kwargs = {
'window_size': 512,
'hop_length': 320,
'n_mels': 64,
'fmin': 50,
'to_db': False
}
a = pa.mfcc(
x,
#sample_rate=16000,
spect=None,
n_mfcc=20,
dct_type=2,
norm='ortho',
lifter=0,
**kwargs)
S = librosa.feature.melspectrogram(
x,
sr=16000,
n_fft=512,
win_length=512,
hop_length=320,
n_mels=64,
fmin=50)
b = librosa.feature.mfcc(
x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
assert relative_err(a, b) < EPS
if __name__ == '__main__':
test_melspectrogram()
test_melspectrogram_db()
test_stft()
test_split_frames()
test_mfcc()

@ -1,49 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ["end_detect"]
def end_detect(ended_hyps, i, M=3, D_end=np.log(1 * np.exp(-10))):
"""End detection.
described in Eq. (50) of S. Watanabe et al
"Hybrid CTC/Attention Architecture for End-to-End Speech Recognition"
:param ended_hyps: dict
:param i: int
:param M: int
:param D_end: float
:return: bool
"""
if len(ended_hyps) == 0:
return False
count = 0
best_hyp = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[0]
for m in range(M):
# get ended_hyps with their length is i - m
hyp_length = i - m
hyps_same_length = [
x for x in ended_hyps if len(x["yseq"]) == hyp_length
]
if len(hyps_same_length) > 0:
best_hyp_same_length = sorted(
hyps_same_length, key=lambda x: x["score"], reverse=True)[0]
if best_hyp_same_length["score"] - best_hyp["score"] < D_end:
count += 1
if count == M:
return True
else:
return False

@ -1,54 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This module provides functions to calculate bleu score in different level.
e.g. wer for word-level, cer for char-level.
"""
import sacrebleu
__all__ = ['bleu', 'char_bleu']
def bleu(hypothesis, reference):
"""Calculate BLEU. BLEU compares reference text and
hypothesis text in word-level using scarebleu.
:param reference: The reference sentences.
:type reference: list[list[str]]
:param hypothesis: The hypothesis sentence.
:type hypothesis: list[str]
:raises ValueError: If the reference length is zero.
"""
return sacrebleu.corpus_bleu(hypothesis, reference)
def char_bleu(hypothesis, reference):
"""Calculate BLEU. BLEU compares reference text and
hypothesis text in char-level using scarebleu.
:param reference: The reference sentences.
:type reference: list[list[str]]
:param hypothesis: The hypothesis sentence.
:type hypothesis: list[str]
:raises ValueError: If the reference number is zero.
"""
hypothesis = [' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis]
reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref]
for ref in reference]
return sacrebleu.corpus_bleu(hypothesis, reference)

@ -0,0 +1,3 @@
# echo system
ASR + TTS

@ -0,0 +1,17 @@
#!/bin/bash
mkdir -p data
wav_en=data/en.wav
wav_zh=data/zh.wav
test -e ${wav_en} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -P data
test -e ${wav_zh} || wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav -P data
pip install paddlehub
asr_en_cmd="import paddlehub as hub; model = hub.Module(name='u2_conformer_librispeech'); print(model.speech_recognize("${wav_en}", device='gpu'))"
asr_zh_cmd="import paddlehub as hub; model = hub.Module(name='u2_conformer_aishell'); print(model.speech_recognize("${wav_zh}", device='gpu'))"
python -c "${asr_en_cmd}"
python -c "${asr_zh_cmd}"

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

@ -1,35 +0,0 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

@ -1,4 +1,4 @@
myst_parser
myst-parser
numpydoc
recommonmark>=0.5.0
sphinx

@ -0,0 +1,5 @@
.wy-nav-content {
max-width: 80%;
}
.table table{ background:#b9b9b9}
.table table td{ background:#FFF; }

@ -1,80 +0,0 @@
# Getting Started
Several shell scripts provided in `./examples/tiny/local` will help us to quickly give it a try, for most major modules, including data preparation, model training, case inference and model evaluation, with a few public dataset (e.g. [LibriSpeech](http://www.openslr.org/12/), [Aishell](http://www.openslr.org/33)). Reading these examples will also help you to understand how to make it work with your own data.
Some of the scripts in `./examples` are not configured with GPUs. If you want to train with 8 GPUs, please modify `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`. If you don't have any GPU available, please set `CUDA_VISIBLE_DEVICES=` to use CPUs instead. Besides, if out-of-memory problem occurs, just reduce `batch_size` to fit.
Let's take a tiny sampled subset of [LibriSpeech dataset](http://www.openslr.org/12/) for instance.
- Go to directory
```bash
cd examples/tiny
```
Notice that this is only a toy example with a tiny sampled subset of LibriSpeech. If you would like to try with the complete dataset (would take several days for training), please go to `examples/librispeech` instead.
- Source env
```bash
source path.sh
```
**Must do this before starting do anything.**
Set `MAIN_ROOT` as project dir. Using defualt `deepspeech2` model as default, you can change this in the script.
- Main entrypoint
```bash
bash run.sh
```
This just a demo, please make sure every `step` is work fine when do next `step`.
More detailed information are provided in the following sections. Wish you a happy journey with the *DeepSpeech on PaddlePaddle* ASR engine!
## Training a model
The key steps of training for Mandarin language are same to that of English language and we have also provided an example for Mandarin training with Aishell in ```examples/aishell/local```. As mentioned above, please execute ```sh data.sh```, ```sh train.sh```, ```sh test.sh``` and ```sh infer.sh``` to do data preparation, training, testing and inference correspondingly. We have also prepared a pre-trained model (downloaded by local/download_model.sh) for users to try with ```sh infer_golden.sh``` and ```sh test_golden.sh```. Notice that, different from English LM, the Mandarin LM is character-based and please run ```local/tune.sh``` to find an optimal setting.
## Speech-to-text Inference
An inference module caller `infer.py` is provided to infer, decode and visualize speech-to-text results for several given audio clips. It might help to have an intuitive and qualitative evaluation of the ASR model's performance.
```bash
CUDA_VISIBLE_DEVICES=0 bash local/infer.sh
```
We provide two types of CTC decoders: *CTC greedy decoder* and *CTC beam search decoder*. The *CTC greedy decoder* is an implementation of the simple best-path decoding algorithm, selecting at each timestep the most likely token, thus being greedy and locally optimal. The [*CTC beam search decoder*](https://arxiv.org/abs/1408.2873) otherwise utilizes a heuristic breadth-first graph search for reaching a near global optimality; it also requires a pre-trained KenLM language model for better scoring and ranking. The decoder type can be set with argument `decoding_method`.
## Evaluate a Model
To evaluate a model's performance quantitatively, please run:
```bash
CUDA_VISIBLE_DEVICES=0 bash local/test.sh
```
The error rate (default: word error rate; can be set with `error_rate_type`) will be printed.
For more help on arguments:
## Hyper-parameters Tuning
The hyper-parameters $\alpha$ (language model weight) and $\beta$ (word insertion weight) for the [*CTC beam search decoder*](https://arxiv.org/abs/1408.2873) often have a significant impact on the decoder's performance. It would be better to re-tune them on the validation set when the acoustic model is renewed.
`tune.py` performs a 2-D grid search over the hyper-parameter $\alpha$ and $\beta$. You must provide the range of $\alpha$ and $\beta$, as well as the number of their attempts.
```bash
CUDA_VISIBLE_DEVICES=0 bash local/tune.sh
```
The grid search will print the WER (word error rate) or CER (character error rate) at each point in the hyper-parameters space, and draw the error surface optionally. A proper hyper-parameters range should include the global minima of the error surface for WER/CER, as illustrated in the following figure.
<p align="center">
<img src="images/tuning_error_surface.png" width=550>
<br/>An example error surface for tuning on the dev-clean set of LibriSpeech
</p>
Usually, as the figure shows, the variation of language model weight ($\alpha$) significantly affect the performance of CTC beam search decoder. And a better procedure is to first tune on serveral data batches (the number can be specified) to find out the proper range of hyper-parameters, then change to the whole validation set to carray out an accurate tuning.
After tuning, you can reset $\alpha$ and $\beta$ in the inference and evaluation modules to see if they really help improve the ASR performance. For more help

@ -1,6 +1,5 @@
# Deepspeech2
## Streaming
# Models introduction
## Streaming DeepSpeech2
The implemented arcitecure of Deepspeech2 online model is based on [Deepspeech2 model](https://arxiv.org/pdf/1512.02595.pdf) with some changes.
The model is mainly composed of 2D convolution subsampling layer and stacked single direction rnn layers.
@ -14,21 +13,22 @@ In addition, the training process and the testing process are also introduced.
The arcitecture of the model is shown in Fig.1.
<p align="center">
<img src="../images/ds2onlineModel.png" width=800>
<br/>Fig.1 The Arcitecture of deepspeech2 online model
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/ds2onlineModel.png" width=800>
<br/>Fig.1 The Arcitecture of deepspeech2 online model
</p>
### Data Preparation
#### Vocabulary
For English data, the vocabulary dictionary is composed of 26 English characters with " ' ", space, \<blank\> and \<eos\>. The \<blank\> represents the blank label in CTC, the \<unk\> represents the unknown character and the \<eos\> represents the start and the end characters. For mandarin, the vocabulary dictionary is composed of chinese characters statisticed from the training set and three additional characters are added. The added characters are \<blank\>, \<unk\> and \<eos\>. For both English and mandarin data, we set the default indexs that \<blank\>=0, \<unk\>=1 and \<eos\>= last index.
```
# The code to build vocabulary
cd examples/aishell/s0
python3 ../../../utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
# The code to build vocabulary
cd examples/aishell/s0
python3 ../../../utils/build_vocab.py \
--unit_type="char" \
--count_threshold=0 \
--vocab_path="data/vocab.txt" \
--manifest_paths "data/manifest.train.raw" "data/manifest.dev.raw"
# vocabulary for aishell dataset (Mandarin)
vi examples/aishell/s0/data/vocab.txt
@ -40,36 +40,36 @@ vi examples/librispeech/s0/data/vocab.txt
#### CMVN
For CMVN, a subset or the full of traininig set is chosed and be used to compute the feature mean and std.
```
# The code to compute the feature mean and std
# The code to compute the feature mean and std
cd examples/aishell/s0
python3 ../../../utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--spectrum_type="linear" \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=20.0 \
--sample_rate=16000 \
--use_dB_normalization=True \
--num_samples=2000 \
--num_workers=10 \
--output_path="data/mean_std.json"
--manifest_path="data/manifest.train.raw" \
--spectrum_type="linear" \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=20.0 \
--sample_rate=16000 \
--use_dB_normalization=True \
--num_samples=2000 \
--num_workers=10 \
--output_path="data/mean_std.json"
```
#### Feature Extraction
For feature extraction, three methods are implemented, which are linear (FFT without using filter bank), fbank and mfcc.
Currently, the released deepspeech2 online model use the linear feature extraction method.
```
The code for feature extraction
vi deepspeech/frontend/featurizer/audio_featurizer.py
```
For feature extraction, three methods are implemented, which are linear (FFT without using filter bank), fbank and mfcc.
Currently, the released deepspeech2 online model use the linear feature extraction method.
```
The code for feature extraction
vi paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
```
### Encoder
The encoder is composed of two 2D convolution subsampling layers and a number of stacked single direction rnn layers. The 2D convolution subsampling layers extract feature representation from the raw audio feature and reduce the length of audio feature at the same time. After passing through the convolution subsampling layers, then the feature representation are input into the stacked rnn layers. For the stacked rnn layers, LSTM cell and GRU cell are provided to use. Adding one fully connected (fc) layer after the stacked rnn layers is optional. If the number of stacked rnn layers is less than 5, adding one fc layer after stacked rnn layers is recommand.
The code of Encoder is in:
```
vi deepspeech/models/ds2_online/deepspeech2.py
vi paddlespeech/s2t/models/ds2_online/deepspeech2.py
```
### Decoder
@ -78,16 +78,16 @@ To got the character possibilities of each frame, the feature representation of
The code of the decoder is in:
```
# The code of constructing the decoder in model
vi deepspeech/models/ds2_online/deepspeech2.py
vi paddlespeech/s2t/models/ds2_online/deepspeech2.py
# The code of CTC Decoder
vi deepspeech/modules/ctc.py
vi paddlespeech/s2t/modules/ctc.py
```
## Training Process
### Training Process
Using the command below, you can train the deepspeech2 online model.
```
cd examples/aishell/s0
bash run.sh --stage 0 --stop_stage 2 --model_type online --conf_path conf/deepspeech2_online.yaml
cd examples/aishell/s0
bash run.sh --stage 0 --stop_stage 2 --model_type online --conf_path conf/deepspeech2_online.yaml
```
The detail commands are:
```
@ -126,10 +126,10 @@ fi
By using the command above, the training process can be started. There are 5 stages in "run.sh", and the first 3 stages are used for training process. The stage 0 is used for data preparation, in which the dataset will be downloaded, and the manifest files of the datasets, vocabulary dictionary and CMVN file will be generated in "./data/". The stage 1 is used for training the model, the log files and model checkpoint is saved in "exp/deepspeech2_online/". The stage 2 is used to generated final model for predicting by averaging the top-k model parameters based on validation loss.
## Testing Process
### Testing Process
Using the command below, you can test the deepspeech2 online model.
```
bash run.sh --stage 3 --stop_stage 5 --model_type online --conf_path conf/deepspeech2_online.yaml
```
bash run.sh --stage 3 --stop_stage 5 --model_type online --conf_path conf/deepspeech2_online.yaml
```
The detail commands are:
```
@ -138,7 +138,7 @@ avg_num=1
model_type=online
avg_ckpt=avg_${avg_num}
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES=2 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
fi
@ -152,37 +152,35 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# test export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
fi
```
```
After the training process, we use stage 3,4,5 for testing process. The stage 3 is for testing the model generated in the stage 2 and provided the CER index of the test set. The stage 4 is for transforming the model from dynamic graph to static graph by using "paddle.jit" library. The stage 5 is for testing the model in static graph.
## Non-Streaming
## Non-Streaming DeepSpeech2
The deepspeech2 offline model is similarity to the deepspeech2 online model. The main difference between them is the offline model use the stacked bi-directional rnn layers while the online model use the single direction rnn layers and the fc layer is not used. For the stacked bi-directional rnn layers in the offline model, the rnn cell and gru cell are provided to use.
The arcitecture of the model is shown in Fig.2.
<p align="center">
<img src="../images/ds2offlineModel.png" width=800>
<br/>Fig.2 The Arcitecture of deepspeech2 offline model
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/ds2offlineModel.png" width=800>
<br/>Fig.2 The Arcitecture of deepspeech2 offline model
</p>
For data preparation and decoder, the deepspeech2 offline model is same with the deepspeech2 online model.
The code of encoder and decoder for deepspeech2 offline model is in:
```
vi deepspeech/models/ds2/deepspeech2.py
vi paddlespeech/s2t/models/ds2/deepspeech2.py
```
The training process and testing process of deepspeech2 offline model is very similary to deepspeech2 online model.
Only some changes should be noticed.
For training and testing, the "model_type" and the "conf_path" must be set.
```
```
# Training offline
cd examples/aishell/s0
bash run.sh --stage 0 --stop_stage 2 --model_type offline --conf_path conf/deepspeech2.yaml
```
```
```
# Testing offline
cd examples/aishell/s0

@ -0,0 +1,40 @@
# Quick Start of Speech-To-Text
Several shell scripts provided in `./examples/tiny/local` will help us to quickly give it a try, for most major modules, including data preparation, model training, case inference and model evaluation, with a few public dataset (e.g. [LibriSpeech](http://www.openslr.org/12/), [Aishell](http://www.openslr.org/33)). Reading these examples will also help you to understand how to make it work with your own data.
Some of the scripts in `./examples` are not configured with GPUs. If you want to train with 8 GPUs, please modify `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`. If you don't have any GPU available, please set `CUDA_VISIBLE_DEVICES=` to use CPUs instead. Besides, if out-of-memory problem occurs, just reduce `batch_size` to fit.
Let's take a tiny sampled subset of [LibriSpeech dataset](http://www.openslr.org/12/) for instance.
- Go to directory
```bash
cd examples/tiny
```
Notice that this is only a toy example with a tiny sampled subset of LibriSpeech. If you would like to try with the complete dataset (would take several days for training), please go to `examples/librispeech` instead.
- Source env
```bash
source path.sh
```
**Must do this before you start to do anything.**
Set `MAIN_ROOT` as project dir. Using defualt `deepspeech2` model as `MODEL`, you can change this in the script.
- Main entrypoint
```bash
bash run.sh
```
This is just a demo, please make sure every `step` works well before next `step`.
More detailed information are provided in the following sections. Wish you a happy journey with the *DeepSpeech on PaddlePaddle* ASR engine!
## Training a model
The key steps of training for Mandarin language are same to that of English language and we have also provided an example for Mandarin training with Aishell in ```examples/aishell/local```. As mentioned above, please execute ```sh data.sh```, ```sh train.sh```and```sh test.sh```to do data preparation, training, and testing correspondingly.
## Evaluate a Model
To evaluate a model's performance quantitatively, please run:
```bash
CUDA_VISIBLE_DEVICES=0 bash local/test.sh
```
The error rate (default: word error rate; can be set with `error_rate_type`) will be printed.
We provide two types of CTC decoders: *CTC greedy decoder* and *CTC beam search decoder*. The *CTC greedy decoder* is an implementation of the simple best-path decoding algorithm, selecting at each timestep the most likely token, thus being greedy and locally optimal. The [*CTC beam search decoder*](https://arxiv.org/abs/1408.2873) otherwise utilizes a heuristic breadth-first graph search for reaching a near global optimality; it also requires a pre-trained KenLM language model for better scoring and ranking. The decoder type can be set with argument `decoding_method`.

@ -1,8 +0,0 @@
# Reference
We refer these repos to build `model` and `engine`:
* [delta](https://github.com/Delta-ML/delta.git)
* [espnet](https://github.com/espnet/espnet.git)
* [kaldi](https://github.com/kaldi-asr/kaldi.git)
* [wenet](https://github.com/mobvoi/wenet)

@ -1,28 +0,0 @@
# Released Models
## Acoustic Model Released in paddle 2.X
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 |-| 151 h
[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h
[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h
[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h
[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h
[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h
## Acoustic Model Transformed from paddle 1.8
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------- | :---------
[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 |-| 151 h|
[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers |-| 0.0685| 960 h|
[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
## Language Model Released
Language Model | Training Data | Token-based | Size | Descriptions
:-------------:| :------------:| :-----: | -----: | :-----------------
[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie' binary with '-a 22 -q 8 -b 8'
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings

@ -23,11 +23,13 @@
import recommonmark.parser
import sphinx_rtd_theme
autodoc_mock_imports = ["soundfile", "librosa"]
# -- Project information -----------------------------------------------------
project = 'paddle speech'
copyright = '2021, Deepspeech-developers'
author = 'Deepspeech-developers'
copyright = '2021, paddlespeech-developers'
author = 'paddlespeech-developers'
# The full version, including alpha/beta/rc tags
release = '2.1'
@ -46,10 +48,10 @@ pygments_style = 'sphinx'
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.viewcode',
'sphinx_rtd_theme',
"sphinx_rtd_theme",
'sphinx.ext.mathjax',
'sphinx.ext.autosummary',
'numpydoc',
'sphinx.ext.autosummary',
'myst_parser',
]
@ -76,6 +78,10 @@ smartquotes = False
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_logo = '../images/paddle.png'
html_css_files = [
'custom.css',
]
# -- Extension configuration -------------------------------------------------
# numpydoc_show_class_members = False

@ -0,0 +1,36 @@
# The Dependencies
## By apt-get
### The base dependencies:
```
bc flac jq vim tig tree pkg-config libsndfile1 libflac-dev libvorbis-dev libboost-dev swig python3-dev
```
### The dependencies of kenlm:
```
build-essential cmake libboost-system-dev libboost-thread-dev libboost-program-options-dev libboost-test-dev libeigen3-dev zlib1g-dev libbz2-dev liblzma-dev gcc-5 g++-5
```
### The dependencies of sox:
```
libvorbis-dev libmp3lame-dev libmad-ocaml-dev
```
## By make or setup
```
kenlm
sox
mfa
openblas
kaldi
sctk
AutoLog
swig-decoder
python_kaldi_features
```

@ -1,7 +1,7 @@
Welcome to paddle Deepspeech documentation !
Welcome to paddle PaddleSpeech documentation !
==============================================
**Deepspeech** is a Speech toolkits implemented by paddlepaddle.
**PaddleSpeech** is a Speech toolkits implemented by paddlepaddle.
Contents
@ -10,34 +10,44 @@ Contents
.. toctree::
:maxdepth: 1
:caption: Introduction
asr/deepspeech_architecture
introduction
.. toctree::
:maxdepth: 1
:caption: Getting_started
asr/install
asr/getting_started
:caption: Quick Start
install
asr/quick_start
tts/quick_start
.. toctree::
:maxdepth: 1
:caption: More Information
:caption: Speech-To-Text
asr/models_introduction
asr/data_preparation
asr/augmentation
asr/feature_list
asr/ngram_lm
asr/ngram_lm
.. toctree::
:maxdepth: 1
:caption: Released_model
:caption: Text-To-Speech
asr/released_model
tts/basic_usage
tts/advanced_usage
tts/zh_text_frontend
tts/models_introduction
tts/gan_vocoder
tts/demo
tts/demo_2
.. toctree::
:maxdepth: 1
:caption: Released Models
released_model
.. toctree::
:maxdepth: 1
@ -45,3 +55,8 @@ Contents
asr/reference

@ -8,7 +8,7 @@ To avoid the trouble of environment setup, [running in Docker container](#runnin
## Setup (Important)
- Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost`, `sox, and `swig`, e.g. installing them via `apt-get`:
- Make sure these libraries or tools installed: `pkg-config`, `flac`, `ogg`, `vorbis`, `boost`, `sox`, and `swig`, e.g. installing them via `apt-get`:
```bash
sudo apt-get install -y sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
@ -33,9 +33,9 @@ make install
```bash
git clone https://github.com/PaddlePaddle/DeepSpeech.git
cd DeepSpeech
pushd tools; make; popd
pushd tools; make virtualenv.done:; popd
source tools/venv/bin/activate
bash setup.sh
pip install -e .
```
- Source venv before do experiment.
@ -44,6 +44,14 @@ bash setup.sh
source tools/venv/bin/activate
```
## Simple Setup
```python
git clone https://github.com/PaddlePaddle/DeepSpeech.git
cd DeepSpeech
pip install -e .
```
## Running in Docker Container (optional)
Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed.

@ -0,0 +1,33 @@
# PaddleSpeech
## What is PaddleSpeech?
PaddleSpeech is an open-source toolkit on PaddlePaddle platform for two critical tasks in Speech - Speech-To-Text (Automatic Speech Recognition, ASR) and Text-To-Speech Synthesis (TTS), with modules involving state-of-art and influential models.
## What can PaddleSpeech do?
### Speech-To-Text
(An introduce of ASR in PaddleSpeech is needed here!)
### Text-To-Speech
TTS mainly consists of components below:
- Implementation of models and commonly used neural network layers.
- Dataset abstraction and common data preprocessing pipelines.
- Ready-to-run experiments.
PaddleSpeech TTS provides you with a complete TTS pipeline, including:
- Text FrontEnd
- Rule based Chinese frontend.
- Acoustic Models
- FastSpeech2
- SpeedySpeech
- TransformerTTS
- Tacotron2
- Vocoders
- Multi Band MelGAN
- Parallel WaveGAN
- WaveFlow
- Voice Cloning
- Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
- GE2E
Text-To-Speech helps you to train TTS models with simple commands.

@ -0,0 +1,37 @@
# Reference
We borrowed a lot of code from these repos to build `model` and `engine`, thank for these great work:
* [espnet](https://github.com/espnet/espnet/blob/master/LICENSE)
- Apache-2.0 License
- python/shell `utils`
- kaldi feat preprocessing
- datapipeline and `transform`
- a lot of tts model, like `fastspeech2` and GAN-based `vocoder`
* [wenet](https://github.com/wenet-e2e/wenet/blob/main/LICENSE)
- Apache-2.0 License
- U2 model
- Building TLG based Graph
* [kaldi](https://github.com/kaldi-asr/kaldi/blob/master/COPYING)
- Apache-2.0 License
- shell/perl/python utils.
- feature bins.
- WFST based decoding for LM integration.
* [delta](https://github.com/Delta-ML/delta/blob/master/LICENSE)
- Apache-2.0 License
- `engine` arch
* [speechbrain](https://github.com/speechbrain/speechbrain/blob/develop/LICENSE)
- Apache-2.0 License
- ECAPA-TDNN SV model
* [chainer](https://github.com/chainer/chainer/blob/master/LICENSE)
- MIT License
- Updater, Trainer and more utils.
* [librosa](https://github.com/librosa/librosa/blob/main/LICENSE.md)
- ISC License
- Audio feature

@ -0,0 +1,55 @@
# Released Models
## Speech-To-Text Models
### Acoustic Model Released in paddle 2.X
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :---------
[Ds2 Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds_online.5rnn.debug.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.0824 |-| 151 h
[Ds2 Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s0/aishell.s0.ds2.offline.cer6p65.release.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.065 |-| 151 h
[Conformer Online Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention + CTC | 0.0594 |-| 151 h
[Conformer Offline Aishell Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention | 0.0547 |-| 151 h
[Conformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | Word-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention |-| 0.0325 | 960 h
[Transformer Librispeech Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/transformer.release.tar.gz) | Librispeech Dataset | Word-based | 195 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention |-| 0.0544 | 960 h
### Acoustic Model Transformed from paddle 1.8
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------- | :---------
[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 |-| 151 h|
[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers |-| 0.0685| 960 h|
[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
### Language Model Released
Language Model | Training Data | Token-based | Size | Descriptions
:-------------:| :------------:| :-----: | -----: | :-----------------
[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie' binary with '-a 22 -q 8 -b 8'
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
## Text-To-Speech Models
### Acoustic Models
Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----
Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_vctk_ckpt_0.5.zip)
### Vocoders
Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----
WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)
Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip.](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)
Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_vctk_ckpt_0.5.zip)
### Voice Cloning
Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----
GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)

@ -2,10 +2,11 @@
Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It is built on PaddlePaddle dynamic graph and includes many influential TTS models.
<div align="center">
<img src="docs/images/logo.png" width=300 /> <br>
<img src="../../images/logo.png" width=300 /> <br>
</div>
## News <img src="./docs/images/news_icon.png" width="40"/>
## News <img src="../../images/news_icon.png" width="40"/>
- Oct-12-2021, Refector examples code.
- Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
- Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).

@ -1,6 +1,5 @@
# Advanced Usage
This sections covers how to extend parakeet by implementing your own models and experiments. Guidelines on implementation are also elaborated.
This sections covers how to extend TTS by implementing your own models and experiments. Guidelines on implementation are also elaborated.
For the general deep learning experiment, there are several parts to deal with:
1. Preprocess the data according to the needs of the model, and iterate the dataset by batch.
@ -8,7 +7,7 @@ For the general deep learning experiment, there are several parts to deal with:
3. Write out the training process (generally including forward / backward calculation, parameter update, log recording, visualization, periodic evaluation, etc.).
5. Configure and run the experiment.
## Parakeet's Model Components
## PaddleSpeech TTS's Model Components
In order to balance the reusability and function of models, we divide models into several types according to its characteristics.
For the commonly used modules that can be used as part of other larger models, we try to implement them as simple and universal as possible, because they will be reused. Modules with trainable parameters are generally implemented as subclasses of `paddle.nn.Layer`. Modules without trainable parameters can be directly implemented as a function, and its input and output are `paddle.Tensor`.
@ -68,11 +67,11 @@ There are two common ways to define a model which consists of several modules.
```
When a model is a complicated and made up of several components, each of which has a separate functionality, and can be replaced by other components with the same functionality, we prefer to define it in this way.
In the directory structure of Parakeet, modules with high reusability are placed in `parakeet.modules`, but models for specific tasks are placed in `parakeet.models`. When developing a new model, developers need to consider the feasibility of splitting the modules, and the degree of generality of the modules, and place them in appropriate directories.
In the directory structure of PaddleSpeech TTS, modules with high reusability are placed in `paddlespeech.t2s.modules`, but models for specific tasks are placed in `paddlespeech.t2s.models`. When developing a new model, developers need to consider the feasibility of splitting the modules, and the degree of generality of the modules, and place them in appropriate directories.
## Parakeet's Data Components
## PaddleSpeech TTS's Data Components
Another critical componnet for a deep learning project is data.
Parakeet uses the following methods for training data:
PaddleSpeech TTS uses the following methods for training data:
1. Preprocess the data.
2. Load the preprocessed data for training.
@ -94,7 +93,7 @@ Then we need to select a format for saving metadata to the hard disk. There are
Meanwhile, `cache` is added here, and a multi-process Manager is used to share memory between multiple processes. When `num_workers` is used, it is guaranteed that each sub process will not cache a copy.
The implementation of `DataTable` can be found in `parakeet/datasets/data_table.py`.
The implementation of `DataTable` can be found in `paddlespeech/t2s/datasets/data_table.py`.
```python
class DataTable(Dataset):
"""Dataset to load and convert data for general purpose.
@ -154,7 +153,7 @@ def _convert(self, meta_datum: Dict[str, Any]) -> Dict[str, Any]:
return example
```
## Parakeet's Training Components
## PaddleSpeech TTS's Training Components
A typical training process includes the following processes:
1. Iterate the dataset.
2. Process batch data.
@ -164,7 +163,7 @@ A typical training process includes the following processes:
6. Write logs, visualize, and in some cases save necessary intermediate results.
7. Save the state of the model and optimizer.
Here, we mainly introduce the training related components of Parakeet and why we designed it like this.
Here, we mainly introduce the training related components of TTS in Pa and why we designed it like this.
### Global Repoter
When training and modifying Deep Learning modelslogging is often needed, and it has even become the key to model debugging and modifying. We usually use various visualization toolssuch as , `visualdl` in `paddle`, `tensorboard` in `tensorflow` and `vidsom`, `wnb` ,etc. Besides, `logging` and `print` are usuaally used for different purpose.
@ -180,9 +179,9 @@ We think this method is a little ugly. We prefer to return the necessary informa
It takes advantage of the globality of Python's module level variables and the effect of context manager.
There is a module level variable in `parakeet/training/reporter.py` `OBSERVATIONS`which is a `Dict` to store key-value.
There is a module level variable in `paddlespeech/t2s/training/reporter.py` `OBSERVATIONS`which is a `Dict` to store key-value.
```python
# parakeet/training/reporter.py
# paddlespeech/t2s/training/reporter.py
@contextlib.contextmanager
def scope(observations):
@ -245,7 +244,7 @@ def test_reporter_scope():
In this way, when we write modular components, we can directly call `report`. The caller will decide where to report as long as it's ready for `OBSERVATION`, then it opens a `scope` and calls the component within this `scope`.
The `Trainer` in Parakeet report the information in this way.
The `Trainer` in PaddleSpeech TTS report the information in this way.
```python
while True:
self.observation = {}
@ -269,7 +268,7 @@ We made an abstraction for these intermediate processes, that is, `Updater`, whi
### Visualizer
Because we choose observation as the communication mode, we can simply write the things in observation into `visualizer`.
## Parakeet's Configuration Components
## PaddleSpeech TTS's Configuration Components
Deep learning experiments often have many options to configure. These configurations can be roughly divided into several categories.
1. Data source and data processing mode configuration.
2. Save path configuration of experimental results.
@ -293,28 +292,26 @@ The following is the basic `ArgumentParser`:
3. `--output-dir` is the dir to save the training results.if there are checkpoints in `checkpoints/` of `--output-dir` , it's defalut to reload the newest checkpoint to train)
4. `--device` and `--nprocs` determine operation modes`--device` specifies the type of running device, whether to run on `cpu` or `gpu`. `--nprocs` refers to the number of training processes. If `nprocs` > 1, it means that multi process parallel training is used. (Note: currently only GPU multi card multi process training is supported.)
Developers can refer to the examples in `Parakeet/examples` to write the default configuration file when adding new experiments.
Developers can refer to the examples in `examples` to write the default configuration file when adding new experiments.
## Parakeet's Experiment template
## PaddleSpeech TTS's Experiment template
The experimental codes in Parakeet are generally organized as follows:
The experimental codes in PaddleSpeech TTS are generally organized as follows:
```text
├── conf
│ └── default.yaml (defalut config)
├── README.md (help information)
├── batch_fn.py (organize metadata into batch)
├── config.py (code to read default config)
├── *_updater.py (Updater of a specific model)
├── preprocess.py (data preprocessing code)
├── preprocess.sh (script to call data preprocessing.py)
├── synthesis.py (synthesis from metadata)
├── synthesis.sh (script to call synthesis.py)
├── synthesis_e2e.py (synthesis from raw text)
├── synthesis_e2e.sh (script to call synthesis_e2e.py)
├── train.py (train code)
└── run.sh (script to call train.py)
.
├── README.md (help information)
├── conf
│ └── default.yaml (defalut config)
├── local
│ ├── preprocess.sh (script to call data preprocessing.py)
│ ├── synthesize.sh (script to call synthesis.py)
│ ├── synthesize_e2e.sh (script to call synthesis_e2e.py)
│ └──train.sh (script to call train.py)
├── path.sh (script include paths to be sourced)
└── run.sh (script to call scripts in local)
```
The `*.py` files called by above `*.sh` are located `${BIN_DIR}/`
We add a named argument. `--output-dir` to each training script to specify the output directory. The directory structure is as follows, It's best for developers to follow this specification:
```text
@ -330,4 +327,4 @@ exp/default/
└── test/ (output dir of synthesis results)
```
You can view the examples we provide in `Parakeet/examples`. These experiments are provided to users as examples which can be run directly. Users are welcome to add new models and experiments and contribute code to Parakeet.
You can view the examples we provide in `examples`. These experiments are provided to users as examples which can be run directly. Users are welcome to add new models and experiments and contribute code to PaddleSpeech.

@ -1,115 +0,0 @@
# Basic Usage
This section shows how to use pretrained models provided by parakeet and make inference with them.
Pretrained models in v0.4 are provided in a archive. Extract it to get a folder like this:
```
checkpoint_name/
├──default.yaml
├──snapshot_iter_76000.pdz
├──speech_stats.npy
└──phone_id_map.txt
```
`default.yaml` stores the config used to train the model.
`snapshot_iter_N.pdz` is the chechpoint file, where `N` is the steps it has been trained.
`*_stats.npy` is the stats file of feature if it has been normalized before training.
`phone_id_map.txt` is the map of phonemes to phoneme_ids.
The example code below shows how to use the models for prediction.
## Acoustic Models (text to spectrogram)
The code below show how to use a `FastSpeech2` model. After loading the pretrained model, use it and normalizer object to construct a prediction objectthen use fastspeech2_inferencet(phone_ids) to generate spectrograms, which can be further used to synthesize raw audio with a vocoder.
```python
from pathlib import Path
import numpy as np
import paddle
import yaml
from yacs.config import CfgNode
from parakeet.models.fastspeech2 import FastSpeech2
from parakeet.models.fastspeech2 import FastSpeech2Inference
from parakeet.modules.normalizer import ZScore
# Parakeet/examples/fastspeech2/baker/frontend.py
from frontend import Frontend
# load the pretrained model
checkpoint_dir = Path("fastspeech2_nosil_baker_ckpt_0.4")
with open(checkpoint_dir / "phone_id_map.txt", "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
with open(checkpoint_dir / "default.yaml") as f:
fastspeech2_config = CfgNode(yaml.safe_load(f))
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size, odim=odim, **fastspeech2_config["model"])
model.set_state_dict(
paddle.load(args.fastspeech2_checkpoint)["main_params"])
model.eval()
# load stats file
stat = np.load(checkpoint_dir / "speech_stats.npy")
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
fastspeech2_normalizer = ZScore(mu, std)
# construct a prediction object
fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)
# load Chinese Frontend
frontend = Frontend(checkpoint_dir / "phone_id_map.txt")
# text to spectrogram
sentence = "你好吗?"
input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
phone_ids = input_ids["phone_ids"]
flags = 0
# The output of Chinese text frontend is segmented
for part_phone_ids in phone_ids:
with paddle.no_grad():
temp_mel = fastspeech2_inference(part_phone_ids)
if flags == 0:
mel = temp_mel
flags = 1
else:
mel = paddle.concat([mel, temp_mel])
```
## Vocoder (spectrogram to wave)
The code below show how to use a ` Parallel WaveGAN` model. Like the example above, after loading the pretrained model, use it and normalizer object to construct a prediction objectthen use pwg_inference(mel) to generate raw audio (in wav format).
```python
from pathlib import Path
import numpy as np
import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from parakeet.models.parallel_wavegan import PWGGenerator
from parakeet.models.parallel_wavegan import PWGInference
from parakeet.modules.normalizer import ZScore
# load the pretrained model
checkpoint_dir = Path("parallel_wavegan_baker_ckpt_0.4")
with open(checkpoint_dir / "pwg_default.yaml") as f:
pwg_config = CfgNode(yaml.safe_load(f))
vocoder = PWGGenerator(**pwg_config["generator_params"])
vocoder.set_state_dict(paddle.load(args.pwg_params))
vocoder.remove_weight_norm()
vocoder.eval()
# load stats file
stat = np.load(checkpoint_dir / "pwg_stats.npy")
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
pwg_normalizer = ZScore(mu, std)
# construct a prediction object
pwg_inference = PWGInference(pwg_normalizer, vocoder)
# spectrogram to wave
wav = pwg_inference(mel)
sf.write(
audio_path,
wav.numpy(),
samplerate=fastspeech2_config.fs)
```

File diff suppressed because it is too large Load Diff

@ -0,0 +1,287 @@
Audio Sample (PaddleSpeech TTS VS Espnet TTS)
==================
This is an audio demo page to contrast PaddleSpeech TTS and Espnet TTS, We use their respective modules (Text Frontend, Acoustic model and Vocoder) here.
We use Espnet's released models here.
FastSpeech2 + Parallel WaveGAN in CSMSC
.. raw:: html
<div class="table">
<table border="2" cellspacing="1" cellpadding="1">
<tr>
<th align="center"> Text </th>
<th align="center"> Espent TTS </th>
<th align="center"> PaddleSpeech TTS </th>
</tr>
<tr>
<td>早上好今天是2020/10/29最低温度是-3°C。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>你好我的编号是37249很高兴为您服务。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>我们公司有37249个人。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>我出生于2005年10月8日。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>我们习惯在12:30吃中午饭。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>只要有超过3/4的人投票同意你就会成为我们的新班长。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/006.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/006.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>我要买一只价值999.9元的手表。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/007.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/007.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>我的手机号是18544139121欢迎来电。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/008.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/008.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>明天有62%的概率降雨。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/009.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/009.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>手表厂有五种好产品。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/010.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/010.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>跑马场有五百匹很勇敢的千里马。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/011.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/011.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>有一天,我看到了一栋楼,我顿感不妙,因为我看不清里面有没有人。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/012.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/012.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>史小姐拿着小雨伞去找她的老保姆了。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/013.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/013.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>不要相信这个老奶奶说的话,她一点儿也不好。</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/014.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/014.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
</table>
</div>

@ -0,0 +1,9 @@
# GAN Vocoders
This is a brief introduction of GAN Vocoders, we mainly introduce the losses of different vocoders here.
Model | Generator Loss |Discriminator Loss
:-------------:| :------------:| :-----
Parallel Wave GAN| adversial loss <br> Feature Matching | Multi-Scale Discriminator |
Mel GAN |adversial loss <br> Multi-resolution STFT loss | adversial loss|
Multi-Band Mel GAN | adversial loss <br> full band Multi-resolution STFT loss <br> sub band Multi-resolution STFT loss |Multi-Scale Discriminator|
HiFi GAN |adversial loss <br> Feature Matching <br> Mel-Spectrogram Loss | Multi-Scale Discriminator <br> Multi-Period Discriminato |

@ -1,45 +0,0 @@
.. parakeet documentation master file, created by
sphinx-quickstart on Fri Sep 10 14:22:24 2021.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Parakeet
====================================
``parakeet`` is a deep learning based text-to-speech toolkit built upon ``paddlepaddle`` framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by `Baidu Research <http://research.baidu.com>`_ and other research groups.
``parakeet`` mainly consists of components below.
#. Implementation of models and commonly used neural network layers.
#. Dataset abstraction and common data preprocessing pipelines.
#. Ready-to-run experiments.
.. toctree::
:maxdepth: 1
:caption: Introduction
introduction
.. toctree::
:maxdepth: 1
:caption: Getting started
install
basic_usage
advanced_usage
cn_text_frontend
released_models
.. toctree::
:maxdepth: 1
:caption: Demos
demo
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

@ -1,47 +0,0 @@
# Installation
## Install PaddlePaddle
Parakeet requires PaddlePaddle as its backend. Note that 2.1.2 or newer versions of paddle is required.
Since paddlepaddle has multiple packages depending on the device (cpu or gpu) and the dependency libraries, it is recommended to install a proper package of paddlepaddle with respect to the device and dependency library versons via `pip`.
Installing paddlepaddle with conda or build paddlepaddle from source is also supported. Please refer to [PaddlePaddle installation](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html) for more details.
Example instruction to install paddlepaddle via pip is listed below.
### PaddlePaddle with GPU
```python
# PaddlePaddle for CUDA10.1
python -m pip install paddlepaddle-gpu==2.1.2.post101 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
# PaddlePaddle for CUDA10.2
python -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple
# PaddlePaddle for CUDA11.0
python -m pip install paddlepaddle-gpu==2.1.2.post110 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
# PaddlePaddle for CUDA11.2
python -m pip install paddlepaddle-gpu==2.1.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
```
### PaddlePaddle with CPU
```python
python -m pip install paddlepaddle==2.1.2 -i https://mirror.baidu.com/pypi/simple
```
## Install libsndfile
Experimemts in parakeet often involve audio and spectrum processing, thus `librosa` and `soundfile` are required. `soundfile` requires a extra C library `libsndfile`, which is not always handled by pip.
For Windows and Mac users, `libsndfile` is also installed when installing `soundfile` via pip, but for Linux users, installing `libsndfile` via system package manager is required. Example commands for popular distributions are listed below.
```bash
# ubuntu, debian
sudo apt-get install libsndfile1
# centos, fedora
sudo yum install libsndfile
# openSUSE
sudo zypper in libsndfile
```
For any problem with installtion of soundfile, please refer to [SoundFile](https://pypi.org/project/SoundFile/).
## Install Parakeet
There are two ways to install parakeet according to the purpose of using it.
1. If you want to run experiments provided by parakeet or add new models and experiments, it is recommended to clone the project from github (Parakeet), and install it in editable mode.
```python
git clone https://github.com/PaddlePaddle/Parakeet
cd Parakeet
pip install -e .
```

@ -1,27 +0,0 @@
# Parakeet - PAddle PARAllel text-to-speech toolKIT
## What is Parakeet?
Parakeet is a deep learning based text-to-speech toolkit built upon paddlepaddle framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by Baidu Research and other research groups.
## What can Parakeet do?
Parakeet mainly consists of components below:
- Implementation of models and commonly used neural network layers.
- Dataset abstraction and common data preprocessing pipelines.
- Ready-to-run experiments.
Parakeet provides you with a complete TTS pipeline, including:
- Text FrontEnd
- Rule based Chinese frontend.
- Acoustic Models
- FastSpeech2
- SpeedySpeech
- TransformerTTS
- Tacotron2
- Vocoders
- Parallel WaveGAN
- WaveFlow
- Voice Cloning
- Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
- GE2E
Parakeet helps you to train TTS models with simple commands.

@ -1,12 +1,12 @@
# Released Models
TTS system mainly includes three modules: `text frontend`, `Acoustic model` and `Vocoder`. We introduce a rule based Chinese text frontend in [cn_text_frontend.md](./cn_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable models.
# Models introduction
TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We introduce a rule based Chinese text frontend in [cn_text_frontend.md](./cn_text_frontend.md). Here, we will introduce acoustic models and vocoders, which are trainable models.
The main processes of TTS include:
1. Convert the original text into characters/phonemes, through `text frontend` module.
2. Convert characters/phonemes into acoustic features , such as linear spectrogram, mel spectrogram, LPC features, etc. through `Acoustic models`.
3. Convert acoustic features into waveforms through `Vocoders`.
A simple text frontend module can be implemented by rules. Acoustic models and vocoders need to be trained. The models provided by Parakeet are acoustic models and vocoders.
A simple text frontend module can be implemented by rules. Acoustic models and vocoders need to be trained. The models provided by PaddleSpeech TTS are acoustic models and vocoders.
## Acoustic Models
### Modeling Objectives of Acoustic Models
@ -27,14 +27,14 @@ At present, there are two mainstream acoustic model structures.
- Acoustic decoder (N Frames - > N Frames).
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/frame_level_am.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/frame_level_am.png" width=500 /> <br>
</div>
- Sequence to sequence acoustic model:
- M Tokens - > N Frames.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/seq2seq_am.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/seq2seq_am.png" width=500 /> <br>
</div>
### Tacotron2
@ -54,7 +54,7 @@ At present, there are two mainstream acoustic model structures.
- CBHG postprocess.
- Vocoder: Griffin-Lim.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/tacotron.png" width=700 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/tacotron.png" width=700 /> <br>
</div>
**Advantage of Tacotron:**
@ -89,10 +89,10 @@ At present, there are two mainstream acoustic model structures.
- The alignment matrix of previous time is considered at the step `t` of decoder.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/tacotron2.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/tacotron2.png" width=500 /> <br>
</div>
You can find Parakeet's tacotron2 example at `Parakeet/examples/tacotron2`.
You can find PaddleSpeech TTS's tacotron2 with LJSpeech dataset example at [examples/ljspeech/tts0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts0).
### TransformerTTS
**Disadvantages of the Tacotrons:**
@ -118,7 +118,7 @@ Transformer TTS is a combination of Tacotron2 and Transformer.
- Positional Encoding.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/transformer.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/transformer.png" width=500 /> <br>
</div>
#### Transformer TTS
@ -138,7 +138,7 @@ Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2.
- Uniform scale position encoding may have a negative impact on input or output sequences.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/transformer_tts.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/transformer_tts.png" width=500 /> <br>
</div>
**Disadvantages of Transformer TTS:**
@ -146,7 +146,7 @@ Transformer TTS is a seq2seq acoustic model based on Transformer and Tacotron2.
- The ability to perceive local information is weak, and local information is more related to pronunciation.
- Stability is worse than Tacotron2.
You can find Parakeet's Transformer TTS example at `Parakeet/examples/transformer_tts`.
You can find PaddleSpeech TTS's Transformer TTS with LJSpeech dataset example at [examples/ljspeech/tts1](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/tts1).
### FastSpeech2
@ -184,14 +184,14 @@ Instead of using the encoder-attention-decoder based architecture as adopted by
• Can be generated in parallel (decoding time is less affected by sequence length)
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/fastspeech.png" width=800 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastspeech.png" width=800 /> <br>
</div>
#### FastPitch
[FastPitch](https://arxiv.org/abs/2006.06873) follows FastSpeech. A single pitch value is predicted for every temporal location, which improves the overall quality of synthesized speech.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/fastpitch.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastpitch.png" width=500 /> <br>
</div>
#### FastSpeech2
@ -209,10 +209,10 @@ Instead of using the encoder-attention-decoder based architecture as adopted by
FastSpeech2 is similar to FastPitch but introduces more variation information of speech.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/fastspeech2.png" width=800 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/fastspeech2.png" width=800 /> <br>
</div>
You can find Parakeet's FastSpeech2/FastPitch example at `Parakeet/examples/fastspeech2`, We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2.
You can find PaddleSpeech TTS's FastSpeech2/FastPitch with CSMSC dataset example at [examples/csmsc/tts3](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts3), We use token-averaged pitch and energy values introduced in FastPitch rather than frame level ones in FastSpeech2.
### SpeedySpeech
[SpeedySpeech](https://arxiv.org/abs/2008.03802) simplify the teacher-student architecture of FastSpeech and provide a fast and stable training procedure.
@ -223,10 +223,10 @@ You can find Parakeet's FastSpeech2/FastPitch example at `Parakeet/examples/fast
- Describe a simple data augmentation technique that can be used early in the training to make the teacher network robust to sequential error propagation.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/speedyspeech.png" width=500 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/speedyspeech.png" width=500 /> <br>
</div>
You can find Parakeet's SpeedySpeech example at `Parakeet/examples/speedyspeech/baker`.
You can find PaddleSpeech TTS's SpeedySpeech with CSMSC dataset example at [examples/csmsc/tts2](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/tts2).
## Vocoders
In speech synthesis, the main task of the vocoder is to convert the spectral parameters predicted by the acoustic model into the final speech waveform.
@ -276,7 +276,7 @@ Here, we introduce a Flow-based vocoder WaveFlow and a GAN-based vocoder Paralle
- It is a small-footprint flow-based model for raw audio. It has only 5.9M parameters, which is 15x smalller than WaveGlow (87.9M).
- It is directly trained with maximum likelihood without probability density distillation and auxiliary losses as used in [Parallel WaveNet](https://arxiv.org/abs/1711.10433) and [ClariNet](https://openreview.net/pdf?id=HklY120cYm), which simplifies the training pipeline and reduces the cost of development.
You can find Parakeet's WaveFlow example at `Parakeet/examples/waveflow`.
You can find PaddleSpeech TTS's WaveFlow with LJSpeech dataset example at [examples/ljspeech/voc0](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/ljspeech/voc0).
### Parallel WaveGAN
[Parallel WaveGAN](https://arxiv.org/abs/1910.11480) trains a non-autoregressive WaveNet variant as a generator in a GAN based training method.
@ -286,10 +286,10 @@ You can find Parakeet's WaveFlow example at `Parakeet/examples/waveflow`.
- Use non-causal convolution instead of causal convolution.
- The input is random Gaussian white noise.
- The model is non-autoregressive both in training and prediction, which is fast
- Multi-resolution STFT loss.
- Multi-resolution STFT loss.
<div align="left">
<img src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/images/pwg.png" width=600 /> <br>
<img src="https://raw.githubusercontent.com/PaddlePaddle/DeepSpeech/develop/docs/images/pwg.png" width=600 /> <br>
</div>
You can find Parakeet's Parallel WaveGAN example at `Parakeet/examples/parallelwave_gan/baker`.
You can find PaddleSpeech TTS's Parallel WaveGAN with CSMSC example at [examples/csmsc/voc1](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1).

@ -0,0 +1,193 @@
# Quick Start of Text-To-Speech
The examples in PaddleSpeech are mainly classified by datasets, the TTS datasets we mainly used are:
* CSMCS (Mandarin single speaker)
* AISHELL3 (Mandarin multiple speaker)
* LJSpeech (English single speaker)
* VCTK (English multiple speaker)
The models in PaddleSpeech TTS have the following mapping relationship:
* tts0 - Tactron2
* tts1 - TransformerTTS
* tts2 - SpeedySpeech
* tts3 - FastSpeech2
* voc0 - WaveFlow
* voc1 - Parallel WaveGAN
* voc2 - MelGAN
* voc3 - MultiBand MelGAN
* vc0 - Tactron2 Voice Clone with GE2E
## Quick Start
Let's take a FastSpeech2 + Parallel WaveGAN with CSMSC dataset for instance. (./examples/csmsc/)(https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc)
### Train Parallel WaveGAN with CSMSC
- Go to directory
```bash
cd examples/csmsc/voc1
```
- Source env
```bash
source path.sh
```
**Must do this before you start to do anything.**
Set `MAIN_ROOT` as project dir. Using `parallelwave_gan` model as `MODEL`.
- Main entrypoint
```bash
bash run.sh
```
This is just a demo, please make sure source data have been prepared well and every `step` works well before next `step`.
### Train FastSpeech2 with CSMSC
- Go to directory
```bash
cd examples/csmsc/tts3
```
- Source env
```bash
source path.sh
```
**Must do this before you start to do anything.**
Set `MAIN_ROOT` as project dir. Using `fastspeech2` model as `MODEL`.
- Main entrypoint
```bash
bash run.sh
```
This is just a demo, please make sure source data have been prepared well and every `step` works well before next `step`.
The steps in `run.sh` mainly include:
- source path.
- preprocess the dataset,
- train the model.
- synthesize waveform from metadata.jsonl.
- synthesize waveform from text file. (in acoustic models)
- inference using static model. (optional)
For more details , you can see `README.md` in examples.
## Pipeline of TTS
This section shows how to use pretrained models provided by TTS and make inference with them.
Pretrained models in TTS are provided in a archive. Extract it to get a folder like this:
**Acoustic Models:**
```text
checkpoint_name
├── default.yaml
├── snapshot_iter_*.pdz
├── speech_stats.npy
├── phone_id_map.txt
├── spk_id_map.txt (optimal)
└── tone_id_map.txt (optimal)
```
**Vocoders:**
```text
checkpoint_name
├── default.yaml
├── snapshot_iter_*.pdz
└── stats.npy
```
- `default.yaml` stores the config used to train the model.
- `snapshot_iter_*.pdz` is the chechpoint file, where `*` is the steps it has been trained.
- `*_stats.npy` is the stats file of feature if it has been normalized before training.
- `phone_id_map.txt` is the map of phonemes to phoneme_ids.
- `tone_id_map.txt` is the map of tones to tones_ids, when you split tones and phones before training acoustic models. (for example in our csmsc/speedyspeech example)
- `spk_id_map.txt` is the map of spkeaker to spk_ids in multi-spk acoustic models. (for example in our aishell3/fastspeech2 example)
The example code below shows how to use the models for prediction.
### Acoustic Models (text to spectrogram)
The code below show how to use a `FastSpeech2` model. After loading the pretrained model, use it and normalizer object to construct a prediction objectthen use `fastspeech2_inferencet(phone_ids)` to generate spectrograms, which can be further used to synthesize raw audio with a vocoder.
```python
from pathlib import Path
import numpy as np
import paddle
import yaml
from yacs.config import CfgNode
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Inference
from paddlespeech.t2s.modules.normalizer import ZScore
# examples/fastspeech2/baker/frontend.py
from frontend import Frontend
# load the pretrained model
checkpoint_dir = Path("fastspeech2_nosil_baker_ckpt_0.4")
with open(checkpoint_dir / "phone_id_map.txt", "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
with open(checkpoint_dir / "default.yaml") as f:
fastspeech2_config = CfgNode(yaml.safe_load(f))
odim = fastspeech2_config.n_mels
model = FastSpeech2(
idim=vocab_size, odim=odim, **fastspeech2_config["model"])
model.set_state_dict(
paddle.load(args.fastspeech2_checkpoint)["main_params"])
model.eval()
# load stats file
stat = np.load(checkpoint_dir / "speech_stats.npy")
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
fastspeech2_normalizer = ZScore(mu, std)
# construct a prediction object
fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)
# load Chinese Frontend
frontend = Frontend(checkpoint_dir / "phone_id_map.txt")
# text to spectrogram
sentence = "你好吗?"
input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
phone_ids = input_ids["phone_ids"]
flags = 0
# The output of Chinese text frontend is segmented
for part_phone_ids in phone_ids:
with paddle.no_grad():
temp_mel = fastspeech2_inference(part_phone_ids)
if flags == 0:
mel = temp_mel
flags = 1
else:
mel = paddle.concat([mel, temp_mel])
```
### Vocoder (spectrogram to wave)
The code below show how to use a ` Parallel WaveGAN` model. Like the example above, after loading the pretrained model, use it and normalizer object to construct a prediction objectthen use `pwg_inference(mel)` to generate raw audio (in wav format).
```python
from pathlib import Path
import numpy as np
import paddle
import soundfile as sf
import yaml
from yacs.config import CfgNode
from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
from paddlespeech.t2s.models.parallel_wavegan import PWGInference
from paddlespeech.t2s.modules.normalizer import ZScore
# load the pretrained model
checkpoint_dir = Path("parallel_wavegan_baker_ckpt_0.4")
with open(checkpoint_dir / "pwg_default.yaml") as f:
pwg_config = CfgNode(yaml.safe_load(f))
vocoder = PWGGenerator(**pwg_config["generator_params"])
vocoder.set_state_dict(paddle.load(args.pwg_params))
vocoder.remove_weight_norm()
vocoder.eval()
# load stats file
stat = np.load(checkpoint_dir / "pwg_stats.npy")
mu, std = stat
mu = paddle.to_tensor(mu)
std = paddle.to_tensor(std)
pwg_normalizer = ZScore(mu, std)
# construct a prediction object
pwg_inference = PWGInference(pwg_normalizer, vocoder)
# spectrogram to wave
wav = pwg_inference(mel)
sf.write(
audio_path,
wav.numpy(),
samplerate=fastspeech2_config.fs)
```

@ -0,0 +1,14 @@
001 早上好今天是2020/10/29最低温度是-3°C。
002 你好我的编号是37249很高兴为您服务。
003 我们公司有37249个人。
004 我出生于2005年10月8日。
005 我们习惯在12:30吃中午饭。
006 只要有超过3/4的人投票同意你就会成为我们的新班长。
007 我要买一只价值999.9元的手表。
008 我的手机号是18544139121欢迎来电。
009 明天有62%的概率降雨。
010 手表厂有五种好产品。
011 跑马场有五百匹很勇敢的千里马。
012 有一天,我看到了一栋楼,我顿感不妙,因为我看不清里面有没有人。
013 史小姐拿着小雨伞去找她的老保姆了。
014 不要相信这个老奶奶说的话,她一点儿也不好。

@ -1,5 +1,5 @@
# Chinese Rule Based Text Frontend
TTS system mainly includes three modules: `text frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in Parakeet, see exapmle in `Parakeet/examples/text_frontend/`.
A TTS system mainly includes three modules: `Text Frontend`, `Acoustic model` and `Vocoder`. We provide a complete Chinese text frontend module in PaddleSpeech TTS, see exapmle in [examples/other/text_frontend/](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/text_frontend).
A text frontend module mainly includes:
- Text Segmentation

@ -5,7 +5,8 @@
## Data
| Data Subset | Duration in Seconds |
| data/manifest.train | 1.23 ~ 14.53125 |
| data/manifest.dev | 1.645 ~ 12.533 |
| data/manifest.test | 1.859125 ~ 14.6999375 |
| Data Subset | Duration in Seconds |
| ------------------- | --------------------- |
| data/manifest.train | 1.23 ~ 14.53125 |
| data/manifest.dev | 1.645 ~ 12.533 |
| data/manifest.test | 1.859125 ~ 14.6999375 |

@ -4,9 +4,7 @@
| Model | Params | Release | Config | Test set | Loss | CER |
| --- | --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug | test | 6.016139030456543 | 0.066549 |
| --- | --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 58.4M | 7181e427 | conf/deepspeech2.yaml + spec aug | test | 5.71956205368042 | 0.064287 |
| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug | test | 5.71956205368042 | 0.064287 |
| DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |
| DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
| DeepSpeech2 | 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |

@ -41,7 +41,7 @@ model:
use_gru: True
share_rnn_weights: False
blank_id: 0
ctc_grad_norm_type: instance
ctc_grad_norm_type: instance
training:
n_epoch: 80

@ -43,7 +43,7 @@ model:
fc_layers_size_list: -1,
use_gru: False
blank_id: 0
ctc_grad_norm_type: instance
ctc_grad_norm_type: null
training:
n_epoch: 50

@ -13,7 +13,7 @@ ckpt_prefix=$2
model_type=$3
# download language model
bash local/download_lm_ch.sh
bash local/download_lm_ch.sh > /dev/null 2>&1
if [ $? -ne 0 ]; then
exit 1
fi

@ -13,7 +13,7 @@ jit_model_export_path=$2
model_type=$3
# download language model
bash local/download_lm_ch.sh
bash local/download_lm_ch.sh > /dev/null 2>&1
if [ $? -ne 0 ]; then
exit 1
fi

@ -11,4 +11,4 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=deepspeech2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin

@ -77,7 +77,7 @@ model:
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -72,7 +72,7 @@ model:
model_conf:
ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: instance
ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

@ -0,0 +1,47 @@
#!/bin/bash
if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix audio_file"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1
ckpt_prefix=$2
audio_file=$3
chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
chunk_mode=true
fi
# download language model
#bash local/download_lm_ch.sh
#if [ $? -ne 0 ]; then
# exit 1
#fi
for type in attention_rescoring; do
echo "decoding ${type}"
batch_size=1
output_dir=${ckpt_prefix}
mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/test_hub.py \
--nproc ${ngpu} \
--config ${config_path} \
--result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} \
--audio_file ${audio_file}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
done
exit 0

@ -12,7 +12,7 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
# model exp
MODEL=u2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
# srilm

@ -1,6 +1,6 @@
#!/bin/bash
set -e
source path.sh
set -e
stage=0
stop_stage=100
@ -13,6 +13,8 @@ avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
echo "checkpoint name ${ckpt}"
audio_file="data/tmp.wav"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh || exit -1
@ -46,5 +48,10 @@ fi
# Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# train lm and build TLG
./local/tlg.sh --corpus aishell --lmtype srilm
./local/tlg.sh --corpus aishell --lmtype srilm
fi
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
# test a single .wav file
CUDA_VISIBLE_DEVICES=3 ./local/test_hub.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi

@ -1,4 +1,11 @@
# Aishell3
* tts0 - fastspeech2
* vc0 - tactron2 voice clone
* tts0 - Tactron2
* tts1 - TransformerTTS
* tts2 - SpeedySpeech
* tts3 - FastSpeech2
* voc0 - WaveFlow
* voc1 - Parallel WaveGAN
* voc2 - MelGAN
* voc3 - MultiBand MelGAN
* vc0 - Tactron2 Voice Clone with GE2E

@ -17,13 +17,24 @@ tar zxvf data_aishell3.tgz -C data_aishell3
```
### Get MFA result of AISHELL-3 and Extract it
We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) (use MFA1.x now) of our repo.
### Preprocess the dataset
You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/other/use_mfa) (use MFA1.x now) of our repo.
## Get Started
Assume the path to the dataset is `~/datasets/data_aishell3`.
Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
Run the command below to preprocess the dataset.
Run the command below to
1. **source path**.
2. preprocess the dataset,
3. train the model.
4. synthesize wavs.
- synthesize waveform from `metadata.jsonl`.
- synthesize waveform from text file.
```bash
./run.sh
```
### Preprocess the dataset
```bash
./preprocess.sh
./local/preprocess.sh ${conf_path}
```
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
```text
@ -47,17 +58,17 @@ The dataset is split into 3 parts, namely `train`, `dev` and` test`, each of whi
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, text_lengths, speech_lengths, durations, path of speech features, path of pitch features, path of energy features, speaker and id of each utterance.
## Train the model
`./run.sh` calls `../train.py`.
### Train the model
`./local/train.sh` calls `${BIN_DIR}/train.py`.
```bash
./run.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
```
Here's the complete help message.
```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
[--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
[--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT]
[--speaker-dict SPEAKER_DICT]
Train a FastSpeech2 model.
@ -70,8 +81,7 @@ optional arguments:
dev data.
--output-dir OUTPUT_DIR
output dir.
--device DEVICE device type to use.
--nprocs NPROCS number of processes.
--ngpu NGPU if ngpu=0, use cpu.
--verbose VERBOSE verbose.
--phones-dict PHONES_DICT
phone vocabulary file.
@ -81,25 +91,12 @@ optional arguments:
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
6. `--phones-dict` is the path of the phone vocabulary file.
7. `--speaker-dict`is the path of the speaker id map file when training a multi-speaker FastSpeech2.
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
5. `--phones-dict` is the path of the phone vocabulary file.
6. `--speaker-dict`is the path of the speaker id map file when training a multi-speaker FastSpeech2.
FastSpeech2 checkpoint contains files listed below.
```text
fastspeech2_nosil_aishell3_ckpt_0.4
├── default.yaml # default config used to train fastspeech2
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
├── snapshot_iter_96400.pdz # model parameters and optimizer states
├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
## Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
### Synthesize
We use [parallel wavegan](https://github.com/PaddlePaddle/DeepSpeech/tree/develop/examples/csmsc/voc1) as the neural vocoder.
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
```bash
unzip pwg_baker_ckpt_0.4.zip
@ -111,9 +108,9 @@ pwg_baker_ckpt_0.4
├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
```
`synthesize.sh` calls `synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
```bash
./synthesize.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
@ -123,7 +120,7 @@ usage: synthesize.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
[--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT]
[--phones-dict PHONES_DICT] [--speaker-dict SPEAKER_DICT]
[--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
[--device DEVICE] [--verbose VERBOSE]
[--ngpu NGPU] [--verbose VERBOSE]
Synthesize with fastspeech2 & parallel wavegan.
@ -150,25 +147,25 @@ optional arguments:
test metadata.
--output-dir OUTPUT_DIR
output dir.
--device DEVICE device type to use.
--verbose VERBOSE verbose.
--ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose
```
`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
`./local/synthesize_e2e.sh` calls `${BIN_DIR}/multi_spk_synthesize_e2e.py`, which can synthesize waveform from text file.
```bash
./synthesize_e2e.sh
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
```text
usage: synthesize_e2e.py [-h] [--fastspeech2-config FASTSPEECH2_CONFIG]
[--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
[--fastspeech2-stat FASTSPEECH2_STAT]
[--pwg-config PWG_CONFIG]
[--pwg-checkpoint PWG_CHECKPOINT]
[--pwg-stat PWG_STAT] [--phones-dict PHONES_DICT]
[--speaker-dict SPEAKER_DICT] [--text TEXT]
[--output-dir OUTPUT_DIR] [--device DEVICE]
[--verbose VERBOSE]
usage: multi_spk_synthesize_e2e.py [-h]
[--fastspeech2-config FASTSPEECH2_CONFIG]
[--fastspeech2-checkpoint FASTSPEECH2_CHECKPOINT]
[--fastspeech2-stat FASTSPEECH2_STAT]
[--pwg-config PWG_CONFIG]
[--pwg-checkpoint PWG_CHECKPOINT]
[--pwg-stat PWG_STAT]
[--phones-dict PHONES_DICT]
[--speaker-dict SPEAKER_DICT] [--text TEXT]
[--output-dir OUTPUT_DIR] [--ngpu NGPU]
[--verbose VERBOSE]
Synthesize with fastspeech2 & parallel wavegan.
@ -194,7 +191,7 @@ optional arguments:
--text TEXT text to synthesize, a 'utt_id sentence' pair per line.
--output-dir OUTPUT_DIR
output dir.
--device DEVICE device type to use.
--ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
```
1. `--fastspeech2-config`, `--fastspeech2-checkpoint`, `--fastspeech2-stat`, `--phones-dict` and `--speaker-dict` are arguments for fastspeech2, which correspond to the 5 files in the fastspeech2 pretrained model.
@ -202,26 +199,39 @@ optional arguments:
3. `--test-metadata` should be the metadata file in the normalized subfolder of `test` in the `dump` folder.
4. `--text` is the text file, which contains sentences to synthesize.
5. `--output-dir` is the directory to save synthesized audio files.
6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
You can use the following scripts to synthesize for `../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
## Pretrained Model
Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
FastSpeech2 checkpoint contains files listed below.
```text
fastspeech2_nosil_aishell3_ckpt_0.4
├── default.yaml # default config used to train fastspeech2
├── phone_id_map.txt # phone vocabulary file when training fastspeech2
├── snapshot_iter_96400.pdz # model parameters and optimizer states
├── speaker_id_map.txt # speaker id map file when training a multi-speaker fastspeech2
└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
```
You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
```bash
source path.sh
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 synthesize_e2e.py \
python3 ${BIN_DIR}/multi_spk_synthesize_e2e.py \
--fastspeech2-config=fastspeech2_nosil_aishell3_ckpt_0.4/default.yaml \
--fastspeech2-checkpoint=fastspeech2_nosil_aishell3_ckpt_0.4/snapshot_iter_96400.pdz \
--fastspeech2-stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--text=../sentences.txt \
--text=${BIN_DIR}/../sentences.txt \
--output-dir=exp/default/test_e2e \
--device="gpu" \
--phones-dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
--speaker-dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt
```
## Future work
A multi-speaker vocoder is needed.

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save