Merge branch 'develop' into develop

pull/1121/head
TianYuan 4 years ago committed by GitHub
commit 7c98bae859
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -161,6 +161,8 @@ paddlespeech cls --input input.wav
paddlespeech asr --lang zh --input input_16k.wav
```
**Speech Translation** (English to Chinese)
(not support for Windows now)
```shell
paddlespeech st --input input_16k.wav
```
@ -170,7 +172,8 @@ paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!
```
- web demo for Text to Speech is integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See Demo: https://huggingface.co/spaces/akhaliq/paddlespeech
If you want to try more functions like training and tuning, please have a look at documents of [Speech-to-Text](./docs/source/asr/quick_start.md) and [Text-to-Speech](./docs/source/tts/quick_start.md).
If you want to try more functions like training and tuning, please have a look at [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md).
## Model List
@ -258,15 +261,15 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
<table>
<thead>
<tr>
<th> Text-to-Speech Module Type <img width="110" height="1"> </th>
<th> Model Type </th>
<th> <img width="50" height="1"> Dataset <img width="50" height="1"> </th>
<th> <img width="101" height="1"> Link <img width="105" height="1"> </th>
<th> Text-to-Speech Module Type </th>
<th> Model Type </th>
<th> Dataset </th>
<th> Link </th>
</tr>
</thead>
<tbody>
<tr>
<td> Text Frontend</td>
<td> Text Frontend </td>
<td colspan="2"> &emsp; </td>
<td>
<a href = "./examples/other/tn">tn</a> / <a href = "./examples/other/g2p">g2p</a>
@ -352,10 +355,10 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
<table style="width:100%">
<thead>
<tr>
<th> <img width="150" height="1">Task <img width="150" height="1"></th>
<th> <img width="110" height="1">Dataset <img width="110" height="1"></th>
<th> <img width="110" height="1">Model Type <img width="110" height="1"></th>
<th> <img width="110" height="1">Link <img width="110" height="1"></th>
<th> Task </th>
<th> Dataset </th>
<th> Model Type </th>
<th> Link </th>
</tr>
</thead>
<tbody>

@ -25,6 +25,7 @@ import os
from pathlib import Path
import soundfile
from utils.utility import download
from utils.utility import unpack

@ -25,6 +25,7 @@ import os
from pathlib import Path
import soundfile
from utils.utility import download
from utils.utility import unpack

@ -27,6 +27,7 @@ import os
from multiprocessing.pool import Pool
import soundfile
from utils.utility import download
from utils.utility import unpack

@ -26,6 +26,7 @@ import os
from multiprocessing.pool import Pool
import soundfile
from utils.utility import download
from utils.utility import unpack

@ -28,6 +28,7 @@ import json
import os
import soundfile
from utils.utility import download
from utils.utility import unpack

@ -28,6 +28,7 @@ import json
import os
import soundfile
from utils.utility import download
from utils.utility import unzip

@ -26,6 +26,7 @@ from multiprocessing.pool import Pool
from pathlib import Path
import soundfile
from utils.utility import download
from utils.utility import unpack

@ -27,6 +27,7 @@ import string
from pathlib import Path
import soundfile
from utils.utility import unzip
URL_ROOT = ""

@ -27,6 +27,7 @@ import shutil
import subprocess
import soundfile
from utils.utility import download_multi
from utils.utility import getfile_insensitive
from utils.utility import unpack

@ -19,7 +19,7 @@ Here are sample files for this demo that can be downloaded:
wget https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
```
### 3. Usage
### 3. Usage (not support for Windows now)
- Command Line(Recommended)
```bash
paddlespeech st --input ./en.wav

@ -1,8 +1,8 @@
# Installation
There are 3 ways to use `PaddleSpeech`. According to the degree of difficulty, the 3 ways can be divided into `Easy`, `Medium` and `Hard`.
There are 3 ways to use `PaddleSpeech`. According to the degree of difficulty, the 3 ways can be divided into **Easy**, **Medium** and **Hard**.
## Easy: Get the Basic Funcition Without Your Own Mechine
If you are a newer of `PaddleSpeech` and want to experience it easily without your own mechine. We recommand you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step tutorial for `PaddleSpeech` and you can use the basic function of `PaddleSpeech` with a free machine.
## Easy: Get the Basic Function without Your Own Machine
If you are newer to `PaddleSpeech` and want to experience it easily without your own machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step tutorial for `PaddleSpeech` and you can use the basic function of `PaddleSpeech` with a free machine.
## Prerequisites for Medium and Hard
- Python >= 3.7
@ -10,11 +10,11 @@ If you are a newer of `PaddleSpeech` and want to experience it easily without yo
- Only Linux is supported
- Hip: Do not use command `sh` instead of command `bash`
## Medium: Get the Basic Funciton on Your Mechine
If you want to install `paddlespeech` on your own mechine. There are 3 steps you need to do.
## Medium: Get the Basic Function on Your Machine
If you want to install `paddlespeech` on your own machine. There are 3 steps you need to do.
### Install the Conda
Conda is environment management system. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version (py>=3.7) and install it by yourself or you can use the following command:
### Install Conda
Conda is a management system of the environment. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version (py>=3.7) and install it by yourself or you can use the following command:
```bash
# download the miniconda
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
@ -35,7 +35,14 @@ conda activate tools/venv
```
Install conda dependencies for `paddlespeech` :
```bash
conda install -y -c conda-forge sox libsndfile swig bzip2 gcc_linux-64=8.4.0 gxx_linux-64=8.4.0
conda install -y -c conda-forge sox libsndfile swig bzip2
```
Do not forget to install `gcc` and `gxx` on your system.
If you use linux, you can use the script below to install them.
(Hip: Do not use this script if you want to install by **Hard** way):
```
conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0
```
### Install PaddlePaddle
For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0:
@ -43,30 +50,27 @@ For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0:
python3 -m pip install paddlepaddle-gpu==2.2.0
```
### Install PaddleSpeech
To Install `paddlespeech`, there are two methods. You can use the following command:
To install `paddlespeech`, there are two methods. You can use the following command:
```bash
pip install paddlespeech
```
If you install `paddlespeech` by `pip`, you can use it to help you build your own model. However, you can not use the `ready-made `examples in paddlespeech.
If you install `paddlespeech` by `pip`, you can use it to help you build your model. However, you can not use the `ready-made `examples in paddlespeech.
If you want to use the` ready-made `examples in `paddlespeech`, you need to clone this repository and install `paddlespeech` by the foll
If you want to use the` ready-made `examples in `paddlespeech`, you need to clone this repository and install `paddlespeech` by the following commands:
```bash
https://github.com/PaddlePaddle/PaddleSpeech.git
cd PaddleSpeech
pip install .
```
## Hard: Get the Full Funciton on Your Mechine
## Hard: Get the Full Function on Your Machine
### Prerequisites
- choice 1: working with `Ubuntu` Docker Container.
or
- choice 2: working on `Ubuntu` with `root` privilege.
To avoid the trouble of environment setup, [running in Docker container](#running-in-docker-container) is highly recommended. Otherwise If you work on `Ubuntu` with `root` privilege, you can skip the next step.
To avoid the trouble of environment setup, [running in Docker container](#running-in-docker-container) is highly recommended. Otherwise, if you work on `Ubuntu` with `root` privilege, you can skip the next step.
### Choice 1: Running in Docker Container (Recommand)
Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed.
Docker is an open-source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed. This Docker image requires the support of NVIDIA GPU, so please make sure its availability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed.
Take several steps to launch the Docker image:
- Download the Docker image
@ -115,7 +119,7 @@ For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0:
```bash
python3 -m pip install paddlepaddle-gpu==2.2.0
```
### Get the Funcition for Developing PaddleSpeech
### Get the Function for Developing PaddleSpeech
```bash
pip install .[develop]
```

@ -2,32 +2,31 @@
## Speech-to-Text Models
### Acoustic Model Released in paddle 2.X
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | example link
### Speech Recognition Model
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link
:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :-----------
[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell S0 Example](../../examples/aishell/asr0)
[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell S0 Example](../../examples/aishell/asr0)
[Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell S1 Example](../../examples/aishell/s1)
[Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell S1 Example](../../examples/aishell/s1)
[Conformer Librispeech ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/librispeech/s1/conformer.release.tar.gz) | Librispeech Dataset | subword-based | 287 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0325 | 960 h | [Conformer Librispeech S1 example](../../example/librispeech/s1)
[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0410 | 960 h | [Transformer Librispeech S1 example](../../example/librispeech/s1)
[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.024 | 960 h | [Transformer Librispeech S2 example](../../example/librispeech/s2)
### Acoustic Model Transformed from paddle 1.8
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech
:-------------:| :------------:| :-----: | -----: | :----------------- | :---------- | :---------- | :---------
[Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz)|Aishell Dataset| Char-based| 234 MB| 2 Conv + 3 bidirectional GRU layers| 0.0804 |-| 151 h|
[Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz)|Librispeech Dataset| Word-based| 307 MB| 2 Conv + 3 bidirectional sharing weight RNN layers |-| 0.0685| 960 h|
[Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz)|Baidu Internal English Dataset| Word-based| 273 MB| 2 Conv + 3 bidirectional GRU layers |-| 0.0541 | 8628 h|
### Language Model Released
[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0)
[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0)
[Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1)
[Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1)
[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0538 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1)
[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/conformer.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1)
[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1)
[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2)
### Language Model based on NGram
Language Model | Training Data | Token-based | Size | Descriptions
:-------------:| :------------:| :-----: | -----: | :-----------------
[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie' binary with '-a 22 -q 8 -b 8'
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
### Speech Translation Models
| Model | Training Data | Token-based | Size | Descriptions | BLEU | Example Link |
| ------------------------------------------------------------ | ------------- | ----------- | ---- | ------------------------------------------------------------ | ----- | ------------------------------------------------------------ |
| [Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz) | Ted-En-Zh | Spm | | Encoder:Transformer, Decoder:Transformer, <br />Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) |
## Text-to-Speech Models
@ -69,8 +68,10 @@ PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset
PANN | ESC-50 |[pann-esc50]("./examples/esc50/cls0")|[panns_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz), [panns_cnn10](https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz), [panns_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz)
## Speech Translation Models
## Speech Recognition Model from paddle 1.8
Model Type | Dataset| Example Link | Pretrained Models | Model Size
:-------------:| :------------:| :-----: | :-----: | :-----:
FAT-ST | TED En-Zh |[FAT + Transformer+ASR MTL](./examples/ted_en_zh/st1)|[fat_st_ted-en-zh.tar.gz](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz) | 50.26M
| Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech |
| :----------------------------------------------------------: | :----------------------------: | :---------: | -----: | :------------------------------------------------- | :----- | :----- | :-------------- |
| [Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz) | Aishell Dataset | Char-based | 234 MB | 2 Conv + 3 bidirectional GRU layers | 0.0804 | - | 151 h |
| [Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz) | Librispeech Dataset | Word-based | 307 MB | 2 Conv + 3 bidirectional sharing weight RNN layers | - | 0.0685 | 960 h |
| [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz) | Baidu Internal English Dataset | Word-based | 273 MB | 2 Conv + 3 bidirectional GRU layers | - | 0.0541 | 8628 h |

@ -455,6 +455,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
<b>CSMSC(Chinese)</b>
<br>
</br>
<table border="2" cellspacing="1" cellpadding="1">
<tr>
<th align="center"> Text </th>
@ -634,6 +635,106 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
</td>
</tr>
</table>
<br>
</br>
<table border="2" cellspacing="1" cellpadding="1">
<tr>
<th align="center"> FastSpeech2-Conformer + ParallelWaveGAN </th>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/001.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/002.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/003.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/004.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/005.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/006.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/007.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/008.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
<tr>
<td>
<audio controls="controls">
<source
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/009.wav"
type="audio/wav">
Your browser does not support the <code>audio</code> element.
</audio>
</td>
</tr>
</table>
</div>
<br>
<br>

@ -1,15 +1,18 @@
# LibriSpeech
## Conformer
train: Epoch 70, 4 V100-32G, best avg: 20
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | 6.738649845123291 | 0.041159 |
| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 6.738649845123291 | 0.039847 |
| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 6.738649845123291 | 0.039790 |
| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 6.738649845123291 | 0.034617 |
| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-clean | attention | 6.433612394332886 | 0.039771 |
| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.433612394332886 | 0.040342 |
| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.433612394332886 | 0.040342 |
| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-clean | attention_rescoring | 6.433612394332886 | 0.033761 |
## Chunk Conformer
| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | 7.11 | 0.063193 |
@ -20,7 +23,7 @@
## Transformer
train: Epoch 120, 4 V100-32G, 27 Day, avg: 10
train: Epoch 120, 4 V100-32G, 27 Day, best avg: 10
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- | --- |

@ -1,41 +1,3 @@
# https://yaml.org/type/float.html
data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
min_input_len: 0.5
max_input_len: 30.0
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 100.0
collator:
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file:
@ -80,6 +42,39 @@ model:
length_normalized_loss: false
data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
collator:
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
training:
n_epoch: 240
accum_grad: 8

@ -1,41 +1,3 @@
# https://yaml.org/type/float.html
data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
min_input_len: 0.5 # second
max_input_len: 30.0 # second
min_output_len: 0.0 # tokens
max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05
max_output_input_ratio: 100.0
collator:
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file:
@ -73,6 +35,37 @@ model:
length_normalized_loss: false
data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
collator:
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
training:
n_epoch: 120
accum_grad: 1

@ -1,41 +1,3 @@
# https://yaml.org/type/float.html
data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test-clean
min_input_len: 0.5 # seconds
max_input_len: 30.0 # seconds
min_output_len: 0.0 # tokens
max_output_len: 400.0 # tokens
min_output_input_ratio: 0.05
max_output_input_ratio: 100.0
collator:
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file:
@ -76,8 +38,40 @@ model:
length_normalized_loss: false
data:
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test-clean
collator:
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'spm'
spm_model_prefix: 'data/lang_char/bpe_unigram_5000'
mean_std_filepath: ""
augmentation_config: conf/preprocess.yaml
batch_size: 16
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
training:
n_epoch: 120
n_epoch: 70
accum_grad: 8
global_grad_clip: 3.0
optim: adam
@ -98,13 +92,7 @@ decoding:
batch_size: 64
error_rate_type: wer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.

@ -17,6 +17,8 @@
```
## Speech Translation (English to Chinese)
(not support for Windows now)
```bash
paddlespeech st --input input_16k.wav
```

@ -11,9 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import _locale
from .asr import ASRExecutor
from .base_commands import BaseCommand
from .base_commands import HelpCommand
from .cls import CLSExecutor
from .st import STExecutor
from .tts import TTSExecutor
_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])

@ -405,8 +405,6 @@ class TTSExecutor(BaseExecutor):
with open(self.voc_config) as f:
self.voc_config = CfgNode(yaml.safe_load(f))
# Enter the path of model root
with open(self.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
@ -463,11 +461,12 @@ class TTSExecutor(BaseExecutor):
am_std = paddle.to_tensor(am_std)
am_normalizer = ZScore(am_mu, am_std)
self.am_inference = am_inference_class(am_normalizer, am)
self.am_inference.eval()
print("acoustic model done!")
# vocoder
# model: {model_name}_{dataset}
voc_name = '_'.join(voc.split('_')[:-1])
voc_name = voc[:voc.rindex('_')]
voc_class = dynamic_import(voc_name, model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference',
model_alias)
@ -480,6 +479,7 @@ class TTSExecutor(BaseExecutor):
voc_std = paddle.to_tensor(voc_std)
voc_normalizer = ZScore(voc_mu, voc_std)
self.voc_inference = voc_inference_class(voc_normalizer, voc)
self.voc_inference.eval()
print("voc done!")
def preprocess(self, input: Any, *args, **kwargs):
@ -501,10 +501,10 @@ class TTSExecutor(BaseExecutor):
"""
Model inference and result stored in self.output.
"""
model_name = am[:am.rindex('_')]
dataset = am[am.rindex('_') + 1:]
am_name = am[:am.rindex('_')]
am_dataset = am[am.rindex('_') + 1:]
get_tone_ids = False
if 'speedyspeech' in model_name:
if am_name == 'speedyspeech':
get_tone_ids = True
if lang == 'zh':
input_ids = self.frontend.get_input_ids(
@ -521,15 +521,14 @@ class TTSExecutor(BaseExecutor):
print("lang should in {'zh', 'en'}!")
# am
if 'speedyspeech' in model_name:
if am_name == 'speedyspeech':
mel = self.am_inference(phone_ids, tone_ids)
# fastspeech2
else:
# multi speaker
if dataset in {"aishell3", "vctk"}:
if am_dataset in {"aishell3", "vctk"}:
mel = self.am_inference(
phone_ids, spk_id=paddle.to_tensor(spk_id))
else:
mel = self.am_inference(phone_ids)

@ -16,10 +16,11 @@ import os
import numpy as np
from paddle import inference
from scipy.special import softmax
from paddleaudio.backends import load as load_audio
from paddleaudio.datasets import ESC50
from paddleaudio.features import melspectrogram
from scipy.special import softmax
# yapf: disable
parser = argparse.ArgumentParser()

@ -15,8 +15,8 @@ import argparse
import os
import paddle
from paddleaudio.datasets import ESC50
from paddleaudio.datasets import ESC50
from paddlespeech.cls.models import cnn14
from paddlespeech.cls.models import SoundClassifier

@ -16,11 +16,11 @@ import argparse
import numpy as np
import paddle
import paddle.nn.functional as F
from paddleaudio.backends import load as load_audio
from paddleaudio.datasets import ESC50
from paddleaudio.features import LogMelSpectrogram
from paddleaudio.features import melspectrogram
from paddlespeech.cls.models import cnn14
from paddlespeech.cls.models import SoundClassifier

@ -15,11 +15,11 @@ import argparse
import os
import paddle
from paddleaudio.datasets import ESC50
from paddleaudio.features import LogMelSpectrogram
from paddleaudio.utils import logger
from paddleaudio.utils import Timer
from paddlespeech.cls.models import cnn14
from paddlespeech.cls.models import SoundClassifier

@ -15,6 +15,7 @@ import os
import paddle.nn as nn
import paddle.nn.functional as F
from paddleaudio.utils.download import load_state_dict_from_url
from paddleaudio.utils.env import MODEL_HOME

@ -356,7 +356,7 @@ class AudioSegment():
# sox, slow
try:
import soxbindings as sox
except:
except ImportError:
try:
from paddlespeech.s2t.utils import dynamic_pip_install
package = "sox"
@ -364,8 +364,9 @@ class AudioSegment():
package = "soxbindings"
dynamic_pip_install.install(package)
import soxbindings as sox
except:
raise RuntimeError("Can not install soxbindings on your system." )
except Exception:
raise RuntimeError(
"Can not install soxbindings on your system.")
tfm = sox.Transformer()
tfm.set_globals(multithread=False)

@ -102,9 +102,11 @@ def read_manifest(
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
feat_len = json_data["input"][0]["shape"][
0] if "input" in json_data and "shape" in json_data["input"][0] else 1.0
0] if "input" in json_data and "shape" in json_data["input"][
0] else 1.0
token_len = json_data["output"][0]["shape"][
0] if "output" in json_data and "shape" in json_data["output"][0] else 1.0
0] if "output" in json_data and "shape" in json_data["output"][
0] else 1.0
conditions = [
feat_len >= min_input_len,
feat_len <= max_input_len,

@ -20,13 +20,13 @@ from paddle.io import DistributedBatchSampler
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
__all__ = [
"SortagradDistributedBatchSampler",
"SortagradBatchSampler",
]
logger = Log(__name__).getlog()
def _batch_shuffle(indices, batch_size, epoch, clipped=False):
"""Put similarly-sized instances into minibatches for better efficiency

@ -17,11 +17,11 @@ from paddlespeech.s2t.utils import dynamic_pip_install
try:
import swig_decoders
except:
except ImportError:
try:
package_name = 'paddlespeech_ctcdecoders'
dynamic_pip_install.install(package_name)
except:
except Exception:
raise RuntimeError(
"Can not install package paddlespeech_ctcdecoders on your system. \
The DeepSpeech2 model is not supported for your system")

@ -129,7 +129,7 @@ class DeepSpeech2Model(nn.Layer):
rnn_layer_size=1024, #RNN layer size (number of RNN cells).
use_gru=True, #Use gru if set True. Use simple rnn if set False.
share_rnn_weights=True, #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
ctc_grad_norm_type=None,))
ctc_grad_norm_type=None, ))
if config is not None:
config.merge_from_other_cfg(default)
return default

@ -17,11 +17,11 @@ from paddlespeech.s2t.utils import dynamic_pip_install
try:
import swig_decoders
except:
except ImportError:
try:
package_name = 'paddlespeech_ctcdecoders'
dynamic_pip_install.install(package_name)
except:
except Exception:
raise RuntimeError(
"Can not install package paddlespeech_ctcdecoders on your system. \
The DeepSpeech2 model is not supported for your system")

@ -28,7 +28,7 @@ try:
from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import ctc_beam_search_decoder_batch # noqa: F401
from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import ctc_greedy_decoder # noqa: F401
from paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper import Scorer # noqa: F401
except:
except ImportError:
try:
from paddlespeech.s2t.utils import dynamic_pip_install
package_name = 'paddlespeech_ctcdecoders'

@ -221,6 +221,8 @@ class Trainer():
if hasattr(self.train_loader, "batch_sampler"):
batch_sampler = self.train_loader.batch_sampler
if isinstance(batch_sampler, paddle.io.DistributedBatchSampler):
logger.debug(
f"train_loader.batch_sample set epoch: {self.epoch}")
batch_sampler.set_epoch(self.epoch)
def before_train(self):

@ -147,7 +147,7 @@ class SpeedPerturbationSox():
try:
import soxbindings as sox
except:
except ImportError:
try:
from paddlespeech.s2t.utils import dynamic_pip_install
package = "sox"
@ -155,8 +155,10 @@ class SpeedPerturbationSox():
package = "soxbindings"
dynamic_pip_install.install(package)
import soxbindings as sox
except:
raise RuntimeError("Can not install soxbindings on your system." )
except Exception:
raise RuntimeError(
"Can not install soxbindings on your system.")
self.sox = sox
if utt2ratio is not None:
self.utt2ratio = {}
@ -200,7 +202,7 @@ class SpeedPerturbationSox():
else:
ratio = self.state.uniform(self.lower, self.upper)
tfm = sox.Transformer()
tfm = self.sox.Transformer()
tfm.set_globals(multithread=False)
tfm.speed(ratio)
y = tfm.build_array(input_array=x, sample_rate_in=self.sr)

@ -137,6 +137,10 @@ class Frontend():
phones_list.append(phones)
if merge_sentences:
merge_list = sum(phones_list, [])
# rm the last 'sp' to avoid the noise at the end
# cause in the training data, no 'sp' in the end
if merge_list[-1] == 'sp':
merge_list = merge_list[:-1]
phones_list = []
phones_list.append(merge_list)
return phones_list

@ -46,7 +46,6 @@ requirements = {
"paddleaudio",
"paddlespeech_feat",
"praatio~=4.1",
"pypi-kenlm",
"pypinyin",
"python-dateutil",
"pyworld",
@ -71,6 +70,7 @@ requirements = {
"phkit",
"Pillow",
"pybind11",
"pypi-kenlm",
"snakeviz",
"sox",
"soxbindings",

@ -5,6 +5,7 @@ import functools
from pathlib import Path
import jsonlines
from utils.utility import add_arguments
from utils.utility import print_arguments

Loading…
Cancel
Save