Merge pull request #1341 from Jackwaterveg/r0.1

[Version]r0.1.1
pull/1484/head r0.1.1
Jackwaterveg 4 years ago committed by GitHub
commit 3d5aac6a94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

1
.gitignore vendored

@ -15,6 +15,7 @@
build build
docs/build/ docs/build/
docs/topic/ctc/warp-ctc/
tools/venv tools/venv
tools/kenlm tools/kenlm

@ -32,6 +32,12 @@ pull_request_rules:
actions: actions:
label: label:
remove: ["conflicts"] remove: ["conflicts"]
- name: "auto add label=Dataset"
conditions:
- files~=^dataset/
actions:
label:
add: ["Dataset"]
- name: "auto add label=S2T" - name: "auto add label=S2T"
conditions: conditions:
- files~=^paddlespeech/s2t/ - files~=^paddlespeech/s2t/
@ -50,18 +56,30 @@ pull_request_rules:
actions: actions:
label: label:
add: ["Audio"] add: ["Audio"]
- name: "auto add label=TextProcess" - name: "auto add label=Vector"
conditions:
- files~=^paddlespeech/vector/
actions:
label:
add: ["Vector"]
- name: "auto add label=Text"
conditions: conditions:
- files~=^paddlespeech/text/ - files~=^paddlespeech/text/
actions: actions:
label: label:
add: ["TextProcess"] add: ["Text"]
- name: "auto add label=Example" - name: "auto add label=Example"
conditions: conditions:
- files~=^examples/ - files~=^examples/
actions: actions:
label: label:
add: ["Example"] add: ["Example"]
- name: "auto add label=CLI"
conditions:
- files~=^paddlespeech/cli
actions:
label:
add: ["CLI"]
- name: "auto add label=Demo" - name: "auto add label=Demo"
conditions: conditions:
- files~=^demos/ - files~=^demos/
@ -70,13 +88,13 @@ pull_request_rules:
add: ["Demo"] add: ["Demo"]
- name: "auto add label=README" - name: "auto add label=README"
conditions: conditions:
- files~=README.md - files~=(README.md|READEME_cn.md)
actions: actions:
label: label:
add: ["README"] add: ["README"]
- name: "auto add label=Documentation" - name: "auto add label=Documentation"
conditions: conditions:
- files~=^docs/ - files~=^(docs/|CHANGELOG.md|paddleaudio/CHANGELOG.md)
actions: actions:
label: label:
add: ["Documentation"] add: ["Documentation"]
@ -88,10 +106,16 @@ pull_request_rules:
add: ["CI"] add: ["CI"]
- name: "auto add label=Installation" - name: "auto add label=Installation"
conditions: conditions:
- files~=^(tools/|setup.py|setup.sh) - files~=^(tools/|setup.py|setup.cfg|setup_audio.py)
actions: actions:
label: label:
add: ["Installation"] add: ["Installation"]
- name: "auto add label=Test"
conditions:
- files~=^(tests/)
actions:
label:
add: ["Test"]
- name: "auto add label=mergify" - name: "auto add label=mergify"
conditions: conditions:
- files~=^.mergify.yml - files~=^.mergify.yml

@ -0,0 +1,11 @@
# Changelog
Date: 2022-1-10, Author: Jackwaterveg.
Add features to: CLI:
- Support English (librispeech/asr1/transformer).
- Support choosing `decode_method` for conformer and transformer models.
- Refactor the config, using the unified config.
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297
***

@ -7,7 +7,7 @@
<h3> <h3>
<a href="#quick-start"> Quick Start </a> <a href="#quick-start"> Quick Start </a>
| <a href="#tutorials"> Tutorials </a> | <a href="#documents"> Documents </a>
| <a href="#model-list"> Models List </a> | <a href="#model-list"> Models List </a>
</div> </div>
@ -25,14 +25,6 @@
<a href="https://huggingface.co/spaces"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue"></a> <a href="https://huggingface.co/spaces"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue"></a>
</p> </p>
<!---
from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readmes-readable.md
1.What is this repo or project? (You can reuse the repo description you used earlier because this section doesnt have to be long.)
2.How does it work?
3.Who will use this repo or project?
4.What is the goal of this project?
-->
**PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech and audio, with the state-of-art and influential models. **PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech and audio, with the state-of-art and influential models.
@ -61,7 +53,6 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
</td> </td>
<td>我认为跑步最重要的就是给我带来了身体健康。</td> <td>我认为跑步最重要的就是给我带来了身体健康。</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
@ -95,7 +86,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
<table style="width:100%"> <table style="width:100%">
<thead> <thead>
<tr> <tr>
<th><img width="200" height="1"> Input Text <img width="200" height="1"> </th> <th width="550" > Input Text</th>
<th>Synthetic Audio</th> <th>Synthetic Audio</th>
</tr> </tr>
</thead> </thead>
@ -114,6 +105,13 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
<img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br> <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
</td> </td>
</tr> </tr>
<tr>
<td >季姬寂,集鸡,鸡即棘鸡。棘鸡饥叽,季姬及箕稷济鸡。鸡既济,跻姬笈,季姬忌,急咭鸡,鸡急,继圾几,季姬急,即籍箕击鸡,箕疾击几伎,伎即齑,鸡叽集几基,季姬急极屐击鸡,鸡既殛,季姬激,即记《季姬击鸡记》。</td>
<td align = "center">
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/jijiji.wav" rel="nofollow">
<img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
</td>
</tr>
</tbody> </tbody>
</table> </table>
@ -121,7 +119,39 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html). For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html).
### Features: ##### Punctuation Restoration
<div align = "center">
<table style="width:100%">
<thead>
<tr>
<th width="390"> Input Text </th>
<th width="390"> Output Text </th>
</tr>
</thead>
<tbody>
<tr>
<td>今天的天气真不错啊你下午有空吗我想约你一起去吃饭</td>
<td>今天的天气真不错啊!你下午有空吗?我想约你一起去吃饭。</td>
</tr>
</tbody>
</table>
</div>
### ⭐ Examples
- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): Use PaddleSpeech TTS to generate virtual human voice.**
<div align="center"><a href="https://www.bilibili.com/video/BV1cL411V71o?share_source=copy_web"><img src="https://ai-studio-static-online.cdn.bcebos.com/06fd746ab32042f398fb6f33f873e6869e846fe63c214596ae37860fe8103720" / width="500px"></a></div>
### 🔥 Hot Activities
- 2021.12.21~12.24
4 Days Live Courses: Depth interpretation of PaddleSpeech!
**Courses videos and related materials: https://aistudio.baidu.com/aistudio/education/group/info/25130**
### Features
Via the easy-to-use, efficient, flexible and scalable implementation, our vision is to empower both industrial application and academic research, including training, inference & testing modules, and deployment process. To be more specific, this toolkit features at: Via the easy-to-use, efficient, flexible and scalable implementation, our vision is to empower both industrial application and academic research, including training, inference & testing modules, and deployment process. To be more specific, this toolkit features at:
- 📦 **Ease of Use**: low barriers to install, and [CLI](#quick-start) is available to quick-start your journey. - 📦 **Ease of Use**: low barriers to install, and [CLI](#quick-start) is available to quick-start your journey.
@ -132,8 +162,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
- 🔬 *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model list](#model-list) for more details. - 🔬 *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model list](#model-list) for more details.
- 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
### Recent Update
### Recent Update:
<!--- <!---
2021.12.14: We would like to have an online courses to introduce basics and research of speech, as well as code practice with `paddlespeech`. Please pay attention to our [Calendar](https://www.paddlepaddle.org.cn/live). 2021.12.14: We would like to have an online courses to introduce basics and research of speech, as well as code practice with `paddlespeech`. Please pay attention to our [Calendar](https://www.paddlepaddle.org.cn/live).
@ -141,25 +170,22 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
- 🤗 2021.12.14: Our PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/akhaliq/paddlespeech) Demos on Hugging Face Spaces are available! - 🤗 2021.12.14: Our PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/akhaliq/paddlespeech) Demos on Hugging Face Spaces are available!
- 👏🏻 2021.12.10: PaddleSpeech CLI is available for Audio Classification, Automatic Speech Recognition, Speech Translation (English to Chinese) and Text-to-Speech. - 👏🏻 2021.12.10: PaddleSpeech CLI is available for Audio Classification, Automatic Speech Recognition, Speech Translation (English to Chinese) and Text-to-Speech.
### Communication ### Community
If you are in China, we recommend you to join our WeChat group to contact directly with our team members! - Scan the QR code below with your Wechat (reply【语音】after your friend's application is approved), you can access to official technical exchange group. Look forward to your participation.
<div align="center"> <div align="center">
<img src="./docs/images/wechat_group.png" width = "400" /> <img src="https://raw.githubusercontent.com/yt605155624/lanceTest/main/images/wechat_4.jpg" width = "300" />
</div> </div>
## Installation ## Installation
We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.7*, where `paddlespeech` can be easily installed with `pip`: We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.7*.
```python Up to now, **Linux** supports CLI for the all our tasks, **Mac OSX** and **Windows** only supports PaddleSpeech CLI for Audio Classification, Speech-to-Text and Text-to-Speech. To install `PaddleSpeech`, please see [installation](./docs/source/install.md).
pip install paddlepaddle paddlespeech
```
Up to now, **Linux** supports CLI for the all our tasks, **Mac OSX and Windows** only supports PaddleSpeech CLI for Audio Classification, Speech-to-Text and Text-to-Speech. Please see [installation](./docs/source/install.md) for other alternatives.
<a name="quickstart"></a>
## Quick Start ## Quick Start
Developers can have a try of our models with [PaddleSpeech Command Line](./demos/README.md). Change `--input` to test your own audio/text. Developers can have a try of our models with [PaddleSpeech Command Line](./paddlespeech/cli/README.md). Change `--input` to test your own audio/text.
**Audio Classification** **Audio Classification**
```shell ```shell
@ -177,10 +203,19 @@ paddlespeech st --input input_16k.wav
``` ```
**Text-to-Speech** **Text-to-Speech**
```shell ```shell
paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav paddlespeech tts --input "你好,欢迎使用飞桨深度学习框架!" --output output.wav
``` ```
- web demo for Text to Speech is integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See Demo: [TTS Demo](https://huggingface.co/spaces/akhaliq/paddlespeech) - web demo for Text to Speech is integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See Demo: [TTS Demo](https://huggingface.co/spaces/akhaliq/paddlespeech)
**Text Postprocessing**
- Punctuation Restoration
```bash
paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
```
For more command lines, please see: [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos)
If you want to try more functions like training and tuning, please have a look at [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md). If you want to try more functions like training and tuning, please have a look at [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md).
@ -190,10 +225,6 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
**Speech-to-Text** contains *Acoustic Model*, *Language Model*, and *Speech Translation*, with the following details: **Speech-to-Text** contains *Acoustic Model*, *Language Model*, and *Speech Translation*, with the following details:
<!---
The current hyperlinks redirect to [Previous Parakeet](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples).
-->
<table style="width:100%"> <table style="width:100%">
<thead> <thead>
<tr> <tr>
@ -313,7 +344,7 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
</td> </td>
</tr> </tr>
<tr> <tr>
<td rowspan="3">Vocoder</td> <td rowspan="5">Vocoder</td>
<td >WaveFlow</td> <td >WaveFlow</td>
<td >LJSpeech</td> <td >LJSpeech</td>
<td> <td>
@ -334,6 +365,20 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
<a href = "./examples/csmsc/voc3">Multi Band MelGAN-csmsc</a> <a href = "./examples/csmsc/voc3">Multi Band MelGAN-csmsc</a>
</td> </td>
</tr> </tr>
<tr>
<td >Style MelGAN</td>
<td >CSMSC</td>
<td>
<a href = "./examples/csmsc/voc4">Style MelGAN-csmsc</a>
</td>
</tr>
<tr>
<td >HiFiGAN</td>
<td >CSMSC</td>
<td>
<a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a>
</td>
<tr>
<tr> <tr>
<td rowspan="3">Voice Cloning</td> <td rowspan="3">Voice Cloning</td>
<td>GE2E</td> <td>GE2E</td>
@ -383,11 +428,37 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
</tbody> </tbody>
</table> </table>
**Punctuation Restoration**
<table style="width:100%">
<thead>
<tr>
<th> Task </th>
<th> Dataset </th>
<th> Model Type </th>
<th> Link </th>
</tr>
</thead>
<tbody>
<tr>
<td>Punctuation Restoration</td>
<td>IWLST2012_zh</td>
<td>Ernie Linear</td>
<td>
<a href = "./examples/iwslt2012/punc0">iwslt2012-punc0</a>
</td>
</tr>
</tbody>
</table>
## Documents ## Documents
Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](https://paperswithcode.com/area/audio) and [Music SoTA](https://paperswithcode.com/area/music) give you an overview of the hot academic topics in the related area. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas. Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](https://paperswithcode.com/area/audio) and [Music SoTA](https://paperswithcode.com/area/music) give you an overview of the hot academic topics in the related area. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas.
- [Installation](./docs/source/install.md) - [Installation](./docs/source/install.md)
- [Quick Start](#quickstart)
- [Some Demos](./demos/README.md)
- Tutorials - Tutorials
- [Automatic Speech Recognition](./docs/source/asr/quick_start.md) - [Automatic Speech Recognition](./docs/source/asr/quick_start.md)
- [Introduction](./docs/source/asr/models_introduction.md) - [Introduction](./docs/source/asr/models_introduction.md)
@ -399,9 +470,12 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](ht
- [Advanced Usage](./docs/source/tts/advanced_usage.md) - [Advanced Usage](./docs/source/tts/advanced_usage.md)
- [Chinese Rule Based Text Frontend](./docs/source/tts/zh_text_frontend.md) - [Chinese Rule Based Text Frontend](./docs/source/tts/zh_text_frontend.md)
- [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) - [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)
- Audio Classification - [Audio Classification](./demos/audio_tagging/README.md)
- Speech Translation - [Speech Translation](./demos/speech_translation/README.md)
- [Released Models](./docs/source/released_model.md) - [Released Models](./docs/source/released_model.md)
- [Community](#Community)
- [Welcome to contribute](#contribution)
- [License](#License)
The Text-to-Speech module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with this repository. If you are interested in academic research about this task, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) is a good guideline for the pipeline components. The Text-to-Speech module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with this repository. If you are interested in academic research about this task, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) is a good guideline for the pipeline components.
@ -416,7 +490,7 @@ howpublished = {\url{https://github.com/PaddlePaddle/PaddleSpeech}},
year={2021} year={2021}
} }
``` ```
<a name="contribution"></a>
## Contribute to PaddleSpeech ## Contribute to PaddleSpeech
You are warmly welcome to submit questions in [discussions](https://github.com/PaddlePaddle/PaddleSpeech/discussions) and bug reports in [issues](https://github.com/PaddlePaddle/PaddleSpeech/issues)! Also, we highly appreciate if you are willing to contribute to this project! You are warmly welcome to submit questions in [discussions](https://github.com/PaddlePaddle/PaddleSpeech/discussions) and bug reports in [issues](https://github.com/PaddlePaddle/PaddleSpeech/issues)! Also, we highly appreciate if you are willing to contribute to this project!
@ -460,13 +534,16 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
## Acknowledgement ## Acknowledgement
- Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling) for years of attention, constructive advice and great help.
- Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
- Many thanks to [AK391](https://github.com/AK391) for TTS web demo on Huggingface Spaces using Gradio. - Many thanks to [AK391](https://github.com/AK391) for TTS web demo on Huggingface Spaces using Gradio.
- Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.
- Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function.
- Many thanks to [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) for contributing Punctuation Restoration model.
Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.
<a name="License"></a>
## License ## License
PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE). PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE).

@ -1,4 +1,4 @@
(简体中文|[English](./README.md)) (简体中文|[English](./README.md))
<p align="center"> <p align="center">
<img src="./docs/images/PaddleSpeech_logo.png" /> <img src="./docs/images/PaddleSpeech_logo.png" />
</p> </p>
@ -6,7 +6,7 @@
<h3> <h3>
<a href="#quick-start"> 快速开始 </a> <a href="#quick-start"> 快速开始 </a>
| <a href="#documents"> 教程 </a> | <a href="#documents"> 教程文档 </a>
| <a href="#model-list"> 模型列表 </a> | <a href="#model-list"> 模型列表 </a>
</div> </div>
@ -30,7 +30,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
4.What is the goal of this project? 4.What is the goal of this project?
--> -->
**PaddleSpeech** 是基于飞桨 [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) 深度学习开源框架平台上的一个开源模型库,用于语音和音频中的各种关键任务的开发,包含大量前沿和有影响力的模型,一些典型的应用示例如下: **PaddleSpeech** 是基于飞桨 [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) 的语音方向的开源模型库,用于语音和音频中的各种关键任务的开发,包含大量基于深度学习前沿和有影响力的模型,一些典型的应用示例如下:
##### 语音识别 ##### 语音识别
<div align = "center"> <div align = "center">
@ -90,7 +90,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
<table style="width:100%"> <table style="width:100%">
<thead> <thead>
<tr> <tr>
<th><img width="200" height="1"> 输入文本 <img width="200" height="1"> </th> <th width="550">输入文本</th>
<th>合成音频</th> <th>合成音频</th>
</tr> </tr>
</thead> </thead>
@ -109,6 +109,13 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
<img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br> <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
</td> </td>
</tr> </tr>
<tr>
<td >季姬寂,集鸡,鸡即棘鸡。棘鸡饥叽,季姬及箕稷济鸡。鸡既济,跻姬笈,季姬忌,急咭鸡,鸡急,继圾几,季姬急,即籍箕击鸡,箕疾击几伎,伎即齑,鸡叽集几基,季姬急极屐击鸡,鸡既殛,季姬激,即记《季姬击鸡记》。</td>
<td align = "center">
<a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/jijiji.wav" rel="nofollow">
<img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
</td>
</tr>
</tbody> </tbody>
</table> </table>
@ -116,7 +123,39 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
更多合成音频,可以参考 [PaddleSpeech 语音合成音频示例](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)。 更多合成音频,可以参考 [PaddleSpeech 语音合成音频示例](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)。
### 特性: ##### 标点恢复
<div align = "center">
<table style="width:100%">
<thead>
<tr>
<th width="390"> 输入文本 </th>
<th width="390"> 输出文本 </th>
</tr>
</thead>
<tbody>
<tr>
<td>今天的天气真不错啊你下午有空吗我想约你一起去吃饭</td>
<td>今天的天气真不错啊!你下午有空吗?我想约你一起去吃饭。</td>
</tr>
</tbody>
</table>
</div>
### ⭐ 应用案例
- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): 使用 PaddleSpeech 的语音合成模块生成虚拟人的声音。**
<div align="center"><a href="https://www.bilibili.com/video/BV1cL411V71o?share_source=copy_web"><img src="https://ai-studio-static-online.cdn.bcebos.com/06fd746ab32042f398fb6f33f873e6869e846fe63c214596ae37860fe8103720" / width="500px"></a></div>
### 🔥 热门活动
- 2021.12.21~12.24
4 日直播课: 深度解读 PaddleSpeech 语音技术!
**直播回放与课件资料: https://aistudio.baidu.com/aistudio/education/group/info/25130**
### 特性
本项目采用了易用、高效、灵活以及可扩展的实现,旨在为工业应用、学术研究提供更好的支持,实现的功能包含训练、推断以及测试模块,以及部署过程,主要包括 本项目采用了易用、高效、灵活以及可扩展的实现,旨在为工业应用、学术研究提供更好的支持,实现的功能包含训练、推断以及测试模块,以及部署过程,主要包括
- 📦 **易用性**: 安装门槛低,可使用 [CLI](#quick-start) 快速开始。 - 📦 **易用性**: 安装门槛低,可使用 [CLI](#quick-start) 快速开始。
@ -127,7 +166,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
- 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块,并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC详情请见 [模型列表](#model-list)。 - 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块,并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC详情请见 [模型列表](#model-list)。
- 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。 - 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。
### 近期更新: ### 近期更新
<!--- <!---
2021.12.14: We would like to have an online courses to introduce basics and research of speech, as well as code practice with `paddlespeech`. Please pay attention to our [Calendar](https://www.paddlepaddle.org.cn/live). 2021.12.14: We would like to have an online courses to introduce basics and research of speech, as well as code practice with `paddlespeech`. Please pay attention to our [Calendar](https://www.paddlepaddle.org.cn/live).
@ -135,21 +174,17 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
- 🤗 2021.12.14: 我们在 Hugging Face Spaces 上的 [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) 以及 [TTS](https://huggingface.co/spaces/akhaliq/paddlespeech) Demos 上线啦! - 🤗 2021.12.14: 我们在 Hugging Face Spaces 上的 [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) 以及 [TTS](https://huggingface.co/spaces/akhaliq/paddlespeech) Demos 上线啦!
- 👏🏻 2021.12.10: PaddleSpeech CLI 上线!覆盖了声音分类、语音识别、语音翻译(英译中)以及语音合成。 - 👏🏻 2021.12.10: PaddleSpeech CLI 上线!覆盖了声音分类、语音识别、语音翻译(英译中)以及语音合成。
### 交流 ### 技术交流
欢迎加入以下微信群,直接和 PaddleSpeech 团队成员进行交流! 微信扫描二维码(好友申请通过后回复【语音】)加入官方交流群,获得更高效的问题答疑,与各行各业开发者充分交流,期待您的加入。
<div align="center"> <div align="center">
<img src="./docs/images/wechat_group.png" width = "400" /> <img src="https://raw.githubusercontent.com/yt605155624/lanceTest/main/images/wechat_4.jpg" width = "300" />
</div> </div>
## 安装 ## 安装
我们强烈建议用户在 **Linux** 环境下,*3.7* 以上版本的 *python* 上安装 PaddleSpeech。这种情况下安装 `paddlespeech` 只需要一条 `pip` 命令: 我们强烈建议用户在 **Linux** 环境下,*3.7* 以上版本的 *python* 上安装 PaddleSpeech。
```python 目前为止,**Linux** 支持声音分类、语音识别、语音合成和语音翻译四种功能,**Mac OSX、 Windows** 下暂不支持语音翻译功能。 想了解具体安装细节,可以参考[安装文档](./docs/source/install_cn.md)。
pip install paddlepaddle paddlespeech
```
目前为止,**Linux** 支持声音分类、语音识别、语音合成和语音翻译四种功能,**Mac OSX、 Windows** 下暂不支持语音翻译功能。 想了解更多安装细节,可以参考[安装文档](./docs/source/install_cn.md)。
## 快速开始 ## 快速开始
@ -171,22 +206,26 @@ paddlespeech st --input input_16k.wav
```shell ```shell
paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav
``` ```
- 语音合成的 web demo 已经集成进了 [Huggingface Spaces](https://huggingface.co/spaces). 请参考: [TTS Demo](https://huggingface.co/spaces/akhaliq/paddlespeech)
**文本后处理**
- 标点恢复
```bash
paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
```
更多命令行命令请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos)
> Note: 如果需要训练或者微调,请查看[语音识别](./docs/source/asr/quick_start.md) [语音合成](./docs/source/tts/quick_start.md)。 > Note: 如果需要训练或者微调,请查看[语音识别](./docs/source/asr/quick_start.md) [语音合成](./docs/source/tts/quick_start.md)。
## 模型列表 ## 模型列表
PaddleSpeech 支持很多主流的模型,并提供了预训练模型,详情请见[模型列表](./docs/source/released_model.md)。 PaddleSpeech 支持很多主流的模型,并提供了预训练模型,详情请见[模型列表](./docs/source/released_model.md)。
PaddleSpeech 的 **语音转文本** 包含语音识别声学模型、语音识别语言模型和语音翻译, 详情如下: PaddleSpeech 的 **语音转文本** 包含语音识别声学模型、语音识别语言模型和语音翻译, 详情如下:
<!---
The current hyperlinks redirect to [Previous Parakeet](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples).
-->
<table style="width:100%"> <table style="width:100%">
<thead> <thead>
<tr> <tr>
<th>语音识别模块种类</th> <th>语音转文本模块类型</th>
<th>数据集</th> <th>数据集</th>
<th>模型种类</th> <th>模型种类</th>
<th>链接</th> <th>链接</th>
@ -254,6 +293,7 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
</tbody> </tbody>
</table> </table>
<a name="语音合成模型"></a>
PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声学模型和声码器。声学模型和声码器模型如下: PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声学模型和声码器。声学模型和声码器模型如下:
<table> <table>
@ -261,8 +301,8 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
<tr> <tr>
<th> 语音合成模块类型 </th> <th> 语音合成模块类型 </th>
<th> 模型种类 </th> <th> 模型种类 </th>
<th> <img width="50" height="1"> 数据集 <img width="50" height="1"> </th> <th> 数据集 </th>
<th> <img width="101" height="1"> 链接 <img width="105" height="1"> </th> <th> 链接 </th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
@ -302,7 +342,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
</td> </td>
</tr> </tr>
<tr> <tr>
<td rowspan="3">声码器</td> <td rowspan="5">声码器</td>
<td >WaveFlow</td> <td >WaveFlow</td>
<td >LJSpeech</td> <td >LJSpeech</td>
<td> <td>
@ -323,6 +363,20 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
<a href = "./examples/csmsc/voc3">Multi Band MelGAN-csmsc</a> <a href = "./examples/csmsc/voc3">Multi Band MelGAN-csmsc</a>
</td> </td>
</tr> </tr>
<tr>
<td >Style MelGAN</td>
<td >CSMSC</td>
<td>
<a href = "./examples/csmsc/voc4">Style MelGAN-csmsc</a>
</td>
</tr>
<tr>
<td >HiFiGAN</td>
<td >CSMSC</td>
<td>
<a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a>
</td>
<tr>
<tr> <tr>
<td rowspan="3">声音克隆</td> <td rowspan="3">声音克隆</td>
<td>GE2E</td> <td>GE2E</td>
@ -348,6 +402,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
</tbody> </tbody>
</table> </table>
<a name="声音分类模型"></a>
**声音分类** **声音分类**
<table style="width:100%"> <table style="width:100%">
@ -373,25 +428,62 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
</tbody> </tbody>
</table> </table>
## 文档 **标点恢复**
[语音 SoTA](https://paperswithcode.com/area/speech)、[声音 SoTA](https://paperswithcode.com/area/audio)、[音乐 SoTA](https://paperswithcode.com/area/music) 概述了相关领域的热门学术话题。对于 PaddleSpeech 的所关注的任务,以下指南有助于掌握核心思想。 <table style="width:100%">
<thead>
<tr>
<th> 任务 </th>
<th> 数据集 </th>
<th> 模型种类 </th>
<th> 链接 </th>
</tr>
</thead>
<tbody>
- [安装](./docs/source/install.md) <tr>
- 教程 <td>标点恢复</td>
- [语音识别](./docs/source/asr/quick_start.md) <td>IWLST2012_zh</td>
<td>Ernie Linear</td>
<td>
<a href = "./examples/iwslt2012/punc0">iwslt2012-punc0</a>
</td>
</tr>
</tbody>
</table>
## 教程文档
对于 PaddleSpeech 的所关注的任务,以下指南有助于帮助开发者快速入门,了解语音相关核心思想。
- [下载安装](./docs/source/install_cn.md)
- [快速开始](#快速开始)
- Notebook基础教程
- [声音分类](./docs/tutorial/cls/cls_tutorial.ipynb)
- [语音识别](./docs/tutorial/asr/tutorial_transformer.ipynb)
- [语音翻译](./docs/tutorial/st/st_tutorial.ipynb)
- [声音合成](./docs/tutorial/tts/tts_tutorial.ipynb)
- [示例Demo](./demos/README.md)
- 进阶文档
- [语音识别自定义训练](./docs/source/asr/quick_start.md)
- [简介](./docs/source/asr/models_introduction.md) - [简介](./docs/source/asr/models_introduction.md)
- [数据准备](./docs/source/asr/data_preparation.md) - [数据准备](./docs/source/asr/data_preparation.md)
- [数据增强](./docs/source/asr/augmentation.md) - [数据增强](./docs/source/asr/augmentation.md)
- [Ngram 语言模型](./docs/source/asr/ngram_lm.md) - [Ngram 语言模型](./docs/source/asr/ngram_lm.md)
- [语音合成](./docs/source/tts/quick_start.md) - [语音合成自定义训练](./docs/source/tts/quick_start.md)
- [简介](./docs/source/tts/models_introduction.md) - [简介](./docs/source/tts/models_introduction.md)
- [进阶用法](./docs/source/tts/advanced_usage.md) - [进阶用法](./docs/source/tts/advanced_usage.md)
- [中文文本前端](./docs/source/tts/zh_text_frontend.md) - [中文文本前端](./docs/source/tts/zh_text_frontend.md)
- [音频示例](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) - [测试语音样本](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)
- 声音分类 - [声音分类](./demos/audio_tagging/README_cn.md)
- 语音翻译 - [语音翻译](./demos/speech_translation/README_cn.md)
- [模型](./docs/source/released_model.md) - [模型列表](#模型列表)
- [语音识别](#语音识别模型)
- [语音合成](#语音合成模型)
- [声音分类](#声音分类模型)
- [技术交流群](#技术交流群)
- [欢迎贡献](#欢迎贡献)
- [License](#License)
语音合成模块最初被称为 [Parakeet](https://github.com/PaddlePaddle/Parakeet),现在与此仓库合并。如果您对该任务的学术研究感兴趣,请参阅 [TTS 研究概述](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview)。此外,[模型介绍](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) 是了解语音合成流程的一个很好的指南。 语音合成模块最初被称为 [Parakeet](https://github.com/PaddlePaddle/Parakeet),现在与此仓库合并。如果您对该任务的学术研究感兴趣,请参阅 [TTS 研究概述](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview)。此外,[模型介绍](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) 是了解语音合成流程的一个很好的指南。
@ -408,9 +500,9 @@ year={2021}
} }
``` ```
<a name="欢迎贡献"></a>
## 参与 PaddleSpeech 的开发 ## 参与 PaddleSpeech 的开发
热烈欢迎您在[Discussions](https://github.com/PaddlePaddle/PaddleSpeech/discussions) 中提交问题,并在[Issues](https://github.com/PaddlePaddle/PaddleSpeech/issues) 中指出发现的 bug。此外我们非常希望您参与到 PaddleSpeech 的开发中! 热烈欢迎您在[Discussions](https://github.com/PaddlePaddle/PaddleSpeech/discussions) 中提交问题,并在[Issues](https://github.com/PaddlePaddle/PaddleSpeech/issues) 中指出发现的 bug。此外我们非常希望您参与到 PaddleSpeech 的开发中!
### 贡献者 ### 贡献者
@ -452,9 +544,11 @@ year={2021}
## 致谢 ## 致谢
- 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling) 多年来的关注和建议,以及在诸多问题上的帮助。 - 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。
- 非常感谢 [AK391](https://github.com/AK391) 在 Huggingface Spaces 上使用 Gradio 对我们的语音合成功能进行网页版演示。 - 非常感谢 [AK391](https://github.com/AK391) 在 Huggingface Spaces 上使用 Gradio 对我们的语音合成功能进行网页版演示。
- 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。
- 非常感谢 [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) 采用 PaddleSpeech 语音合成功能实现 Virtual Uploader(VUP)/Virtual YouTuber(VTuber) 虚拟主播。
- 非常感谢 [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) 贡献标点重建相关模型。
此外PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。 此外PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。

@ -1,10 +1,15 @@
# Speech Application based on PaddleSpeech # Speech Application based on PaddleSpeech
([简体中文](./README_cn.md)|English)
The directory containes many speech applications in multi scenarios. The directory containes many speech applications in multi scenarios.
* audio tagging - tag audio label in vedio * audio tagging - multi-label tagging of an audio file
* automatic_video_subtitiles - generate subtitles from a video
* metaverse - 2D AR with TTS * metaverse - 2D AR with TTS
* speech recogintion - vidio understanding * punctuation_restoration - restore punctuation from raw text
* speech recogintion - recognize text of an audio file
* speech translation - end to end speech translation * speech translation - end to end speech translation
* story talker - book reader based on OCR and TTS * story talker - book reader based on OCR and TTS
* style_fs2 - multi style control for FastSpeech2 model * style_fs2 - multi style control for FastSpeech2 model
* text_to_speech - convert text into speech

@ -0,0 +1,15 @@
# PaddleSpeech 语音应用 Demo
(简体中文|[English](./README.md))
该目录包含基于 PaddleSpeech 开发的不同场景的语音应用 Demo
* 声音分类 - 基于 AudioSet 的 527 类标签的音频多标签分类。
* 视频字幕生成 - 识别视频中语音的文本,并进行文本后处理。
* 元宇宙 - 基于语音合成的 2D 增强现实。
* 标点恢复 - 通常作为语音识别的文本后处理任务,为一段无标点的纯文本添加相应的标点符号。
* 语音识别 - 识别一段音频中包含的语音文字。
* 语音翻译 - 实时识别音频中的语言,并同时翻译成目标语言。
* 会说话的故事书 - 基于 OCR 和语音合成的会说话的故事书。
* 个性化语音合成 - 基于 FastSpeech2 模型的个性化语音合成。
* 语音合成 - 基于给定的文本生成语音音频。

@ -9,9 +9,9 @@ This demo is an implementation to tag an audio file with 527 [AudioSet](https://
## Usage ## Usage
### 1. Installation ### 1. Installation
```bash see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
pip install paddlespeech
``` You can choose one way from easy, meduim and hard to install paddlespeech.
### 2. Prepare Input File ### 2. Prepare Input File
The input of this demo should be a WAV file(`.wav`). The input of this demo should be a WAV file(`.wav`).

@ -9,9 +9,10 @@
## 使用方法 ## 使用方法
### 1. 安装 ### 1. 安装
```bash 请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。
pip install paddlespeech
``` 你可以从 easymediumhard 三中方式中选择一种方式安装。
### 2. 准备输入 ### 2. 准备输入
这个 demo 的输入应该是一个 WAV 文件(`.wav` 这个 demo 的输入应该是一个 WAV 文件(`.wav`

@ -8,9 +8,9 @@ This demo is an implementation to automatic video subtitles from a video file. I
## Usage ## Usage
### 1. Installation ### 1. Installation
```bash see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
pip install paddlespeech
``` You can choose one way from easy, meduim and hard to install paddlespeech.
### 2. Prepare Input ### 2. Prepare Input
Get a video file with the speech of the specific language: Get a video file with the speech of the specific language:

@ -6,9 +6,10 @@
这个 demo 是一个为视频自动生成字幕的实现,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。 这个 demo 是一个为视频自动生成字幕的实现,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。
## 使用方法 ## 使用方法
### 1. 安装 ### 1. 安装
```bash 请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。
pip install paddlespeech
``` 你可以从 easymediumhard 三中方式中选择一种方式安装。
### 2. 准备输入 ### 2. 准备输入
获取包含特定语言语音的视频文件: 获取包含特定语言语音的视频文件:
```bash ```bash

@ -37,17 +37,20 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# run tts # run tts
CUDA_VISIBLE_DEVICES=${gpus} \ CUDA_VISIBLE_DEVICES=${gpus} \
python3 ${BIN_DIR}/synthesize_e2e.py \ python3 ${BIN_DIR}/../synthesize_e2e.py \
--fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ --am=fastspeech2_csmsc \
--fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ --am_config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
--fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ --am_ckpt=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
--pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ --am_stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --voc=pwgan_csmsc \
--pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ --voc_config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=sentences.txt \ --text=sentences.txt \
--output-dir=output/wavs \ --output_dir=output/wavs \
--inference-dir=output/inference \ --inference_dir=output/inference \
--phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt --phones_dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
# output/inference is not needed here, which save the static models # output/inference is not needed here, which save the static models
rm -rf output/inference rm -rf output/inference
fi fi

@ -7,9 +7,10 @@ This demo is an implementation to restore punctuation from raw text. It can be d
## Usage ## Usage
### 1. Installation ### 1. Installation
```bash see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
pip install paddlespeech
``` You can choose one way from easy, meduim and hard to install paddlespeech.
### 2. Prepare Input ### 2. Prepare Input
The input of this demo should be a text of the specific language that can be passed via argument. The input of this demo should be a text of the specific language that can be passed via argument.

@ -9,9 +9,10 @@
## 使用方法 ## 使用方法
### 1. 安装 ### 1. 安装
```bash 请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。
pip install paddlespeech
``` 你可以从 easymediumhard 三中方式中选择一种方式安装。
### 2. 准备输入 ### 2. 准备输入
这个 demo 的输入是通过参数传递的特定语言的文本。 这个 demo 的输入是通过参数传递的特定语言的文本。

@ -8,9 +8,9 @@ This demo is an implementation to recognize text from a specific audio file. It
## Usage ## Usage
### 1. Installation ### 1. Installation
```bash see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
pip install paddlespeech
``` You can choose one way from easy, meduim and hard to install paddlespeech.
### 2. Prepare Input File ### 2. Prepare Input File
The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
@ -23,8 +23,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
### 3. Usage ### 3. Usage
- Command Line(Recommended) - Command Line(Recommended)
```bash ```bash
# Chinese
paddlespeech asr --input ./zh.wav paddlespeech asr --input ./zh.wav
# English
paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
``` ```
(It doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.)
Usage: Usage:
```bash ```bash
paddlespeech asr --help paddlespeech asr --help
@ -36,11 +41,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
- `sample_rate`: Sample rate of the model. Default: `16000`. - `sample_rate`: Sample rate of the model. Default: `16000`.
- `config`: Config of asr task. Use pretrained model when it is None. Default: `None`. - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`.
- `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
- `yes`: No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate. Default: `False`.
- `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment. - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
Output: Output:
```bash ```bash
# Chinese
[2021-12-08 13:12:34,063] [ INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康 [2021-12-08 13:12:34,063] [ INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康
# English
[2022-01-12 11:51:10,815] [ INFO] - ASR Result: i knocked at the door on the ancient side of the building
``` ```
- Python API - Python API
@ -56,6 +65,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
config=None, # Set `config` and `ckpt_path` to None to use pretrained model. config=None, # Set `config` and `ckpt_path` to None to use pretrained model.
ckpt_path=None, ckpt_path=None,
audio_file='./zh.wav', audio_file='./zh.wav',
force_yes=False,
device=paddle.get_device()) device=paddle.get_device())
print('ASR Result: \n{}'.format(text)) print('ASR Result: \n{}'.format(text))
``` ```
@ -73,4 +83,4 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by
| Model | Language | Sample Rate | Model | Language | Sample Rate
| :--- | :---: | :---: | | :--- | :---: | :---: |
| conformer_wenetspeech| zh| 16000 | conformer_wenetspeech| zh| 16000
| transformer_aishell| zh| 16000 | transformer_librispeech| en| 16000

@ -2,14 +2,15 @@
# 语音识别 # 语音识别
## 介绍 ## 介绍
语音识别解决让计算机程序自动转录语音的问题 语音识别是一项用计算机程序自动转录语音的技术
这个 demo 是一个从给定音频文件识别文本的实现,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。 这个 demo 是一个从给定音频文件识别文本的实现,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。
## 使用方法 ## 使用方法
### 1. 安装 ### 1. 安装
```bash 请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。
pip install paddlespeech
``` 你可以从 easymediumhard 三中方式中选择一种方式安装。
### 2. 准备输入 ### 2. 准备输入
这个 demo 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 这个 demo 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。
@ -20,8 +21,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
### 3. 使用方法 ### 3. 使用方法
- 命令行 (推荐使用) - 命令行 (推荐使用)
```bash ```bash
# 中文
paddlespeech asr --input ./zh.wav paddlespeech asr --input ./zh.wav
# 英文
paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
``` ```
(如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error没有关系这个包是非必须的。)
使用方法: 使用方法:
```bash ```bash
paddlespeech asr --help paddlespeech asr --help
@ -33,11 +39,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
- `sample_rate`:音频采样率,默认值:`16000`。 - `sample_rate`:音频采样率,默认值:`16000`。
- `config`ASR 任务的参数文件,若不设置则使用预训练模型中的默认配置,默认值:`None`。 - `config`ASR 任务的参数文件,若不设置则使用预训练模型中的默认配置,默认值:`None`。
- `ckpt_path`:模型参数文件,若不设置则下载预训练模型使用,默认值:`None`。 - `ckpt_path`:模型参数文件,若不设置则下载预训练模型使用,默认值:`None`。
- `yes`;不需要设置额外的参数,一旦设置了该参数,说明你默认同意程序的所有请求,其中包括自动转换输入音频的采样率。默认值:`False`。
- `device`:执行预测的设备,默认值:当前系统下 paddlepaddle 的默认 device。 - `device`:执行预测的设备,默认值:当前系统下 paddlepaddle 的默认 device。
输出: 输出:
```bash ```bash
# 中文
[2021-12-08 13:12:34,063] [ INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康 [2021-12-08 13:12:34,063] [ INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康
# 英文
[2022-01-12 11:51:10,815] [ INFO] - ASR Result: i knocked at the door on the ancient side of the building
``` ```
- Python API - Python API
@ -53,6 +63,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
config=None, # Set `config` and `ckpt_path` to None to use pretrained model. config=None, # Set `config` and `ckpt_path` to None to use pretrained model.
ckpt_path=None, ckpt_path=None,
audio_file='./zh.wav', audio_file='./zh.wav',
force_yes=False,
device=paddle.get_device()) device=paddle.get_device())
print('ASR Result: \n{}'.format(text)) print('ASR Result: \n{}'.format(text))
``` ```
@ -69,4 +80,4 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
| 模型 | 语言 | 采样率 | 模型 | 语言 | 采样率
| :--- | :---: | :---: | | :--- | :---: | :---: |
| conformer_wenetspeech| zh| 16000 | conformer_wenetspeech| zh| 16000
| transformer_aishell| zh| 16000 | transformer_librispeech| en| 16000

@ -7,9 +7,10 @@ This demo is an implementation to recognize text from a specific audio file and
## Usage ## Usage
### 1. Installation ### 1. Installation
```bash see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
pip install paddlespeech
``` You can choose one way from easy, meduim and hard to install paddlespeech.
### 2. Prepare Input File ### 2. Prepare Input File
The input of this demo should be a WAV file(`.wav`). The input of this demo should be a WAV file(`.wav`).

@ -8,9 +8,10 @@
## 使用方法 ## 使用方法
### 1. 安装 ### 1. 安装
```bash 请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。
pip install paddlespeech
``` 你可以从 easymediumhard 三中方式中选择一种方式安装。
### 2. 准备输入 ### 2. 准备输入
这个 Demo 的输入是 WAV(`.wav`) 语音文件 这个 Demo 的输入是 WAV(`.wav`) 语音文件

@ -37,17 +37,20 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# run tts # run tts
CUDA_VISIBLE_DEVICES=${gpus} \ CUDA_VISIBLE_DEVICES=${gpus} \
python3 ${BIN_DIR}/synthesize_e2e.py \ python3 ${BIN_DIR}/../synthesize_e2e.py \
--fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ --am=fastspeech2_csmsc \
--fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ --am_config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
--fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ --am_ckpt=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
--pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ --am_stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --voc=pwgan_csmsc \
--pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ --voc_config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=output/sentences.txt \ --text=output/sentences.txt \
--output-dir=output/wavs \ --output_dir=output/wavs \
--inference-dir=output/inference \ --inference_dir=output/inference \
--phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt --phones_dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
# output/inference is not needed here, which save the static models # output/inference is not needed here, which save the static models
rm -rf output/inference rm -rf output/inference
fi fi

@ -8,9 +8,9 @@ This demo is an implementation to generate audio from the given text. It can be
## Usage ## Usage
### 1. Installation ### 1. Installation
```bash see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
pip install paddlespeech
``` You can choose one way from easy, meduim and hard to install paddlespeech.
### 2. Prepare Input ### 2. Prepare Input
The input of this demo should be a text of the specific language that can be passed via argument. The input of this demo should be a text of the specific language that can be passed via argument.

@ -9,9 +9,10 @@
## 使用方法 ## 使用方法
### 1. 安装 ### 1. 安装
```bash 请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。
pip install paddlespeech
``` 你可以从 easymediumhard 三中方式中选择一种方式安装。
### 2. 准备输入 ### 2. 准备输入
这个 demo 的输入是通过参数传递的特定语言的文本。 这个 demo 的输入是通过参数传递的特定语言的文本。

@ -0,0 +1,128 @@
# Customize Dataset for Audio Classification
Following this tutorial you can customize your dataset for audio classification task by using `paddlespeech` and `paddleaudio`.
A base class of classification dataset is `paddleaudio.dataset.AudioClassificationDataset`. To customize your dataset you should write a dataset class derived from `AudioClassificationDataset`.
Assuming you have some wave files that stored in your own directory. You should prepare a meta file with the information of filepaths and labels. For example the absolute path of it is `/PATH/TO/META_FILE.txt`:
```
/PATH/TO/WAVE_FILE/1.wav cat
/PATH/TO/WAVE_FILE/2.wav cat
/PATH/TO/WAVE_FILE/3.wav dog
/PATH/TO/WAVE_FILE/4.wav dog
```
Here is an example to build your custom dataset in `custom_dataset.py`:
```python
from paddleaudio.datasets.dataset import AudioClassificationDataset
class CustomDataset(AudioClassificationDataset):
meta_file = '/PATH/TO/META_FILE.txt'
# List all the class labels
label_list = [
'cat',
'dog',
]
def __init__(self, **kwargs):
files, labels = self._get_data()
super(CustomDataset, self).__init__(
files=files, labels=labels, feat_type='raw', **kwargs)
def _get_data(self):
'''
This method offer information of wave files and labels.
'''
files = []
labels = []
with open(self.meta_file) as f:
for line in f:
file, label_str = line.strip().split(' ')
files.append(file)
labels.append(self.label_list.index(label_str))
return files, labels
```
Then you can build dataset and data loader from `CustomDataset`:
```python
import paddle
from paddleaudio.features import LogMelSpectrogram
from custom_dataset import CustomDataset
# Feature config should be align with pretrained model
sample_rate = 32000
feat_conf = {
'sr': sample_rate,
'n_fft': 1024,
'hop_length': 320,
'window': 'hann',
'win_length': 1024,
'f_min': 50.0,
'f_max': 14000.0,
'n_mels': 64,
}
train_ds = CustomDataset(sample_rate=sample_rate)
feature_extractor = LogMelSpectrogram(**feat_conf)
train_sampler = paddle.io.DistributedBatchSampler(
train_ds, batch_size=4, shuffle=True, drop_last=False)
train_loader = paddle.io.DataLoader(
train_ds,
batch_sampler=train_sampler,
return_list=True,
use_buffer_reader=True)
```
Train model with `CustomDataset`:
```python
from paddlespeech.cls.models import cnn14
from paddlespeech.cls.models import SoundClassifier
backbone = cnn14(pretrained=True, extract_embedding=True)
model = SoundClassifier(backbone, num_class=len(train_ds.label_list))
optimizer = paddle.optimizer.Adam(
learning_rate=1e-6, parameters=model.parameters())
criterion = paddle.nn.loss.CrossEntropyLoss()
steps_per_epoch = len(train_sampler)
epochs = 10
for epoch in range(1, epochs + 1):
model.train()
for batch_idx, batch in enumerate(train_loader):
waveforms, labels = batch
# Need a padding when lengths of waveforms differ in a batch.
feats = feature_extractor(waveforms)
feats = paddle.transpose(feats, [0, 2, 1])
logits = model(feats)
loss = criterion(logits, labels)
loss.backward()
optimizer.step()
if isinstance(optimizer._learning_rate,
paddle.optimizer.lr.LRScheduler):
optimizer._learning_rate.step()
optimizer.clear_grad()
# Calculate loss
avg_loss = loss.numpy()[0]
# Calculate metrics
preds = paddle.argmax(logits, axis=1)
num_corrects = (preds == labels).numpy().sum()
num_samples = feats.shape[0]
avg_acc = num_corrects / num_samples
print_msg = 'Epoch={}/{}, Step={}/{}'.format(
epoch, epochs, batch_idx + 1, steps_per_epoch)
print_msg += ' loss={:.4f}'.format(avg_loss)
print_msg += ' acc={:.4f}'.format(avg_acc)
print_msg += ' lr={:.6f}'.format(optimizer.get_lr())
print(print_msg)
```
If you want to save the checkpoint of model and evaluate from a specific dataset, please see `paddlespeech/cli/exp/panns/train.py` for more details.

@ -0,0 +1,51 @@
# Quick Start of Audio Classification
Several shell scripts provided in `./examples/esc50/cls0` will help us to quickly give it a try, for most major modules, including data preparation, model training, model evaluation, with [ESC50](ttps://github.com/karolpiczak/ESC-50) dataset.
Some of the scripts in `./examples` are not configured with GPUs. If you want to train with 8 GPUs, please modify `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`. If you don't have any GPU available, please set `CUDA_VISIBLE_DEVICES=` to use CPUs instead.
Let's start a audio classification task with the following steps:
- Go to the directory
```bash
cd examples/esc50/cls0
```
- Source env
```bash
source path.sh
```
- Main entry point
```bash
CUDA_VISIBLE_DEVICES=0 ./run.sh 1
```
This demo includes fine-tuning, evaluating and deploying a audio classificatio model. More detailed information is provided in the following sections.
## Fine-tuning a model
PANNs([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf)) are pretrained models with [Audioset](https://research.google.com/audioset/). They can be easily used to extract audio embeddings for audio classification task.
To start a model fine-tuning, please run:
```bash
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
feat_backend=numpy
./local/train.sh ${ngpu} ${feat_backend}
```
## Deploy a model
Once you save a model checkpoint, you can export it to static graph and deploy by python scirpt:
- Export to a static graph
```bash
./local/export.sh ${ckpt_dir} ./export
```
The argument `ckpt_dir` should be a directory in which a model checkpoint stored, for example `checkpoint/epoch_50`.
The static graph will be exported to `./export`.
- Inference
```bash
./local/static_model_infer.sh ${infer_device} ./export ${audio_file}
```
The argument `infer_device` can be `cpu` or `gpu`, and it means which device to be used to infer. And `audio_file` should be a wave file with name `*.wav`.

@ -6,16 +6,17 @@ There are 3 ways to use `PaddleSpeech`. According to the degree of difficulty, t
|:---- |:----------------------------------------------------------- |:----| |:---- |:----------------------------------------------------------- |:----|
| Easy | (1) Use command-line functions of PaddleSpeech. <br> (2) Experience PaddleSpeech on Ai Studio. | Linux, Mac(not support M1 chip)Windows | | Easy | (1) Use command-line functions of PaddleSpeech. <br> (2) Experience PaddleSpeech on Ai Studio. | Linux, Mac(not support M1 chip)Windows |
| Medium | Support major functions such as using the` ready-made `examples and using PaddleSpeech to train your model. | Linux | | Medium | Support major functions such as using the` ready-made `examples and using PaddleSpeech to train your model. | Linux |
| Hard | Support full function of Paddlespeechincluding training n-gram language model, Montreal-Forced-Aligner, and so on. And you are more able to be a developer! | Ubuntu | | Hard | Support full function of Paddlespeech, including using join ctc decoder with kaldi, training n-gram language model, Montreal-Forced-Aligner, and so on. And you are more able to be a developer! | Ubuntu |
## Prerequisites ## Prerequisites
- Python >= 3.7 - Python >= 3.7
- PaddlePaddle latest version (please refer to the [Installation Guide] (https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) - PaddlePaddle latest version (please refer to the [Installation Guide] (https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
- C++ compilation environment - C++ compilation environment
- Hip: For Linux and Mac, do not use command `sh` instead of command `bash` in installation document. - Hip: For Linux and Mac, do not use command `sh` instead of command `bash` in installation document.
- Hip: We recommand you to install `paddlepaddle` from https://mirror.baidu.com/pypi/simple and install `paddlespeech` from https://pypi.tuna.tsinghua.edu.cn/simple.
## Easy: Get the Basic Function (Support Linux, Mac, and Windows) ## Easy: Get the Basic Function (Support Linux, Mac, and Windows)
- If you are newer to `PaddleSpeech` and want to experience it easily without your machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step tutorial for `PaddleSpeech` and you can use the basic function of `PaddleSpeech` with a free machine. - If you are newer to `PaddleSpeech` and want to experience it easily without your machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step [tutorial](https://aistudio.baidu.com/aistudio/education/group/info/25130) for `PaddleSpeech`, and you can use the basic function of `PaddleSpeech` with a free machine.
- If you want to use the command line function of Paddlespeech, you need to complete the following steps to install `PaddleSpeech`. For more information about how to use the command line function, you can see the [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli). - If you want to use the command line function of Paddlespeech, you need to complete the following steps to install `PaddleSpeech`. For more information about how to use the command line function, you can see the [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli).
### Install Conda ### Install Conda
Conda is a management system of the environment. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) (select a version py>=3.7) to download and install the conda. Conda is a management system of the environment. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) (select a version py>=3.7) to download and install the conda.
@ -29,6 +30,10 @@ conda install -y -c conda-forge sox libsndfile bzip2
#### Windows #### Windows
You need to install `Visual Studio` to make the C++ compilation environment. You need to install `Visual Studio` to make the C++ compilation environment.
https://visualstudio.microsoft.com/visual-cpp-build-tools/
You can also see [#1195](https://github.com/PaddlePaddle/PaddleSpeech/discussions/1195) for more help.
#### Mac #### Mac
```bash ```bash
brew install gcc brew install gcc
@ -47,10 +52,19 @@ sudo apt install build-essential
conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0
``` ```
### Install PaddleSpeech ### Install PaddleSpeech
You can use the following command: Some users may fail to install `kaldiio` due to the default download source, you can install `pytest-runner` at first
```bash ```bash
pip install paddlepaddle paddlespeech pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
Then you can use the following commands:
```bash
pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple
```
> If you encounter problem with downloading **nltk_data** while using paddlespeech, it maybe due to your poor network, we suggest you download the [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) provided by us, and extract it to your `${HOME}`.
> If you fail to install paddlespeech-ctcdecoders, it doesn't matter.
## Medium: Get the Major Functions (Support Linux) ## Medium: Get the Major Functions (Support Linux)
If you want to get the major function of `paddlespeech`, you need to do following steps: If you want to get the major function of `paddlespeech`, you need to do following steps:
### Git clone PaddleSpeech ### Git clone PaddleSpeech
@ -105,13 +119,15 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0
### Install PaddlePaddle ### Install PaddlePaddle
You can choose the `PaddlePaddle` version based on your system. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu 2.2.0: You can choose the `PaddlePaddle` version based on your system. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu 2.2.0:
```bash ```bash
python3 -m pip install paddlepaddle-gpu==2.2.0 python3 -m pip install paddlepaddle-gpu==2.2.0 -i https://mirror.baidu.com/pypi/simple
``` ```
### Install PaddleSpeech ### Install PaddleSpeech
You can install `paddlespeech` by the following commandthen you can use the `ready-made` examples in `paddlespeech` : You can install `paddlespeech` by the following commandthen you can use the `ready-made` examples in `paddlespeech` :
```bash ```bash
# Some users may fail to install `kaldiio` due to the default download source, you can install `pytest-runner` at first
pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple
# Make sure you are in the root directory of PaddleSpeech # Make sure you are in the root directory of PaddleSpeech
pip install . pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
## Hard: Get the Full Function (Support Ubuntu) ## Hard: Get the Full Function (Support Ubuntu)
@ -175,14 +191,17 @@ conda activate tools/venv
conda install -y -c conda-forge sox libsndfile swig bzip2 libflac bc conda install -y -c conda-forge sox libsndfile swig bzip2 libflac bc
``` ```
### Install PaddlePaddle ### Install PaddlePaddle
Some users may fail to install `kaldiio` due to the default download source, you can install `pytest-runner` at first
```bash
pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple
```
Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0: Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0:
```bash ```bash
python3 -m pip install paddlepaddle-gpu==2.2.0 python3 -m pip install paddlepaddle-gpu==2.2.0 -i https://mirror.baidu.com/pypi/simple
``` ```
### Install PaddleSpeech in Developing Mode ### Install PaddleSpeech in Developing Mode
```bash ```bash
pip install -e .[develop] pip install -e .[develop] -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
### Install the Kaldi (Optional) ### Install the Kaldi (Optional)
```bash ```bash

@ -5,16 +5,18 @@
| :--- | :----------------------------------------------------------- | :------------------ | | :--- | :----------------------------------------------------------- | :------------------ |
| 简单 | (1) 使用 PaddleSpeech 的命令行功能. <br> (2) 在 Aistudio上体验 PaddleSpeech. | Linux, Mac(不支持M1芯片)Windows | | 简单 | (1) 使用 PaddleSpeech 的命令行功能. <br> (2) 在 Aistudio上体验 PaddleSpeech. | Linux, Mac(不支持M1芯片)Windows |
| 中等 | 支持 PaddleSpeech 主要功能,比如使用已有 examples 中的模型和使用 PaddleSpeech 来训练自己的模型. | Linux | | 中等 | 支持 PaddleSpeech 主要功能,比如使用已有 examples 中的模型和使用 PaddleSpeech 来训练自己的模型. | Linux |
| 困难 | 支持 PaddleSpeech 的各项功能,包含训练语言模型,使用强制对齐等。并且你更能成为一名开发者! | Ubuntu | | 困难 | 支持 PaddleSpeech 的各项功能,包含结合kaldi使用 join ctc decoder 方式解码,训练语言模型,使用强制对齐等。并且你更能成为一名开发者! | Ubuntu |
## 先决条件 ## 先决条件
- Python >= 3.7 - Python >= 3.7
- 最新版本的 PaddlePaddle (请看 [安装向导](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) - 最新版本的 PaddlePaddle (请看 [安装向导](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
- C++ 编译环境 - C++ 编译环境
- 提示: 对于 Linux 和 Mac请不要使用 `sh` 代替安装文档中的 `bash` - 提示: 对于 Linux 和 Mac请不要使用 `sh` 代替安装文档中的 `bash`
- 提示: 我们建议在安装 `paddlepaddle` 的时候使用百度源 https://mirror.baidu.com/pypi/simple ,而在安装 `paddlespeech` 的时候使用清华源 https://pypi.tuna.tsinghua.edu.cn/simple 。
## 简单: 获取基本功能(支持 LinuxMac 和 Windows) ## 简单: 获取基本功能(支持 LinuxMac 和 Windows)
- 如果你是一个刚刚接触 `PaddleSpeech` 的新人并且想要很方便地体验一下该项目。我们建议你 体验一下[AI Studio](https://aistudio.baidu.com/aistudio/index)。我们在AI Studio上面建立了一个让你一步一步运行体验来使用`PaddleSpeech`的教程。 - 如果你是一个刚刚接触 `PaddleSpeech` 的新人并且想要很方便地体验一下该项目。我们建议你体验一下 [AI Studio](https://aistudio.baidu.com/aistudio/index)。我们在 AI Studio上面建立了一个让你一步一步运行体验来使用 `PaddleSpeech` [教程](https://aistudio.baidu.com/aistudio/education/group/info/25130)
- 如果你想使用 `PaddleSpeech` 的命令行功能,你需要跟随下面的步骤来安装 `PaddleSpeech`。如果你想了解更多关于使用 `PaddleSpeech` 命令行功能的信息,你可以参考 [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli)。 - 如果你想使用 `PaddleSpeech` 的命令行功能,你需要跟随下面的步骤来安装 `PaddleSpeech`。如果你想了解更多关于使用 `PaddleSpeech` 命令行功能的信息,你可以参考 [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli)。
### 安装Conda ### 安装 Conda
Conda是一个包管理的环境。你可以前往 [minicoda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda请下载 py>=3.7 的版本)。 Conda是一个包管理的环境。你可以前往 [minicoda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda请下载 py>=3.7 的版本)。
然后你需要安装 `paddlespeech` 的 conda 依赖: 然后你需要安装 `paddlespeech` 的 conda 依赖:
```bash ```bash
@ -24,6 +26,11 @@ conda install -y -c conda-forge sox libsndfile bzip2
(如果你系统上已经安装了 C++ 编译环境,请忽略这一步。) (如果你系统上已经安装了 C++ 编译环境,请忽略这一步。)
#### Windows #### Windows
对于 Windows 系统,需要安装 `Visual Studio` 来完成 C++ 编译环境的安装。 对于 Windows 系统,需要安装 `Visual Studio` 来完成 C++ 编译环境的安装。
https://visualstudio.microsoft.com/visual-cpp-build-tools/
你可以前往讨论区[#1195](https://github.com/PaddlePaddle/PaddleSpeech/discussions/1195)获取更多帮助。
#### Mac #### Mac
```bash ```bash
brew install gcc brew install gcc
@ -42,19 +49,27 @@ sudo apt install build-essential
conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0
``` ```
### 安装 PaddleSpeech ### 安装 PaddleSpeech
你可以使用如下命令: 部分用户系统由于默认源的问题安装中会出现kaldiio安转出错的问题建议首先安装pytest-runner:
```bash
pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple
```
然后你可以使用如下命令:
```bash ```bash
pip install paddlepaddle paddlespeech pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple
pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
> 如果您在使用 paddlespeech 的过程中遇到关于下载 **nltk_data** 的问题,可能是您的网络不佳,我们建议您下载我们提供的 [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) 并解压缩到您的 `${HOME}` 目录下。
> 如果出现 paddlespeech-ctcdecoders 无法安装的问题,无须担心,这不影响使用。
## 中等: 获取主要功能(支持 Linux ## 中等: 获取主要功能(支持 Linux
如果你想要使用` paddlespeech` 的主要功能。你需要完成以下几个步骤 如果你想要使用 `paddlespeech` 的主要功能。你需要完成以下几个步骤
### Git clone PaddleSpeech ### Git clone PaddleSpeech
你需要先git clone本仓库 你需要先 git clone 本仓库
```bash ```bash
git clone https://github.com/PaddlePaddle/PaddleSpeech.git git clone https://github.com/PaddlePaddle/PaddleSpeech.git
cd PaddleSpeech cd PaddleSpeech
``` ```
### 安装 Conda ### 安装 Conda
Conda 是一个包管理的环境。你可以前往 [minicoda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda请下载 py>=3.7 的版本)。你可以尝试自己安装,或者使用以下的命令: Conda 是一个包管理的环境。你可以前往 [minicoda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda请下载 py>=3.7 的版本)。你可以尝试自己安装,或者使用以下的命令:
```bash ```bash
@ -98,12 +113,15 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0
### 安装 PaddlePaddle ### 安装 PaddlePaddle
你可以根据系统配置选择 PaddlePaddle 版本,例如系统使用 CUDA 10.2 CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.2.0 你可以根据系统配置选择 PaddlePaddle 版本,例如系统使用 CUDA 10.2 CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.2.0
```bash ```bash
python3 -m pip install paddlepaddle-gpu==2.2.0 python3 -m pip install paddlepaddle-gpu==2.2.0 -i https://mirror.baidu.com/pypi/simple
``` ```
### 安装 PaddleSpeech ### 安装 PaddleSpeech
最后安装 `paddlespeech`,这样你就可以使用 `paddlespeech`中已有的 examples 最后安装 `paddlespeech`,这样你就可以使用 `paddlespeech`中已有的 examples
```bash ```bash
pip install . # 部分用户系统由于默认源的问题安装中会出现kaldiio安转出错的问题建议首先安装pytest-runner:
pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple
# 请确保目前处于PaddleSpeech项目的根目录
pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
## 困难: 获取所有功能(支持 Ubuntu ## 困难: 获取所有功能(支持 Ubuntu
### 先决条件 ### 先决条件
@ -164,11 +182,16 @@ conda install -y -c conda-forge sox libsndfile swig bzip2 libflac bc
### 安装 PaddlePaddle ### 安装 PaddlePaddle
请确认你系统是否有 GPU并且使用了正确版本的 paddlepaddle。例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.2.0 请确认你系统是否有 GPU并且使用了正确版本的 paddlepaddle。例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.2.0
```bash ```bash
python3 -m pip install paddlepaddle-gpu==2.2.0 python3 -m pip install paddlepaddle-gpu==2.2.0 -i https://mirror.baidu.com/pypi/simple
``` ```
### 用开发者模式安装 PaddleSpeech ### 用开发者模式安装 PaddleSpeech
部分用户系统由于默认源的问题安装中会出现kaldiio安转出错的问题建议首先安装pytest-runner:
```bash
pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple
```
然后安装 PaddleSpeech
```bash ```bash
pip install -e .[develop] pip install -e .[develop] -i https://pypi.tuna.tsinghua.edu.cn/simple
``` ```
### 安装 Kaldi可选 ### 安装 Kaldi可选
```bash ```bash

@ -4,19 +4,18 @@
### Speech Recognition Model ### Speech Recognition Model
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link
:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :----------- :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----:
[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0)
[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0)
[Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.056 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1)
[Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1)
[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0538 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) [Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1)
[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/conformer.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1) [Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1)
[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1) [Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2)
[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2)
### Language Model based on NGram ### Language Model based on NGram
Language Model | Training Data | Token-based | Size | Descriptions Language Model | Training Data | Token-based | Size | Descriptions
:-------------:| :------------:| :-----: | -----: | :----------------- :------------:| :------------:|:------------: | :------------: | :------------:
[English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie' binary with '-a 22 -q 8 -b 8' [English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1; <br/> About 1.85 billion n-grams; <br/> 'trie' binary with '-a 22 -q 8 -b 8'
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4; <br/> About 0.13 billion n-grams; <br/> 'probing' binary with default settings
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning; <br/> About 3.7 billion n-grams; <br/> 'probing' binary with default settings
@ -24,14 +23,13 @@ Language Model | Training Data | Token-based | Size | Descriptions
### Speech Translation Models ### Speech Translation Models
| Model | Training Data | Token-based | Size | Descriptions | BLEU | Example Link | | Model | Training Data | Token-based | Size | Descriptions | BLEU | Example Link |
| ------------------------------------------------------------ | ------------- | ----------- | ---- | ------------------------------------------------------------ | ----- | ------------------------------------------------------------ | | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
| [Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz) | Ted-En-Zh | Spm | | Encoder:Transformer, Decoder:Transformer, <br />Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) | | (only for CLI)[Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz) | Ted-En-Zh| Spm| | Encoder:Transformer, Decoder:Transformer, <br />Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) |
## Text-to-Speech Models ## Text-to-Speech Models
### Acoustic Models ### Acoustic Models
Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static) Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----: :-------------:| :------------:| :-----: | :-----:| :-----:| :-----:
Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)||| Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)|||
TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
@ -43,14 +41,16 @@ FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/Pa
FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)|||
### Vocoders ### Vocoders
Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static) Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size (static)
:-------------:| :------------:| :-----: | :-----:| :-----:| :-----: :-----:| :-----:| :-----: | :-----:| :-----:| :-----:
WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)||| WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)|||
Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)|5.1MB| Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)|5.1MB|
Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)||| Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|||
Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)||| Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)|||
Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)||| Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|||
|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip) <br>[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB| |Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip) <br>[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB|
Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | |
HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB|
### Voice Cloning ### Voice Cloning
Model Type | Dataset| Example Link | Pretrained Models Model Type | Dataset| Example Link | Pretrained Models
@ -64,14 +64,18 @@ GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/
Model Type | Dataset| Example Link | Pretrained Models Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----: :-------------:| :------------:| :-----: | :-----:
PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams),[panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams),[panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams) PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams), [panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams), [panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams)
PANN | ESC-50 |[pann-esc50]("./examples/esc50/cls0")|[panns_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz), [panns_cnn10](https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz), [panns_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz) PANN | ESC-50 |[pann-esc50]("./examples/esc50/cls0")|[esc50_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn6.tar.gz), [esc50_cnn10.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn10.tar.gz), [esc50_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn14.tar.gz)
## Punctuation Restoration Models
Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----:
Ernie Linear | IWLST2012_zh |[iwslt2012_punc0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/iwslt2012/punc0)|[ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip)
## Speech Recognition Model from paddle 1.8 ## Speech Recognition Model from paddle 1.8
| Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | | Acoustic Model |Training Data| Token-based | Size | Descriptions | CER | WER | Hours of speech |
| :----------------------------------------------------------: | :----------------------------: | :---------: | -----: | :------------------------------------------------- | :----- | :----- | :-------------- | | :-----:| :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
| [Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz) | Aishell Dataset | Char-based | 234 MB | 2 Conv + 3 bidirectional GRU layers | 0.0804 | - | 151 h | | [Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz) | Aishell Dataset | Char-based | 234 MB | 2 Conv + 3 bidirectional GRU layers | 0.0804 | — | 151 h |
| [Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz) | Librispeech Dataset | Word-based | 307 MB | 2 Conv + 3 bidirectional sharing weight RNN layers | - | 0.0685 | 960 h | | [Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz) | Librispeech Dataset | Word-based | 307 MB | 2 Conv + 3 bidirectional sharing weight RNN layers | — | 0.0685 | 960 h |
| [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz) | Baidu Internal English Dataset | Word-based | 273 MB | 2 Conv + 3 bidirectional GRU layers | - | 0.0541 | 8628 h | | [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz) | Baidu Internal English Dataset | Word-based | 273 MB | 2 Conv + 3 bidirectional GRU layers |— | 0.0541 | 8628 h|

@ -0,0 +1,42 @@
# TTS Papers
## Text Frontend
### Polyphone
- [【g2pM】g2pM: A Neural Grapheme-to-Phoneme Conversion Package for Mandarin Chinese Based on a New Open Benchmark Dataset](https://arxiv.org/abs/2004.03136)
- [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf)
### Text Normalization
#### English
- [applenob/text_normalization](https://github.com/applenob/text_normalization)
### G2P
#### English
- [cmusphinx/g2p-seq2seq](https://github.com/cmusphinx/g2p-seq2seq)
## Acoustic Models
- [【AdaSpeech3】AdaSpeech 3: Adaptive Text to Speech for Spontaneous Style](https://arxiv.org/abs/2107.02530)
- [【AdaSpeech2】AdaSpeech 2: Adaptive Text to Speech with Untranscribed Data](https://arxiv.org/abs/2104.09715)
- [【AdaSpeech】AdaSpeech: Adaptive Text to Speech for Custom Voice](https://arxiv.org/abs/2103.00993)
- [【FastSpeech2】FastSpeech 2: Fast and High-Quality End-to-End Text to Speech](https://arxiv.org/abs/2006.04558)
- [【FastPitch】FastPitch: Parallel Text-to-speech with Pitch Prediction](https://arxiv.org/abs/2006.06873)
- [【SpeedySpeech】SpeedySpeech: Efficient Neural Speech Synthesis](https://arxiv.org/abs/2008.03802)
- [【FastSpeech】FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263)
- [【Transformer TTS】Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895)
- [【Tacotron2】Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)
## Vocoders
- [【RefineGAN】RefineGAN: Universally Generating Waveform Better than Ground Truth with Highly Accurate Pitch and Intensity Responses](https://arxiv.org/abs/2111.00962)
- [【Fre-GAN】Fre-GAN: Adversarial Frequency-consistent Audio Synthesis](https://arxiv.org/abs/2106.02297)
- [【StyleMelGAN】StyleMelGAN: An Efficient High-Fidelity Adversarial Vocoder with Temporal Adaptive Normalization](https://arxiv.org/abs/2011.01557)
- [【Multi-band MelGAN】Multi-band MelGAN: Faster Waveform Generation for High-Quality Text-to-Speech](https://arxiv.org/abs/2005.05106)
- [【HiFi-GAN】HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis](https://arxiv.org/abs/2010.05646)
- [【VocGAN】VocGAN: A High-Fidelity Real-time Vocoder with a Hierarchically-nested Adversarial Network](https://arxiv.org/abs/2007.15256)
- [【Parallel WaveGAN】Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram](https://arxiv.org/abs/1910.11480)
- [【MelGAN】MelGAN: Generative Adversarial Networks for Conditional Waveform Synthesis](https://arxiv.org/abs/1910.06711)
- [【WaveFlow】WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219)
- [【LPCNet】LPCNet: Improving Neural Speech Synthesis Through Linear Prediction](https://arxiv.org/abs/1810.11846)
- [【WaveRNN】Efficient Neural Audio Synthesis](https://arxiv.org/abs/1802.08435)
## GAN TTS
- [【GAN TTS】High Fidelity Speech Synthesis with Adversarial Networks](https://arxiv.org/abs/1909.11646)
## Voice Cloning
- [【SV2TTS】Transfer Learning from Speaker Verification to Multispeaker Text-to-Speech Synthesis](https://arxiv.org/abs/1806.04558)
- [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)

@ -0,0 +1,520 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "ff6ff1e0",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "33af5f76",
"metadata": {},
"outputs": [],
"source": [
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "9b566b73",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cloning into 'warp-ctc'...\n",
"remote: Enumerating objects: 829, done.\u001b[K\n",
"remote: Total 829 (delta 0), reused 0 (delta 0), pack-reused 829\u001b[K\n",
"Receiving objects: 100% (829/829), 388.85 KiB | 140.00 KiB/s, done.\n",
"Resolving deltas: 100% (419/419), done.\n",
"Checking connectivity... done.\n"
]
}
],
"source": [
"!git clone https://github.com/SeanNaren/warp-ctc.git"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4a087a09",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc\n"
]
}
],
"source": [
"%cd warp-ctc"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f55dc29a",
"metadata": {},
"outputs": [],
"source": [
"mkdir -p build"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "fe79f4cf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build\n"
]
}
],
"source": [
"cd build"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3d25c718",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-- The C compiler identification is GNU 5.4.0\n",
"-- The CXX compiler identification is GNU 5.4.0\n",
"-- Check for working C compiler: /usr/bin/cc\n",
"-- Check for working C compiler: /usr/bin/cc -- works\n",
"-- Detecting C compiler ABI info\n",
"-- Detecting C compiler ABI info - done\n",
"-- Detecting C compile features\n",
"-- Detecting C compile features - done\n",
"-- Check for working CXX compiler: /usr/bin/c++\n",
"-- Check for working CXX compiler: /usr/bin/c++ -- works\n",
"-- Detecting CXX compiler ABI info\n",
"-- Detecting CXX compiler ABI info - done\n",
"-- Detecting CXX compile features\n",
"-- Detecting CXX compile features - done\n",
"-- Looking for pthread.h\n",
"-- Looking for pthread.h - found\n",
"-- Performing Test CMAKE_HAVE_LIBC_PTHREAD\n",
"-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed\n",
"-- Looking for pthread_create in pthreads\n",
"-- Looking for pthread_create in pthreads - not found\n",
"-- Looking for pthread_create in pthread\n",
"-- Looking for pthread_create in pthread - found\n",
"-- Found Threads: TRUE \n",
"-- Found CUDA: /usr/local/cuda (found suitable version \"10.2\", minimum required is \"6.5\") \n",
"-- cuda found TRUE\n",
"-- Building shared library with GPU support\n",
"-- Configuring done\n",
"-- Generating done\n",
"-- Build files have been written to: /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build\n"
]
}
],
"source": [
"!cmake .."
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "7a4238f1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[ 11%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/warpctc.dir/src/warpctc_generated_reduce.cu.o\u001b[0m\n",
"[ 22%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/warpctc.dir/src/warpctc_generated_ctc_entrypoint.cu.o\u001b[0m\n",
"\u001b[35m\u001b[1mScanning dependencies of target warpctc\u001b[0m\n",
"[ 33%] \u001b[32m\u001b[1mLinking CXX shared library libwarpctc.so\u001b[0m\n",
"[ 33%] Built target warpctc\n",
"[ 44%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/test_gpu.dir/tests/test_gpu_generated_test_gpu.cu.o\u001b[0m\n",
"\u001b[35m\u001b[1mScanning dependencies of target test_cpu\u001b[0m\n",
"[ 55%] \u001b[32mBuilding CXX object CMakeFiles/test_cpu.dir/tests/test_cpu.cpp.o\u001b[0m\n",
"[ 66%] \u001b[32mBuilding CXX object CMakeFiles/test_cpu.dir/tests/random.cpp.o\u001b[0m\n",
"[ 77%] \u001b[32m\u001b[1mLinking CXX executable test_cpu\u001b[0m\n",
"[ 77%] Built target test_cpu\n",
"\u001b[35m\u001b[1mScanning dependencies of target test_gpu\u001b[0m\n",
"[ 88%] \u001b[32mBuilding CXX object CMakeFiles/test_gpu.dir/tests/random.cpp.o\u001b[0m\n",
"[100%] \u001b[32m\u001b[1mLinking CXX executable test_gpu\u001b[0m\n",
"[100%] Built target test_gpu\n"
]
}
],
"source": [
"!make -j"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "31761a31",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc\n"
]
}
],
"source": [
"cd .."
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "f53316f6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding\n"
]
}
],
"source": [
"cd pytorch_binding"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "084f1e49",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"running install\n",
"running bdist_egg\n",
"running egg_info\n",
"creating warpctc_pytorch.egg-info\n",
"writing warpctc_pytorch.egg-info/PKG-INFO\n",
"writing dependency_links to warpctc_pytorch.egg-info/dependency_links.txt\n",
"writing top-level names to warpctc_pytorch.egg-info/top_level.txt\n",
"writing manifest file 'warpctc_pytorch.egg-info/SOURCES.txt'\n",
"writing manifest file 'warpctc_pytorch.egg-info/SOURCES.txt'\n",
"installing library code to build/bdist.linux-x86_64/egg\n",
"running install_lib\n",
"running build_py\n",
"creating build\n",
"creating build/lib.linux-x86_64-3.9\n",
"creating build/lib.linux-x86_64-3.9/warpctc_pytorch\n",
"copying warpctc_pytorch/__init__.py -> build/lib.linux-x86_64-3.9/warpctc_pytorch\n",
"running build_ext\n",
"building 'warpctc_pytorch._warp_ctc' extension\n",
"creating /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9\n",
"creating /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src\n",
"Emitting ninja build file /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/build.ninja...\n",
"Compiling objects...\n",
"Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
"[1/1] c++ -MMD -MF /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o.d -pthread -B /workspace/zhanghui/DeepSpeech-2.x/tools/venv/compiler_compat -Wl,--sysroot=/ -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /workspace/zhanghui/DeepSpeech-2.x/tools/venv/include -fPIC -O2 -isystem /workspace/zhanghui/DeepSpeech-2.x/tools/venv/include -fPIC -I/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/TH -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/include/python3.9 -c -c /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/src/binding.cpp -o /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o -std=c++14 -fPIC -DWARPCTC_ENABLE_GPU -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE=\"_gcc\"' '-DPYBIND11_STDLIB=\"_libstdcpp\"' '-DPYBIND11_BUILD_ABI=\"_cxxabi1011\"' -DTORCH_EXTENSION_NAME=_warp_ctc -D_GLIBCXX_USE_CXX11_ABI=0\n",
"g++ -pthread -B /workspace/zhanghui/DeepSpeech-2.x/tools/venv/compiler_compat -Wl,--sysroot=/ -shared -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath-link,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath-link,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o -L/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/lib -L/usr/local/cuda/lib64 -lwarpctc -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -o build/lib.linux-x86_64-3.9/warpctc_pytorch/_warp_ctc.cpython-39-x86_64-linux-gnu.so -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build\n",
"creating build/bdist.linux-x86_64\n",
"creating build/bdist.linux-x86_64/egg\n",
"creating build/bdist.linux-x86_64/egg/warpctc_pytorch\n",
"copying build/lib.linux-x86_64-3.9/warpctc_pytorch/__init__.py -> build/bdist.linux-x86_64/egg/warpctc_pytorch\n",
"copying build/lib.linux-x86_64-3.9/warpctc_pytorch/_warp_ctc.cpython-39-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/egg/warpctc_pytorch\n",
"byte-compiling build/bdist.linux-x86_64/egg/warpctc_pytorch/__init__.py to __init__.cpython-39.pyc\n",
"creating stub loader for warpctc_pytorch/_warp_ctc.cpython-39-x86_64-linux-gnu.so\n",
"byte-compiling build/bdist.linux-x86_64/egg/warpctc_pytorch/_warp_ctc.py to _warp_ctc.cpython-39.pyc\n",
"creating build/bdist.linux-x86_64/egg/EGG-INFO\n",
"copying warpctc_pytorch.egg-info/PKG-INFO -> build/bdist.linux-x86_64/egg/EGG-INFO\n",
"copying warpctc_pytorch.egg-info/SOURCES.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n",
"copying warpctc_pytorch.egg-info/dependency_links.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n",
"copying warpctc_pytorch.egg-info/top_level.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n",
"writing build/bdist.linux-x86_64/egg/EGG-INFO/native_libs.txt\n",
"zip_safe flag not set; analyzing archive contents...\n",
"warpctc_pytorch.__pycache__._warp_ctc.cpython-39: module references __file__\n",
"creating dist\n",
"creating 'dist/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg' and adding 'build/bdist.linux-x86_64/egg' to it\n",
"removing 'build/bdist.linux-x86_64/egg' (and everything under it)\n",
"Processing warpctc_pytorch-0.1-py3.9-linux-x86_64.egg\n",
"removing '/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg' (and everything under it)\n",
"creating /workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg\n",
"Extracting warpctc_pytorch-0.1-py3.9-linux-x86_64.egg to /workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages\n",
"warpctc-pytorch 0.1 is already the active version in easy-install.pth\n",
"\n",
"Installed /workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg\n",
"Processing dependencies for warpctc-pytorch==0.1\n",
"Finished processing dependencies for warpctc-pytorch==0.1\n"
]
}
],
"source": [
"!python setup.py install"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "ee4ca9e3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Python 3.9.5\r\n"
]
}
],
"source": [
"!python -V"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "59255ed8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc\n"
]
}
],
"source": [
"cd .."
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "1dae09b9",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"grep: warning: GREP_OPTIONS is deprecated; please use an alias or script\n"
]
}
],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import warpctc_pytorch as wp\n",
"import paddle.nn as pn\n",
"import paddle"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "83d0762e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'1.10.0+cu102'"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.__version__"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "62501e2c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'2.2.0'"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"paddle.__version__"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "9e8e0f40",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([2, 1, 5])\n",
"2.4628584384918213\n",
"[[[ 0.17703122 -0.70812464 0.17703122 0.17703122 0.17703122]]\n",
"\n",
" [[ 0.17703122 0.17703122 -0.70812464 0.17703122 0.17703122]]]\n"
]
}
],
"source": [
"probs = torch.FloatTensor([[\n",
" [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]\n",
" ]]).transpose(0, 1).contiguous()\n",
"print(probs.size())\n",
"labels = torch.IntTensor([1, 2])\n",
"label_sizes = torch.IntTensor([2])\n",
"probs_sizes = torch.IntTensor([2])\n",
"probs.requires_grad_(True)\n",
"bs = probs.size(1)\n",
"\n",
"ctc_loss = wp.CTCLoss(size_average=False, length_average=False)\n",
"cost = ctc_loss(probs, labels, probs_sizes, label_sizes)\n",
"cost = cost.sum() / bs\n",
"print(cost.item())\n",
"cost.backward()\n",
"print(probs.grad.numpy())"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "2cd46569",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.4628584384918213\n",
"[[[ 0.1770312 -0.7081248 0.1770312 0.1770312 0.1770312]]\n",
"\n",
" [[ 0.1770312 0.1770312 -0.7081248 0.1770312 0.1770312]]]\n"
]
}
],
"source": [
"probs = torch.FloatTensor([[\n",
" [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]\n",
" ]]).transpose(0, 1).contiguous()\n",
"labels = torch.IntTensor([1, 2])\n",
"label_sizes = torch.IntTensor([2])\n",
"probs_sizes = torch.IntTensor([2])\n",
"probs.requires_grad_(True)\n",
"bs = probs.size(1)\n",
"\n",
"log_probs = torch.log_softmax(probs, axis=-1)\n",
"\n",
"ctc_loss1 = nn.CTCLoss(reduction='none')\n",
"cost = ctc_loss1(log_probs, labels, probs_sizes, label_sizes)\n",
"cost = cost.sum() / bs\n",
"print(cost.item())\n",
"cost.backward()\n",
"print(probs.grad.numpy())"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "85c3461a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2, 1, 5]\n",
"[1, 2]\n",
"2.4628584384918213\n",
"[[[ 0.17703122 -0.70812464 0.17703122 0.17703122 0.17703122]]\n",
"\n",
" [[ 0.17703122 0.17703122 -0.70812464 0.17703122 0.17703122]]]\n"
]
}
],
"source": [
"paddle.set_device('cpu')\n",
"probs = paddle.to_tensor([[\n",
" [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1],\n",
" ]]).transpose([1,0,2])\n",
"print(probs.shape) # (T, B, D)\n",
"labels = paddle.to_tensor([[1, 2]], dtype='int32') #BL)\n",
"print(labels.shape)\n",
"label_sizes = paddle.to_tensor([2], dtype='int64')\n",
"probs_sizes = paddle.to_tensor([2], dtype='int64')\n",
"bs = paddle.shape(probs)[1]\n",
"probs.stop_gradient=False\n",
"\n",
"ctc_loss = pn.CTCLoss(reduction='none')\n",
"cost = ctc_loss(probs, labels, probs_sizes, label_sizes)\n",
"cost = cost.sum() / bs\n",
"print(cost.item())\n",
"cost.backward()\n",
"print(probs.grad.numpy())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d390cd91",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -0,0 +1,174 @@
# g2p 字典设计
<!--
modified from https://zhuanlan.zhihu.com/p/349600439
-->
本文主要讲语音合成的 g2p (grapheme to phoneme) 部分。
代码: [generate_lexicon.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/generate_lexicon.py) (代码可能与此处的描述有些许出入,以代码为准,生成的带 tone 带儿化的 pinyin 字典参考 [simple.lexicon](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/csmsc/tts3/local/simple.lexicon)
## ARPAbet
对于英文 TTS常用的 g2p 是通过查询 CMUDict 来实现,而 CMUDict 注音使用的系统是 ARPAbet具体含义参见 [CMU 发音词典](http://www.speech.cs.cmu.edu/cgi-bin/cmudict/)。
它包含 39 个 phoneme 不包含音词汇重音的变体:
| Phoneme | Example | Translation |
|:-------------:|:-------:|:-----------:|
| AA | odd | AA D |
| AE | at | AE T |
| AH | hut | HH AH T |
| AO | ought | AO T |
| AW | cow | K AW |
| AY | hide | HH AY D |
| B | be | B IY |
| CH | cheese | CH IY Z |
| D | dee | D IY |
| DH | thee | DH IY |
| EH | Ed | EH D |
| ER | hurt | HH ER T |
| EY | ate | EY T |
| F | fee | F IY |
| G | green | G R IY N |
| HH | he | HH IY |
| IH | it | IH T |
| IY | eat | IY T |
| JH | gee | JH IY |
| K | key | K IY |
| L | lee | L IY |
| M | me | M IY |
| N | knee | N IY |
| NG | ping | P IH NG |
| OW | oat | OW T |
| OY | toy | T OY |
| P | pee | P IY |
| R | read | R IY D |
| S | sea | S IY |
| SH | she | SH IY |
| T | tea | T IY |
| TH | theta | TH EY T AH|
| UH | hood | HH UH D |
| UW | two | T UW |
| V | vee | V IY |
| W | we | W IY |
| Y | yield | Y IY L D |
| Z | zee | Z IY |
| ZH | seizure| S IY ZH ER|
另外还包含三个重音标记,
0 — No stress
1 — Primary stress
2 — Secondary stress
其中重音标记附在元音后面。当只需要音标而不需要重音标记的时候也可以直接省略。
CMUDict 只是一个词典当出现了不在词典中的词时OOV可以求助其他工具可以根据拼写得到对应的发音如:
- [Lexicon Tool](http://www.speech.cs.cmu.edu/tools)
- [g2p-seq2seq](https://github.com/cmusphinx/g2p-seq2seq)
## 中文注音系统
中文普通话的注音系统存在许多套,比如汉语拼音 (pinyin) 注音符号 (bopomofo) 国语注音符第二式, 威妥玛拼音等。而且有一些并非注音方案,是拉丁化方案,因此为了符号系统的经济性,会做一些互补符号的简并,比如汉语拼音中的 `i` 的代表了三个音位, `e` 代表了两个音位(单用的情况很少, 单用时写作 `ê`);也有一些简写,比如 `bpmf` 后的 `o``uo` 的简写, `ui``uei` 的简写,` iu` 是 `iou` 的简写, `un``uen` 的简写, `ao` 是为了书写避免形近而改掉的 `au` `y``w` 是为了连续书写时作为分隔而产生的零声母, `ü``j``q``x` 后面省略两点(中国大陆使用美式键盘打字的时候,一般只有在“女”、 “律”、“略”和“虐”这一类的字里面用 `v` 代替 `ü`,而在 `j``q``x` 后面的时候则仍用 `u` ),有鼻韵母 `uang` 而没有 `ueng`,但是又有 `weng` 这个音节之类的问题, 有 `ong` 韵母但是又没有单用的情形。其实这些都是汉语拼音作为拉丁化方案而做的一系列的修改。
另外,汉语的声调是用了特殊符号来标调型,用字母记录的时候常用 `12345` 或者 `1234`、轻音不标等手段。
另外还有两个比较突出的问题是**儿化**和**变调**(参考 [zh_text_frontend](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/zh_text_frontend.md))。对于具体的数据集,也可能有不同的标注方案。一般我们为汉字标音是标字调而不标变调,但是**标贝数据集是标变调的**(但是也没有把所有的变调都正确标出来)。儿化在汉语书写和拼音中也是一个比较麻烦的事情,虽然正字法中说到可以用小字号的儿表示儿化,但是这种发音由字号这种排版要素来表达的手法未免过于崎岖,所以鲜见有人真的这么排版,只有在某些书籍中,强调此事的时候见过。另外,在儿化的标音方式上,鼻韵母需要去掉韵尾然后换成 r这么一来如果直接抽取拼音的字符串表示那么可能出现的音节就会超过 1400 甚至进入一种含糊的状态,不清楚一共有多少个有效音节,即使是韵母,也会因此扩展近一倍。
因为存在这样的情形,再考虑到不同的数据集自带的拼音 transcription 的风格可能不同,所以需要考虑进行转换,在内部转成统一的表示。既然这个过程是必要的,那么我们可以大胆设计一个内部方案。
这里设计的原则是:
1. 有效符号集仅切分为声母和韵母,不作声母,介音,韵腹,韵尾的切分;
2. 尽可能把不同的音用不同的符号表示,比如 `i``e` 会被拆分为 3 和 2 个符号, `u``ü` 开头的韵母分开,这是为了 TTS 系统的建议性考虑的,我们选择尽量反映语音的现实情况,而不把注音系统里面的奇怪规则留给模型去学习;
3. 不包含零声母 `y` `w`之类的形式上的符号,因为如果这些符号不发声或者发声极短,那么可以不加入音符序列中,以期待 attention 更对角;
4. 声调和韵母不结合为一个符号,而是分开,这样可以**减少词汇量**,使得符号的 embedding 得到更充分的训练,也更能反映声调语言的特点(数据集少时推荐这么做);
5. 儿化的标音方式采用拆分的方式处理, 但是增设一个特殊符号 `&r` 来表示儿化的 `r`,它和一般的 `er` 不同,以区分实际读音的区别。
6. 更加贴近注音符号,把 `in` 写作 `ien``ing` 写作 `ieng` `un` 写作 `uen` `ong` 写作 `ueng` `iong` 写作 `üeng`。其中 `in``ing` 的转写纯属偏好,无论用什么符号写,都可以被转为一个 index 只要它们的使用情况不发声变化就可以。而 `ong` 写作 `ueng` 则是有实际差别的,如果 `ong` 作为一个韵母,那么 `weng` 经过修改之后会变成 `ueng` 就会同时有 `ueng``ong`。而如果不细究音值上的微妙差异,`ong` 就是 `ung` 的一种奇怪表示, 在注意符号中, 它就记作 `ㄨㄥ`。而 `iong` 则是 `ㄩㄥ`
7. `ui` `iu` 都展开为 `uei``iou` 纯属偏好,对实际结果没有影响。`bpmf `后的 `o` 展开为 `uo`,这个则是为了和单独的 `o` 区分开(哦, 和波里面的韵母的发音其实不同)。
8. 所有的 `ü `都有 `v` 代替,无论是单独作韵母, 还是复韵母和鼻韵母。
9. 把停顿以 `#1` 等方式纳入其中, 把 `<pad>` `<unk>` `<s>` `</s>` 这些为了处理符号系列的特殊符号也加入其中,多一些特殊词汇并不会对 Embedding 产生什么影响。
于是我们可以的通过一套规则系统,把标贝的**拼音标注**转换成我们需要的形式。(当然,如果是别的数据集的实际标注不同,那么转换规则也要作一些修改)
在实际使用中文数据集时,我们仅使用其提供的**拼音标注**,而不使用**音素标注**PhoneLabel因为不同的数据集有不同的标注规则而且有的数据集是没有**音素标注**的aishell3
我们的做法和维基百科上的汉语拼音音节列表更接近 [汉语拼音音节列表](https://zh.wikipedia.org/zh-hans/%E6%B1%89%E8%AF%AD%E6%8B%BC%E9%9F%B3%E9%9F%B3%E8%8A%82%E5%88%97%E8%A1%A8)
转换之后,符号列表是:
声母基本没有什么争议,共 21 个:
|声母|
|:--:|
|b|
|p|
|m|
|f|
|d|
|t|
|n|
|l|
|g|
|k|
|h|
|j|
|q|
|x|
|zh|
|ch|
|sh|
|r|
|z|
|c|
|s|
韵母和儿化韵尾(共 41个
|韵母|解释|
|:----:|:-----------: |
|ii |`zi``ci` `si` 里面的韵母 `i`|
|iii |`zhi` `chi` `shi` `ri` 里面的韵母 `i`|
|a |啊,卡|
|o |哦|
|e |恶,个|
|ea |ê|
|ai |爱,在|
|ei |诶,薇|
|ao |奥,脑|
|ou |欧,勾|
|an |安,单|
|en |恩,痕|
|ang |盎,刚|
|eng |嗯,更|
|er |儿|
|i |一|
|ia |鸦,家|
|io |哟|
|ie |叶,界|
|iai |崖(台语发音)|
|iao |要,教|
|iou |有,久|
|ian |言,眠|
|ien |因,新|
|iang |样,降|
|ieng |英,晶
|u |无,卢|
|ua |哇,瓜|
|uo |我,波|
|uai |外,怪|
|uei |位,贵|
|uan |万,乱|
|uen |问,论|
|uang |网,光|
|ueng |翁,共|
|v |玉,曲,`ü`|
|ve |月,却|
|van |源,倦|
|ven |韵,君|
|veng |永,炯|
|&r |儿化韵尾|

@ -21,7 +21,11 @@
"|FB-RAWs|Filter Bank Random Window Discriminators|\n", "|FB-RAWs|Filter Bank Random Window Discriminators|\n",
"\n", "\n",
"<br></br>\n", "<br></br>\n",
"csmsc 数据集上 GAN Vocoder 整体对比\n", "csmsc 数据集上 GAN Vocoder 整体对比如下, \n ",
"\n",
"测试机器1 x Tesla V100-32G 40 core Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz\n ",
"\n",
"测试环境Python 3.7.0, paddlepaddle 2.2.0\n",
"\n", "\n",
"Model|Date|Input|Generator<br>Loss|Discriminator<br>Loss|Need<br>Finetune|Training<br>Steps|Finetune<br>Steps|Batch<br>Size|ips<br>(gen only)<br>(gen + dis)|Static Model<br>Size (gen)|RTF<br>(GPU)|\n", "Model|Date|Input|Generator<br>Loss|Discriminator<br>Loss|Need<br>Finetune|Training<br>Steps|Finetune<br>Steps|Batch<br>Size|ips<br>(gen only)<br>(gen + dis)|Static Model<br>Size (gen)|RTF<br>(GPU)|\n",
":-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|\n", ":-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|\n",

@ -0,0 +1,173 @@
# 简化安装与发包
## 问题:
1. [如何去除 ubuntu 的 apt 安装依赖?](#conda-代替系统依赖)
2. [如何支持普通用户和开发者两种安装的需求,尽量减少普通用户所需的依赖?](#区分install模式和develop模式)
3. [如何进行 python 包的动态安装?](#python-包的动态安装)
4. [如何进行 python 项目编包?](#python-编包方法)
5. [发包前要有什么准备?](#关于发包前的准备工作)
6. [发 C++ 包需要注意的东西?](#manylinux)
## conda 代替系统依赖
conda 可以用来代替一些 apt-get 安装的系统依赖,这样可以让项目适用于除了 ubuntu 以外的系统。
使用 conda 可以安装 sox、 libsndfile、swig 等 paddlespeech 需要的依赖:
```bash
conda install -y -c conda-forge sox libsndfile
```
部分系统会缺少 libbzip2 库,这个 paddlespeech 也是需要的,这也可以用 conda 安装:
```bash
conda install -y -c bzip2
```
conda 也可以安装 linux 的 C++ 的依赖:
```bash
conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0
```
#### 剩余问题:使用 conda 环境编译 kenlm 失败。目前在 conda 环境下编译 kenlm 会出现链接失败的问题
目前知道需要的依赖:
```bash
conda install -c conda-forge eigen boost cmake
```
## 区分install模式和develop模式
可以在 setup.py 中划分 install 的依赖(基本依赖)和 develop 的依赖 (开发者额外依赖)。 setup_info 中 `install_requires` 设置 install 的依赖,而在 `extras_require` 中设置 `develop` key 为 develop 的依赖。
普通安装可以使用:
```bash
pip install .
```
另外使用 pip 安装已发的包也是使用普通安装的:
```
pip install paddlespeech
```
而开发者可以使用如下方式安装,这样不仅会安装 install 的依赖,也会安装 develop 的依赖, 即:最后安装的依赖 = install 依赖 + develop 依赖:
```bash
pip install -e .[develop]
```
## python 包的动态安装
可以使用 pip 包来实现动态安装:
```python
import pip
if int(pip.__version__.split('.')[0]) > 9:
from pip._internal import main
else:
from pip import main
main(['install', package_name])
```
## python 编包方法
#### 创建 pypi的账号
创建 pypi 账号
#### 下载 twine
```
pip install twine
```
#### python 编包
编写好 python 包的 setup.py, 然后使用如下命令编 wheel 包:
```bash
python setup.py bdist_wheel
```
如果要编源码包,用如下命令:
```bash
python setup.py sdist
```
#### 上传包
```bash
twine upload dist/wheel包
```
输入账号和密码后就可以上传 wheel 包了
#### 关于python 包的发包信息
主要可以参考这个[文档](https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/?highlight=find_packages)
## 关于发包前的准备工作
#### 拉分支
在发包之前需要拉分支。例如需要发 0.1.0 版本的正式包,则需要拉一个 r0.1 的分支。并且在这个 r0.1 分支的包上面打 0.1.0 的tag。在拉分支之前可以选择性的使用 rc 版本发一个正式版前的试用包例如0.1.0rc0等到rc包测试通过后再拉分支如果是发 0.1.1 包,则 merge r0.1分支打tag完成发包。总体步骤可以总结为
- 用 develop 分支发 rc 包
- rc 包通过后拉分支
- 打 tag
- 发包
- 编写 release note
## ManyLinux
为了让有 C++ 依赖的 pip wheel 包可以适用于更多的 linux 系统,需要降低其本身的 glibc 的依赖。这就需要让 pip wheel 包在 manylinux 的 docker 下编包。关于查看系统的 glibc 版本,可以使用命令:`ldd --version`。
### Manylinux
关于 Manylinux主要可以参考 Github 项目的说明[ github many linux](https://github.com/pypa/manylinux)。
manylinux1 支持 Centos5以上 manylinux2010 支持 Centos 6 以上manylinux2014 支持Centos 7 以上。
目前使用 manylinux2010 基本可以满足所有的 linux 生产环境需求。不建议使用manylinux1系统较老难度较大
### 拉取 manylinux2010
```bash
docker pull quay.io/pypa/manylinux1_x86_64
```
### 使用 manylinux2010
启动 manylinux2010 docker。
```bash
docker run -it xxxxxx
```
在 manylinux2010 的docker环境自带 swig 和各种类型的 python 版本。这里注意不要自己下载 conda 来安装环境来编译 pip 包,要用 docker 本身的环境来编包。
设置python
```bash
export PATH="/opt/python/cp37-cp37m/bin/:$PATH"
#export PATH="/opt/python/cp38-cp38/bin/:$PATH"
#export PATH="/opt/python/cp39-cp39/bin/:$PATH"
```
随后正常编包,编包后需要使用 [auditwheel](https://github.com/pypa/auditwheel) 来降低编好的wheel包的版本。
显示 wheel 包的 glibc 依赖版本
```bash
auditwheel show wheel包
```
降低 wheel包的版本
```bash
auditwheel repair wheel包
```

@ -2,9 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"<a href=\"https://github.com/PaddlePaddle/PaddleSpeech\"><img style=\"position: absolute; z-index: 999; top: 0; right: 0; border: 0; width: 128px; height: 128px;\" src=\"https://nosir.github.io/cleave.js/images/right-graphite@2x.png\" alt=\"Fork me on GitHub\"></a>\n", "<a href=\"https://github.com/PaddlePaddle/PaddleSpeech\"><img style=\"position: absolute; z-index: 999; top: 0; right: 0; border: 0; width: 128px; height: 128px;\" src=\"https://nosir.github.io/cleave.js/images/right-graphite@2x.png\" alt=\"Fork me on GitHub\"></a>\n",
"\n", "\n",
@ -32,9 +30,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"%%HTML\n", "%%HTML\n",
@ -45,9 +41,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"# 2. 音频和特征提取" "# 2. 音频和特征提取"
] ]
@ -55,9 +49,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"# 环境准备安装paddlespeech和paddleaudio\n", "# 环境准备安装paddlespeech和paddleaudio\n",
@ -67,9 +59,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"import warnings\n", "import warnings\n",
@ -82,9 +72,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"\n", "\n",
"\n", "\n",
@ -98,9 +86,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"# 获取示例音频\n", "# 获取示例音频\n",
@ -111,9 +97,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from paddleaudio import load\n", "from paddleaudio import load\n",
@ -130,9 +114,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"!paddlespeech cls --input ./dog.wav" "!paddlespeech cls --input ./dog.wav"
@ -140,9 +122,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"## 2.2 音频特征提取\n", "## 2.2 音频特征提取\n",
"\n", "\n",
@ -162,21 +142,20 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"import paddle\n", "import paddle\n",
"import numpy as np\n", "import numpy as np\n",
"\n", "\n",
"data, sr = load(file='./dog.wav', sr=32000, mono=True, dtype='float32')\n",
"x = paddle.to_tensor(data)\n", "x = paddle.to_tensor(data)\n",
"n_fft = 1024\n", "n_fft = 1024\n",
"win_length = 1024\n", "win_length = 1024\n",
"hop_length = 512\n", "hop_length = 320\n",
"\n", "\n",
"# [D, T]\n", "# [D, T]\n",
"spectrogram = paddle.signal.stft(x, n_fft=1024, win_length=1024, hop_length=512, onesided=True) \n", "spectrogram = paddle.signal.stft(x, n_fft=n_fft, win_length=win_length, hop_length=hop_length, onesided=True) \n",
"print('spectrogram.shape: {}'.format(spectrogram.shape))\n", "print('spectrogram.shape: {}'.format(spectrogram.shape))\n",
"print('spectrogram.dtype: {}'.format(spectrogram.dtype))\n", "print('spectrogram.dtype: {}'.format(spectrogram.dtype))\n",
"\n", "\n",
@ -190,9 +169,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"### 2.2.2 LogFBank\n", "### 2.2.2 LogFBank\n",
"\n", "\n",
@ -220,13 +197,15 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from paddleaudio.features import LogMelSpectrogram\n", "from paddleaudio.features import LogMelSpectrogram\n",
"\n", "\n",
"f_min=50.0\n",
"f_max=14000.0\n",
"n_mels=64\n",
"\n",
"# - sr: 音频文件的采样率。\n", "# - sr: 音频文件的采样率。\n",
"# - n_fft: FFT样本点个数。\n", "# - n_fft: FFT样本点个数。\n",
"# - hop_length: 音频帧之间的间隔。\n", "# - hop_length: 音频帧之间的间隔。\n",
@ -239,7 +218,9 @@
" hop_length=hop_length, \n", " hop_length=hop_length, \n",
" win_length=win_length, \n", " win_length=win_length, \n",
" window='hann', \n", " window='hann', \n",
" n_mels=64)\n", " f_min=f_min,\n",
" f_max=f_max,\n",
" n_mels=n_mels)\n",
"\n", "\n",
"x = paddle.to_tensor(data).unsqueeze(0) # [B, L]\n", "x = paddle.to_tensor(data).unsqueeze(0) # [B, L]\n",
"log_fbank = feature_extractor2(x) # [B, D, T]\n", "log_fbank = feature_extractor2(x) # [B, D, T]\n",
@ -253,9 +234,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"## 2.3 声音分类方法\n", "## 2.3 声音分类方法\n",
"\n", "\n",
@ -272,9 +251,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"### 2.3.2 深度学习方法\n", "### 2.3.2 深度学习方法\n",
"传统机器学习方法可以捕捉声音特征的差异(例如男声和女声的声音在音高上往往差异较大)并实现分类任务。\n", "传统机器学习方法可以捕捉声音特征的差异(例如男声和女声的声音在音高上往往差异较大)并实现分类任务。\n",
@ -288,9 +265,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"### 2.3.3 Pretrain + Finetune\n", "### 2.3.3 Pretrain + Finetune\n",
"\n", "\n",
@ -315,9 +290,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"# 3. 实践:环境声音分类\n", "# 3. 实践:环境声音分类\n",
"\n", "\n",
@ -361,22 +334,18 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from paddleaudio.datasets import ESC50\n", "from paddleaudio.datasets import ESC50\n",
"\n", "\n",
"train_ds = ESC50(mode='train')\n", "train_ds = ESC50(mode='train', sample_rate=sr)\n",
"dev_ds = ESC50(mode='dev')" "dev_ds = ESC50(mode='dev', sample_rate=sr)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"### 3.1.2 特征提取\n", "### 3.1.2 特征提取\n",
"通过下列代码,用 `paddleaudio.features.LogMelSpectrogram` 初始化一个音频特征提取器,在训练过程中实时提取音频的 LogFBank 特征,其中主要的参数如下: " "通过下列代码,用 `paddleaudio.features.LogMelSpectrogram` 初始化一个音频特征提取器,在训练过程中实时提取音频的 LogFBank 特征,其中主要的参数如下: "
@ -385,19 +354,23 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"feature_extractor = LogMelSpectrogram(sr=44100, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window='hann', n_mels=64)" "feature_extractor = LogMelSpectrogram(\n",
" sr=sr, \n",
" n_fft=n_fft, \n",
" hop_length=hop_length, \n",
" win_length=win_length, \n",
" window='hann', \n",
" f_min=f_min,\n",
" f_max=f_max,\n",
" n_mels=n_mels)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"## 3.2 模型\n", "## 3.2 模型\n",
"\n", "\n",
@ -409,9 +382,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from paddlespeech.cls.models import cnn14\n", "from paddlespeech.cls.models import cnn14\n",
@ -420,9 +391,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"### 3.2.2 构建分类模型\n", "### 3.2.2 构建分类模型\n",
"\n", "\n",
@ -432,9 +401,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"import paddle.nn as nn\n", "import paddle.nn as nn\n",
@ -461,18 +428,14 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"## 3.3 Finetune" "## 3.3 Finetune"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"1. 创建 DataLoader " "1. 创建 DataLoader "
] ]
@ -480,9 +443,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"batch_size = 16\n", "batch_size = 16\n",
@ -492,9 +453,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"2. 定义优化器和 Loss" "2. 定义优化器和 Loss"
] ]
@ -502,9 +461,7 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"optimizer = paddle.optimizer.Adam(learning_rate=1e-4, parameters=model.parameters())\n", "optimizer = paddle.optimizer.Adam(learning_rate=1e-4, parameters=model.parameters())\n",
@ -513,19 +470,15 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"3. 启动模型训练 " "3. 启动模型训练 "
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"from paddleaudio.utils import logger\n", "from paddleaudio.utils import logger\n",
@ -603,9 +556,7 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"## 3.4 音频预测\n", "## 3.4 音频预测\n",
"\n", "\n",
@ -615,16 +566,13 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {},
"collapsed": false
},
"outputs": [], "outputs": [],
"source": [ "source": [
"top_k = 10\n", "top_k = 10\n",
"wav_file = './dog.wav'\n", "wav_file = './dog.wav'\n",
"\n", "\n",
"waveform, sr = load(wav_file)\n", "waveform, _ = load(wav_file, sr)\n",
"feature_extractor = LogMelSpectrogram(sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window='hann', n_mels=64)\n",
"feats = feature_extractor(paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))\n", "feats = feature_extractor(paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))\n",
"feats = paddle.transpose(feats, [0, 2, 1]) # [B, N, T] -> [B, T, N]\n", "feats = paddle.transpose(feats, [0, 2, 1]) # [B, N, T] -> [B, T, N]\n",
"print(feats.shape)\n", "print(feats.shape)\n",
@ -635,16 +583,14 @@
"sorted_indices = probs[0].argsort()\n", "sorted_indices = probs[0].argsort()\n",
"\n", "\n",
"msg = f'[{wav_file}]\\n'\n", "msg = f'[{wav_file}]\\n'\n",
"for idx in sorted_indices[-top_k:]:\n", "for idx in sorted_indices[-1:-top_k-1:-1]:\n",
" msg += f'{ESC50.label_list[idx]}: {probs[0][idx]:.5f}\\n'\n", " msg += f'{ESC50.label_list[idx]}: {probs[0][idx]:.5f}\\n'\n",
"print(msg)" "print(msg)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": { "metadata": {},
"collapsed": false
},
"source": [ "source": [
"# 4. 作业\n", "# 4. 作业\n",
"1. 使用开发模式安装 [PaddleSpeech](https://github.com/PaddlePaddle/PaddleSpeech) \n", "1. 使用开发模式安装 [PaddleSpeech](https://github.com/PaddlePaddle/PaddleSpeech) \n",
@ -653,6 +599,7 @@
"1. 在 [MusicSpeech](http://marsyas.info/downloads/datasets.html) 数据集上完成 music/speech 二分类。 \n", "1. 在 [MusicSpeech](http://marsyas.info/downloads/datasets.html) 数据集上完成 music/speech 二分类。 \n",
"2. 在 [GTZAN Genre Collection](http://marsyas.info/downloads/datasets.html) 音乐分类数据集上利用 PANNs 预训练模型实现音乐类别十分类。\n", "2. 在 [GTZAN Genre Collection](http://marsyas.info/downloads/datasets.html) 音乐分类数据集上利用 PANNs 预训练模型实现音乐类别十分类。\n",
"\n", "\n",
"关于如何自定义分类数据集,请参考文档 [PaddleSpeech/docs/source/cls/custom_dataset.md](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/cls/custom_dataset.md)\n",
"\n", "\n",
"# 5. 关注 PaddleSpeech\n", "# 5. 关注 PaddleSpeech\n",
"\n", "\n",
@ -681,9 +628,9 @@
], ],
"metadata": { "metadata": {
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "py37",
"language": "python", "language": "python",
"name": "py35-paddle1.2.0" "name": "py37"
}, },
"language_info": { "language_info": {
"codemirror_mode": { "codemirror_mode": {
@ -695,7 +642,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.7.4" "version": "3.7.7"
} }
}, },
"nbformat": 4, "nbformat": 4,

@ -1,8 +1,14 @@
# Aishell-1 # Aishell-1
## Deepspeech2 Streaming
| Model | Number of Params | Release | Config | Test set | Valid Loss | CER |
| --- | --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 45.18M | 2.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.994938373565674 | 0.080 |
## Deepspeech2 Non-Streaming ## Deepspeech2 Non-Streaming
| Model | Params | Release | Config | Test set | Loss | CER | | Model | Number of Params | Release | Config | Test set | Valid Loss | CER |
| --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug | test | 5.738585948944092 | 0.064000 | | DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug | test | 5.738585948944092 | 0.064000 |
| DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |

@ -1,68 +1,64 @@
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 0.0 dev_manifest: data/manifest.dev
max_input_len: 27.0 # second test_manifest: data/manifest.test
min_output_len: 0.0 min_input_len: 0.0
max_output_len: .inf max_input_len: 27.0 # second
min_output_input_ratio: 0.00 min_output_len: 0.0
max_output_input_ratio: .inf max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator: ###########################################
batch_size: 64 # one gpu # Dataloader #
mean_std_filepath: data/mean_std.json ###########################################
unit_type: char batch_size: 64 # one gpu
vocab_filepath: data/lang_char/vocab.txt mean_std_filepath: data/mean_std.json
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear random_seed: 0
feat_dim: spm_model_prefix:
delta_delta: False spectrum_type: linear
stride_ms: 10.0 feat_dim: 161
window_ms: 20.0 delta_delta: False
n_fft: None stride_ms: 10.0
max_freq: None window_ms: 20.0
target_sample_rate: 16000 n_fft: None
use_dB_normalization: True max_freq: None
target_dB: -20 target_sample_rate: 16000
dither: 1.0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True dither: 1.0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 2 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 3 ############################################
rnn_layer_size: 1024 num_conv_layers: 2
use_gru: True num_rnn_layers: 3
share_rnn_weights: False rnn_layer_size: 1024
blank_id: 0 use_gru: True
ctc_grad_norm_type: instance share_rnn_weights: False
blank_id: 0
ctc_grad_norm_type: instance
training: ###########################################
n_epoch: 80 # Training #
accum_grad: 1 ###########################################
lr: 2e-3 n_epoch: 80
lr_decay: 0.83 accum_grad: 1
weight_decay: 1e-06 lr: 2.0e-3
global_grad_clip: 3.0 lr_decay: 0.83
log_interval: 100 weight_decay: 1.0e-6
checkpoint: global_grad_clip: 3.0
log_interval: 100
checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5
decoding:
batch_size: 128
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 1.9
beta: 5.0
beam_size: 300
cutoff_prob: 0.99
cutoff_top_n: 40
num_proc_bsearch: 10

@ -1,70 +1,68 @@
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 0.0 dev_manifest: data/manifest.dev
max_input_len: 27.0 # second test_manifest: data/manifest.test
min_output_len: 0.0 min_input_len: 0.0
max_output_len: .inf max_input_len: 27.0 # second
min_output_input_ratio: 0.00 min_output_len: 0.0
max_output_input_ratio: .inf max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator: ###########################################
batch_size: 64 # one gpu # Dataloader #
mean_std_filepath: data/mean_std.json ###########################################
unit_type: char batch_size: 64 # one gpu
vocab_filepath: data/lang_char/vocab.txt mean_std_filepath: data/mean_std.json
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear #linear, mfcc, fbank random_seed: 0
feat_dim: spm_model_prefix:
delta_delta: False spectrum_type: linear #linear, mfcc, fbank
stride_ms: 10.0 feat_dim: 161
window_ms: 20.0 delta_delta: False
n_fft: None stride_ms: 10.0
max_freq: None window_ms: 20.0
target_sample_rate: 16000 n_fft: None
use_dB_normalization: True max_freq: None
target_dB: -20 target_sample_rate: 16000
dither: 1.0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True dither: 1.0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 0 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 0
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 5 ############################################
rnn_layer_size: 1024 num_conv_layers: 2
rnn_direction: forward # [forward, bidirect] num_rnn_layers: 5
num_fc_layers: 0 rnn_layer_size: 1024
fc_layers_size_list: -1, rnn_direction: forward # [forward, bidirect]
use_gru: False num_fc_layers: 0
blank_id: 0 fc_layers_size_list: -1,
use_gru: False
blank_id: 0
training: ###########################################
n_epoch: 65 # Training #
accum_grad: 1 ###########################################
lr: 5e-4 n_epoch: 65
lr_decay: 0.93 accum_grad: 1
weight_decay: 1e-06 lr: 5.0e-4
global_grad_clip: 3.0 lr_decay: 0.93
log_interval: 100 weight_decay: 1.0e-6
checkpoint: global_grad_clip: 3.0
log_interval: 100
checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5
decoding:
batch_size: 32
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 2.2 #1.9
beta: 4.3
beam_size: 300
cutoff_prob: 0.99
cutoff_top_n: 40
num_proc_bsearch: 10

@ -0,0 +1,10 @@
chunk_batch_size: 32
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 2.2 #1.9
beta: 4.3
beam_size: 300
cutoff_prob: 0.99
cutoff_top_n: 40
num_proc_bsearch: 10

@ -0,0 +1,10 @@
decode_batch_size: 128
error_rate_type: cer
decoding_method: ctc_beam_search
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
alpha: 1.9
beta: 5.0
beam_size: 300
cutoff_prob: 0.99
cutoff_top_n: 40
num_proc_bsearch: 10

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
exit -1 exit -1
fi fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
model_type=$3 ckpt_prefix=$3
model_type=$4
# download language model # download language model
bash local/download_lm_ch.sh bash local/download_lm_ch.sh
@ -21,6 +22,7 @@ fi
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} --model_type ${model_type}

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type"
exit -1 exit -1
fi fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
jit_model_export_path=$2 decode_config_path=$2
model_type=$3 jit_model_export_path=$3
model_type=$4
# download language model # download language model
bash local/download_lm_ch.sh > /dev/null 2>&1 bash local/download_lm_ch.sh > /dev/null 2>&1
@ -21,6 +22,7 @@ fi
python3 -u ${BIN_DIR}/test_export.py \ python3 -u ${BIN_DIR}/test_export.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${jit_model_export_path}.rsl \ --result_file ${jit_model_export_path}.rsl \
--export_path ${jit_model_export_path} \ --export_path ${jit_model_export_path} \
--model_type ${model_type} --model_type ${model_type}

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 4 ];then if [ $# != 5 ];then
echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file"
exit -1 exit -1
fi fi
@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
model_type=$3 ckpt_prefix=$3
audio_file=$4 model_type=$4
audio_file=$5
mkdir -p data mkdir -p data
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
@ -33,6 +34,7 @@ fi
python3 -u ${BIN_DIR}/test_wav.py \ python3 -u ${BIN_DIR}/test_wav.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.rsl \ --result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--model_type ${model_type} \ --model_type ${model_type} \

@ -6,6 +6,7 @@ gpus=0,1,2,3
stage=0 stage=0
stop_stage=100 stop_stage=100
conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=1 avg_num=1
model_type=offline # offline or online model_type=offline # offline or online
audio_file=data/demo_01_03.wav audio_file=data/demo_01_03.wav
@ -34,7 +35,7 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
@ -44,11 +45,11 @@ fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# test export ckpt avg_n # test export ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1
fi fi
# Optionally, you can add LM and test it with runtime. # Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# test a single .wav file # test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1
fi fi

@ -25,7 +25,7 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
| --- | --- | --- | --- | --- | --- | --- | --- | | --- | --- | --- | --- | --- | --- | --- | --- |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 | | transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.8103787302970886 | 0.056588 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 | | transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.8103787302970886 | 0.059932 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 | | transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.8103787302970886 | 0.059989 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 | | transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.8103787302970886 | 0.052273 |

@ -1,48 +1,11 @@
# https://yaml.org/type/float.html ############################################
data: # Network Architecture #
train_manifest: data/manifest.train ############################################
dev_manifest: data/manifest.dev cmvn_file:
test_manifest: data/manifest.test cmvn_file_type: "json"
min_input_len: 0.5 # encoder related
max_input_len: 20.0 # second encoder: conformer
min_output_len: 0.0 encoder_conf:
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator:
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
batch_size: 32
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention output_size: 256 # dimension of attention
attention_heads: 4 attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward linear_units: 2048 # the number of units of position-wise feed forward
@ -52,8 +15,8 @@ model:
attention_dropout_rate: 0.0 attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: True normalize_before: True
use_cnn_module: True
cnn_module_kernel: 15 cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish' activation_type: 'swish'
pos_enc_layer_type: 'rel_pos' pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn' selfattention_layer_type: 'rel_selfattn'
@ -61,10 +24,9 @@ model:
use_dynamic_chunk: true use_dynamic_chunk: true
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false use_dynamic_left_chunk: false
# decoder related
# decoder related decoder: transformer
decoder: transformer decoder_conf:
decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
@ -72,51 +34,62 @@ model:
positional_dropout_rate: 0.1 positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention
# hybrid CTC/attention model_conf:
model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
###########################################
# Data #
###########################################
training: train_manifest: data/manifest.train
n_epoch: 240 dev_manifest: data/manifest.dev
accum_grad: 4 test_manifest: data/manifest.test
global_grad_clip: 5.0
optim: adam
optim_conf: ###########################################
lr: 0.001 # Dataloader #
weight_decay: 1e-6 ###########################################
scheduler: warmuplr
scheduler_conf: vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: ''
unit_type: 'char'
preprocess_config: conf/preprocess.yaml
feat_dim: 80
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 64
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
num_workers: 0
subsampling_factor: 1
num_encs: 1
###########################################
# Training #
###########################################
n_epoch: 240
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002
weight_decay: 1.0e-6
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
lr_decay: 1.0 lr_decay: 1.0
log_interval: 100 log_interval: 100
checkpoint: checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5
decoding:
batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: true # simulate streaming inference. Defaults to False.

@ -1,47 +1,11 @@
# https://yaml.org/type/float.html ############################################
data: # Network Architecture #
train_manifest: data/manifest.train ############################################
dev_manifest: data/manifest.dev cmvn_file:
test_manifest: data/manifest.test cmvn_file_type: "json"
min_input_len: 0.5 # encoder related
max_input_len: 20.0 # second encoder: conformer
min_output_len: 0.0 encoder_conf:
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator:
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention output_size: 256 # dimension of attention
attention_heads: 4 attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward linear_units: 2048 # the number of units of position-wise feed forward
@ -51,15 +15,15 @@ model:
attention_dropout_rate: 0.0 attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: True normalize_before: True
use_cnn_module: True
cnn_module_kernel: 15 cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish' activation_type: 'swish'
pos_enc_layer_type: 'rel_pos' pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn' selfattention_layer_type: 'rel_selfattn'
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
@ -68,50 +32,58 @@ model:
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
###########################################
# Data #
###########################################
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
training: ###########################################
n_epoch: 240 # Dataloader #
accum_grad: 2 ###########################################
global_grad_clip: 5.0 vocab_filepath: data/lang_char/vocab.txt
optim: adam spm_model_prefix: ''
optim_conf: unit_type: 'char'
preprocess_config: conf/preprocess.yaml
feat_dim: 80
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 64
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
num_workers: 2
subsampling_factor: 1
num_encs: 1
###########################################
# Training #
###########################################
n_epoch: 240
accum_grad: 2
global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002 lr: 0.002
weight_decay: 1e-6 weight_decay: 1.0e-6
scheduler: warmuplr # pytorch v1.1.0+ required scheduler: warmuplr
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
lr_decay: 1.0 lr_decay: 1.0
log_interval: 100 log_interval: 100
checkpoint: checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5
decoding:
batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -5,7 +5,7 @@ process:
n_mels: 80 n_mels: 80
n_shift: 160 n_shift: 160
win_length: 400 win_length: 400
dither: true dither: 0.1
- type: cmvn_json - type: cmvn_json
cmvn_path: data/mean_std.json cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument # these three processes are a.k.a. SpecAugument

@ -1,47 +1,11 @@
# https://yaml.org/type/float.html ############################################
data: # Network Architecture #
train_manifest: data/manifest.train ############################################
dev_manifest: data/manifest.dev cmvn_file:
test_manifest: data/manifest.test cmvn_file_type: "json"
min_input_len: 0.5 # encoder related
max_input_len: 20.0 # second encoder: transformer
min_output_len: 0.0 encoder_conf:
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator:
vocab_filepath: data/lang_char/vocab.txt
unit_type: 'char'
spm_model_prefix: ''
augmentation_config: conf/preprocess.yaml
batch_size: 64
raw_wav: True # use raw_wav or kaldi feature
spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80
delta_delta: False
dither: 1.0
target_sample_rate: 16000
max_freq: None
n_fft: None
stride_ms: 10.0
window_ms: 25.0
use_dB_normalization: True
target_dB: -20
random_seed: 0
keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture
model:
cmvn_file:
cmvn_file_type: "json"
# encoder related
encoder: transformer
encoder_conf:
output_size: 256 # dimension of attention output_size: 256 # dimension of attention
attention_heads: 4 attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward linear_units: 2048 # the number of units of position-wise feed forward
@ -51,10 +15,9 @@ model:
attention_dropout_rate: 0.0 attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true normalize_before: true
# decoder related
# decoder related decoder: transformer
decoder: transformer decoder_conf:
decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
@ -63,50 +26,60 @@ model:
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
ctc_dropoutrate: 0.0
ctc_grad_norm_type: null
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
###########################################
# Data #
###########################################
# https://yaml.org/type/float.html
train_manifest: data/manifest.train
dev_manifest: data/manifest.dev
test_manifest: data/manifest.test
###########################################
# Dataloader #
###########################################
unit_type: 'char'
vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: ''
feat_dim: 80
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
batch_size: 64
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
preprocess_config: conf/preprocess.yaml
num_workers: 0
subsampling_factor: 1
num_encs: 1
training: ###########################################
n_epoch: 120 # Training #
accum_grad: 2 ###########################################
global_grad_clip: 5.0 n_epoch: 240
optim: adam accum_grad: 2
optim_conf: global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002 lr: 0.002
weight_decay: 1e-6 weight_decay: 1.0e-6
scheduler: warmuplr scheduler: warmuplr
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
lr_decay: 1.0 lr_decay: 1.0
log_interval: 100 log_interval: 100
checkpoint: checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5
decoding:
batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -0,0 +1,11 @@
beam_size: 10
decode_batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: 16 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: True # simulate streaming inference. Defaults to False.

@ -0,0 +1,11 @@
beam_size: 10
decode_batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
batch_size=1 batch_size=1
output_dir=${ckpt_prefix} output_dir=${ckpt_prefix}
@ -20,9 +21,10 @@ mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/alignment.py \ python3 -u ${BIN_DIR}/alignment.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.align \ --result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in ctc alignment!" echo "Failed in ctc alignment!"

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
chunk_mode=false chunk_mode=false
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then if [ $# != 4 ];then
echo "usage: ${0} config_path ckpt_path_prefix audio_file" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
exit -1 exit -1
fi fi
@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
audio_file=$3 ckpt_prefix=$3
audio_file=$4
mkdir -p data mkdir -p data
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
@ -42,10 +43,11 @@ for type in attention_rescoring; do
python3 -u ${BIN_DIR}/test_wav.py \ python3 -u ${BIN_DIR}/test_wav.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} \ --opts decode.decode_batch_size ${batch_size} \
--audio_file ${audio_file} --audio_file ${audio_file}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then

@ -6,6 +6,7 @@ gpus=0,1,2,3
stage=0 stage=0
stop_stage=50 stop_stage=50
conf_path=conf/conformer.yaml conf_path=conf/conformer.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=20 avg_num=20
audio_file=data/demo_01_03.wav audio_file=data/demo_01_03.wav
@ -32,18 +33,18 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# ctc alignment of test data # ctc alignment of test data
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
# Optionally, you can add LM and test it with runtime. # Optionally, you can add LM and test it with runtime.
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# test a single .wav file # test a single .wav file
CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
fi fi
# Not supported at now!!! # Not supported at now!!!

@ -8,4 +8,5 @@
* voc1 - Parallel WaveGAN * voc1 - Parallel WaveGAN
* voc2 - MelGAN * voc2 - MelGAN
* voc3 - MultiBand MelGAN * voc3 - MultiBand MelGAN
* vc0 - Tactron2 Voice Clone with GE2E * vc0 - Tactron2 Voice Cloning with GE2E
* vc1 - FastSpeech2 Voice Cloning with GE2E

@ -72,8 +72,8 @@ Here's the complete help message.
```text ```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] [--ngpu NGPU] [--phones-dict PHONES_DICT]
[--speaker-dict SPEAKER_DICT] [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING]
Train a FastSpeech2 model. Train a FastSpeech2 model.
@ -87,11 +87,12 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu=0, use cpu. --ngpu NGPU if ngpu=0, use cpu.
--verbose VERBOSE verbose.
--phones-dict PHONES_DICT --phones-dict PHONES_DICT
phone vocabulary file. phone vocabulary file.
--speaker-dict SPEAKER_DICT --speaker-dict SPEAKER_DICT
speaker id map file for multiple speaker model. speaker id map file for multiple speaker model.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
``` ```
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.

@ -67,8 +67,8 @@ Here's the complete help message.
```text ```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
[--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] [--run-benchmark RUN_BENCHMARK]
[--profiler_options PROFILER_OPTIONS] [--profiler_options PROFILER_OPTIONS]
Train a ParallelWaveGAN model. Train a ParallelWaveGAN model.
@ -83,7 +83,6 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu == 0, use cpu. --ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
benchmark: benchmark:
arguments related to benchmark. arguments related to benchmark.
@ -113,7 +112,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
[--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
[--output-dir OUTPUT_DIR] [--ngpu NGPU] [--output-dir OUTPUT_DIR] [--ngpu NGPU]
[--verbose VERBOSE]
Synthesize with GANVocoder. Synthesize with GANVocoder.
@ -130,7 +128,6 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu == 0, use cpu. --ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
``` ```
1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.

@ -72,10 +72,7 @@ lambda_adv: 4.0 # Loss balancing coefficient.
########################################################### ###########################################################
batch_size: 8 # Batch size. batch_size: 8 # Batch size.
batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift. batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift.
pin_memory: true # Whether to pin memory in Pytorch DataLoader. num_workers: 2 # Number of workers in DataLoader.
num_workers: 4 # Number of workers in Pytorch DataLoader.
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
########################################################### ###########################################################
# OPTIMIZER & SCHEDULER SETTING # # OPTIMIZER & SCHEDULER SETTING #

@ -1,48 +1,47 @@
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 0.5 dev_manifest: data/manifest.dev
max_input_len: 20.0 # second test_manifest: data/manifest.test
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.05
max_output_input_ratio: 10.0
collator: ###########################################
vocab_filepath: data/lang_char/vocab.txt # Dataloader #
unit_type: 'char' ###########################################
spm_model_prefix: '' vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/preprocess.yaml unit_type: 'char'
batch_size: 32 spm_model_prefix: ''
raw_wav: True # use raw_wav or kaldi feature preprocess_config: conf/preprocess.yaml
spectrum_type: fbank #linear, mfcc, fbank batch_size: 32
feat_dim: 80 raw_wav: True # use raw_wav or kaldi feature
delta_delta: False spectrum_type: fbank #linear, mfcc, fbank
dither: 1.0 feat_dim: 80
target_sample_rate: 8000 delta_delta: False
max_freq: None dither: 1.0
n_fft: None target_sample_rate: 8000
stride_ms: 10.0 max_freq: None
window_ms: 25.0 n_fft: None
use_dB_normalization: True stride_ms: 10.0
target_dB: -20 window_ms: 25.0
random_seed: 0 use_dB_normalization: True
keep_transcription_text: False target_dB: -20
sortagrad: True random_seed: 0
shuffle_method: batch_shuffle keep_transcription_text: False
num_workers: 2 sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
# network architecture ############################################
model: # Network Architecture #
cmvn_file: ############################################
cmvn_file_type: "json" cmvn_file:
# encoder related cmvn_file_type: "json"
encoder: conformer # encoder related
encoder_conf: encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention output_size: 256 # dimension of attention
attention_heads: 4 attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward linear_units: 2048 # the number of units of position-wise feed forward
@ -62,9 +61,9 @@ model:
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
use_dynamic_left_chunk: false use_dynamic_left_chunk: false
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
@ -73,48 +72,27 @@ model:
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
###########################################
training: # Training #
n_epoch: 240 ###########################################
accum_grad: 4 n_epoch: 240
global_grad_clip: 5.0 accum_grad: 4
optim: adam global_grad_clip: 5.0
optim_conf: optim: adam
optim_conf:
lr: 0.001 lr: 0.001
weight_decay: 1e-6 weight_decay: 1.0e-6
scheduler: warmuplr scheduler: warmuplr
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
lr_decay: 1.0 lr_decay: 1.0
log_interval: 100 log_interval: 100
checkpoint: checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5
decoding:
batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: true # simulate streaming inference. Defaults to False.

@ -1,47 +1,44 @@
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev ###########################################
test_manifest: data/manifest.test train_manifest: data/manifest.train
min_input_len: 0.5 dev_manifest: data/manifest.dev
max_input_len: 20.0 # second test_manifest: data/manifest.test
min_output_len: 0.0
max_output_len: 400.0
min_output_input_ratio: 0.0
max_output_input_ratio: .inf
collator: ###########################################
vocab_filepath: data/lang_char/vocab.txt # Dataloader #
unit_type: 'char' ###########################################
spm_model_prefix: '' vocab_filepath: data/lang_char/vocab.txt
augmentation_config: conf/preprocess.yaml unit_type: 'char'
batch_size: 32 spm_model_prefix: ''
raw_wav: True # use raw_wav or kaldi feature preprocess_config: conf/preprocess.yaml
spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80
feat_dim: 80 stride_ms: 10.0
delta_delta: False window_ms: 25.0
dither: 1.0 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
target_sample_rate: 8000 batch_size: 64
max_freq: None maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
n_fft: None maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
stride_ms: 10.0 minibatches: 0 # for debug
window_ms: 25.0 batch_count: auto
use_dB_normalization: True batch_bins: 0
target_dB: -20 batch_frames_in: 0
random_seed: 0 batch_frames_out: 0
keep_transcription_text: False batch_frames_inout: 0
sortagrad: True num_workers: 0
shuffle_method: batch_shuffle subsampling_factor: 1
num_workers: 2 num_encs: 1
# network architecture ############################################
model: # Network Architecture #
cmvn_file: ############################################
cmvn_file_type: "json" cmvn_file:
# encoder related cmvn_file_type: "json"
encoder: conformer # encoder related
encoder_conf: encoder: conformer
encoder_conf:
output_size: 256 # dimension of attention output_size: 256 # dimension of attention
attention_heads: 4 attention_heads: 4
linear_units: 2048 # the number of units of position-wise feed forward linear_units: 2048 # the number of units of position-wise feed forward
@ -57,9 +54,9 @@ model:
pos_enc_layer_type: 'rel_pos' pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn' selfattention_layer_type: 'rel_selfattn'
# decoder related # decoder related
decoder: transformer decoder: transformer
decoder_conf: decoder_conf:
attention_heads: 4 attention_heads: 4
linear_units: 2048 linear_units: 2048
num_blocks: 6 num_blocks: 6
@ -68,50 +65,28 @@ model:
self_attention_dropout_rate: 0.0 self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0
# hybrid CTC/attention # hybrid CTC/attention
model_conf: model_conf:
ctc_weight: 0.3 ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false length_normalized_loss: false
training: ###########################################
n_epoch: 100 # 50 will be lowest # Training #
accum_grad: 4 ###########################################
global_grad_clip: 5.0 n_epoch: 100 # 50 will be lowest
optim: adam accum_grad: 4
optim_conf: global_grad_clip: 5.0
optim: adam
optim_conf:
lr: 0.002 lr: 0.002
weight_decay: 1e-6 weight_decay: 1.0e-6
scheduler: warmuplr scheduler: warmuplr
scheduler_conf: scheduler_conf:
warmup_steps: 25000 warmup_steps: 25000
lr_decay: 1.0 lr_decay: 1.0
log_interval: 100 log_interval: 100
checkpoint: checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5
decoding:
batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 2.5
beta: 0.3
beam_size: 10
cutoff_prob: 1.0
cutoff_top_n: 0
num_proc_bsearch: 8
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -1,11 +1,11 @@
process: process:
# extract kaldi fbank from PCM # extract kaldi fbank from PCM
- type: fbank_kaldi - type: fbank_kaldi
fs: 16000 fs: 8000
n_mels: 80 n_mels: 80
n_shift: 160 n_shift: 160
win_length: 400 win_length: 400
dither: true dither: 0.1
- type: cmvn_json - type: cmvn_json
cmvn_path: data/mean_std.json cmvn_path: data/mean_std.json
# these three processes are a.k.a. SpecAugument # these three processes are a.k.a. SpecAugument

@ -0,0 +1,11 @@
decode_batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: true # simulate streaming inference. Defaults to False.

@ -0,0 +1,13 @@
decode_batch_size: 128
error_rate_type: cer
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
beam_size: 10
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
# <0: for decoding, use full chunk.
# >0: for decoding, use fixed chunk size as set.
# 0: used for training, it's prohibited here.
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
simulate_streaming: False # simulate streaming inference. Defaults to False.

@ -1,7 +1,7 @@
#! /usr/bin/env bash #! /usr/bin/env bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
ckpt_name=$(basename ${ckpt_prefxi}) ckpt_name=$(basename ${ckpt_prefxi})
@ -25,9 +26,10 @@ mkdir -p ${output_dir}
python3 -u ${BIN_DIR}/alignment.py \ python3 -u ${BIN_DIR}/alignment.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.align \ --result_file ${output_dir}/${type}.align \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in ctc alignment!" echo "Failed in ctc alignment!"

@ -1,7 +1,7 @@
#! /usr/bin/env bash #! /usr/bin/env bash
if [ $# != 2 ];then if [ $# != 3 ];then
echo "usage: ${0} config_path ckpt_path_prefix" echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1 exit -1
fi fi
@ -9,7 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 decode_config_path=$2
ckpt_prefix=$3
ckpt_name=$(basename ${ckpt_prefxi}) ckpt_name=$(basename ${ckpt_prefxi})
@ -30,10 +32,11 @@ for type in attention ctc_greedy_search; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"
@ -49,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do
python3 -u ${BIN_DIR}/test.py \ python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \ --ngpu ${ngpu} \
--config ${config_path} \ --config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${output_dir}/${type}.rsl \ --result_file ${output_dir}/${type}.rsl \
--checkpoint_path ${ckpt_prefix} \ --checkpoint_path ${ckpt_prefix} \
--opts decoding.decoding_method ${type} \ --opts decode.decoding_method ${type} \
--opts decoding.batch_size ${batch_size} --opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "Failed in evaluation!" echo "Failed in evaluation!"

@ -4,8 +4,9 @@ source path.sh
gpus=0,1,2,3 gpus=0,1,2,3
stage=0 stage=0
stop_stage=100 stop_stage=50
conf_path=conf/conformer.yaml conf_path=conf/conformer.yaml
decode_conf_path=conf/tuning/decode.yaml
avg_num=20 avg_num=20
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@ -31,15 +32,15 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n # test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# ctc alignment of test data # ctc alignment of test data
CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
# export ckpt avg_n # export ckpt avg_n
CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
fi fi

@ -60,8 +60,7 @@ Here's the complete help message.
```text ```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--ngpu NGPU] [--verbose VERBOSE] [--ngpu NGPU] [--use-relative-path USE_RELATIVE_PATH]
[--use-relative-path USE_RELATIVE_PATH]
[--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
Train a Speedyspeech model with a single speaker dataset. Train a Speedyspeech model with a single speaker dataset.
@ -76,7 +75,6 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu == 0, use cpu. --ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
--use-relative-path USE_RELATIVE_PATH --use-relative-path USE_RELATIVE_PATH
whether use relative path in metadata whether use relative path in metadata
--phones-dict PHONES_DICT --phones-dict PHONES_DICT
@ -109,7 +107,7 @@ pwg_baker_ckpt_0.4
```bash ```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
``` ```
``text ```text
usage: synthesize.py [-h] usage: synthesize.py [-h]
[--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
[--am_config AM_CONFIG] [--am_ckpt AM_CKPT] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]

@ -45,6 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--stats=dump/train/feats_stats.npy \ --stats=dump/train/feats_stats.npy \
--phones-dict=dump/phone_id_map.txt \ --phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt \
--use-relative-path=True --use-relative-path=True
python3 ${BIN_DIR}/normalize.py \ python3 ${BIN_DIR}/normalize.py \
@ -53,6 +54,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--stats=dump/train/feats_stats.npy \ --stats=dump/train/feats_stats.npy \
--phones-dict=dump/phone_id_map.txt \ --phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt \
--use-relative-path=True --use-relative-path=True
python3 ${BIN_DIR}/normalize.py \ python3 ${BIN_DIR}/normalize.py \
@ -61,6 +63,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--stats=dump/train/feats_stats.npy \ --stats=dump/train/feats_stats.npy \
--phones-dict=dump/phone_id_map.txt \ --phones-dict=dump/phone_id_map.txt \
--tones-dict=dump/tone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt \
--use-relative-path=True --use-relative-path=True
fi fi

@ -38,9 +38,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \ --am_stat=dump/train/feats_stats.npy \
--voc=mb_melgan_csmsc \ --voc=mb_melgan_csmsc \
--voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
@ -61,9 +61,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \ --am_stat=dump/train/feats_stats.npy \
--voc=style_melgan_csmsc \ --voc=style_melgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
@ -82,9 +82,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/feats_stats.npy \ --am_stat=dump/train/feats_stats.npy \
--voc=hifigan_csmsc \ --voc=hifigan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \

@ -63,8 +63,8 @@ Here's the complete help message.
```text ```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] [--ngpu NGPU] [--phones-dict PHONES_DICT]
[--speaker-dict SPEAKER_DICT] [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING]
Train a FastSpeech2 model. Train a FastSpeech2 model.
@ -78,11 +78,12 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu=0, use cpu. --ngpu NGPU if ngpu=0, use cpu.
--verbose VERBOSE verbose.
--phones-dict PHONES_DICT --phones-dict PHONES_DICT
phone vocabulary file. phone vocabulary file.
--speaker-dict SPEAKER_DICT --speaker-dict SPEAKER_DICT
speaker id map file for multiple speaker model. speaker id map file for multiple speaker model.
--voice-cloning VOICE_CLONING
whether training voice cloning model.
``` ```
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
@ -259,5 +260,5 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=exp/default/test_e2e \ --output_dir=exp/default/test_e2e \
--inference_dir=exp/default/inference \ --inference_dir=exp/default/inference \
--phones_dict=dump/phone_id_map.txt --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
``` ```

@ -37,9 +37,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \ --am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \ --voc=mb_melgan_csmsc \
--voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
--voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
@ -59,9 +59,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \ --am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \ --voc=style_melgan_csmsc \
--voc_config=style_melgan_test/default.yaml \ --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_test/snapshot_iter_935000.pdz \ --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_test/feats_stats.npy \ --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \
@ -80,9 +80,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \ --am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \ --voc=hifigan_csmsc \
--voc_config=hifigan_test/default.yaml \ --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_test/snapshot_iter_1600000.pdz \ --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_test/feats_stats.npy \ --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \ --lang=zh \
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \ --output_dir=${train_output_path}/test_e2e \

@ -57,8 +57,8 @@ Here's the complete help message.
```text ```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
[--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] [--run-benchmark RUN_BENCHMARK]
[--profiler_options PROFILER_OPTIONS] [--profiler_options PROFILER_OPTIONS]
Train a ParallelWaveGAN model. Train a ParallelWaveGAN model.
@ -73,7 +73,6 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu == 0, use cpu. --ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
benchmark: benchmark:
arguments related to benchmark. arguments related to benchmark.
@ -103,7 +102,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
[--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
[--output-dir OUTPUT_DIR] [--ngpu NGPU] [--output-dir OUTPUT_DIR] [--ngpu NGPU]
[--verbose VERBOSE]
Synthesize with GANVocoder. Synthesize with GANVocoder.
@ -120,7 +118,6 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu == 0, use cpu. --ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
``` ```
1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
@ -134,7 +131,7 @@ The pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://pad
The static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip). The static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip).
Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss:| eval/spectral_convergence_loss Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss
:-------------:| :------------:| :-----: | :-----: | :--------: :-------------:| :------------:| :-----: | :-----: | :--------:
default| 1(gpu) x 400000|1.948763|0.670098|0.248882 default| 1(gpu) x 400000|1.948763|0.670098|0.248882

@ -79,10 +79,7 @@ lambda_adv: 4.0 # Loss balancing coefficient.
########################################################### ###########################################################
batch_size: 8 # Batch size. batch_size: 8 # Batch size.
batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by n_shift. batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by n_shift.
pin_memory: true # Whether to pin memory in Pytorch DataLoader. num_workers: 2 # Number of workers in DataLoader.
num_workers: 2 # Number of workers in Pytorch DataLoader.
remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps.
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
########################################################### ###########################################################
# OPTIMIZER & SCHEDULER SETTING # # OPTIMIZER & SCHEDULER SETTING #

@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index
### Get MFA Result and Extract ### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio. We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
## Get Started ## Get Started
Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the dataset is `~/datasets/BZNSYP`.
@ -57,7 +57,7 @@ Here's the complete help message.
```text ```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--ngpu NGPU] [--verbose VERBOSE] [--ngpu NGPU]
Train a Multi-Band MelGAN model. Train a Multi-Band MelGAN model.
@ -71,7 +71,6 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu == 0, use cpu. --ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
``` ```
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
[--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
[--output-dir OUTPUT_DIR] [--ngpu NGPU] [--output-dir OUTPUT_DIR] [--ngpu NGPU]
[--verbose VERBOSE]
Synthesize with GANVocoder. Synthesize with GANVocoder.
@ -105,7 +103,6 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu == 0, use cpu. --ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
``` ```
1. `--config` multi band melgan config file. You should use the same config with which the model is trained. 1. `--config` multi band melgan config file. You should use the same config with which the model is trained.
@ -155,22 +152,22 @@ TODO:
The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set). The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set).
## Pretrained Models ## Pretrained Models
The pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip). The pretrained model can be downloaded here [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip).
The finetuned model can be downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip). The finetuned model can be downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip).
The static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) The static model can be downloaded here [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip)
Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss
:-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------: :-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------:
default| 1(gpu) x 1000000| ——|—— |—— |—— | ——| default| 1(gpu) x 1000000| 2.4851|0.71778 |0.2761 |0.66334 |0.2777|
finetune| 1(gpu) x 1000000|3.196967|0.977804| 0.778484| 0.889576 |0.776756 | finetune| 1(gpu) x 1000000|3.196967|0.977804| 0.778484| 0.889576 |0.776756 |
Multi Band MelGAN checkpoint contains files listed below. Multi Band MelGAN checkpoint contains files listed below.
```text ```text
mb_melgan_baker_ckpt_0.5 mb_melgan_csmsc_ckpt_0.1.1
├── default.yaml # default config used to train multi band melgan ├── default.yaml # default config used to train multi band melgan
├── feats_stats.npy # statistics used to normalize spectrogram when training multi band melgan ├── feats_stats.npy # statistics used to normalize spectrogram when training multi band melgan
└── snapshot_iter_1000000.pdz # generator parameters of multi band melgan └── snapshot_iter_1000000.pdz # generator parameters of multi band melgan

@ -15,11 +15,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--dur-file=durations.txt \ --dur-file=durations.txt \
--output-dir=dump_finetune \ --output-dir=dump_finetune \
--phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \
--dataset=baker \
--rootdir=~/datasets/BZNSYP/
fi fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 local/link_wav.py \ python3 ${MAIN_ROOT}/utils/link_wav.py \
--old-dump-dir=dump \ --old-dump-dir=dump \
--dump-dir=dump_finetune --dump-dir=dump_finetune
fi fi

@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index
### Get MFA Result and Extract ### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio. We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
## Get Started ## Get Started
Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the dataset is `~/datasets/BZNSYP`.
@ -57,9 +57,9 @@ Here's the complete help message.
```text ```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--ngpu NGPU] [--verbose VERBOSE] [--ngpu NGPU]
Train a Multi-Band MelGAN model. Train a Style MelGAN model.
optional arguments: optional arguments:
-h, --help show this help message and exit -h, --help show this help message and exit
@ -71,7 +71,6 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu == 0, use cpu. --ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
``` ```
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
[--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
[--output-dir OUTPUT_DIR] [--ngpu NGPU] [--output-dir OUTPUT_DIR] [--ngpu NGPU]
[--verbose VERBOSE]
Synthesize with GANVocoder. Synthesize with GANVocoder.
@ -105,7 +103,6 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu == 0, use cpu. --ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
``` ```
1. `--config` style melgan config file. You should use the same config with which the model is trained. 1. `--config` style melgan config file. You should use the same config with which the model is trained.
@ -113,3 +110,20 @@ optional arguments:
3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
4. `--output-dir` is the directory to save the synthesized audio files. 4. `--output-dir` is the directory to save the synthesized audio files.
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Models
The pretrained model can be downloaded here [style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip).
The static model of Style MelGAN is not available now.
Style MelGAN checkpoint contains files listed below.
```text
hifigan_csmsc_ckpt_0.1.1
├── default.yaml # default config used to train style melgan
├── feats_stats.npy # statistics used to normalize spectrogram when training style melgan
└── snapshot_iter_1500000.pdz # generator parameters of style melgan
```
## Acknowledgement
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.

@ -88,7 +88,7 @@ discriminator_adv_loss_params:
batch_size: 32 # Batch size. batch_size: 32 # Batch size.
# batch_max_steps(24000) == prod(noise_upsample_scales)(80) * prod(upsample_scales)(300, n_shift) # batch_max_steps(24000) == prod(noise_upsample_scales)(80) * prod(upsample_scales)(300, n_shift)
batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift. batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift.
num_workers: 2 # Number of workers in Pytorch DataLoader. num_workers: 2 # Number of workers in DataLoader.
########################################################### ###########################################################
# OPTIMIZER & SCHEDULER SETTING # # OPTIMIZER & SCHEDULER SETTING #

@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index
### Get MFA Result and Extract ### Get MFA Result and Extract
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo. You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
## Get Started ## Get Started
Assume the path to the dataset is `~/datasets/BZNSYP`. Assume the path to the dataset is `~/datasets/BZNSYP`.
@ -57,7 +57,7 @@ Here's the complete help message.
```text ```text
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
[--ngpu NGPU] [--verbose VERBOSE] [--ngpu NGPU]
Train a HiFiGAN model. Train a HiFiGAN model.
@ -71,7 +71,6 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu == 0, use cpu. --ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
``` ```
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
[--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
[--output-dir OUTPUT_DIR] [--ngpu NGPU] [--output-dir OUTPUT_DIR] [--ngpu NGPU]
[--verbose VERBOSE]
Synthesize with GANVocoder. Synthesize with GANVocoder.
@ -105,7 +103,6 @@ optional arguments:
--output-dir OUTPUT_DIR --output-dir OUTPUT_DIR
output dir. output dir.
--ngpu NGPU if ngpu == 0, use cpu. --ngpu NGPU if ngpu == 0, use cpu.
--verbose VERBOSE verbose.
``` ```
1. `--config` config file. You should use the same config with which the model is trained. 1. `--config` config file. You should use the same config with which the model is trained.
@ -114,4 +111,23 @@ optional arguments:
4. `--output-dir` is the directory to save the synthesized audio files. 4. `--output-dir` is the directory to save the synthesized audio files.
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Fine-tuning ## Pretrained Models
The pretrained model can be downloaded here [hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip).
The static model can be downloaded here [hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip).
Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
:-------------:| :------------:| :-----: | :-----: | :--------:
default| 1(gpu) x 2500000|24.927|0.1262|7.554
HiFiGAN checkpoint contains files listed below.
```text
hifigan_csmsc_ckpt_0.1.1
├── default.yaml # default config used to train hifigan
├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan
└── snapshot_iter_2500000.pdz # generator parameters of hifigan
```
## Acknowledgement
We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.

@ -119,7 +119,7 @@ lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
########################################################### ###########################################################
batch_size: 16 # Batch size. batch_size: 16 # Batch size.
batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size. batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size.
num_workers: 2 # Number of workers in Pytorch DataLoader. num_workers: 2 # Number of workers in DataLoader.
########################################################### ###########################################################
# OPTIMIZER & SCHEDULER SETTING # # OPTIMIZER & SCHEDULER SETTING #

@ -119,7 +119,7 @@ lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
########################################################### ###########################################################
batch_size: 16 # Batch size. batch_size: 16 # Batch size.
batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size. batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size.
num_workers: 2 # Number of workers in Pytorch DataLoader. num_workers: 2 # Number of workers in DataLoader.
########################################################### ###########################################################
# OPTIMIZER & SCHEDULER SETTING # # OPTIMIZER & SCHEDULER SETTING #

@ -15,11 +15,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
--dur-file=durations.txt \ --dur-file=durations.txt \
--output-dir=dump_finetune \ --output-dir=dump_finetune \
--phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \
--dataset=baker \
--rootdir=~/datasets/BZNSYP/
fi fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 local/link_wav.py \ python3 ${MAIN_ROOT}/utils/link_wav.py \
--old-dump-dir=dump \ --old-dump-dir=dump \
--dump-dir=dump_finetune --dump-dir=dump_finetune
fi fi

@ -1,85 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os
from operator import itemgetter
from pathlib import Path
import jsonlines
import numpy as np
def main():
# parse config and args
parser = argparse.ArgumentParser(
description="Preprocess audio and then extract features .")
parser.add_argument(
"--old-dump-dir",
default=None,
type=str,
help="directory to dump feature files.")
parser.add_argument(
"--dump-dir",
type=str,
required=True,
help="directory to finetune dump feature files.")
args = parser.parse_args()
old_dump_dir = Path(args.old_dump_dir).expanduser()
old_dump_dir = old_dump_dir.resolve()
dump_dir = Path(args.dump_dir).expanduser()
# use absolute path
dump_dir = dump_dir.resolve()
dump_dir.mkdir(parents=True, exist_ok=True)
assert old_dump_dir.is_dir()
assert dump_dir.is_dir()
for sub in ["train", "dev", "test"]:
# 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
output_dir = dump_dir / sub
output_dir.mkdir(parents=True, exist_ok=True)
results = []
for name in os.listdir(output_dir / "raw"):
# 003918_feats.npy
utt_id = name.split("_")[0]
mel_path = output_dir / ("raw/" + name)
gen_mel = np.load(mel_path)
wave_name = utt_id + "_wave.npy"
wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
output_dir / ("raw/" + wave_name))
num_sample = wav.shape[0]
num_frames = gen_mel.shape[0]
wav_path = output_dir / ("raw/" + wave_name)
record = {
"utt_id": utt_id,
"num_samples": num_sample,
"num_frames": num_frames,
"feats": str(mel_path),
"wave": str(wav_path),
}
results.append(record)
results.sort(key=itemgetter("utt_id"))
with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer:
for item in results:
writer.write(item)
if __name__ == "__main__":
main()

@ -17,21 +17,32 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型可供用
- CNN6: 该模型主要包含4个卷积层和2个全连接层模型参数的数量为4.5Membbedding维度是512。 - CNN6: 该模型主要包含4个卷积层和2个全连接层模型参数的数量为4.5Membbedding维度是512。
## 数据集
[ESC-50: Dataset for Environmental Sound Classification](https://github.com/karolpiczak/ESC-50) 是一个包含有 2000 个带标签的环境声音样本,音频样本采样率为 44,100Hz 的单通道音频文件,所有样本根据标签被划分为 50 个类别,每个类别有 40 个样本。
## 模型指标
根据 `ESC-50` 提供的fold信息对数据集进行 5-fold 的 fine-tune 训练和评估,平均准确率如下:
|Model|Acc|
|--|--|
|CNN14| 0.9500
|CNN10| 0.8975
|CNN6| 0.8825
## 快速开始 ## 快速开始
### 模型训练 ### 模型训练
以环境声音分类数据集`ESC50`为示例运行下面的命令可在训练集上进行模型的finetune支持单机的单卡训练和多卡训练。 运行下面的命令可在训练集上进行模型的finetune支持单机的单卡训练和多卡训练。
启动训练: 启动训练:
```shell ```shell
$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns.yaml
``` ```
`paddlespeech/cls/exps/panns/train.py` 脚本中可支持配置的参数: 训练的参数可在 `conf/panns.yaml``training` 中配置,其中:
- `device`: 指定模型预测时使用的设备。
- `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。
- `epochs`: 训练轮次默认为50。 - `epochs`: 训练轮次默认为50。
- `learning_rate`: Fine-tune的学习率默认为5e-5。 - `learning_rate`: Fine-tune的学习率默认为5e-5。
- `batch_size`: 批处理大小请结合显存情况进行调整若出现显存不足请适当调低这一参数默认为16。 - `batch_size`: 批处理大小请结合显存情况进行调整若出现显存不足请适当调低这一参数默认为16。
@ -40,36 +51,31 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1
- `save_freq`: 训练过程中的模型保存频率默认为10。 - `save_freq`: 训练过程中的模型保存频率默认为10。
- `log_freq`: 训练过程中的信息打印频率默认为10。 - `log_freq`: 训练过程中的信息打印频率默认为10。
示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过以下方式执行: 示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过修改 `conf/panns.yaml``model` 中配置:
```python ```yaml
from paddleaudio.datasets import ESC50
from paddlespeech.cls.models import SoundClassifier
from paddlespeech.cls.models import cnn14, cnn10, cnn6
# CNN14 # CNN14
backbone = cnn14(pretrained=True, extract_embedding=True) model:
model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) backbone: 'paddlespeech.cls.models:cnn14'
```
```yaml
# CNN10 # CNN10
backbone = cnn10(pretrained=True, extract_embedding=True) model:
model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) backbone: 'paddlespeech.cls.models:cnn10'
```
```yaml
# CNN6 # CNN6
backbone = cnn6(pretrained=True, extract_embedding=True) model:
model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) backbone: 'paddlespeech.cls.models:cnn6'
``` ```
### 模型预测 ### 模型预测
```shell ```shell
$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 conf/panns.yaml
``` ```
`paddlespeech/cls/exps/panns/predict.py` 脚本中可支持配置的参数: 训练的参数可在 `conf/panns.yaml``predicting` 中配置,其中:
- `audio_file`: 指定预测的音频文件。
- `device`: 指定模型预测时使用的设备。
- `wav`: 指定预测的音频文件。
- `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。
- `top_k`: 预测显示的top k标签的得分默认为1。 - `top_k`: 预测显示的top k标签的得分默认为1。
- `checkpoint`: 模型参数checkpoint文件。 - `checkpoint`: 模型参数checkpoint文件。
@ -88,7 +94,7 @@ Cat: 6.579841738130199e-06
模型训练结束后,可以将已保存的动态图参数导出成静态图的模型和参数,然后实施静态图的部署。 模型训练结束后,可以将已保存的动态图参数导出成静态图的模型和参数,然后实施静态图的部署。
```shell ```shell
$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 3 ./checkpoint/epoch_50/model.pdparams ./export
``` ```
`paddlespeech/cls/exps/panns/export_model.py` 脚本中可支持配置的参数: `paddlespeech/cls/exps/panns/export_model.py` 脚本中可支持配置的参数:
@ -109,7 +115,7 @@ export
`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api提供了python端部署的示例 `paddlespeech/cls/exps/panns/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api提供了python端部署的示例
```shell ```shell
$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4 $ CUDA_VISIBLE_DEVICES=0 ./run.sh 4 cpu ./export /audio/dog.wav
``` ```
`paddlespeech/cls/exps/panns/deploy/predict.py` 脚本中可支持配置的主要参数: `paddlespeech/cls/exps/panns/deploy/predict.py` 脚本中可支持配置的主要参数:

@ -0,0 +1,9 @@
## Metrics
5-fold cross validation accuracy on [ESC-50](https://github.com/karolpiczak/ESC-50) dataset:
|Model|Acc|
|--|--|
|CNN14| 0.9500
|CNN10| 0.8975
|CNN6| 0.8825

@ -0,0 +1,36 @@
data:
dataset: 'paddleaudio.datasets:ESC50'
num_classes: 50
train:
mode: 'train'
split: 1
dev:
mode: 'dev'
split: 1
model:
backbone: 'paddlespeech.cls.models:cnn14'
feature:
sr: 32000
n_fft: 1024
hop_length: 320
window: 'hann'
win_length: 1024
f_min: 50.0
f_max: 14000.0
n_mels: 64
training:
epochs: 50
learning_rate: 0.00005
num_workers: 2
batch_size: 16
checkpoint_dir: './checkpoint'
save_freq: 10
log_freq: 10
predicting:
audio_file: '/audio/dog.wav'
top_k: 10
checkpoint: './checkpoint/epoch_50/model.pdparams'

@ -1,8 +1,8 @@
#!/bin/bash #!/bin/bash
ckpt_dir=$1 ckpt=$1
output_dir=$2 output_dir=$2
python3 ${BIN_DIR}/export_model.py \ python3 ${BIN_DIR}/export_model.py \
--checkpoint ${ckpt_dir}/model.pdparams \ --checkpoint ${ckpt} \
--output_dir ${output_dir} --output_dir ${output_dir}

@ -1,11 +1,4 @@
#!/bin/bash #!/bin/bash
audio_file=$1
ckpt_dir=$2
feat_backend=$3
python3 ${BIN_DIR}/predict.py \ python3 ${BIN_DIR}/predict.py \
--wav ${audio_file} \ --cfg_path=$1
--feat_backend ${feat_backend} \
--top_k 10 \
--checkpoint ${ckpt_dir}/model.pdparams

@ -1,25 +1,12 @@
#!/bin/bash #!/bin/bash
ngpu=$1 ngpu=$1
feat_backend=$2 cfg_path=$2
num_epochs=50
batch_size=16
ckpt_dir=./checkpoint
save_freq=10
if [ ${ngpu} -gt 0 ]; then if [ ${ngpu} -gt 0 ]; then
python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \ python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \
--epochs ${num_epochs} \ --cfg_path ${cfg_path}
--feat_backend ${feat_backend} \
--batch_size ${batch_size} \
--checkpoint_dir ${ckpt_dir} \
--save_freq ${save_freq}
else else
python3 ${BIN_DIR}/train.py \ python3 ${BIN_DIR}/train.py \
--epochs ${num_epochs} \ --cfg_path ${cfg_path}
--feat_backend ${feat_backend} \
--batch_size ${batch_size} \
--checkpoint_dir ${ckpt_dir} \
--save_freq ${save_freq}
fi fi

@ -6,28 +6,30 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
stage=$1 stage=$1
stop_stage=100 stop_stage=100
feat_backend=numpy
audio_file=~/cat.wav
ckpt_dir=./checkpoint/epoch_50
output_dir=./export
infer_device=cpu
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
./local/train.sh ${ngpu} ${feat_backend} || exit -1 cfg_path=$2
./local/train.sh ${ngpu} ${cfg_path} || exit -1
exit 0 exit 0
fi fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
./local/infer.sh ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1 cfg_path=$2
./local/infer.sh ${cfg_path} || exit -1
exit 0 exit 0
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
./local/export.sh ${ckpt_dir} ${output_dir} || exit -1 ckpt=$2
output_dir=$3
./local/export.sh ${ckpt} ${output_dir} || exit -1
exit 0 exit 0
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
./local/static_model_infer.sh ${infer_device} ${output_dir} ${audio_file} || exit -1 infer_device=$2
graph_dir=$3
audio_file=$4
./local/static_model_infer.sh ${infer_device} ${graph_dir} ${audio_file} || exit -1
exit 0 exit 0
fi fi

@ -1,35 +1,29 @@
# 中文实验例程 # Punctuation Restoration with IWLST2012-Zh
## 测试数据:
- IWLST2012中文test2012
## 运行代码 ## Get Started
- 运行 `run.sh 0 0 conf/train_conf/bertBLSTM_zh.yaml 1 conf/data_conf/chinese.yaml ` ### Data Preprocessing
```bash
./run.sh --stage 0 --stop-stage 0
```
### Model Training
```bash
./run.sh --stage 1 --stop-stage 1
```
### Testing
```bash
./run.sh --stage 2 --stop-stage 2
```
### Punctuation Restoration
```bash
./run.sh --stage 3 --stop-stage 3
```
## Pretrained Model
The pretrained model can be downloaded here [ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip).
## 实验结果: ### Test Result
- BertLinear - Ernie Linear
- 实验配置conf/train_conf/bertLinear_zh.yaml | |COMMA | PERIOD | QUESTION | OVERALL|
- 测试结果 |:-----:|:-----:|:-----:|:-----:|:-----:|
|Precision |0.510955 |0.526462 |0.820755 |0.619391|
| | COMMA | PERIOD | QUESTION | OVERALL | |Recall |0.517433 |0.564179 |0.861386 |0.647666|
|-----------|-----------|-----------|-----------|--------- | |F1 |0.514173 |0.544669 |0.840580 |0.633141|
|Precision | 0.425665 | 0.335190 | 0.698113 | 0.486323 |
|Recall | 0.511278 | 0.572108 | 0.787234 | 0.623540 |
|F1 | 0.464560 | 0.422717 | 0.740000 | 0.542426 |
- BertBLSTM
- 实验配置conf/train_conf/bertBLSTM_zh.yaml
- 测试结果 avg_1
| | COMMA | PERIOD | QUESTION | OVERALL |
|-----------|-----------|-----------|-----------|--------- |
|Precision | 0.469484 | 0.550604 | 0.801887 | 0.607325 |
|Recall | 0.580271 | 0.592408 | 0.817308 | 0.663329 |
|F1 | 0.519031 | 0.570741 | 0.809524 | 0.633099 |
- BertBLSTM/avg_1测试标贝合成数据
| | COMMA | PERIOD | QUESTION | OVERALL |
|-----------|-----------|-----------|-----------|--------- |
|Precision | 0.217192 | 0.196339 | 0.820717 | 0.411416 |
|Recall | 0.205922 | 0.892531 | 0.416162 | 0.504872 |
|F1 | 0.211407 | 0.321873 | 0.552279 | 0.361853 |

@ -0,0 +1,44 @@
###########################################################
# DATA SETTING #
###########################################################
dataset_type: Ernie
train_path: data/iwslt2012_zh/train.txt
dev_path: data/iwslt2012_zh/dev.txt
test_path: data/iwslt2012_zh/test.txt
batch_size: 64
num_workers: 2
data_params:
pretrained_token: ernie-1.0
punc_path: data/iwslt2012_zh/punc_vocab
seq_len: 100
###########################################################
# MODEL SETTING #
###########################################################
model_type: ErnieLinear
model:
pretrained_token: ernie-1.0
num_classes: 4
###########################################################
# OPTIMIZER SETTING #
###########################################################
optimizer_params:
weight_decay: 1.0e-6 # weight decay coefficient.
scheduler_params:
learning_rate: 1.0e-5 # learning rate.
gamma: 1.0 # scheduler gamma.
###########################################################
# TRAINING SETTING #
###########################################################
max_epoch: 20
num_snapshots: 5
###########################################################
# OTHER SETTING #
###########################################################
num_snapshots: 10 # max number of snapshots to keep while training
seed: 42 # random seed for paddle, random, and np.random

@ -1,36 +0,0 @@
data:
dataset_type: Ernie
train_path: data/iwslt2012_zh/train.txt
dev_path: data/iwslt2012_zh/dev.txt
test_path: data/iwslt2012_zh/test.txt
data_params:
pretrained_token: ernie-1.0
punc_path: data/iwslt2012_zh/punc_vocab
seq_len: 100
batch_size: 64
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 0
checkpoint:
kbest_n: 5
latest_n: 10
metric_type: F1
model_type: ErnieLinear
model_params:
pretrained_token: ernie-1.0
num_classes: 4
training:
n_epoch: 100
lr: !!float 1e-5
lr_decay: 1.0
weight_decay: !!float 1e-06
global_grad_clip: 5.0
log_interval: 10
log_path: log/train_ernie_linear.log
testing:
log_path: log/test_ernie_linear.log

@ -1,23 +0,0 @@
#! /usr/bin/env bash
if [ $# != 2 ]; then
echo "usage: ${0} ckpt_dir avg_num"
exit -1
fi
ckpt_dir=${1}
average_num=${2}
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
python3 -u ${BIN_DIR}/avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir} \
--num ${average_num} \
--val_best
if [ $? -ne 0 ]; then
echo "Failed in avg ckpt!"
exit 1
fi
exit 0

@ -0,0 +1,12 @@
#!/bin/bash
config_path=$1
train_output_path=$2
ckpt_name=$3
text=$4
ckpt_prefix=${ckpt_name%.*}
python3 ${BIN_DIR}/punc_restore.py \
--config=${config_path} \
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
--text=${text}

@ -1,26 +1,11 @@
#!/bin/bash #!/bin/bash
if [ $# != 2 ];then
echo "usage: ${0} config_path ckpt_path_prefix"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_prefix=$2 train_output_path=$2
ckpt_name=$3
python3 -u ${BIN_DIR}/test.py \
--ngpu 1 \
--config ${config_path} \
--result_file ${ckpt_prefix}.rsl \
--checkpoint_path ${ckpt_prefix}
if [ $? -ne 0 ]; then ckpt_prefix=${ckpt_name%.*}
echo "Failed in evaluation!"
exit 1
fi
exit 0 python3 ${BIN_DIR}/test.py \
--config=${config_path} \
--checkpoint=${train_output_path}/checkpoints/${ckpt_name}

@ -1,28 +1,9 @@
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name log_dir"
exit -1
fi
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
config_path=$1 config_path=$1
ckpt_name=$2 train_output_path=$2
log_dir=$3
mkdir -p exp
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output_dir exp/${ckpt_name} \
--log_dir ${log_dir}
if [ $? -ne 0 ]; then
echo "Failed in training!"
exit 1
fi
exit 0 python3 ${BIN_DIR}/train.py \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=1

@ -10,5 +10,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=$1 MODEL=ernie_linear
export BIN_DIR=${MAIN_ROOT}/paddlespeech/text/exps/${MODEL} export BIN_DIR=${MAIN_ROOT}/paddlespeech/text/exps/${MODEL}

@ -1,40 +1,35 @@
#!/bin/bash #!/bin/bash
set -e set -e
source path.sh
if [ $# -ne 4 ]; then gpus=0,1
echo "usage: bash ./run.sh stage gpu train_config avg_num" stage=0
echo "eg: bash ./run.sh 1 0 train_config 1"
exit -1
fi
stage=$1
stop_stage=100 stop_stage=100
gpus=$2
conf_path=$3
avg_num=$4
avg_ckpt=avg_${avg_num}
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
log_dir=log
source path.sh ${ckpt} conf_path=conf/default.yaml
train_output_path=exp/default
ckpt_name=snapshot_iter_12840.pdz
text=今天的天气真不错啊你下午有空吗我想约你一起去吃饭
# with the following command, you can choose the stage range you want to run
# such as `./run.sh --stage 0 --stop-stage 0`
# this can not be mixed use with `$1`, `$2` ...
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data # prepare data
bash ./local/data.sh ./local/data.sh
fi fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir # train model, all `ckpt` under `train_output_path/checkpoints/` dir
CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${conf_path} ${ckpt} ${log_dir} CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
bash ./local/avg.sh exp/${ckpt}/checkpoints ${avg_num}
fi fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n CUDA_VISIBLE_DEVICES=${gpus} ./local/punc_restore.sh ${conf_path} ${train_output_path} ${ckpt_name} ${text}|| exit -1
CUDA_VISIBLE_DEVICES=${gpus} bash ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
fi fi

@ -1,68 +1,65 @@
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev-clean ###########################################
test_manifest: data/manifest.test-clean train_manifest: data/manifest.train
min_input_len: 0.0 dev_manifest: data/manifest.dev-clean
max_input_len: 30.0 # second test_manifest: data/manifest.test-clean
min_output_len: 0.0 min_input_len: 0.0
max_output_len: .inf max_input_len: 30.0 # second
min_output_input_ratio: 0.00 min_output_len: 0.0
max_output_input_ratio: .inf max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator: ###########################################
batch_size: 20 # Dataloader #
mean_std_filepath: data/mean_std.json ###########################################
unit_type: char batch_size: 20
vocab_filepath: data/lang_char/vocab.txt mean_std_filepath: data/mean_std.json
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear random_seed: 0
target_sample_rate: 16000 spm_model_prefix:
max_freq: None spectrum_type: linear
n_fft: None feat_dim:
stride_ms: 10.0 target_sample_rate: 16000
window_ms: 20.0 max_freq: None
delta_delta: False n_fft: None
dither: 1.0 stride_ms: 10.0
use_dB_normalization: True window_ms: 20.0
target_dB: -20 delta_delta: False
random_seed: 0 dither: 1.0
keep_transcription_text: False use_dB_normalization: True
sortagrad: True target_dB: -20
shuffle_method: batch_shuffle random_seed: 0
num_workers: 2 keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 2
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 3 ############################################
rnn_layer_size: 2048 num_conv_layers: 2
use_gru: False num_rnn_layers: 3
share_rnn_weights: True rnn_layer_size: 2048
blank_id: 0 use_gru: False
share_rnn_weights: True
blank_id: 0
training: ###########################################
n_epoch: 50 # Training #
accum_grad: 1 ###########################################
lr: 1e-3 n_epoch: 50
lr_decay: 0.83 accum_grad: 1
weight_decay: 1e-06 lr: 1.0e-3
global_grad_clip: 5.0 lr_decay: 0.83
log_interval: 100 weight_decay: 1.0e-6
checkpoint: global_grad_clip: 5.0
log_interval: 100
checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5
decoding:
batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 1.9
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8

@ -1,70 +1,67 @@
# https://yaml.org/type/float.html # https://yaml.org/type/float.html
data: ###########################################
train_manifest: data/manifest.train # Data #
dev_manifest: data/manifest.dev-clean ###########################################
test_manifest: data/manifest.test-clean train_manifest: data/manifest.train
min_input_len: 0.0 dev_manifest: data/manifest.dev-clean
max_input_len: 30.0 # second test_manifest: data/manifest.test-clean
min_output_len: 0.0 min_input_len: 0.0
max_output_len: .inf max_input_len: 30.0 # second
min_output_input_ratio: 0.00 min_output_len: 0.0
max_output_input_ratio: .inf max_output_len: .inf
min_output_input_ratio: 0.00
max_output_input_ratio: .inf
collator: ###########################################
batch_size: 15 # Dataloader #
mean_std_filepath: data/mean_std.json ###########################################
unit_type: char batch_size: 15
vocab_filepath: data/lang_char/vocab.txt mean_std_filepath: data/mean_std.json
augmentation_config: conf/augmentation.json unit_type: char
random_seed: 0 vocab_filepath: data/lang_char/vocab.txt
spm_model_prefix: augmentation_config: conf/augmentation.json
spectrum_type: linear random_seed: 0
target_sample_rate: 16000 spm_model_prefix:
max_freq: None spectrum_type: linear
n_fft: None feat_dim:
stride_ms: 10.0 target_sample_rate: 16000
window_ms: 20.0 max_freq: None
delta_delta: False n_fft: None
dither: 1.0 stride_ms: 10.0
use_dB_normalization: True window_ms: 20.0
target_dB: -20 delta_delta: False
random_seed: 0 dither: 1.0
keep_transcription_text: False use_dB_normalization: True
sortagrad: True target_dB: -20
shuffle_method: batch_shuffle random_seed: 0
num_workers: 0 keep_transcription_text: False
sortagrad: True
shuffle_method: batch_shuffle
num_workers: 0
model: ############################################
num_conv_layers: 2 # Network Architecture #
num_rnn_layers: 3 ############################################
rnn_layer_size: 2048 num_conv_layers: 2
rnn_direction: forward num_rnn_layers: 3
num_fc_layers: 2 rnn_layer_size: 2048
fc_layers_size_list: 512, 256 rnn_direction: forward
use_gru: False num_fc_layers: 2
blank_id: 0 fc_layers_size_list: 512, 256
use_gru: False
blank_id: 0
training: ###########################################
n_epoch: 50 # Training #
accum_grad: 4 ###########################################
lr: 1e-3 n_epoch: 50
lr_decay: 0.83 accum_grad: 4
weight_decay: 1e-06 lr: 1.0e-3
global_grad_clip: 5.0 lr_decay: 0.83
log_interval: 100 weight_decay: 1.0e-6
checkpoint: global_grad_clip: 5.0
log_interval: 100
checkpoint:
kbest_n: 50 kbest_n: 50
latest_n: 5 latest_n: 5
decoding:
batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 1.9
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8

@ -0,0 +1,10 @@
decode_batch_size: 128
error_rate_type: wer
decoding_method: ctc_beam_search
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
alpha: 1.9
beta: 0.3
beam_size: 500
cutoff_prob: 1.0
cutoff_top_n: 40
num_proc_bsearch: 8

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save