diff --git a/.gitignore b/.gitignore index 8cbb734df..cc8fff877 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ build docs/build/ +docs/topic/ctc/warp-ctc/ tools/venv tools/kenlm diff --git a/.mergify.yml b/.mergify.yml index 2c30721f7..3347c6dc3 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -32,6 +32,12 @@ pull_request_rules: actions: label: remove: ["conflicts"] + - name: "auto add label=Dataset" + conditions: + - files~=^dataset/ + actions: + label: + add: ["Dataset"] - name: "auto add label=S2T" conditions: - files~=^paddlespeech/s2t/ @@ -50,18 +56,30 @@ pull_request_rules: actions: label: add: ["Audio"] - - name: "auto add label=TextProcess" + - name: "auto add label=Vector" + conditions: + - files~=^paddlespeech/vector/ + actions: + label: + add: ["Vector"] + - name: "auto add label=Text" conditions: - files~=^paddlespeech/text/ actions: label: - add: ["TextProcess"] + add: ["Text"] - name: "auto add label=Example" conditions: - files~=^examples/ actions: label: add: ["Example"] + - name: "auto add label=CLI" + conditions: + - files~=^paddlespeech/cli + actions: + label: + add: ["CLI"] - name: "auto add label=Demo" conditions: - files~=^demos/ @@ -70,13 +88,13 @@ pull_request_rules: add: ["Demo"] - name: "auto add label=README" conditions: - - files~=README.md + - files~=(README.md|READEME_cn.md) actions: label: add: ["README"] - name: "auto add label=Documentation" conditions: - - files~=^docs/ + - files~=^(docs/|CHANGELOG.md|paddleaudio/CHANGELOG.md) actions: label: add: ["Documentation"] @@ -88,10 +106,16 @@ pull_request_rules: add: ["CI"] - name: "auto add label=Installation" conditions: - - files~=^(tools/|setup.py|setup.sh) + - files~=^(tools/|setup.py|setup.cfg|setup_audio.py) actions: label: add: ["Installation"] + - name: "auto add label=Test" + conditions: + - files~=^(tests/) + actions: + label: + add: ["Test"] - name: "auto add label=mergify" conditions: - files~=^.mergify.yml diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..5ffe80984 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,11 @@ +# Changelog + + +Date: 2022-1-10, Author: Jackwaterveg. +Add features to: CLI: + - Support English (librispeech/asr1/transformer). + - Support choosing `decode_method` for conformer and transformer models. + - Refactor the config, using the unified config. + - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297 + +*** diff --git a/README.md b/README.md index 328508f1f..cca1cb539 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@

Quick Start - | Tutorials + | Documents | Models List @@ -25,14 +25,6 @@

- - **PaddleSpeech** is an open-source toolkit on [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform for a variety of critical tasks in speech and audio, with the state-of-art and influential models. @@ -61,7 +53,6 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme 我认为跑步最重要的就是给我带来了身体健康。 - @@ -95,7 +86,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme - + @@ -114,6 +105,13 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
+ + + +
Input Text Input Text Synthetic Audio
季姬寂,集鸡,鸡即棘鸡。棘鸡饥叽,季姬及箕稷济鸡。鸡既济,跻姬笈,季姬忌,急咭鸡,鸡急,继圾几,季姬急,即籍箕击鸡,箕疾击几伎,伎即齑,鸡叽集几基,季姬急极屐击鸡,鸡既殛,季姬激,即记《季姬击鸡记》。 + +
+
@@ -121,7 +119,39 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html). -### Features: +##### Punctuation Restoration +
+ + + + + + + + + + + + + +
Input Text Output Text
今天的天气真不错啊你下午有空吗我想约你一起去吃饭今天的天气真不错啊!你下午有空吗?我想约你一起去吃饭。
+ +
+ +### ⭐ Examples +- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): Use PaddleSpeech TTS to generate virtual human voice.** + +
+ +### 🔥 Hot Activities + +- 2021.12.21~12.24 + + 4 Days Live Courses: Depth interpretation of PaddleSpeech! + + **Courses videos and related materials: https://aistudio.baidu.com/aistudio/education/group/info/25130** + +### Features Via the easy-to-use, efficient, flexible and scalable implementation, our vision is to empower both industrial application and academic research, including training, inference & testing modules, and deployment process. To be more specific, this toolkit features at: - 📦 **Ease of Use**: low barriers to install, and [CLI](#quick-start) is available to quick-start your journey. @@ -132,8 +162,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🔬 *Integration of mainstream models and datasets*: the toolkit implements modules that participate in the whole pipeline of the speech tasks, and uses mainstream datasets like LibriSpeech, LJSpeech, AIShell, CSMSC, etc. See also [model list](#model-list) for more details. - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). - -### Recent Update: +### Recent Update - @@ -313,7 +344,7 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle - + - + + + + + + + + + + + @@ -383,11 +428,37 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
VocoderVocoder WaveFlow LJSpeech @@ -333,7 +364,21 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle Multi Band MelGAN-csmsc
Style MelGANCSMSC + Style MelGAN-csmsc +
HiFiGANCSMSC + HiFiGAN-csmsc +
Voice Cloning GE2E
+**Punctuation Restoration** + + + + + + + + + + + + + + + + + + + +
Task Dataset Model Type Link
Punctuation RestorationIWLST2012_zhErnie Linear + iwslt2012-punc0 +
+ ## Documents Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](https://paperswithcode.com/area/audio) and [Music SoTA](https://paperswithcode.com/area/music) give you an overview of the hot academic topics in the related area. To focus on the tasks in PaddleSpeech, you will find the following guidelines are helpful to grasp the core ideas. - [Installation](./docs/source/install.md) +- [Quick Start](#quickstart) +- [Some Demos](./demos/README.md) - Tutorials - [Automatic Speech Recognition](./docs/source/asr/quick_start.md) - [Introduction](./docs/source/asr/models_introduction.md) @@ -399,9 +470,12 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](ht - [Advanced Usage](./docs/source/tts/advanced_usage.md) - [Chinese Rule Based Text Frontend](./docs/source/tts/zh_text_frontend.md) - [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) - - Audio Classification - - Speech Translation + - [Audio Classification](./demos/audio_tagging/README.md) + - [Speech Translation](./demos/speech_translation/README.md) - [Released Models](./docs/source/released_model.md) +- [Community](#Community) +- [Welcome to contribute](#contribution) +- [License](#License) The Text-to-Speech module is originally called [Parakeet](https://github.com/PaddlePaddle/Parakeet), and now merged with this repository. If you are interested in academic research about this task, please see [TTS research overview](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview). Also, [this document](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) is a good guideline for the pipeline components. @@ -416,7 +490,7 @@ howpublished = {\url{https://github.com/PaddlePaddle/PaddleSpeech}}, year={2021} } ``` - + ## Contribute to PaddleSpeech You are warmly welcome to submit questions in [discussions](https://github.com/PaddlePaddle/PaddleSpeech/discussions) and bug reports in [issues](https://github.com/PaddlePaddle/PaddleSpeech/issues)! Also, we highly appreciate if you are willing to contribute to this project! @@ -460,13 +534,16 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement -- Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling) for years of attention, constructive advice and great help. + +- Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [AK391](https://github.com/AK391) for TTS web demo on Huggingface Spaces using Gradio. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. - +- Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function. +- Many thanks to [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) for contributing Punctuation Restoration model. Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. + ## License PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE). diff --git a/README_cn.md b/README_cn.md index 551c9395b..ddf189c31 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,4 +1,4 @@ - (简体中文|[English](./README.md)) +(简体中文|[English](./README.md))

@@ -6,7 +6,7 @@

快速开始 - | 教程 + | 教程文档 | 模型列表 @@ -30,7 +30,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme 4.What is the goal of this project? --> -**PaddleSpeech** 是基于飞桨 [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) 深度学习开源框架平台上的一个开源模型库,用于语音和音频中的各种关键任务的开发,包含大量前沿和有影响力的模型,一些典型的应用示例如下: +**PaddleSpeech** 是基于飞桨 [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) 的语音方向的开源模型库,用于语音和音频中的各种关键任务的开发,包含大量基于深度学习前沿和有影响力的模型,一些典型的应用示例如下: ##### 语音识别
@@ -38,7 +38,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme 输入音频 - 识别结果 + 识别结果 @@ -68,8 +68,8 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme - - + + @@ -90,7 +90,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
输入音频 翻译结果 输入音频 翻译结果
- + @@ -109,6 +109,13 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
+ + + +
输入文本 输入文本 合成音频
季姬寂,集鸡,鸡即棘鸡。棘鸡饥叽,季姬及箕稷济鸡。鸡既济,跻姬笈,季姬忌,急咭鸡,鸡急,继圾几,季姬急,即籍箕击鸡,箕疾击几伎,伎即齑,鸡叽集几基,季姬急极屐击鸡,鸡既殛,季姬激,即记《季姬击鸡记》。 + +
+
@@ -116,7 +123,39 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme 更多合成音频,可以参考 [PaddleSpeech 语音合成音频示例](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)。 -### 特性: +##### 标点恢复 +
+ + + + + + + + + + + + + +
输入文本 输出文本
今天的天气真不错啊你下午有空吗我想约你一起去吃饭今天的天气真不错啊!你下午有空吗?我想约你一起去吃饭。
+ +
+ +### ⭐ 应用案例 +- **[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo): 使用 PaddleSpeech 的语音合成模块生成虚拟人的声音。** + +
+ + +### 🔥 热门活动 + +- 2021.12.21~12.24 + + 4 日直播课: 深度解读 PaddleSpeech 语音技术! + + **直播回放与课件资料: https://aistudio.baidu.com/aistudio/education/group/info/25130** +### 特性 本项目采用了易用、高效、灵活以及可扩展的实现,旨在为工业应用、学术研究提供更好的支持,实现的功能包含训练、推断以及测试模块,以及部署过程,主要包括 - 📦 **易用性**: 安装门槛低,可使用 [CLI](#quick-start) 快速开始。 @@ -127,7 +166,7 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme - 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块,并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC,详情请见 [模型列表](#model-list)。 - 🧩 级联模型应用: 作为传统语音任务的扩展,我们结合了自然语言处理、计算机视觉等任务,实现更接近实际需求的产业级应用。 -### 近期更新: +### 近期更新 - - + @@ -254,6 +293,7 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
语音识别模块种类语音转文本模块类型 数据集 模型种类 链接
+ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声学模型和声码器。声学模型和声码器模型如下: @@ -261,8 +301,8 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - - + + @@ -302,7 +342,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - + - + + + + + + + + + + + @@ -348,6 +402,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
语音合成模块类型 模型种类 数据集 链接 数据集 链接
声码器声码器 WaveFlow LJSpeech @@ -322,7 +362,21 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 Multi Band MelGAN-csmsc
Style MelGANCSMSC + Style MelGAN-csmsc +
HiFiGANCSMSC + HiFiGAN-csmsc +
声音克隆 GE2E
+ **声音分类** @@ -373,25 +428,62 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
-## 文档 +**标点恢复** + + + + + + + + + + + + + + + + + + + +
任务 数据集 模型种类 链接
标点恢复IWLST2012_zhErnie Linear + iwslt2012-punc0 +
+ +## 教程文档 -[语音 SoTA](https://paperswithcode.com/area/speech)、[声音 SoTA](https://paperswithcode.com/area/audio)、[音乐 SoTA](https://paperswithcode.com/area/music) 概述了相关领域的热门学术话题。对于 PaddleSpeech 的所关注的任务,以下指南有助于掌握核心思想。 +对于 PaddleSpeech 的所关注的任务,以下指南有助于帮助开发者快速入门,了解语音相关核心思想。 -- [安装](./docs/source/install.md) -- 教程 - - [语音识别](./docs/source/asr/quick_start.md) +- [下载安装](./docs/source/install_cn.md) +- [快速开始](#快速开始) +- Notebook基础教程 + - [声音分类](./docs/tutorial/cls/cls_tutorial.ipynb) + - [语音识别](./docs/tutorial/asr/tutorial_transformer.ipynb) + - [语音翻译](./docs/tutorial/st/st_tutorial.ipynb) + - [声音合成](./docs/tutorial/tts/tts_tutorial.ipynb) + - [示例Demo](./demos/README.md) +- 进阶文档 + - [语音识别自定义训练](./docs/source/asr/quick_start.md) - [简介](./docs/source/asr/models_introduction.md) - [数据准备](./docs/source/asr/data_preparation.md) - [数据增强](./docs/source/asr/augmentation.md) - [Ngram 语言模型](./docs/source/asr/ngram_lm.md) - - [语音合成](./docs/source/tts/quick_start.md) + - [语音合成自定义训练](./docs/source/tts/quick_start.md) - [简介](./docs/source/tts/models_introduction.md) - [进阶用法](./docs/source/tts/advanced_usage.md) - [中文文本前端](./docs/source/tts/zh_text_frontend.md) - - [音频示例](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) - - 声音分类 - - 语音翻译 -- [模型](./docs/source/released_model.md) + - [测试语音样本](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html) + - [声音分类](./demos/audio_tagging/README_cn.md) + - [语音翻译](./demos/speech_translation/README_cn.md) +- [模型列表](#模型列表) + - [语音识别](#语音识别模型) + - [语音合成](#语音合成模型) + - [声音分类](#声音分类模型) +- [技术交流群](#技术交流群) +- [欢迎贡献](#欢迎贡献) +- [License](#License) 语音合成模块最初被称为 [Parakeet](https://github.com/PaddlePaddle/Parakeet),现在与此仓库合并。如果您对该任务的学术研究感兴趣,请参阅 [TTS 研究概述](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/docs/source/tts#overview)。此外,[模型介绍](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/models_introduction.md) 是了解语音合成流程的一个很好的指南。 @@ -408,9 +500,9 @@ year={2021} } ``` + ## 参与 PaddleSpeech 的开发 - 热烈欢迎您在[Discussions](https://github.com/PaddlePaddle/PaddleSpeech/discussions) 中提交问题,并在[Issues](https://github.com/PaddlePaddle/PaddleSpeech/issues) 中指出发现的 bug。此外,我们非常希望您参与到 PaddleSpeech 的开发中! ### 贡献者 @@ -452,10 +544,12 @@ year={2021} ## 致谢 -- 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling) 多年来的关注和建议,以及在诸多问题上的帮助。 +- 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。 - 非常感谢 [AK391](https://github.com/AK391) 在 Huggingface Spaces 上使用 Gradio 对我们的语音合成功能进行网页版演示。 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。 - +- 非常感谢 [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) 采用 PaddleSpeech 语音合成功能实现 Virtual Uploader(VUP)/Virtual YouTuber(VTuber) 虚拟主播。 +- 非常感谢 [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) 贡献标点重建相关模型。 + 此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。 ## License diff --git a/demos/README.md b/demos/README.md index 28bab8bb3..4482aa191 100644 --- a/demos/README.md +++ b/demos/README.md @@ -1,10 +1,15 @@ # Speech Application based on PaddleSpeech +([简体中文](./README_cn.md)|English) + The directory containes many speech applications in multi scenarios. -* audio tagging - tag audio label in vedio -* metaverse - 2D AR with TTS -* speech recogintion - vidio understanding +* audio tagging - multi-label tagging of an audio file +* automatic_video_subtitiles - generate subtitles from a video +* metaverse - 2D AR with TTS +* punctuation_restoration - restore punctuation from raw text +* speech recogintion - recognize text of an audio file * speech translation - end to end speech translation * story talker - book reader based on OCR and TTS * style_fs2 - multi style control for FastSpeech2 model +* text_to_speech - convert text into speech diff --git a/demos/README_cn.md b/demos/README_cn.md new file mode 100644 index 000000000..242b4f070 --- /dev/null +++ b/demos/README_cn.md @@ -0,0 +1,15 @@ +# PaddleSpeech 语音应用 Demo + +(简体中文|[English](./README.md)) + +该目录包含基于 PaddleSpeech 开发的不同场景的语音应用 Demo: + +* 声音分类 - 基于 AudioSet 的 527 类标签的音频多标签分类。 +* 视频字幕生成 - 识别视频中语音的文本,并进行文本后处理。 +* 元宇宙 - 基于语音合成的 2D 增强现实。 +* 标点恢复 - 通常作为语音识别的文本后处理任务,为一段无标点的纯文本添加相应的标点符号。 +* 语音识别 - 识别一段音频中包含的语音文字。 +* 语音翻译 - 实时识别音频中的语言,并同时翻译成目标语言。 +* 会说话的故事书 - 基于 OCR 和语音合成的会说话的故事书。 +* 个性化语音合成 - 基于 FastSpeech2 模型的个性化语音合成。 +* 语音合成 - 基于给定的文本生成语音音频。 diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md index 88ff36375..9d4af0be6 100644 --- a/demos/audio_tagging/README.md +++ b/demos/audio_tagging/README.md @@ -9,9 +9,9 @@ This demo is an implementation to tag an audio file with 527 [AudioSet](https:// ## Usage ### 1. Installation -```bash -pip install paddlespeech -``` +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from easy, meduim and hard to install paddlespeech. ### 2. Prepare Input File The input of this demo should be a WAV file(`.wav`). diff --git a/demos/audio_tagging/README_cn.md b/demos/audio_tagging/README_cn.md index 3331a83a6..79f87bf8c 100644 --- a/demos/audio_tagging/README_cn.md +++ b/demos/audio_tagging/README_cn.md @@ -9,9 +9,10 @@ ## 使用方法 ### 1. 安装 -```bash -pip install paddlespeech -``` +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + ### 2. 准备输入 这个 demo 的输入应该是一个 WAV 文件(`.wav`), diff --git a/demos/automatic_video_subtitiles/README.md b/demos/automatic_video_subtitiles/README.md index df4b0e264..db6da40db 100644 --- a/demos/automatic_video_subtitiles/README.md +++ b/demos/automatic_video_subtitiles/README.md @@ -8,9 +8,9 @@ This demo is an implementation to automatic video subtitles from a video file. I ## Usage ### 1. Installation -```bash -pip install paddlespeech -``` +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from easy, meduim and hard to install paddlespeech. ### 2. Prepare Input Get a video file with the speech of the specific language: diff --git a/demos/automatic_video_subtitiles/README_cn.md b/demos/automatic_video_subtitiles/README_cn.md index b217f921d..fc7b2cf6a 100644 --- a/demos/automatic_video_subtitiles/README_cn.md +++ b/demos/automatic_video_subtitiles/README_cn.md @@ -6,9 +6,10 @@ 这个 demo 是一个为视频自动生成字幕的实现,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。 ## 使用方法 ### 1. 安装 -```bash -pip install paddlespeech -``` +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + ### 2. 准备输入 获取包含特定语言语音的视频文件: ```bash diff --git a/demos/metaverse/run.sh b/demos/metaverse/run.sh index e653dbb70..551f0b4e5 100755 --- a/demos/metaverse/run.sh +++ b/demos/metaverse/run.sh @@ -37,17 +37,20 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # run tts CUDA_VISIBLE_DEVICES=${gpus} \ - python3 ${BIN_DIR}/synthesize_e2e.py \ - --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ - --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ - --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ - --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ - --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --am_ckpt=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --am_stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ --text=sentences.txt \ - --output-dir=output/wavs \ - --inference-dir=output/inference \ - --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt + --output_dir=output/wavs \ + --inference_dir=output/inference \ + --phones_dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt # output/inference is not needed here, which save the static models rm -rf output/inference fi diff --git a/demos/punctuation_restoration/README.md b/demos/punctuation_restoration/README.md index 966e387c6..518d437dc 100644 --- a/demos/punctuation_restoration/README.md +++ b/demos/punctuation_restoration/README.md @@ -7,9 +7,10 @@ This demo is an implementation to restore punctuation from raw text. It can be d ## Usage ### 1. Installation -```bash -pip install paddlespeech -``` +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from easy, meduim and hard to install paddlespeech. + ### 2. Prepare Input The input of this demo should be a text of the specific language that can be passed via argument. diff --git a/demos/punctuation_restoration/README_cn.md b/demos/punctuation_restoration/README_cn.md index 4f1e01239..9d4be8bf0 100644 --- a/demos/punctuation_restoration/README_cn.md +++ b/demos/punctuation_restoration/README_cn.md @@ -9,9 +9,10 @@ ## 使用方法 ### 1. 安装 -```bash -pip install paddlespeech -``` +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + ### 2. 准备输入 这个 demo 的输入是通过参数传递的特定语言的文本。 diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index 738acdc5c..c49afa35c 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -8,9 +8,9 @@ This demo is an implementation to recognize text from a specific audio file. It ## Usage ### 1. Installation -```bash -pip install paddlespeech -``` +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from easy, meduim and hard to install paddlespeech. ### 2. Prepare Input File The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. @@ -23,8 +23,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 3. Usage - Command Line(Recommended) ```bash + # Chinese paddlespeech asr --input ./zh.wav + # English + paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav ``` + (It doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.) + Usage: ```bash paddlespeech asr --help @@ -36,11 +41,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - `sample_rate`: Sample rate of the model. Default: `16000`. - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`. - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. + - `yes`: No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate. Default: `False`. - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment. Output: ```bash + # Chinese [2021-12-08 13:12:34,063] [ INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康 + # English + [2022-01-12 11:51:10,815] [ INFO] - ASR Result: i knocked at the door on the ancient side of the building ``` - Python API @@ -56,6 +65,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee config=None, # Set `config` and `ckpt_path` to None to use pretrained model. ckpt_path=None, audio_file='./zh.wav', + force_yes=False, device=paddle.get_device()) print('ASR Result: \n{}'.format(text)) ``` @@ -73,4 +83,4 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by | Model | Language | Sample Rate | :--- | :---: | :---: | | conformer_wenetspeech| zh| 16000 -| transformer_aishell| zh| 16000 +| transformer_librispeech| en| 16000 diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md index 29e2343d3..c2e38c91b 100644 --- a/demos/speech_recognition/README_cn.md +++ b/demos/speech_recognition/README_cn.md @@ -2,14 +2,15 @@ # 语音识别 ## 介绍 -语音识别解决让计算机程序自动转录语音的问题。 +语音识别是一项用计算机程序自动转录语音的技术。 这个 demo 是一个从给定音频文件识别文本的实现,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。 ## 使用方法 ### 1. 安装 -```bash -pip install paddlespeech -``` +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + ### 2. 准备输入 这个 demo 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 @@ -20,8 +21,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 3. 使用方法 - 命令行 (推荐使用) ```bash + # 中文 paddlespeech asr --input ./zh.wav + # 英文 + paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav ``` + (如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error,没有关系,这个包是非必须的。) + 使用方法: ```bash paddlespeech asr --help @@ -33,11 +39,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee - `sample_rate`:音频采样率,默认值:`16000`。 - `config`:ASR 任务的参数文件,若不设置则使用预训练模型中的默认配置,默认值:`None`。 - `ckpt_path`:模型参数文件,若不设置则下载预训练模型使用,默认值:`None`。 + - `yes`;不需要设置额外的参数,一旦设置了该参数,说明你默认同意程序的所有请求,其中包括自动转换输入音频的采样率。默认值:`False`。 - `device`:执行预测的设备,默认值:当前系统下 paddlepaddle 的默认 device。 输出: ```bash + # 中文 [2021-12-08 13:12:34,063] [ INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康 + # 英文 + [2022-01-12 11:51:10,815] [ INFO] - ASR Result: i knocked at the door on the ancient side of the building ``` - Python API @@ -53,6 +63,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee config=None, # Set `config` and `ckpt_path` to None to use pretrained model. ckpt_path=None, audio_file='./zh.wav', + force_yes=False, device=paddle.get_device()) print('ASR Result: \n{}'.format(text)) ``` @@ -69,4 +80,4 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee | 模型 | 语言 | 采样率 | :--- | :---: | :---: | | conformer_wenetspeech| zh| 16000 -| transformer_aishell| zh| 16000 +| transformer_librispeech| en| 16000 diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md index a2041b0c9..f675a4eda 100644 --- a/demos/speech_translation/README.md +++ b/demos/speech_translation/README.md @@ -7,9 +7,10 @@ This demo is an implementation to recognize text from a specific audio file and ## Usage ### 1. Installation -```bash -pip install paddlespeech -``` +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from easy, meduim and hard to install paddlespeech. + ### 2. Prepare Input File The input of this demo should be a WAV file(`.wav`). diff --git a/demos/speech_translation/README_cn.md b/demos/speech_translation/README_cn.md index affa82282..bad9b392f 100644 --- a/demos/speech_translation/README_cn.md +++ b/demos/speech_translation/README_cn.md @@ -8,9 +8,10 @@ ## 使用方法 ### 1. 安装 -```bash -pip install paddlespeech -``` +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + ### 2. 准备输入 这个 Demo 的输入是 WAV(`.wav`) 语音文件 diff --git a/demos/story_talker/run.sh b/demos/story_talker/run.sh index 142959b6b..50335e73b 100755 --- a/demos/story_talker/run.sh +++ b/demos/story_talker/run.sh @@ -37,17 +37,20 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # run tts CUDA_VISIBLE_DEVICES=${gpus} \ - python3 ${BIN_DIR}/synthesize_e2e.py \ - --fastspeech2-config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ - --fastspeech2-checkpoint=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ - --fastspeech2-stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ - --pwg-config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ - --pwg-checkpoint=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --pwg-stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ + python3 ${BIN_DIR}/../synthesize_e2e.py \ + --am=fastspeech2_csmsc \ + --am_config=download/fastspeech2_nosil_baker_ckpt_0.4/default.yaml \ + --am_ckpt=download/fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \ + --am_stat=download/fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=download/pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=download/pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=download/pwg_baker_ckpt_0.4/pwg_stats.npy \ + --lang=zh \ --text=output/sentences.txt \ - --output-dir=output/wavs \ - --inference-dir=output/inference \ - --phones-dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt + --output_dir=output/wavs \ + --inference_dir=output/inference \ + --phones_dict=download/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt # output/inference is not needed here, which save the static models rm -rf output/inference fi diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index 785c2a623..9d3c4ac53 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -8,9 +8,9 @@ This demo is an implementation to generate audio from the given text. It can be ## Usage ### 1. Installation -```bash -pip install paddlespeech -``` +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from easy, meduim and hard to install paddlespeech. ### 2. Prepare Input The input of this demo should be a text of the specific language that can be passed via argument. diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md index 0b2cd0b5d..f075efdaf 100644 --- a/demos/text_to_speech/README_cn.md +++ b/demos/text_to_speech/README_cn.md @@ -9,9 +9,10 @@ ## 使用方法 ### 1. 安装 -```bash -pip install paddlespeech -``` +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + ### 2. 准备输入 这个 demo 的输入是通过参数传递的特定语言的文本。 diff --git a/docs/source/cls/custom_dataset.md b/docs/source/cls/custom_dataset.md new file mode 100644 index 000000000..aaf5943c5 --- /dev/null +++ b/docs/source/cls/custom_dataset.md @@ -0,0 +1,128 @@ +# Customize Dataset for Audio Classification + +Following this tutorial you can customize your dataset for audio classification task by using `paddlespeech` and `paddleaudio`. + +A base class of classification dataset is `paddleaudio.dataset.AudioClassificationDataset`. To customize your dataset you should write a dataset class derived from `AudioClassificationDataset`. + +Assuming you have some wave files that stored in your own directory. You should prepare a meta file with the information of filepaths and labels. For example the absolute path of it is `/PATH/TO/META_FILE.txt`: +``` +/PATH/TO/WAVE_FILE/1.wav cat +/PATH/TO/WAVE_FILE/2.wav cat +/PATH/TO/WAVE_FILE/3.wav dog +/PATH/TO/WAVE_FILE/4.wav dog +``` +Here is an example to build your custom dataset in `custom_dataset.py`: + +```python +from paddleaudio.datasets.dataset import AudioClassificationDataset + +class CustomDataset(AudioClassificationDataset): + meta_file = '/PATH/TO/META_FILE.txt' + # List all the class labels + label_list = [ + 'cat', + 'dog', + ] + + def __init__(self, **kwargs): + files, labels = self._get_data() + super(CustomDataset, self).__init__( + files=files, labels=labels, feat_type='raw', **kwargs) + + def _get_data(self): + ''' + This method offer information of wave files and labels. + ''' + files = [] + labels = [] + + with open(self.meta_file) as f: + for line in f: + file, label_str = line.strip().split(' ') + files.append(file) + labels.append(self.label_list.index(label_str)) + + return files, labels +``` + +Then you can build dataset and data loader from `CustomDataset`: +```python +import paddle +from paddleaudio.features import LogMelSpectrogram + +from custom_dataset import CustomDataset + +# Feature config should be align with pretrained model +sample_rate = 32000 +feat_conf = { + 'sr': sample_rate, + 'n_fft': 1024, + 'hop_length': 320, + 'window': 'hann', + 'win_length': 1024, + 'f_min': 50.0, + 'f_max': 14000.0, + 'n_mels': 64, +} + +train_ds = CustomDataset(sample_rate=sample_rate) +feature_extractor = LogMelSpectrogram(**feat_conf) + +train_sampler = paddle.io.DistributedBatchSampler( + train_ds, batch_size=4, shuffle=True, drop_last=False) +train_loader = paddle.io.DataLoader( + train_ds, + batch_sampler=train_sampler, + return_list=True, + use_buffer_reader=True) +``` + +Train model with `CustomDataset`: +```python +from paddlespeech.cls.models import cnn14 +from paddlespeech.cls.models import SoundClassifier + +backbone = cnn14(pretrained=True, extract_embedding=True) +model = SoundClassifier(backbone, num_class=len(train_ds.label_list)) +optimizer = paddle.optimizer.Adam( + learning_rate=1e-6, parameters=model.parameters()) +criterion = paddle.nn.loss.CrossEntropyLoss() + +steps_per_epoch = len(train_sampler) +epochs = 10 +for epoch in range(1, epochs + 1): + model.train() + + for batch_idx, batch in enumerate(train_loader): + waveforms, labels = batch + # Need a padding when lengths of waveforms differ in a batch. + feats = feature_extractor(waveforms) + feats = paddle.transpose(feats, [0, 2, 1]) + logits = model(feats) + loss = criterion(logits, labels) + loss.backward() + optimizer.step() + if isinstance(optimizer._learning_rate, + paddle.optimizer.lr.LRScheduler): + optimizer._learning_rate.step() + optimizer.clear_grad() + + # Calculate loss + avg_loss = loss.numpy()[0] + + # Calculate metrics + preds = paddle.argmax(logits, axis=1) + num_corrects = (preds == labels).numpy().sum() + num_samples = feats.shape[0] + + avg_acc = num_corrects / num_samples + + print_msg = 'Epoch={}/{}, Step={}/{}'.format( + epoch, epochs, batch_idx + 1, steps_per_epoch) + print_msg += ' loss={:.4f}'.format(avg_loss) + print_msg += ' acc={:.4f}'.format(avg_acc) + print_msg += ' lr={:.6f}'.format(optimizer.get_lr()) + print(print_msg) +``` + +If you want to save the checkpoint of model and evaluate from a specific dataset, please see `paddlespeech/cli/exp/panns/train.py` for more details. diff --git a/docs/source/cls/quick_start.md b/docs/source/cls/quick_start.md new file mode 100644 index 000000000..e173255cf --- /dev/null +++ b/docs/source/cls/quick_start.md @@ -0,0 +1,51 @@ +# Quick Start of Audio Classification +Several shell scripts provided in `./examples/esc50/cls0` will help us to quickly give it a try, for most major modules, including data preparation, model training, model evaluation, with [ESC50](ttps://github.com/karolpiczak/ESC-50) dataset. + +Some of the scripts in `./examples` are not configured with GPUs. If you want to train with 8 GPUs, please modify `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`. If you don't have any GPU available, please set `CUDA_VISIBLE_DEVICES=` to use CPUs instead. + +Let's start a audio classification task with the following steps: + +- Go to the directory + + ```bash + cd examples/esc50/cls0 + ``` + +- Source env + ```bash + source path.sh + ``` + +- Main entry point + ```bash + CUDA_VISIBLE_DEVICES=0 ./run.sh 1 + ``` + +This demo includes fine-tuning, evaluating and deploying a audio classificatio model. More detailed information is provided in the following sections. + +## Fine-tuning a model +PANNs([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf)) are pretrained models with [Audioset](https://research.google.com/audioset/). They can be easily used to extract audio embeddings for audio classification task. + +To start a model fine-tuning, please run: +```bash +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +feat_backend=numpy +./local/train.sh ${ngpu} ${feat_backend} +``` + +## Deploy a model +Once you save a model checkpoint, you can export it to static graph and deploy by python scirpt: + +- Export to a static graph + ```bash + ./local/export.sh ${ckpt_dir} ./export + ``` + The argument `ckpt_dir` should be a directory in which a model checkpoint stored, for example `checkpoint/epoch_50`. + + The static graph will be exported to `./export`. + +- Inference + ```bash + ./local/static_model_infer.sh ${infer_device} ./export ${audio_file} + ``` + The argument `infer_device` can be `cpu` or `gpu`, and it means which device to be used to infer. And `audio_file` should be a wave file with name `*.wav`. diff --git a/docs/source/install.md b/docs/source/install.md index 850847772..bdeb37cec 100644 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -6,16 +6,17 @@ There are 3 ways to use `PaddleSpeech`. According to the degree of difficulty, t |:---- |:----------------------------------------------------------- |:----| | Easy | (1) Use command-line functions of PaddleSpeech.
(2) Experience PaddleSpeech on Ai Studio. | Linux, Mac(not support M1 chip),Windows | | Medium | Support major functions ,such as using the` ready-made `examples and using PaddleSpeech to train your model. | Linux | -| Hard | Support full function of Paddlespeech,including training n-gram language model, Montreal-Forced-Aligner, and so on. And you are more able to be a developer! | Ubuntu | +| Hard | Support full function of Paddlespeech, including using join ctc decoder with kaldi, training n-gram language model, Montreal-Forced-Aligner, and so on. And you are more able to be a developer! | Ubuntu | ## Prerequisites - Python >= 3.7 - PaddlePaddle latest version (please refer to the [Installation Guide] (https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) - C++ compilation environment - Hip: For Linux and Mac, do not use command `sh` instead of command `bash` in installation document. +- Hip: We recommand you to install `paddlepaddle` from https://mirror.baidu.com/pypi/simple and install `paddlespeech` from https://pypi.tuna.tsinghua.edu.cn/simple. ## Easy: Get the Basic Function (Support Linux, Mac, and Windows) -- If you are newer to `PaddleSpeech` and want to experience it easily without your machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step tutorial for `PaddleSpeech` and you can use the basic function of `PaddleSpeech` with a free machine. +- If you are newer to `PaddleSpeech` and want to experience it easily without your machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step [tutorial](https://aistudio.baidu.com/aistudio/education/group/info/25130) for `PaddleSpeech`, and you can use the basic function of `PaddleSpeech` with a free machine. - If you want to use the command line function of Paddlespeech, you need to complete the following steps to install `PaddleSpeech`. For more information about how to use the command line function, you can see the [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli). ### Install Conda Conda is a management system of the environment. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) (select a version py>=3.7) to download and install the conda. @@ -29,6 +30,10 @@ conda install -y -c conda-forge sox libsndfile bzip2 #### Windows You need to install `Visual Studio` to make the C++ compilation environment. +https://visualstudio.microsoft.com/visual-cpp-build-tools/ + +You can also see [#1195](https://github.com/PaddlePaddle/PaddleSpeech/discussions/1195) for more help. + #### Mac ```bash brew install gcc @@ -47,10 +52,19 @@ sudo apt install build-essential conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ``` ### Install PaddleSpeech -You can use the following command: +Some users may fail to install `kaldiio` due to the default download source, you can install `pytest-runner` at first; +```bash +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple +``` +Then you can use the following commands: ```bash -pip install paddlepaddle paddlespeech +pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple ``` +> If you encounter problem with downloading **nltk_data** while using paddlespeech, it maybe due to your poor network, we suggest you download the [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) provided by us, and extract it to your `${HOME}`. + +> If you fail to install paddlespeech-ctcdecoders, it doesn't matter. + ## Medium: Get the Major Functions (Support Linux) If you want to get the major function of `paddlespeech`, you need to do following steps: ### Git clone PaddleSpeech @@ -105,13 +119,15 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ### Install PaddlePaddle You can choose the `PaddlePaddle` version based on your system. For example, for CUDA 10.2, CuDNN7.5 install paddlepaddle-gpu 2.2.0: ```bash -python3 -m pip install paddlepaddle-gpu==2.2.0 +python3 -m pip install paddlepaddle-gpu==2.2.0 -i https://mirror.baidu.com/pypi/simple ``` ### Install PaddleSpeech You can install `paddlespeech` by the following command,then you can use the `ready-made` examples in `paddlespeech` : ```bash +# Some users may fail to install `kaldiio` due to the default download source, you can install `pytest-runner` at first; +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple # Make sure you are in the root directory of PaddleSpeech -pip install . +pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple ``` ## Hard: Get the Full Function (Support Ubuntu) @@ -175,14 +191,17 @@ conda activate tools/venv conda install -y -c conda-forge sox libsndfile swig bzip2 libflac bc ``` ### Install PaddlePaddle +Some users may fail to install `kaldiio` due to the default download source, you can install `pytest-runner` at first; +```bash +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple +``` Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0: - ```bash -python3 -m pip install paddlepaddle-gpu==2.2.0 +python3 -m pip install paddlepaddle-gpu==2.2.0 -i https://mirror.baidu.com/pypi/simple ``` ### Install PaddleSpeech in Developing Mode ```bash -pip install -e .[develop] +pip install -e .[develop] -i https://pypi.tuna.tsinghua.edu.cn/simple ``` ### Install the Kaldi (Optional) ```bash diff --git a/docs/source/install_cn.md b/docs/source/install_cn.md index 3ffe371d9..55fef93d5 100644 --- a/docs/source/install_cn.md +++ b/docs/source/install_cn.md @@ -5,16 +5,18 @@ | :--- | :----------------------------------------------------------- | :------------------ | | 简单 | (1) 使用 PaddleSpeech 的命令行功能.
(2) 在 Aistudio上体验 PaddleSpeech. | Linux, Mac(不支持M1芯片),Windows | | 中等 | 支持 PaddleSpeech 主要功能,比如使用已有 examples 中的模型和使用 PaddleSpeech 来训练自己的模型. | Linux | -| 困难 | 支持 PaddleSpeech 的各项功能,包含训练语言模型,使用强制对齐等。并且你更能成为一名开发者! | Ubuntu | +| 困难 | 支持 PaddleSpeech 的各项功能,包含结合kaldi使用 join ctc decoder 方式解码,训练语言模型,使用强制对齐等。并且你更能成为一名开发者! | Ubuntu | ## 先决条件 - Python >= 3.7 - 最新版本的 PaddlePaddle (请看 [安装向导](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) - C++ 编译环境 - 提示: 对于 Linux 和 Mac,请不要使用 `sh` 代替安装文档中的 `bash` +- 提示: 我们建议在安装 `paddlepaddle` 的时候使用百度源 https://mirror.baidu.com/pypi/simple ,而在安装 `paddlespeech` 的时候使用清华源 https://pypi.tuna.tsinghua.edu.cn/simple 。 + ## 简单: 获取基本功能(支持 Linux,Mac 和 Windows) -- 如果你是一个刚刚接触 `PaddleSpeech` 的新人并且想要很方便地体验一下该项目。我们建议你 体验一下[AI Studio](https://aistudio.baidu.com/aistudio/index)。我们在AI Studio上面建立了一个让你一步一步运行体验来使用`PaddleSpeech`的教程。 +- 如果你是一个刚刚接触 `PaddleSpeech` 的新人并且想要很方便地体验一下该项目。我们建议你体验一下 [AI Studio](https://aistudio.baidu.com/aistudio/index)。我们在 AI Studio上面建立了一个让你一步一步运行体验来使用 `PaddleSpeech` 的[教程](https://aistudio.baidu.com/aistudio/education/group/info/25130)。 - 如果你想使用 `PaddleSpeech` 的命令行功能,你需要跟随下面的步骤来安装 `PaddleSpeech`。如果你想了解更多关于使用 `PaddleSpeech` 命令行功能的信息,你可以参考 [cli](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/cli)。 -### 安装Conda +### 安装 Conda Conda是一个包管理的环境。你可以前往 [minicoda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda(请下载 py>=3.7 的版本)。 然后你需要安装 `paddlespeech` 的 conda 依赖: ```bash @@ -24,6 +26,11 @@ conda install -y -c conda-forge sox libsndfile bzip2 (如果你系统上已经安装了 C++ 编译环境,请忽略这一步。) #### Windows 对于 Windows 系统,需要安装 `Visual Studio` 来完成 C++ 编译环境的安装。 + +https://visualstudio.microsoft.com/visual-cpp-build-tools/ + +你可以前往讨论区[#1195](https://github.com/PaddlePaddle/PaddleSpeech/discussions/1195)获取更多帮助。 + #### Mac ```bash brew install gcc @@ -42,19 +49,27 @@ sudo apt install build-essential conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ``` ### 安装 PaddleSpeech -你可以使用如下命令: +部分用户系统由于默认源的问题,安装中会出现kaldiio安转出错的问题,建议首先安装pytest-runner: +```bash +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple +``` +然后你可以使用如下命令: ```bash -pip install paddlepaddle paddlespeech +pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple +pip install paddlespeech -i https://pypi.tuna.tsinghua.edu.cn/simple ``` +> 如果您在使用 paddlespeech 的过程中遇到关于下载 **nltk_data** 的问题,可能是您的网络不佳,我们建议您下载我们提供的 [nltk_data](https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz) 并解压缩到您的 `${HOME}` 目录下。 + +> 如果出现 paddlespeech-ctcdecoders 无法安装的问题,无须担心,这不影响使用。 + ## 中等: 获取主要功能(支持 Linux) -如果你想要使用` paddlespeech` 的主要功能。你需要完成以下几个步骤 +如果你想要使用 `paddlespeech` 的主要功能。你需要完成以下几个步骤 ### Git clone PaddleSpeech -你需要先git clone本仓库 +你需要先 git clone 本仓库 ```bash git clone https://github.com/PaddlePaddle/PaddleSpeech.git cd PaddleSpeech ``` - ### 安装 Conda Conda 是一个包管理的环境。你可以前往 [minicoda](https://docs.conda.io/en/latest/miniconda.html) 去下载并安装 conda(请下载 py>=3.7 的版本)。你可以尝试自己安装,或者使用以下的命令: ```bash @@ -98,12 +113,15 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 ### 安装 PaddlePaddle 你可以根据系统配置选择 PaddlePaddle 版本,例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.2.0: ```bash -python3 -m pip install paddlepaddle-gpu==2.2.0 +python3 -m pip install paddlepaddle-gpu==2.2.0 -i https://mirror.baidu.com/pypi/simple ``` ### 安装 PaddleSpeech 最后安装 `paddlespeech`,这样你就可以使用 `paddlespeech`中已有的 examples: ```bash -pip install . +# 部分用户系统由于默认源的问题,安装中会出现kaldiio安转出错的问题,建议首先安装pytest-runner: +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple +# 请确保目前处于PaddleSpeech项目的根目录 +pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple ``` ## 困难: 获取所有功能(支持 Ubuntu) ### 先决条件 @@ -164,11 +182,16 @@ conda install -y -c conda-forge sox libsndfile swig bzip2 libflac bc ### 安装 PaddlePaddle 请确认你系统是否有 GPU,并且使用了正确版本的 paddlepaddle。例如系统使用 CUDA 10.2, CuDNN7.5 ,你可以安装 paddlepaddle-gpu 2.2.0: ```bash -python3 -m pip install paddlepaddle-gpu==2.2.0 +python3 -m pip install paddlepaddle-gpu==2.2.0 -i https://mirror.baidu.com/pypi/simple ``` ### 用开发者模式安装 PaddleSpeech +部分用户系统由于默认源的问题,安装中会出现kaldiio安转出错的问题,建议首先安装pytest-runner: +```bash +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple +``` +然后安装 PaddleSpeech: ```bash -pip install -e .[develop] +pip install -e .[develop] -i https://pypi.tuna.tsinghua.edu.cn/simple ``` ### 安装 Kaldi(可选) ```bash diff --git a/docs/source/released_model.md b/docs/source/released_model.md index 91ef6d166..3310bfb23 100644 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -4,34 +4,32 @@ ### Speech Recognition Model Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link -:-------------:| :------------:| :-----: | -----: | :----------------- |:--------- | :---------- | :--------- | :----------- -[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) -[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) -[Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) -[Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) -[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0538 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) -[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/conformer.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1) -[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1) -[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2) +:-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----: +[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) +[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) +[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.056 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) +[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) +[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1) +[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1) +[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2) ### Language Model based on NGram Language Model | Training Data | Token-based | Size | Descriptions -:-------------:| :------------:| :-----: | -----: | :----------------- +:------------:| :------------:|:------------: | :------------: | :------------: [English LM](https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm) | [CommonCrawl(en.00)](http://web-language-models.s3-website-us-east-1.amazonaws.com/ngrams/en/deduped/en.00.deduped.xz) | Word-based | 8.3 GB | Pruned with 0 1 1 1 1;
About 1.85 billion n-grams;
'trie' binary with '-a 22 -q 8 -b 8' [Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings [Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings ### Speech Translation Models -| Model | Training Data | Token-based | Size | Descriptions | BLEU | Example Link | -| ------------------------------------------------------------ | ------------- | ----------- | ---- | ------------------------------------------------------------ | ----- | ------------------------------------------------------------ | -| [Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz) | Ted-En-Zh | Spm | | Encoder:Transformer, Decoder:Transformer,
Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) | - +| Model | Training Data | Token-based | Size | Descriptions | BLEU | Example Link | +| :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| (only for CLI)[Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz) | Ted-En-Zh| Spm| | Encoder:Transformer, Decoder:Transformer,
Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) | ## Text-to-Speech Models ### Acoustic Models -Model Type | Dataset| Example Link | Pretrained Models|Static Models|Siize(static) +Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: Tacotron2|LJSpeech|[tacotron2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.3.zip)||| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| @@ -43,14 +41,16 @@ FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/Pa FastSpeech2| VCTK |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_nosil_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip)||| ### Vocoders -Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size(static) -:-------------:| :------------:| :-----: | :-----:| :-----:| :-----: +Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size (static) +:-----:| :-----:| :-----: | :-----:| :-----:| :-----: WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)||| Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)|5.1MB| Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)||| Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)||| Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)||| -|Multi Band MelGAN |CSMSC|[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) |8.2MB| +|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB| +Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | +HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB| ### Voice Cloning Model Type | Dataset| Example Link | Pretrained Models @@ -64,14 +64,18 @@ GE2E + FastSpeech2 | AISHELL-3 |[ge2e-fastspeech2-aishell3](https://github.com/ Model Type | Dataset| Example Link | Pretrained Models :-------------:| :------------:| :-----: | :-----: -PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams),[panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams),[panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams) -PANN | ESC-50 |[pann-esc50]("./examples/esc50/cls0")|[panns_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz), [panns_cnn10](https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz), [panns_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz) +PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams), [panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams), [panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams) +PANN | ESC-50 |[pann-esc50]("./examples/esc50/cls0")|[esc50_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn6.tar.gz), [esc50_cnn10.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn10.tar.gz), [esc50_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn14.tar.gz) +## Punctuation Restoration Models +Model Type | Dataset| Example Link | Pretrained Models +:-------------:| :------------:| :-----: | :-----: +Ernie Linear | IWLST2012_zh |[iwslt2012_punc0](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/iwslt2012/punc0)|[ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip) ## Speech Recognition Model from paddle 1.8 -| Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | -| :----------------------------------------------------------: | :----------------------------: | :---------: | -----: | :------------------------------------------------- | :----- | :----- | :-------------- | -| [Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz) | Aishell Dataset | Char-based | 234 MB | 2 Conv + 3 bidirectional GRU layers | 0.0804 | - | 151 h | -| [Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz) | Librispeech Dataset | Word-based | 307 MB | 2 Conv + 3 bidirectional sharing weight RNN layers | - | 0.0685 | 960 h | -| [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz) | Baidu Internal English Dataset | Word-based | 273 MB | 2 Conv + 3 bidirectional GRU layers | - | 0.0541 | 8628 h | +| Acoustic Model |Training Data| Token-based | Size | Descriptions | CER | WER | Hours of speech | +| :-----:| :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | +| [Ds2 Offline Aishell model](https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz) | Aishell Dataset | Char-based | 234 MB | 2 Conv + 3 bidirectional GRU layers | 0.0804 | — | 151 h | +| [Ds2 Offline Librispeech model](https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz) | Librispeech Dataset | Word-based | 307 MB | 2 Conv + 3 bidirectional sharing weight RNN layers | — | 0.0685 | 960 h | +| [Ds2 Offline Baidu en8k model](https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz) | Baidu Internal English Dataset | Word-based | 273 MB | 2 Conv + 3 bidirectional GRU layers |— | 0.0541 | 8628 h| diff --git a/docs/source/tts/tts_papers.md b/docs/source/tts/tts_papers.md new file mode 100644 index 000000000..2b35b8852 --- /dev/null +++ b/docs/source/tts/tts_papers.md @@ -0,0 +1,42 @@ +# TTS Papers +## Text Frontend +### Polyphone +- [【g2pM】g2pM: A Neural Grapheme-to-Phoneme Conversion Package for Mandarin Chinese Based on a New Open Benchmark Dataset](https://arxiv.org/abs/2004.03136) +- [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf) +### Text Normalization +#### English +- [applenob/text_normalization](https://github.com/applenob/text_normalization) +### G2P +#### English +- [cmusphinx/g2p-seq2seq](https://github.com/cmusphinx/g2p-seq2seq) + +## Acoustic Models +- [【AdaSpeech3】AdaSpeech 3: Adaptive Text to Speech for Spontaneous Style](https://arxiv.org/abs/2107.02530) +- [【AdaSpeech2】AdaSpeech 2: Adaptive Text to Speech with Untranscribed Data](https://arxiv.org/abs/2104.09715) +- [【AdaSpeech】AdaSpeech: Adaptive Text to Speech for Custom Voice](https://arxiv.org/abs/2103.00993) +- [【FastSpeech2】FastSpeech 2: Fast and High-Quality End-to-End Text to Speech](https://arxiv.org/abs/2006.04558) +- [【FastPitch】FastPitch: Parallel Text-to-speech with Pitch Prediction](https://arxiv.org/abs/2006.06873) +- [【SpeedySpeech】SpeedySpeech: Efficient Neural Speech Synthesis](https://arxiv.org/abs/2008.03802) +- [【FastSpeech】FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263) +- [【Transformer TTS】Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895) +- [【Tacotron2】Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884) + +## Vocoders +- [【RefineGAN】RefineGAN: Universally Generating Waveform Better than Ground Truth with Highly Accurate Pitch and Intensity Responses](https://arxiv.org/abs/2111.00962) +- [【Fre-GAN】Fre-GAN: Adversarial Frequency-consistent Audio Synthesis](https://arxiv.org/abs/2106.02297) +- [【StyleMelGAN】StyleMelGAN: An Efficient High-Fidelity Adversarial Vocoder with Temporal Adaptive Normalization](https://arxiv.org/abs/2011.01557) +- [【Multi-band MelGAN】Multi-band MelGAN: Faster Waveform Generation for High-Quality Text-to-Speech](https://arxiv.org/abs/2005.05106) +- [【HiFi-GAN】HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis](https://arxiv.org/abs/2010.05646) +- [【VocGAN】VocGAN: A High-Fidelity Real-time Vocoder with a Hierarchically-nested Adversarial Network](https://arxiv.org/abs/2007.15256) +- [【Parallel WaveGAN】Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram](https://arxiv.org/abs/1910.11480) +- [【MelGAN】MelGAN: Generative Adversarial Networks for Conditional Waveform Synthesis](https://arxiv.org/abs/1910.06711) +- [【WaveFlow】WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219) +- [【LPCNet】LPCNet: Improving Neural Speech Synthesis Through Linear Prediction](https://arxiv.org/abs/1810.11846) +- [【WaveRNN】Efficient Neural Audio Synthesis](https://arxiv.org/abs/1802.08435) +## GAN TTS + +- [【GAN TTS】High Fidelity Speech Synthesis with Adversarial Networks](https://arxiv.org/abs/1909.11646) + +## Voice Cloning +- [【SV2TTS】Transfer Learning from Speaker Verification to Multispeaker Text-to-Speech Synthesis](https://arxiv.org/abs/1806.04558) +- [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467) diff --git a/docs/topic/ctc/ctc_loss_compare.ipynb b/docs/topic/ctc/ctc_loss_compare.ipynb new file mode 100644 index 000000000..95b2af508 --- /dev/null +++ b/docs/topic/ctc/ctc_loss_compare.ipynb @@ -0,0 +1,520 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "ff6ff1e0", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "33af5f76", + "metadata": {}, + "outputs": [], + "source": [ + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9b566b73", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloning into 'warp-ctc'...\n", + "remote: Enumerating objects: 829, done.\u001b[K\n", + "remote: Total 829 (delta 0), reused 0 (delta 0), pack-reused 829\u001b[K\n", + "Receiving objects: 100% (829/829), 388.85 KiB | 140.00 KiB/s, done.\n", + "Resolving deltas: 100% (419/419), done.\n", + "Checking connectivity... done.\n" + ] + } + ], + "source": [ + "!git clone https://github.com/SeanNaren/warp-ctc.git" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4a087a09", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc\n" + ] + } + ], + "source": [ + "%cd warp-ctc" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f55dc29a", + "metadata": {}, + "outputs": [], + "source": [ + "mkdir -p build" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fe79f4cf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build\n" + ] + } + ], + "source": [ + "cd build" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3d25c718", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- The C compiler identification is GNU 5.4.0\n", + "-- The CXX compiler identification is GNU 5.4.0\n", + "-- Check for working C compiler: /usr/bin/cc\n", + "-- Check for working C compiler: /usr/bin/cc -- works\n", + "-- Detecting C compiler ABI info\n", + "-- Detecting C compiler ABI info - done\n", + "-- Detecting C compile features\n", + "-- Detecting C compile features - done\n", + "-- Check for working CXX compiler: /usr/bin/c++\n", + "-- Check for working CXX compiler: /usr/bin/c++ -- works\n", + "-- Detecting CXX compiler ABI info\n", + "-- Detecting CXX compiler ABI info - done\n", + "-- Detecting CXX compile features\n", + "-- Detecting CXX compile features - done\n", + "-- Looking for pthread.h\n", + "-- Looking for pthread.h - found\n", + "-- Performing Test CMAKE_HAVE_LIBC_PTHREAD\n", + "-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed\n", + "-- Looking for pthread_create in pthreads\n", + "-- Looking for pthread_create in pthreads - not found\n", + "-- Looking for pthread_create in pthread\n", + "-- Looking for pthread_create in pthread - found\n", + "-- Found Threads: TRUE \n", + "-- Found CUDA: /usr/local/cuda (found suitable version \"10.2\", minimum required is \"6.5\") \n", + "-- cuda found TRUE\n", + "-- Building shared library with GPU support\n", + "-- Configuring done\n", + "-- Generating done\n", + "-- Build files have been written to: /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build\n" + ] + } + ], + "source": [ + "!cmake .." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7a4238f1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 11%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/warpctc.dir/src/warpctc_generated_reduce.cu.o\u001b[0m\n", + "[ 22%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/warpctc.dir/src/warpctc_generated_ctc_entrypoint.cu.o\u001b[0m\n", + "\u001b[35m\u001b[1mScanning dependencies of target warpctc\u001b[0m\n", + "[ 33%] \u001b[32m\u001b[1mLinking CXX shared library libwarpctc.so\u001b[0m\n", + "[ 33%] Built target warpctc\n", + "[ 44%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/test_gpu.dir/tests/test_gpu_generated_test_gpu.cu.o\u001b[0m\n", + "\u001b[35m\u001b[1mScanning dependencies of target test_cpu\u001b[0m\n", + "[ 55%] \u001b[32mBuilding CXX object CMakeFiles/test_cpu.dir/tests/test_cpu.cpp.o\u001b[0m\n", + "[ 66%] \u001b[32mBuilding CXX object CMakeFiles/test_cpu.dir/tests/random.cpp.o\u001b[0m\n", + "[ 77%] \u001b[32m\u001b[1mLinking CXX executable test_cpu\u001b[0m\n", + "[ 77%] Built target test_cpu\n", + "\u001b[35m\u001b[1mScanning dependencies of target test_gpu\u001b[0m\n", + "[ 88%] \u001b[32mBuilding CXX object CMakeFiles/test_gpu.dir/tests/random.cpp.o\u001b[0m\n", + "[100%] \u001b[32m\u001b[1mLinking CXX executable test_gpu\u001b[0m\n", + "[100%] Built target test_gpu\n" + ] + } + ], + "source": [ + "!make -j" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "31761a31", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc\n" + ] + } + ], + "source": [ + "cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f53316f6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding\n" + ] + } + ], + "source": [ + "cd pytorch_binding" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "084f1e49", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "running install\n", + "running bdist_egg\n", + "running egg_info\n", + "creating warpctc_pytorch.egg-info\n", + "writing warpctc_pytorch.egg-info/PKG-INFO\n", + "writing dependency_links to warpctc_pytorch.egg-info/dependency_links.txt\n", + "writing top-level names to warpctc_pytorch.egg-info/top_level.txt\n", + "writing manifest file 'warpctc_pytorch.egg-info/SOURCES.txt'\n", + "writing manifest file 'warpctc_pytorch.egg-info/SOURCES.txt'\n", + "installing library code to build/bdist.linux-x86_64/egg\n", + "running install_lib\n", + "running build_py\n", + "creating build\n", + "creating build/lib.linux-x86_64-3.9\n", + "creating build/lib.linux-x86_64-3.9/warpctc_pytorch\n", + "copying warpctc_pytorch/__init__.py -> build/lib.linux-x86_64-3.9/warpctc_pytorch\n", + "running build_ext\n", + "building 'warpctc_pytorch._warp_ctc' extension\n", + "creating /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9\n", + "creating /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src\n", + "Emitting ninja build file /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/build.ninja...\n", + "Compiling objects...\n", + "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", + "[1/1] c++ -MMD -MF /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o.d -pthread -B /workspace/zhanghui/DeepSpeech-2.x/tools/venv/compiler_compat -Wl,--sysroot=/ -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /workspace/zhanghui/DeepSpeech-2.x/tools/venv/include -fPIC -O2 -isystem /workspace/zhanghui/DeepSpeech-2.x/tools/venv/include -fPIC -I/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/TH -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/include/THC -I/usr/local/cuda/include -I/workspace/zhanghui/DeepSpeech-2.x/tools/venv/include/python3.9 -c -c /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/src/binding.cpp -o /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o -std=c++14 -fPIC -DWARPCTC_ENABLE_GPU -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE=\"_gcc\"' '-DPYBIND11_STDLIB=\"_libstdcpp\"' '-DPYBIND11_BUILD_ABI=\"_cxxabi1011\"' -DTORCH_EXTENSION_NAME=_warp_ctc -D_GLIBCXX_USE_CXX11_ABI=0\n", + "g++ -pthread -B /workspace/zhanghui/DeepSpeech-2.x/tools/venv/compiler_compat -Wl,--sysroot=/ -shared -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath-link,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -Wl,-rpath-link,/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib /workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/pytorch_binding/build/temp.linux-x86_64-3.9/src/binding.o -L/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build -L/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/torch/lib -L/usr/local/cuda/lib64 -lwarpctc -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -o build/lib.linux-x86_64-3.9/warpctc_pytorch/_warp_ctc.cpython-39-x86_64-linux-gnu.so -Wl,-rpath,/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc/build\n", + "creating build/bdist.linux-x86_64\n", + "creating build/bdist.linux-x86_64/egg\n", + "creating build/bdist.linux-x86_64/egg/warpctc_pytorch\n", + "copying build/lib.linux-x86_64-3.9/warpctc_pytorch/__init__.py -> build/bdist.linux-x86_64/egg/warpctc_pytorch\n", + "copying build/lib.linux-x86_64-3.9/warpctc_pytorch/_warp_ctc.cpython-39-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/egg/warpctc_pytorch\n", + "byte-compiling build/bdist.linux-x86_64/egg/warpctc_pytorch/__init__.py to __init__.cpython-39.pyc\n", + "creating stub loader for warpctc_pytorch/_warp_ctc.cpython-39-x86_64-linux-gnu.so\n", + "byte-compiling build/bdist.linux-x86_64/egg/warpctc_pytorch/_warp_ctc.py to _warp_ctc.cpython-39.pyc\n", + "creating build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying warpctc_pytorch.egg-info/PKG-INFO -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying warpctc_pytorch.egg-info/SOURCES.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying warpctc_pytorch.egg-info/dependency_links.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "copying warpctc_pytorch.egg-info/top_level.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", + "writing build/bdist.linux-x86_64/egg/EGG-INFO/native_libs.txt\n", + "zip_safe flag not set; analyzing archive contents...\n", + "warpctc_pytorch.__pycache__._warp_ctc.cpython-39: module references __file__\n", + "creating dist\n", + "creating 'dist/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg' and adding 'build/bdist.linux-x86_64/egg' to it\n", + "removing 'build/bdist.linux-x86_64/egg' (and everything under it)\n", + "Processing warpctc_pytorch-0.1-py3.9-linux-x86_64.egg\n", + "removing '/workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg' (and everything under it)\n", + "creating /workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg\n", + "Extracting warpctc_pytorch-0.1-py3.9-linux-x86_64.egg to /workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages\n", + "warpctc-pytorch 0.1 is already the active version in easy-install.pth\n", + "\n", + "Installed /workspace/zhanghui/DeepSpeech-2.x/tools/venv/lib/python3.9/site-packages/warpctc_pytorch-0.1-py3.9-linux-x86_64.egg\n", + "Processing dependencies for warpctc-pytorch==0.1\n", + "Finished processing dependencies for warpctc-pytorch==0.1\n" + ] + } + ], + "source": [ + "!python setup.py install" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ee4ca9e3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python 3.9.5\r\n" + ] + } + ], + "source": [ + "!python -V" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "59255ed8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/workspace/zhanghui/DeepSpeech-2.x/docs/topic/ctc/warp-ctc\n" + ] + } + ], + "source": [ + "cd .." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1dae09b9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "grep: warning: GREP_OPTIONS is deprecated; please use an alias or script\n" + ] + } + ], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import warpctc_pytorch as wp\n", + "import paddle.nn as pn\n", + "import paddle" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "83d0762e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1.10.0+cu102'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "62501e2c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2.2.0'" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "paddle.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "9e8e0f40", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([2, 1, 5])\n", + "2.4628584384918213\n", + "[[[ 0.17703122 -0.70812464 0.17703122 0.17703122 0.17703122]]\n", + "\n", + " [[ 0.17703122 0.17703122 -0.70812464 0.17703122 0.17703122]]]\n" + ] + } + ], + "source": [ + "probs = torch.FloatTensor([[\n", + " [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]\n", + " ]]).transpose(0, 1).contiguous()\n", + "print(probs.size())\n", + "labels = torch.IntTensor([1, 2])\n", + "label_sizes = torch.IntTensor([2])\n", + "probs_sizes = torch.IntTensor([2])\n", + "probs.requires_grad_(True)\n", + "bs = probs.size(1)\n", + "\n", + "ctc_loss = wp.CTCLoss(size_average=False, length_average=False)\n", + "cost = ctc_loss(probs, labels, probs_sizes, label_sizes)\n", + "cost = cost.sum() / bs\n", + "print(cost.item())\n", + "cost.backward()\n", + "print(probs.grad.numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "2cd46569", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.4628584384918213\n", + "[[[ 0.1770312 -0.7081248 0.1770312 0.1770312 0.1770312]]\n", + "\n", + " [[ 0.1770312 0.1770312 -0.7081248 0.1770312 0.1770312]]]\n" + ] + } + ], + "source": [ + "probs = torch.FloatTensor([[\n", + " [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]\n", + " ]]).transpose(0, 1).contiguous()\n", + "labels = torch.IntTensor([1, 2])\n", + "label_sizes = torch.IntTensor([2])\n", + "probs_sizes = torch.IntTensor([2])\n", + "probs.requires_grad_(True)\n", + "bs = probs.size(1)\n", + "\n", + "log_probs = torch.log_softmax(probs, axis=-1)\n", + "\n", + "ctc_loss1 = nn.CTCLoss(reduction='none')\n", + "cost = ctc_loss1(log_probs, labels, probs_sizes, label_sizes)\n", + "cost = cost.sum() / bs\n", + "print(cost.item())\n", + "cost.backward()\n", + "print(probs.grad.numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "85c3461a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2, 1, 5]\n", + "[1, 2]\n", + "2.4628584384918213\n", + "[[[ 0.17703122 -0.70812464 0.17703122 0.17703122 0.17703122]]\n", + "\n", + " [[ 0.17703122 0.17703122 -0.70812464 0.17703122 0.17703122]]]\n" + ] + } + ], + "source": [ + "paddle.set_device('cpu')\n", + "probs = paddle.to_tensor([[\n", + " [0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1],\n", + " ]]).transpose([1,0,2])\n", + "print(probs.shape) # (T, B, D)\n", + "labels = paddle.to_tensor([[1, 2]], dtype='int32') #(B,L)\n", + "print(labels.shape)\n", + "label_sizes = paddle.to_tensor([2], dtype='int64')\n", + "probs_sizes = paddle.to_tensor([2], dtype='int64')\n", + "bs = paddle.shape(probs)[1]\n", + "probs.stop_gradient=False\n", + "\n", + "ctc_loss = pn.CTCLoss(reduction='none')\n", + "cost = ctc_loss(probs, labels, probs_sizes, label_sizes)\n", + "cost = cost.sum() / bs\n", + "print(cost.item())\n", + "cost.backward()\n", + "print(probs.grad.numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d390cd91", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/topic/frontend/g2p.md b/docs/topic/frontend/g2p.md new file mode 100644 index 000000000..7713420a1 --- /dev/null +++ b/docs/topic/frontend/g2p.md @@ -0,0 +1,174 @@ +# g2p 字典设计 + +本文主要讲语音合成的 g2p (grapheme to phoneme) 部分。 + +代码: [generate_lexicon.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/generate_lexicon.py) (代码可能与此处的描述有些许出入,以代码为准,生成的带 tone 带儿化的 pinyin 字典参考 [simple.lexicon](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/csmsc/tts3/local/simple.lexicon)) + +## ARPAbet +对于英文 TTS,常用的 g2p 是通过查询 CMUDict 来实现,而 CMUDict 注音使用的系统是 ARPAbet,具体含义参见 [CMU 发音词典](http://www.speech.cs.cmu.edu/cgi-bin/cmudict/)。 + +它包含 39 个 phoneme, 不包含音词汇重音的变体: + +| Phoneme | Example | Translation | +|:-------------:|:-------:|:-----------:| +| AA | odd | AA D | +| AE | at | AE T | +| AH | hut | HH AH T | +| AO | ought | AO T | +| AW | cow | K AW | +| AY | hide | HH AY D | +| B | be | B IY | +| CH | cheese | CH IY Z | +| D | dee | D IY | +| DH | thee | DH IY | +| EH | Ed | EH D | +| ER | hurt | HH ER T | +| EY | ate | EY T | +| F | fee | F IY | +| G | green | G R IY N | +| HH | he | HH IY | +| IH | it | IH T | +| IY | eat | IY T | +| JH | gee | JH IY | +| K | key | K IY | +| L | lee | L IY | +| M | me | M IY | +| N | knee | N IY | +| NG | ping | P IH NG | +| OW | oat | OW T | +| OY | toy | T OY | +| P | pee | P IY | +| R | read | R IY D | +| S | sea | S IY | +| SH | she | SH IY | +| T | tea | T IY | +| TH | theta | TH EY T AH| +| UH | hood | HH UH D | +| UW | two | T UW | +| V | vee | V IY | +| W | we | W IY | +| Y | yield | Y IY L D | +| Z | zee | Z IY | +| ZH | seizure| S IY ZH ER| + +另外还包含三个重音标记, + +0 — No stress +1 — Primary stress +2 — Secondary stress + +其中重音标记附在元音后面。当只需要音标而不需要重音标记的时候也可以直接省略。 + +CMUDict 只是一个词典,当出现了不在词典中的词时(OOV),可以求助其他工具可以根据拼写得到对应的发音,如: + - [Lexicon Tool](http://www.speech.cs.cmu.edu/tools) + - [g2p-seq2seq](https://github.com/cmusphinx/g2p-seq2seq) + +## 中文注音系统 + +中文普通话的注音系统存在许多套,比如汉语拼音 (pinyin), 注音符号 (bopomofo), 国语注音符第二式, 威妥玛拼音等。而且有一些并非注音方案,是拉丁化方案,因此为了符号系统的经济性,会做一些互补符号的简并,比如汉语拼音中的 `i` 的代表了三个音位, `e` 代表了两个音位(单用的情况很少, 单用时写作 `ê`);也有一些简写,比如 `bpmf` 后的 `o` 是 `uo` 的简写, `ui` 是 `uei` 的简写,` iu` 是 `iou` 的简写, `un` 是 `uen` 的简写, `ao` 是为了书写避免形近而改掉的 `au`, `y` 和 `w` 是为了连续书写时作为分隔而产生的零声母, `ü` 在 `j`、 `q`、 `x` 后面省略两点(中国大陆使用美式键盘打字的时候,一般只有在“女”、 “律”、“略”和“虐”这一类的字里面用 `v` 代替 `ü`,而在 `j`、 `q`、 `x` 后面的时候则仍用 `u` ),有鼻韵母 `uang` 而没有 `ueng`,但是又有 `weng` 这个音节之类的问题, 有 `ong` 韵母但是又没有单用的情形。其实这些都是汉语拼音作为拉丁化方案而做的一系列的修改。 + +另外,汉语的声调是用了特殊符号来标调型,用字母记录的时候常用 `12345` 或者 `1234`、轻音不标等手段。 + +另外还有两个比较突出的问题是**儿化**和**变调**(参考 [zh_text_frontend](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/tts/zh_text_frontend.md))。对于具体的数据集,也可能有不同的标注方案。一般我们为汉字标音是标字调而不标变调,但是**标贝数据集是标变调的**(但是也没有把所有的变调都正确标出来)。儿化在汉语书写和拼音中也是一个比较麻烦的事情,虽然正字法中说到可以用小字号的儿表示儿化,但是这种发音由字号这种排版要素来表达的手法未免过于崎岖,所以鲜见有人真的这么排版,只有在某些书籍中,强调此事的时候见过。另外,在儿化的标音方式上,鼻韵母需要去掉韵尾然后换成 r,这么一来,如果直接抽取拼音的字符串表示,那么可能出现的音节就会超过 1400, 甚至进入一种含糊的状态,不清楚一共有多少个有效音节,即使是韵母,也会因此扩展近一倍。 + +因为存在这样的情形,再考虑到不同的数据集自带的拼音 transcription 的风格可能不同,所以需要考虑进行转换,在内部转成统一的表示。既然这个过程是必要的,那么我们可以大胆设计一个内部方案。 + +这里设计的原则是: + +1. 有效符号集仅切分为声母和韵母,不作声母,介音,韵腹,韵尾的切分; + +2. 尽可能把不同的音用不同的符号表示,比如 `i` 的 `e` 会被拆分为 3 和 2 个符号, `u` 和 `ü` 开头的韵母分开,这是为了 TTS 系统的建议性考虑的,我们选择尽量反映语音的现实情况,而不把注音系统里面的奇怪规则留给模型去学习; + +3. 不包含零声母 `y`, `w`之类的形式上的符号,因为如果这些符号不发声或者发声极短,那么可以不加入音符序列中,以期待 attention 更对角; + +4. 声调和韵母不结合为一个符号,而是分开,这样可以**减少词汇量**,使得符号的 embedding 得到更充分的训练,也更能反映声调语言的特点(数据集少时推荐这么做); + +5. 儿化的标音方式采用拆分的方式处理, 但是增设一个特殊符号 `&r` 来表示儿化的 `r`,它和一般的 `er` 不同,以区分实际读音的区别。 + +6. 更加贴近注音符号,把 `in` 写作 `ien`,`ing` 写作 `ieng`, `un` 写作 `uen`, `ong` 写作 `ueng`, `iong` 写作 `üeng`。其中 `in` 和 `ing` 的转写纯属偏好,无论用什么符号写,都可以被转为一个 index, 只要它们的使用情况不发声变化就可以。而 `ong` 写作 `ueng` 则是有实际差别的,如果 `ong` 作为一个韵母,那么 `weng` 经过修改之后会变成 `ueng`, 就会同时有 `ueng` 和 `ong`。而如果不细究音值上的微妙差异,`ong` 就是 `ung` 的一种奇怪表示, 在注意符号中, 它就记作 `ㄨㄥ`。而 `iong` 则是 `ㄩㄥ`。 + +7. `ui`, `iu` 都展开为 `uei` 和 `iou` , 纯属偏好,对实际结果没有影响。`bpmf `后的 `o` 展开为 `uo`,这个则是为了和单独的 `o` 区分开(哦, 和波里面的韵母的发音其实不同)。 + +8. 所有的 `ü `都有 `v` 代替,无论是单独作韵母, 还是复韵母和鼻韵母。 + +9. 把停顿以 `#1` 等方式纳入其中, 把 `` `` `` `` 这些为了处理符号系列的特殊符号也加入其中,多一些特殊词汇并不会对 Embedding 产生什么影响。 + +于是我们可以的通过一套规则系统,把标贝的**拼音标注**转换成我们需要的形式。(当然,如果是别的数据集的实际标注不同,那么转换规则也要作一些修改) + +在实际使用中文数据集时,我们仅使用其提供的**拼音标注**,而不使用**音素标注**(PhoneLabel),因为不同的数据集有不同的标注规则,而且有的数据集是没有**音素标注**的(如,aishell3) + +我们的做法和维基百科上的汉语拼音音节列表更接近 [汉语拼音音节列表](https://zh.wikipedia.org/zh-hans/%E6%B1%89%E8%AF%AD%E6%8B%BC%E9%9F%B3%E9%9F%B3%E8%8A%82%E5%88%97%E8%A1%A8) + +转换之后,符号列表是: + +声母基本没有什么争议,共 21 个: +|声母| +|:--:| +|b| +|p| +|m| +|f| +|d| +|t| +|n| +|l| +|g| +|k| +|h| +|j| +|q| +|x| +|zh| +|ch| +|sh| +|r| +|z| +|c| +|s| + +韵母和儿化韵尾(共 41个) +|韵母|解释| +|:----:|:-----------: | +|ii |`zi`,`ci`, `si` 里面的韵母 `i`| +|iii |`zhi`, `chi`, `shi`, `ri` 里面的韵母 `i`| +|a |啊,卡| +|o |哦| +|e |恶,个| +|ea |ê| +|ai |爱,在| +|ei |诶,薇| +|ao |奥,脑| +|ou |欧,勾| +|an |安,单| +|en |恩,痕| +|ang |盎,刚| +|eng |嗯,更| +|er |儿| +|i |一| +|ia |鸦,家| +|io |哟| +|ie |叶,界| +|iai |崖(台语发音)| +|iao |要,教| +|iou |有,久| +|ian |言,眠| +|ien |因,新| +|iang |样,降| +|ieng |英,晶 +|u |无,卢| +|ua |哇,瓜| +|uo |我,波| +|uai |外,怪| +|uei |位,贵| +|uan |万,乱| +|uen |问,论| +|uang |网,光| +|ueng |翁,共| +|v |玉,曲,`ü`| +|ve |月,却| +|van |源,倦| +|ven |韵,君| +|veng |永,炯| +|&r |儿化韵尾| diff --git a/docs/topic/gan_vocoder/gan_vocoder.ipynb b/docs/topic/gan_vocoder/gan_vocoder.ipynb index d214a81e2..edb4eeb1d 100644 --- a/docs/topic/gan_vocoder/gan_vocoder.ipynb +++ b/docs/topic/gan_vocoder/gan_vocoder.ipynb @@ -21,7 +21,11 @@ "|FB-RAWs|Filter Bank Random Window Discriminators|\n", "\n", "

\n", - "csmsc 数据集上 GAN Vocoder 整体对比\n", + "csmsc 数据集上 GAN Vocoder 整体对比如下, \n ", + "\n", + "测试机器:1 x Tesla V100-32G 40 core Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz\n ", + "\n", + "测试环境:Python 3.7.0, paddlepaddle 2.2.0\n", "\n", "Model|Date|Input|Generator
Loss|Discriminator
Loss|Need
Finetune|Training
Steps|Finetune
Steps|Batch
Size|ips
(gen only)
(gen + dis)|Static Model
Size (gen)|RTF
(GPU)|\n", ":-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|:-----:|\n", diff --git a/docs/topic/package_release/python_package_release.md b/docs/topic/package_release/python_package_release.md new file mode 100644 index 000000000..96d6f5f4e --- /dev/null +++ b/docs/topic/package_release/python_package_release.md @@ -0,0 +1,173 @@ +# 简化安装与发包 + +## 问题: + +1. [如何去除 ubuntu 的 apt 安装依赖?](#conda-代替系统依赖) +2. [如何支持普通用户和开发者两种安装的需求,尽量减少普通用户所需的依赖?](#区分install模式和develop模式) +3. [如何进行 python 包的动态安装?](#python-包的动态安装) +4. [如何进行 python 项目编包?](#python-编包方法) +5. [发包前要有什么准备?](#关于发包前的准备工作) +6. [发 C++ 包需要注意的东西?](#manylinux) + + +## conda 代替系统依赖 + +conda 可以用来代替一些 apt-get 安装的系统依赖,这样可以让项目适用于除了 ubuntu 以外的系统。 + +使用 conda 可以安装 sox、 libsndfile、swig 等 paddlespeech 需要的依赖: + +```bash +conda install -y -c conda-forge sox libsndfile +``` + +部分系统会缺少 libbzip2 库,这个 paddlespeech 也是需要的,这也可以用 conda 安装: + +```bash +conda install -y -c bzip2 +``` + +conda 也可以安装 linux 的 C++ 的依赖: + +```bash +conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 +``` + +#### 剩余问题:使用 conda 环境编译 kenlm 失败。目前在 conda 环境下编译 kenlm 会出现链接失败的问题 + +目前知道需要的依赖: + +```bash +conda install -c conda-forge eigen boost cmake +``` + +## 区分install模式和develop模式 + +可以在 setup.py 中划分 install 的依赖(基本依赖)和 develop 的依赖 (开发者额外依赖)。 setup_info 中 `install_requires` 设置 install 的依赖,而在 `extras_require` 中设置 `develop` key 为 develop 的依赖。 +普通安装可以使用: + +```bash +pip install . +``` + +另外使用 pip 安装已发的包也是使用普通安装的: + +``` +pip install paddlespeech +``` + +而开发者可以使用如下方式安装,这样不仅会安装 install 的依赖,也会安装 develop 的依赖, 即:最后安装的依赖 = install 依赖 + develop 依赖: + +```bash +pip install -e .[develop] +``` + +## python 包的动态安装 + +可以使用 pip 包来实现动态安装: + +```python +import pip +if int(pip.__version__.split('.')[0]) > 9: + from pip._internal import main + else: + from pip import main + main(['install', package_name]) +``` + +## python 编包方法 + +#### 创建 pypi的账号 + +创建 pypi 账号 + +#### 下载 twine + +``` +pip install twine +``` + +#### python 编包 + +编写好 python 包的 setup.py, 然后使用如下命令编 wheel 包: + +```bash +python setup.py bdist_wheel +``` + +如果要编源码包,用如下命令: + +```bash +python setup.py sdist +``` + +#### 上传包 + +```bash +twine upload dist/wheel包 +``` + +输入账号和密码后就可以上传 wheel 包了 + +#### 关于python 包的发包信息 + +主要可以参考这个[文档](https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/?highlight=find_packages) + + +## 关于发包前的准备工作 + +#### 拉分支 +在发包之前需要拉分支。例如需要发 0.1.0 版本的正式包,则需要拉一个 r0.1 的分支。并且在这个 r0.1 分支的包上面打 0.1.0 的tag。在拉分支之前可以选择性的使用 rc 版本发一个正式版前的试用包,例如0.1.0rc0,等到rc包测试通过后,再拉分支(如果是发 0.1.1 包,则 merge r0.1分支),打tag,完成发包。总体步骤可以总结为: + +- 用 develop 分支发 rc 包 +- rc 包通过后拉分支 +- 打 tag +- 发包 +- 编写 release note + + + +## ManyLinux + +为了让有 C++ 依赖的 pip wheel 包可以适用于更多的 linux 系统,需要降低其本身的 glibc 的依赖。这就需要让 pip wheel 包在 manylinux 的 docker 下编包。关于查看系统的 glibc 版本,可以使用命令:`ldd --version`。 + +### Manylinux + +关于 Manylinux,主要可以参考 Github 项目的说明[ github many linux](https://github.com/pypa/manylinux)。 +manylinux1 支持 Centos5以上, manylinux2010 支持 Centos 6 以上,manylinux2014 支持Centos 7 以上。 +目前使用 manylinux2010 基本可以满足所有的 linux 生产环境需求。(不建议使用manylinux1,系统较老,难度较大) + +### 拉取 manylinux2010 + +```bash +docker pull quay.io/pypa/manylinux1_x86_64 +``` + +### 使用 manylinux2010 + +启动 manylinux2010 docker。 + +```bash +docker run -it xxxxxx +``` + +在 manylinux2010 的docker环境自带 swig 和各种类型的 python 版本。这里注意不要自己下载 conda 来安装环境来编译 pip 包,要用 docker 本身的环境来编包。 +设置python: + +```bash +export PATH="/opt/python/cp37-cp37m/bin/:$PATH" +#export PATH="/opt/python/cp38-cp38/bin/:$PATH" +#export PATH="/opt/python/cp39-cp39/bin/:$PATH" +``` + +随后正常编包,编包后需要使用 [auditwheel](https://github.com/pypa/auditwheel) 来降低编好的wheel包的版本。 +显示 wheel 包的 glibc 依赖版本 + +```bash +auditwheel show wheel包 +``` + +降低 wheel包的版本 + +```bash +auditwheel repair wheel包 +``` diff --git a/docs/tutorial/cls/cls_tutorial.ipynb b/docs/tutorial/cls/cls_tutorial.ipynb index 9b8bfc119..56b488adc 100644 --- a/docs/tutorial/cls/cls_tutorial.ipynb +++ b/docs/tutorial/cls/cls_tutorial.ipynb @@ -2,9 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "\"Fork\n", "\n", @@ -32,9 +30,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "%%HTML\n", @@ -45,9 +41,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "# 2. 音频和特征提取" ] @@ -55,9 +49,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# 环境准备:安装paddlespeech和paddleaudio\n", @@ -67,9 +59,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import warnings\n", @@ -82,9 +72,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "\n", "\n", @@ -98,9 +86,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "# 获取示例音频\n", @@ -111,9 +97,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from paddleaudio import load\n", @@ -130,9 +114,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "!paddlespeech cls --input ./dog.wav" @@ -140,9 +122,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 2.2 音频特征提取\n", "\n", @@ -162,21 +142,20 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import paddle\n", "import numpy as np\n", "\n", + "data, sr = load(file='./dog.wav', sr=32000, mono=True, dtype='float32')\n", "x = paddle.to_tensor(data)\n", "n_fft = 1024\n", "win_length = 1024\n", - "hop_length = 512\n", + "hop_length = 320\n", "\n", "# [D, T]\n", - "spectrogram = paddle.signal.stft(x, n_fft=1024, win_length=1024, hop_length=512, onesided=True) \n", + "spectrogram = paddle.signal.stft(x, n_fft=n_fft, win_length=win_length, hop_length=hop_length, onesided=True) \n", "print('spectrogram.shape: {}'.format(spectrogram.shape))\n", "print('spectrogram.dtype: {}'.format(spectrogram.dtype))\n", "\n", @@ -190,9 +169,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 2.2.2 LogFBank\n", "\n", @@ -220,13 +197,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from paddleaudio.features import LogMelSpectrogram\n", "\n", + "f_min=50.0\n", + "f_max=14000.0\n", + "n_mels=64\n", + "\n", "# - sr: 音频文件的采样率。\n", "# - n_fft: FFT样本点个数。\n", "# - hop_length: 音频帧之间的间隔。\n", @@ -239,7 +218,9 @@ " hop_length=hop_length, \n", " win_length=win_length, \n", " window='hann', \n", - " n_mels=64)\n", + " f_min=f_min,\n", + " f_max=f_max,\n", + " n_mels=n_mels)\n", "\n", "x = paddle.to_tensor(data).unsqueeze(0) # [B, L]\n", "log_fbank = feature_extractor2(x) # [B, D, T]\n", @@ -253,9 +234,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 2.3 声音分类方法\n", "\n", @@ -272,9 +251,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 2.3.2 深度学习方法\n", "传统机器学习方法可以捕捉声音特征的差异(例如男声和女声的声音在音高上往往差异较大)并实现分类任务。\n", @@ -288,9 +265,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 2.3.3 Pretrain + Finetune\n", "\n", @@ -315,9 +290,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "# 3. 实践:环境声音分类\n", "\n", @@ -361,22 +334,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from paddleaudio.datasets import ESC50\n", "\n", - "train_ds = ESC50(mode='train')\n", - "dev_ds = ESC50(mode='dev')" + "train_ds = ESC50(mode='train', sample_rate=sr)\n", + "dev_ds = ESC50(mode='dev', sample_rate=sr)" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 3.1.2 特征提取\n", "通过下列代码,用 `paddleaudio.features.LogMelSpectrogram` 初始化一个音频特征提取器,在训练过程中实时提取音频的 LogFBank 特征,其中主要的参数如下: " @@ -385,19 +354,23 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ - "feature_extractor = LogMelSpectrogram(sr=44100, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window='hann', n_mels=64)" + "feature_extractor = LogMelSpectrogram(\n", + " sr=sr, \n", + " n_fft=n_fft, \n", + " hop_length=hop_length, \n", + " win_length=win_length, \n", + " window='hann', \n", + " f_min=f_min,\n", + " f_max=f_max,\n", + " n_mels=n_mels)" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 3.2 模型\n", "\n", @@ -409,9 +382,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "from paddlespeech.cls.models import cnn14\n", @@ -420,9 +391,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "### 3.2.2 构建分类模型\n", "\n", @@ -432,9 +401,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "import paddle.nn as nn\n", @@ -461,18 +428,14 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 3.3 Finetune" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "1. 创建 DataLoader " ] @@ -480,9 +443,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "batch_size = 16\n", @@ -492,9 +453,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "2. 定义优化器和 Loss" ] @@ -502,9 +461,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "optimizer = paddle.optimizer.Adam(learning_rate=1e-4, parameters=model.parameters())\n", @@ -513,19 +470,15 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "3. 启动模型训练 " ] }, { "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "from paddleaudio.utils import logger\n", @@ -603,9 +556,7 @@ }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "## 3.4 音频预测\n", "\n", @@ -615,16 +566,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false - }, + "metadata": {}, "outputs": [], "source": [ "top_k = 10\n", "wav_file = './dog.wav'\n", "\n", - "waveform, sr = load(wav_file)\n", - "feature_extractor = LogMelSpectrogram(sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window='hann', n_mels=64)\n", + "waveform, _ = load(wav_file, sr)\n", "feats = feature_extractor(paddle.to_tensor(paddle.to_tensor(waveform).unsqueeze(0)))\n", "feats = paddle.transpose(feats, [0, 2, 1]) # [B, N, T] -> [B, T, N]\n", "print(feats.shape)\n", @@ -635,16 +583,14 @@ "sorted_indices = probs[0].argsort()\n", "\n", "msg = f'[{wav_file}]\\n'\n", - "for idx in sorted_indices[-top_k:]:\n", + "for idx in sorted_indices[-1:-top_k-1:-1]:\n", " msg += f'{ESC50.label_list[idx]}: {probs[0][idx]:.5f}\\n'\n", "print(msg)" ] }, { "cell_type": "markdown", - "metadata": { - "collapsed": false - }, + "metadata": {}, "source": [ "# 4. 作业\n", "1. 使用开发模式安装 [PaddleSpeech](https://github.com/PaddlePaddle/PaddleSpeech) \n", @@ -653,6 +599,7 @@ "1. 在 [MusicSpeech](http://marsyas.info/downloads/datasets.html) 数据集上完成 music/speech 二分类。 \n", "2. 在 [GTZAN Genre Collection](http://marsyas.info/downloads/datasets.html) 音乐分类数据集上利用 PANNs 预训练模型实现音乐类别十分类。\n", "\n", + "关于如何自定义分类数据集,请参考文档 [PaddleSpeech/docs/source/cls/custom_dataset.md](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/cls/custom_dataset.md)\n", "\n", "# 5. 关注 PaddleSpeech\n", "\n", @@ -681,9 +628,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "py37", "language": "python", - "name": "py35-paddle1.2.0" + "name": "py37" }, "language_info": { "codemirror_mode": { @@ -695,7 +642,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.7.7" } }, "nbformat": 4, diff --git a/examples/aishell/asr0/RESULTS.md b/examples/aishell/asr0/RESULTS.md index 8e85d1d58..5841a8522 100644 --- a/examples/aishell/asr0/RESULTS.md +++ b/examples/aishell/asr0/RESULTS.md @@ -1,12 +1,18 @@ # Aishell-1 +## Deepspeech2 Streaming + +| Model | Number of Params | Release | Config | Test set | Valid Loss | CER | +| --- | --- | --- | --- | --- | --- | --- | +| DeepSpeech2 | 45.18M | 2.2.0 | conf/deepspeech2_online.yaml + spec aug | test | 7.994938373565674 | 0.080 | + ## Deepspeech2 Non-Streaming -| Model | Params | Release | Config | Test set | Loss | CER | +| Model | Number of Params | Release | Config | Test set | Valid Loss | CER | | --- | --- | --- | --- | --- | --- | --- | | DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug | test | 5.738585948944092 | 0.064000 | | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 | | DeepSpeech2 | 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 | | --- | --- | --- | --- | --- | --- | --- | -| DeepSpeech2 | 58.4M | 1.8.5 | - | test | - | 0.080447 | +| DeepSpeech2 | 58.4M | 1.8.5 | - | test | - | 0.080447 | diff --git a/examples/aishell/asr0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml index bdfa42199..fb6998647 100644 --- a/examples/aishell/asr0/conf/deepspeech2.yaml +++ b/examples/aishell/asr0/conf/deepspeech2.yaml @@ -1,68 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: 161 +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 0 - ctc_grad_norm_type: instance +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 0 +ctc_grad_norm_type: instance -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 1.9 - beta: 5.0 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 10 +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2.0e-3 +lr_decay: 0.83 +weight_decay: 1.0e-6 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml index 2f63f4de0..ef01ac595 100644 --- a/examples/aishell/asr0/conf/deepspeech2_online.yaml +++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml @@ -1,70 +1,68 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear #linear, mfcc, fbank - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear #linear, mfcc, fbank +feat_dim: 161 +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 -model: - num_conv_layers: 2 - num_rnn_layers: 5 - rnn_layer_size: 1024 - rnn_direction: forward # [forward, bidirect] - num_fc_layers: 0 - fc_layers_size_list: -1, - use_gru: False - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 5 +rnn_layer_size: 1024 +rnn_direction: forward # [forward, bidirect] +num_fc_layers: 0 +fc_layers_size_list: -1, +use_gru: False +blank_id: 0 -training: - n_epoch: 65 - accum_grad: 1 - lr: 5e-4 - lr_decay: 0.93 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 65 +accum_grad: 1 +lr: 5.0e-4 +lr_decay: 0.93 +weight_decay: 1.0e-6 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 + -decoding: - batch_size: 32 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 2.2 #1.9 - beta: 4.3 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/conf/tuning/chunk_decode.yaml b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 000000000..9de06711c --- /dev/null +++ b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +chunk_batch_size: 32 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 2.2 #1.9 +beta: 4.3 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/conf/tuning/decode.yaml b/examples/aishell/asr0/conf/tuning/decode.yaml new file mode 100644 index 000000000..5778e6565 --- /dev/null +++ b/examples/aishell/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 1.9 +beta: 5.0 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 10 diff --git a/examples/aishell/asr0/local/test.sh b/examples/aishell/asr0/local/test.sh index 8cbff2352..463593ef3 100755 --- a/examples/aishell/asr0/local/test.sh +++ b/examples/aishell/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/aishell/asr0/local/test_export.sh b/examples/aishell/asr0/local/test_export.sh index 4f5e5c8b6..7a4b87f8c 100755 --- a/examples/aishell/asr0/local/test_export.sh +++ b/examples/aishell/asr0/local/test_export.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -jit_model_export_path=$2 -model_type=$3 +decode_config_path=$2 +jit_model_export_path=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh > /dev/null 2>&1 @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test_export.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${jit_model_export_path}.rsl \ --export_path ${jit_model_export_path} \ --model_type ${model_type} diff --git a/examples/aishell/asr0/local/test_wav.sh b/examples/aishell/asr0/local/test_wav.sh index 4a6d92fbe..62b005a6a 100755 --- a/examples/aishell/asr0/local/test_wav.sh +++ b/examples/aishell/asr0/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 4 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" +if [ $# != 5 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file" exit -1 fi @@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 -audio_file=$4 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 +audio_file=$5 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -33,6 +34,7 @@ fi python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} \ diff --git a/examples/aishell/asr0/run.sh b/examples/aishell/asr0/run.sh index 270b88fc0..15685f21f 100755 --- a/examples/aishell/asr0/run.sh +++ b/examples/aishell/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeeech2_online.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline # offline or online audio_file=data/demo_01_03.wav @@ -34,7 +35,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type}|| exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -44,11 +45,11 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test export ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_export.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt}.jit ${model_type}|| exit -1 fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 fi diff --git a/examples/aishell/asr1/RESULTS.md b/examples/aishell/asr1/RESULTS.md index 783e179e0..b68d69924 100644 --- a/examples/aishell/asr1/RESULTS.md +++ b/examples/aishell/asr1/RESULTS.md @@ -25,7 +25,7 @@ Need set `decoding.decoding_chunk_size=16` when decoding. | Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER | | --- | --- | --- | --- | --- | --- | --- | --- | -| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 | -| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 | -| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 | -| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.8103787302970886 | 0.056588 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.8103787302970886 | 0.059932 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.8103787302970886 | 0.059989 | +| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.8103787302970886 | 0.052273 | diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml index 80b455878..68e852ba7 100644 --- a/examples/aishell/asr1/conf/chunk_conformer.yaml +++ b/examples/aishell/asr1/conf/chunk_conformer.yaml @@ -1,122 +1,95 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +########################################### +# Dataloader # +########################################### - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -training: - n_epoch: 240 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml index 154f44a25..775a4527d 100644 --- a/examples/aishell/asr1/conf/conformer.yaml +++ b/examples/aishell/asr1/conf/conformer.yaml @@ -1,117 +1,89 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -training: - n_epoch: 240 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr # pytorch v1.1.0+ required - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 2 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml index dd4cfd273..f7f4c58d5 100644 --- a/examples/aishell/asr1/conf/preprocess.yaml +++ b/examples/aishell/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: true + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml index 60ec01801..9d2946537 100644 --- a/examples/aishell/asr1/conf/transformer.yaml +++ b/examples/aishell/asr1/conf/transformer.yaml @@ -1,112 +1,85 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - - -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - ctc_dropoutrate: 0.0 - ctc_grad_norm_type: null - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -training: - n_epoch: 120 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Data # +########################################### +# https://yaml.org/type/float.html +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Dataloader # +########################################### +unit_type: 'char' +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/aishell/asr1/conf/tuning/chunk_decode.yaml b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 000000000..7e8afb7a8 --- /dev/null +++ b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: 16 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: True # simulate streaming inference. Defaults to False. diff --git a/examples/aishell/asr1/conf/tuning/decode.yaml b/examples/aishell/asr1/conf/tuning/decode.yaml new file mode 100644 index 000000000..72ede9272 --- /dev/null +++ b/examples/aishell/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +beam_size: 10 +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/aishell/asr1/local/align.sh b/examples/aishell/asr1/local/align.sh index c65d611c4..14d91d687 100755 --- a/examples/aishell/asr1/local/align.sh +++ b/examples/aishell/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh index da159de73..65b884e51 100755 --- a/examples/aishell/asr1/local/test.sh +++ b/examples/aishell/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/aishell/asr1/local/test_wav.sh b/examples/aishell/asr1/local/test_wav.sh index f85c1a47e..d029f2fde 100755 --- a/examples/aishell/asr1/local/test_wav.sh +++ b/examples/aishell/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -42,10 +43,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} if [ $? -ne 0 ]; then diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh index d07a4ed5c..c54dae9cf 100644 --- a/examples/aishell/asr1/run.sh +++ b/examples/aishell/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/conformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=20 audio_file=data/demo_01_03.wav @@ -32,18 +33,18 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi # Optionally, you can add LM and test it with runtime. if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi # Not supported at now!!! diff --git a/examples/aishell3/README.md b/examples/aishell3/README.md index b52950c47..273f488e4 100644 --- a/examples/aishell3/README.md +++ b/examples/aishell3/README.md @@ -8,4 +8,5 @@ * voc1 - Parallel WaveGAN * voc2 - MelGAN * voc3 - MultiBand MelGAN -* vc0 - Tactron2 Voice Clone with GE2E +* vc0 - Tactron2 Voice Cloning with GE2E +* vc1 - FastSpeech2 Voice Cloning with GE2E diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index 8d1c2aa9c..2538e8f96 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -72,8 +72,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] Train a FastSpeech2 model. @@ -87,11 +87,12 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu=0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md index 7da3946e3..dad464092 100644 --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -67,8 +67,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] - [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -83,7 +83,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. benchmark: arguments related to benchmark. @@ -113,7 +112,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -130,7 +128,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml index 88968d6fc..7fbffbdde 100644 --- a/examples/aishell3/voc1/conf/default.yaml +++ b/examples/aishell3/voc1/conf/default.yaml @@ -72,10 +72,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. ########################################################### batch_size: 8 # Batch size. batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift. -pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 4 # Number of workers in Pytorch DataLoader. -remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. -allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # diff --git a/examples/callcenter/asr1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml index 69959c68e..19e783a62 100644 --- a/examples/callcenter/asr1/conf/chunk_conformer.yaml +++ b/examples/callcenter/asr1/conf/chunk_conformer.yaml @@ -1,120 +1,98 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - - -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 8000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test + +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +spm_model_prefix: '' +preprocess_config: conf/preprocess.yaml +batch_size: 32 +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 8000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -training: - n_epoch: 240 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: true + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/callcenter/asr1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml index 80c15abb1..f6fcb9498 100644 --- a/examples/callcenter/asr1/conf/conformer.yaml +++ b/examples/callcenter/asr1/conf/conformer.yaml @@ -1,117 +1,92 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.5 - max_input_len: 20.0 # second - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.0 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 8000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +spm_model_prefix: '' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -training: - n_epoch: 100 # 50 will be lowest - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Training # +########################################### +n_epoch: 100 # 50 will be lowest +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/callcenter/asr1/conf/preprocess.yaml b/examples/callcenter/asr1/conf/preprocess.yaml index dd4cfd273..877e7d5a7 100644 --- a/examples/callcenter/asr1/conf/preprocess.yaml +++ b/examples/callcenter/asr1/conf/preprocess.yaml @@ -1,11 +1,11 @@ process: # extract kaldi fbank from PCM - type: fbank_kaldi - fs: 16000 + fs: 8000 n_mels: 80 n_shift: 160 win_length: 400 - dither: true + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 000000000..49a6a114c --- /dev/null +++ b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: true # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/callcenter/asr1/conf/tuning/decode.yaml b/examples/callcenter/asr1/conf/tuning/decode.yaml new file mode 100644 index 000000000..d2e0b72dd --- /dev/null +++ b/examples/callcenter/asr1/conf/tuning/decode.yaml @@ -0,0 +1,13 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. + + diff --git a/examples/callcenter/asr1/local/align.sh b/examples/callcenter/asr1/local/align.sh index 681c77ede..1397ae57d 100755 --- a/examples/callcenter/asr1/local/align.sh +++ b/examples/callcenter/asr1/local/align.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 ckpt_name=$(basename ${ckpt_prefxi}) @@ -25,9 +26,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/callcenter/asr1/local/test.sh b/examples/callcenter/asr1/local/test.sh index fc43c5a20..b7ff722a7 100755 --- a/examples/callcenter/asr1/local/test.sh +++ b/examples/callcenter/asr1/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 + ckpt_name=$(basename ${ckpt_prefxi}) @@ -30,10 +32,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -49,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/callcenter/asr1/run.sh b/examples/callcenter/asr1/run.sh index e9be3d03c..0c7ffc1e7 100644 --- a/examples/callcenter/asr1/run.sh +++ b/examples/callcenter/asr1/run.sh @@ -4,8 +4,9 @@ source path.sh gpus=0,1,2,3 stage=0 -stop_stage=100 +stop_stage=50 conf_path=conf/conformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=20 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -31,15 +32,15 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then # export ckpt avg_n CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md index 2c7a917e9..5f31f7b36 100644 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -60,8 +60,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] - [--use-relative-path USE_RELATIVE_PATH] + [--ngpu NGPU] [--use-relative-path USE_RELATIVE_PATH] [--phones-dict PHONES_DICT] [--tones-dict TONES_DICT] Train a Speedyspeech model with a single speaker dataset. @@ -76,7 +75,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. --use-relative-path USE_RELATIVE_PATH whether use relative path in metadata --phones-dict PHONES_DICT @@ -109,7 +107,7 @@ pwg_baker_ckpt_0.4 ```bash CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ``` -``text +```text usage: synthesize.py [-h] [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}] [--am_config AM_CONFIG] [--am_ckpt AM_CKPT] diff --git a/examples/csmsc/tts2/local/preprocess.sh b/examples/csmsc/tts2/local/preprocess.sh index f7f5ea74c..c44f075db 100755 --- a/examples/csmsc/tts2/local/preprocess.sh +++ b/examples/csmsc/tts2/local/preprocess.sh @@ -45,6 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --stats=dump/train/feats_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt \ --use-relative-path=True python3 ${BIN_DIR}/normalize.py \ @@ -53,6 +54,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --stats=dump/train/feats_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt \ --use-relative-path=True python3 ${BIN_DIR}/normalize.py \ @@ -61,6 +63,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --stats=dump/train/feats_stats.npy \ --phones-dict=dump/phone_id_map.txt \ --tones-dict=dump/tone_id_map.txt \ + --speaker-dict=dump/speaker_id_map.txt \ --use-relative-path=True fi diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index a458bd5ff..0a4cf69bb 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -38,9 +38,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/feats_stats.npy \ --voc=mb_melgan_csmsc \ - --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ - --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ - --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ @@ -61,9 +61,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/feats_stats.npy \ --voc=style_melgan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ @@ -82,9 +82,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/feats_stats.npy \ --voc=hifigan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md index 570dd28b8..13d291b5c 100644 --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -7,7 +7,7 @@ Download CSMSC from it's [Official Website](https://test.data-baker.com/data/ind ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. @@ -63,8 +63,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] Train a FastSpeech2 model. @@ -78,11 +78,12 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu=0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. @@ -259,5 +260,5 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=exp/default/test_e2e \ --inference_dir=exp/default/inference \ - --phones_dict=dump/phone_id_map.txt + --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt ``` diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index 891ed041b..d4744486c 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -37,9 +37,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=mb_melgan_csmsc \ - --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ - --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ - --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ @@ -59,9 +59,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=style_melgan_csmsc \ - --voc_config=style_melgan_test/default.yaml \ - --voc_ckpt=style_melgan_test/snapshot_iter_935000.pdz \ - --voc_stat=style_melgan_test/feats_stats.npy \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ @@ -80,9 +80,9 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=hifigan_csmsc \ - --voc_config=hifigan_test/default.yaml \ - --voc_ckpt=hifigan_test/snapshot_iter_1600000.pdz \ - --voc_stat=hifigan_test/feats_stats.npy \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md index 19a9c722a..5527e8088 100644 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -57,8 +57,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] - [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -73,7 +73,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. benchmark: arguments related to benchmark. @@ -103,7 +102,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -120,7 +118,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. @@ -134,7 +131,7 @@ The pretrained model can be downloaded here [pwg_baker_ckpt_0.4.zip](https://pad The static model can be downloaded here [pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip). -Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss:| eval/spectral_convergence_loss +Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------: default| 1(gpu) x 400000|1.948763|0.670098|0.248882 diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml index 9ea81b8d3..28d218ff3 100644 --- a/examples/csmsc/voc1/conf/default.yaml +++ b/examples/csmsc/voc1/conf/default.yaml @@ -79,10 +79,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. ########################################################### batch_size: 8 # Batch size. batch_max_steps: 25500 # Length of each audio in batch. Make sure dividable by n_shift. -pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 2 # Number of workers in Pytorch DataLoader. -remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. -allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md index e4f6be4e8..22104a8f2 100644 --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. @@ -57,7 +57,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] + [--ngpu NGPU] Train a Multi-Band MelGAN model. @@ -71,7 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. @@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -105,7 +103,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` multi band melgan config file. You should use the same config with which the model is trained. @@ -155,22 +152,22 @@ TODO: The hyperparameter of `finetune.yaml` is not good enough, a smaller `learning_rate` should be used (more `milestones` should be set). ## Pretrained Models -The pretrained model can be downloaded here [mb_melgan_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_ckpt_0.5.zip). +The pretrained model can be downloaded here [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip). The finetuned model can be downloaded here [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip). -The static model can be downloaded here [mb_melgan_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_static_0.5.zip) +The static model can be downloaded here [mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------: -default| 1(gpu) x 1000000| ——|—— |—— |—— | ——| +default| 1(gpu) x 1000000| 2.4851|0.71778 |0.2761 |0.66334 |0.2777| finetune| 1(gpu) x 1000000|3.196967|0.977804| 0.778484| 0.889576 |0.776756 | Multi Band MelGAN checkpoint contains files listed below. ```text -mb_melgan_baker_ckpt_0.5 +mb_melgan_csmsc_ckpt_0.1.1 ├── default.yaml # default config used to train multi band melgan ├── feats_stats.npy # statistics used to normalize spectrogram when training multi band melgan └── snapshot_iter_1000000.pdz # generator parameters of multi band melgan diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh index 4ab10e5b3..6719bd0be 100755 --- a/examples/csmsc/voc3/finetune.sh +++ b/examples/csmsc/voc3/finetune.sh @@ -15,11 +15,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ --dur-file=durations.txt \ --output-dir=dump_finetune \ - --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt + --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \ + --dataset=baker \ + --rootdir=~/datasets/BZNSYP/ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - python3 local/link_wav.py \ + python3 ${MAIN_ROOT}/utils/link_wav.py \ --old-dump-dir=dump \ --dump-dir=dump_finetune fi diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md index 57d88e0fc..b5c687391 100644 --- a/examples/csmsc/voc4/README.md +++ b/examples/csmsc/voc4/README.md @@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. @@ -57,9 +57,9 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] + [--ngpu NGPU] -Train a Multi-Band MelGAN model. +Train a Style MelGAN model. optional arguments: -h, --help show this help message and exit @@ -71,7 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. @@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -105,7 +103,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` style melgan config file. You should use the same config with which the model is trained. @@ -113,3 +110,20 @@ optional arguments: 3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory. 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. + +## Pretrained Models +The pretrained model can be downloaded here [style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip). + +The static model of Style MelGAN is not available now. + +Style MelGAN checkpoint contains files listed below. + +```text +hifigan_csmsc_ckpt_0.1.1 +├── default.yaml # default config used to train style melgan +├── feats_stats.npy # statistics used to normalize spectrogram when training style melgan +└── snapshot_iter_1500000.pdz # generator parameters of style melgan +``` + +## Acknowledgement +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/csmsc/voc4/conf/default.yaml b/examples/csmsc/voc4/conf/default.yaml index 6f7d0f2b3..c9abf78dc 100644 --- a/examples/csmsc/voc4/conf/default.yaml +++ b/examples/csmsc/voc4/conf/default.yaml @@ -88,7 +88,7 @@ discriminator_adv_loss_params: batch_size: 32 # Batch size. # batch_max_steps(24000) == prod(noise_upsample_scales)(80) * prod(upsample_scales)(300, n_shift) batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift. -num_workers: 2 # Number of workers in Pytorch DataLoader. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md index 2ced9f779..21afe6eef 100644 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -6,7 +6,7 @@ Download CSMSC from the [official website](https://www.data-baker.com/data/index ### Get MFA Result and Extract We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio. -You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/mfa) of our repo. +You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo. ## Get Started Assume the path to the dataset is `~/datasets/BZNSYP`. @@ -57,7 +57,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] + [--ngpu NGPU] Train a HiFiGAN model. @@ -71,7 +71,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. @@ -88,7 +87,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -105,7 +103,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` config file. You should use the same config with which the model is trained. @@ -114,4 +111,23 @@ optional arguments: 4. `--output-dir` is the directory to save the synthesized audio files. 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. -## Fine-tuning +## Pretrained Models +The pretrained model can be downloaded here [hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip). + +The static model can be downloaded here [hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip). + +Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss +:-------------:| :------------:| :-----: | :-----: | :--------: +default| 1(gpu) x 2500000|24.927|0.1262|7.554 + +HiFiGAN checkpoint contains files listed below. + +```text +hifigan_csmsc_ckpt_0.1.1 +├── default.yaml # default config used to train hifigan +├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan +└── snapshot_iter_2500000.pdz # generator parameters of hifigan +``` + +## Acknowledgement +We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/csmsc/voc5/conf/default.yaml b/examples/csmsc/voc5/conf/default.yaml index 5192d3897..f42fc385a 100644 --- a/examples/csmsc/voc5/conf/default.yaml +++ b/examples/csmsc/voc5/conf/default.yaml @@ -119,7 +119,7 @@ lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss.. ########################################################### batch_size: 16 # Batch size. batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size. -num_workers: 2 # Number of workers in Pytorch DataLoader. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # diff --git a/examples/csmsc/voc5/conf/finetune.yaml b/examples/csmsc/voc5/conf/finetune.yaml index 9876e93d0..734206251 100644 --- a/examples/csmsc/voc5/conf/finetune.yaml +++ b/examples/csmsc/voc5/conf/finetune.yaml @@ -119,7 +119,7 @@ lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss.. ########################################################### batch_size: 16 # Batch size. batch_max_steps: 8400 # Length of each audio in batch. Make sure dividable by hop_size. -num_workers: 2 # Number of workers in Pytorch DataLoader. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # diff --git a/examples/csmsc/voc5/finetune.sh b/examples/csmsc/voc5/finetune.sh index 4ab10e5b3..6719bd0be 100755 --- a/examples/csmsc/voc5/finetune.sh +++ b/examples/csmsc/voc5/finetune.sh @@ -15,11 +15,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \ --dur-file=durations.txt \ --output-dir=dump_finetune \ - --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt + --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \ + --dataset=baker \ + --rootdir=~/datasets/BZNSYP/ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - python3 local/link_wav.py \ + python3 ${MAIN_ROOT}/utils/link_wav.py \ --old-dump-dir=dump \ --dump-dir=dump_finetune fi diff --git a/examples/csmsc/voc5/local/link_wav.py b/examples/csmsc/voc5/local/link_wav.py deleted file mode 100644 index c81e0d4b8..000000000 --- a/examples/csmsc/voc5/local/link_wav.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import os -from operator import itemgetter -from pathlib import Path - -import jsonlines -import numpy as np - - -def main(): - # parse config and args - parser = argparse.ArgumentParser( - description="Preprocess audio and then extract features .") - - parser.add_argument( - "--old-dump-dir", - default=None, - type=str, - help="directory to dump feature files.") - parser.add_argument( - "--dump-dir", - type=str, - required=True, - help="directory to finetune dump feature files.") - args = parser.parse_args() - - old_dump_dir = Path(args.old_dump_dir).expanduser() - old_dump_dir = old_dump_dir.resolve() - dump_dir = Path(args.dump_dir).expanduser() - # use absolute path - dump_dir = dump_dir.resolve() - dump_dir.mkdir(parents=True, exist_ok=True) - - assert old_dump_dir.is_dir() - assert dump_dir.is_dir() - - for sub in ["train", "dev", "test"]: - # 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置 - output_dir = dump_dir / sub - output_dir.mkdir(parents=True, exist_ok=True) - results = [] - for name in os.listdir(output_dir / "raw"): - # 003918_feats.npy - utt_id = name.split("_")[0] - mel_path = output_dir / ("raw/" + name) - gen_mel = np.load(mel_path) - wave_name = utt_id + "_wave.npy" - wav = np.load(old_dump_dir / sub / ("raw/" + wave_name)) - os.symlink(old_dump_dir / sub / ("raw/" + wave_name), - output_dir / ("raw/" + wave_name)) - num_sample = wav.shape[0] - num_frames = gen_mel.shape[0] - wav_path = output_dir / ("raw/" + wave_name) - - record = { - "utt_id": utt_id, - "num_samples": num_sample, - "num_frames": num_frames, - "feats": str(mel_path), - "wave": str(wav_path), - } - results.append(record) - - results.sort(key=itemgetter("utt_id")) - - with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer: - for item in results: - writer.write(item) - - -if __name__ == "__main__": - main() diff --git a/examples/esc50/README.md b/examples/esc50/README.md index 66409754d..2ce57ae06 100644 --- a/examples/esc50/README.md +++ b/examples/esc50/README.md @@ -17,21 +17,32 @@ PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型,可供用 - CNN6: 该模型主要包含4个卷积层和2个全连接层,模型参数的数量为4.5M,embbedding维度是512。 +## 数据集 + +[ESC-50: Dataset for Environmental Sound Classification](https://github.com/karolpiczak/ESC-50) 是一个包含有 2000 个带标签的环境声音样本,音频样本采样率为 44,100Hz 的单通道音频文件,所有样本根据标签被划分为 50 个类别,每个类别有 40 个样本。 + +## 模型指标 + +根据 `ESC-50` 提供的fold信息,对数据集进行 5-fold 的 fine-tune 训练和评估,平均准确率如下: + +|Model|Acc| +|--|--| +|CNN14| 0.9500 +|CNN10| 0.8975 +|CNN6| 0.8825 + ## 快速开始 ### 模型训练 -以环境声音分类数据集`ESC50`为示例,运行下面的命令,可在训练集上进行模型的finetune,支持单机的单卡训练和多卡训练。 +运行下面的命令,可在训练集上进行模型的finetune,支持单机的单卡训练和多卡训练。 启动训练: ```shell -$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns.yaml ``` -`paddlespeech/cls/exps/panns/train.py` 脚本中可支持配置的参数: - -- `device`: 指定模型预测时使用的设备。 -- `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 +训练的参数可在 `conf/panns.yaml` 的 `training` 中配置,其中: - `epochs`: 训练轮次,默认为50。 - `learning_rate`: Fine-tune的学习率;默认为5e-5。 - `batch_size`: 批处理大小,请结合显存情况进行调整,若出现显存不足,请适当调低这一参数;默认为16。 @@ -40,36 +51,31 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 - `save_freq`: 训练过程中的模型保存频率,默认为10。 - `log_freq`: 训练过程中的信息打印频率,默认为10。 -示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过以下方式执行: -```python -from paddleaudio.datasets import ESC50 -from paddlespeech.cls.models import SoundClassifier -from paddlespeech.cls.models import cnn14, cnn10, cnn6 - +示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过修改 `conf/panns.yaml` 的 `model` 中配置: +```yaml # CNN14 -backbone = cnn14(pretrained=True, extract_embedding=True) -model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) - +model: + backbone: 'paddlespeech.cls.models:cnn14' +``` +```yaml # CNN10 -backbone = cnn10(pretrained=True, extract_embedding=True) -model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) - +model: + backbone: 'paddlespeech.cls.models:cnn10' +``` +```yaml # CNN6 -backbone = cnn6(pretrained=True, extract_embedding=True) -model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) +model: + backbone: 'paddlespeech.cls.models:cnn6' ``` ### 模型预测 ```shell -$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 conf/panns.yaml ``` -`paddlespeech/cls/exps/panns/predict.py` 脚本中可支持配置的参数: - -- `device`: 指定模型预测时使用的设备。 -- `wav`: 指定预测的音频文件。 -- `feat_backend`: 选择提取特征的后端,可选`'numpy'`或`'paddle'`,默认为`'numpy'`。 +训练的参数可在 `conf/panns.yaml` 的 `predicting` 中配置,其中: +- `audio_file`: 指定预测的音频文件。 - `top_k`: 预测显示的top k标签的得分,默认为1。 - `checkpoint`: 模型参数checkpoint文件。 @@ -88,7 +94,7 @@ Cat: 6.579841738130199e-06 模型训练结束后,可以将已保存的动态图参数导出成静态图的模型和参数,然后实施静态图的部署。 ```shell -$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3 +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 3 ./checkpoint/epoch_50/model.pdparams ./export ``` `paddlespeech/cls/exps/panns/export_model.py` 脚本中可支持配置的参数: @@ -109,7 +115,7 @@ export `paddlespeech/cls/exps/panns/deploy/predict.py` 脚本使用了`paddle.inference`模块下的api,提供了python端部署的示例: ```shell -$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4 +$ CUDA_VISIBLE_DEVICES=0 ./run.sh 4 cpu ./export /audio/dog.wav ``` `paddlespeech/cls/exps/panns/deploy/predict.py` 脚本中可支持配置的主要参数: diff --git a/examples/esc50/RESULTS.md b/examples/esc50/RESULTS.md new file mode 100644 index 000000000..edbf07a3f --- /dev/null +++ b/examples/esc50/RESULTS.md @@ -0,0 +1,9 @@ +## Metrics + +5-fold cross validation accuracy on [ESC-50](https://github.com/karolpiczak/ESC-50) dataset: + +|Model|Acc| +|--|--| +|CNN14| 0.9500 +|CNN10| 0.8975 +|CNN6| 0.8825 diff --git a/examples/esc50/cls0/conf/panns.yaml b/examples/esc50/cls0/conf/panns.yaml new file mode 100644 index 000000000..3a9d42aa5 --- /dev/null +++ b/examples/esc50/cls0/conf/panns.yaml @@ -0,0 +1,36 @@ +data: + dataset: 'paddleaudio.datasets:ESC50' + num_classes: 50 + train: + mode: 'train' + split: 1 + dev: + mode: 'dev' + split: 1 + +model: + backbone: 'paddlespeech.cls.models:cnn14' + +feature: + sr: 32000 + n_fft: 1024 + hop_length: 320 + window: 'hann' + win_length: 1024 + f_min: 50.0 + f_max: 14000.0 + n_mels: 64 + +training: + epochs: 50 + learning_rate: 0.00005 + num_workers: 2 + batch_size: 16 + checkpoint_dir: './checkpoint' + save_freq: 10 + log_freq: 10 + +predicting: + audio_file: '/audio/dog.wav' + top_k: 10 + checkpoint: './checkpoint/epoch_50/model.pdparams' \ No newline at end of file diff --git a/examples/esc50/cls0/local/export.sh b/examples/esc50/cls0/local/export.sh index 160dc7432..9c854a194 100755 --- a/examples/esc50/cls0/local/export.sh +++ b/examples/esc50/cls0/local/export.sh @@ -1,8 +1,8 @@ #!/bin/bash -ckpt_dir=$1 +ckpt=$1 output_dir=$2 python3 ${BIN_DIR}/export_model.py \ ---checkpoint ${ckpt_dir}/model.pdparams \ +--checkpoint ${ckpt} \ --output_dir ${output_dir} diff --git a/examples/esc50/cls0/local/infer.sh b/examples/esc50/cls0/local/infer.sh index bc03d6810..25d595be2 100755 --- a/examples/esc50/cls0/local/infer.sh +++ b/examples/esc50/cls0/local/infer.sh @@ -1,11 +1,4 @@ #!/bin/bash -audio_file=$1 -ckpt_dir=$2 -feat_backend=$3 - python3 ${BIN_DIR}/predict.py \ ---wav ${audio_file} \ ---feat_backend ${feat_backend} \ ---top_k 10 \ ---checkpoint ${ckpt_dir}/model.pdparams +--cfg_path=$1 diff --git a/examples/esc50/cls0/local/train.sh b/examples/esc50/cls0/local/train.sh index 0f0f3d091..cab547b84 100755 --- a/examples/esc50/cls0/local/train.sh +++ b/examples/esc50/cls0/local/train.sh @@ -1,25 +1,12 @@ #!/bin/bash ngpu=$1 -feat_backend=$2 - -num_epochs=50 -batch_size=16 -ckpt_dir=./checkpoint -save_freq=10 +cfg_path=$2 if [ ${ngpu} -gt 0 ]; then python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \ - --epochs ${num_epochs} \ - --feat_backend ${feat_backend} \ - --batch_size ${batch_size} \ - --checkpoint_dir ${ckpt_dir} \ - --save_freq ${save_freq} + --cfg_path ${cfg_path} else python3 ${BIN_DIR}/train.py \ - --epochs ${num_epochs} \ - --feat_backend ${feat_backend} \ - --batch_size ${batch_size} \ - --checkpoint_dir ${ckpt_dir} \ - --save_freq ${save_freq} + --cfg_path ${cfg_path} fi diff --git a/examples/esc50/cls0/run.sh b/examples/esc50/cls0/run.sh index 7283aa8d7..0e407b40e 100755 --- a/examples/esc50/cls0/run.sh +++ b/examples/esc50/cls0/run.sh @@ -6,28 +6,30 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') stage=$1 stop_stage=100 -feat_backend=numpy -audio_file=~/cat.wav -ckpt_dir=./checkpoint/epoch_50 -output_dir=./export -infer_device=cpu if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - ./local/train.sh ${ngpu} ${feat_backend} || exit -1 + cfg_path=$2 + ./local/train.sh ${ngpu} ${cfg_path} || exit -1 exit 0 fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ./local/infer.sh ${audio_file} ${ckpt_dir} ${feat_backend} || exit -1 + cfg_path=$2 + ./local/infer.sh ${cfg_path} || exit -1 exit 0 fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - ./local/export.sh ${ckpt_dir} ${output_dir} || exit -1 + ckpt=$2 + output_dir=$3 + ./local/export.sh ${ckpt} ${output_dir} || exit -1 exit 0 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - ./local/static_model_infer.sh ${infer_device} ${output_dir} ${audio_file} || exit -1 + infer_device=$2 + graph_dir=$3 + audio_file=$4 + ./local/static_model_infer.sh ${infer_device} ${graph_dir} ${audio_file} || exit -1 exit 0 fi diff --git a/examples/iwslt2012/punc0/README.md b/examples/iwslt2012/punc0/README.md index 1fcd954ca..74d599a21 100644 --- a/examples/iwslt2012/punc0/README.md +++ b/examples/iwslt2012/punc0/README.md @@ -1,35 +1,29 @@ -# 中文实验例程 -## 测试数据: -- IWLST2012中文:test2012 +# Punctuation Restoration with IWLST2012-Zh -## 运行代码 -- 运行 `run.sh 0 0 conf/train_conf/bertBLSTM_zh.yaml 1 conf/data_conf/chinese.yaml ` +## Get Started +### Data Preprocessing +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Model Training +```bash +./run.sh --stage 1 --stop-stage 1 +``` +### Testing +```bash +./run.sh --stage 2 --stop-stage 2 +``` +### Punctuation Restoration +```bash +./run.sh --stage 3 --stop-stage 3 +``` +## Pretrained Model +The pretrained model can be downloaded here [ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip). -## 实验结果: -- BertLinear - - 实验配置:conf/train_conf/bertLinear_zh.yaml - - 测试结果 - - | | COMMA | PERIOD | QUESTION | OVERALL | - |-----------|-----------|-----------|-----------|--------- | - |Precision | 0.425665 | 0.335190 | 0.698113 | 0.486323 | - |Recall | 0.511278 | 0.572108 | 0.787234 | 0.623540 | - |F1 | 0.464560 | 0.422717 | 0.740000 | 0.542426 | - -- BertBLSTM - - 实验配置:conf/train_conf/bertBLSTM_zh.yaml - - 测试结果 avg_1 - - | | COMMA | PERIOD | QUESTION | OVERALL | - |-----------|-----------|-----------|-----------|--------- | - |Precision | 0.469484 | 0.550604 | 0.801887 | 0.607325 | - |Recall | 0.580271 | 0.592408 | 0.817308 | 0.663329 | - |F1 | 0.519031 | 0.570741 | 0.809524 | 0.633099 | - - - BertBLSTM/avg_1测试标贝合成数据 - - | | COMMA | PERIOD | QUESTION | OVERALL | - |-----------|-----------|-----------|-----------|--------- | - |Precision | 0.217192 | 0.196339 | 0.820717 | 0.411416 | - |Recall | 0.205922 | 0.892531 | 0.416162 | 0.504872 | - |F1 | 0.211407 | 0.321873 | 0.552279 | 0.361853 | +### Test Result +- Ernie Linear + | |COMMA | PERIOD | QUESTION | OVERALL| + |:-----:|:-----:|:-----:|:-----:|:-----:| + |Precision |0.510955 |0.526462 |0.820755 |0.619391| + |Recall |0.517433 |0.564179 |0.861386 |0.647666| + |F1 |0.514173 |0.544669 |0.840580 |0.633141| diff --git a/examples/iwslt2012/punc0/conf/default.yaml b/examples/iwslt2012/punc0/conf/default.yaml new file mode 100644 index 000000000..74ced9932 --- /dev/null +++ b/examples/iwslt2012/punc0/conf/default.yaml @@ -0,0 +1,44 @@ +########################################################### +# DATA SETTING # +########################################################### +dataset_type: Ernie +train_path: data/iwslt2012_zh/train.txt +dev_path: data/iwslt2012_zh/dev.txt +test_path: data/iwslt2012_zh/test.txt +batch_size: 64 +num_workers: 2 +data_params: + pretrained_token: ernie-1.0 + punc_path: data/iwslt2012_zh/punc_vocab + seq_len: 100 + + +########################################################### +# MODEL SETTING # +########################################################### +model_type: ErnieLinear +model: + pretrained_token: ernie-1.0 + num_classes: 4 + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer_params: + weight_decay: 1.0e-6 # weight decay coefficient. + +scheduler_params: + learning_rate: 1.0e-5 # learning rate. + gamma: 1.0 # scheduler gamma. + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 20 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/iwslt2012/punc0/conf/ernie_linear.yaml b/examples/iwslt2012/punc0/conf/ernie_linear.yaml deleted file mode 100644 index e00b793c1..000000000 --- a/examples/iwslt2012/punc0/conf/ernie_linear.yaml +++ /dev/null @@ -1,36 +0,0 @@ -data: - dataset_type: Ernie - train_path: data/iwslt2012_zh/train.txt - dev_path: data/iwslt2012_zh/dev.txt - test_path: data/iwslt2012_zh/test.txt - data_params: - pretrained_token: ernie-1.0 - punc_path: data/iwslt2012_zh/punc_vocab - seq_len: 100 - batch_size: 64 - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 - -checkpoint: - kbest_n: 5 - latest_n: 10 - metric_type: F1 - -model_type: ErnieLinear - -model_params: - pretrained_token: ernie-1.0 - num_classes: 4 - -training: - n_epoch: 100 - lr: !!float 1e-5 - lr_decay: 1.0 - weight_decay: !!float 1e-06 - global_grad_clip: 5.0 - log_interval: 10 - log_path: log/train_ernie_linear.log - -testing: - log_path: log/test_ernie_linear.log diff --git a/examples/iwslt2012/punc0/local/avg.sh b/examples/iwslt2012/punc0/local/avg.sh deleted file mode 100644 index b8c14c662..000000000 --- a/examples/iwslt2012/punc0/local/avg.sh +++ /dev/null @@ -1,23 +0,0 @@ -#! /usr/bin/env bash - -if [ $# != 2 ]; then - echo "usage: ${0} ckpt_dir avg_num" - exit -1 -fi - -ckpt_dir=${1} -average_num=${2} -decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams - -python3 -u ${BIN_DIR}/avg_model.py \ ---dst_model ${decode_checkpoint} \ ---ckpt_dir ${ckpt_dir} \ ---num ${average_num} \ ---val_best - -if [ $? -ne 0 ]; then - echo "Failed in avg ckpt!" - exit 1 -fi - -exit 0 \ No newline at end of file diff --git a/examples/iwslt2012/punc0/local/data.sh b/examples/iwslt2012/punc0/local/data.sh old mode 100644 new mode 100755 diff --git a/examples/iwslt2012/punc0/local/punc_restore.sh b/examples/iwslt2012/punc0/local/punc_restore.sh new file mode 100755 index 000000000..30a4f12f8 --- /dev/null +++ b/examples/iwslt2012/punc0/local/punc_restore.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +text=$4 +ckpt_prefix=${ckpt_name%.*} + +python3 ${BIN_DIR}/punc_restore.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --text=${text} diff --git a/examples/iwslt2012/punc0/local/test.sh b/examples/iwslt2012/punc0/local/test.sh old mode 100644 new mode 100755 index ee0224622..94e508b5b --- a/examples/iwslt2012/punc0/local/test.sh +++ b/examples/iwslt2012/punc0/local/test.sh @@ -1,26 +1,11 @@ - #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" - exit -1 -fi - -ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -echo "using $ngpu gpus..." - config_path=$1 -ckpt_prefix=$2 - -python3 -u ${BIN_DIR}/test.py \ ---ngpu 1 \ ---config ${config_path} \ ---result_file ${ckpt_prefix}.rsl \ ---checkpoint_path ${ckpt_prefix} +train_output_path=$2 +ckpt_name=$3 -if [ $? -ne 0 ]; then - echo "Failed in evaluation!" - exit 1 -fi +ckpt_prefix=${ckpt_name%.*} -exit 0 +python3 ${BIN_DIR}/test.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} diff --git a/examples/iwslt2012/punc0/local/train.sh b/examples/iwslt2012/punc0/local/train.sh old mode 100644 new mode 100755 index 9fabb8f75..85227eacb --- a/examples/iwslt2012/punc0/local/train.sh +++ b/examples/iwslt2012/punc0/local/train.sh @@ -1,28 +1,9 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name log_dir" - exit -1 -fi - -ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -echo "using $ngpu gpus..." - config_path=$1 -ckpt_name=$2 -log_dir=$3 - -mkdir -p exp - -python3 -u ${BIN_DIR}/train.py \ ---ngpu ${ngpu} \ ---config ${config_path} \ ---output_dir exp/${ckpt_name} \ ---log_dir ${log_dir} - -if [ $? -ne 0 ]; then - echo "Failed in training!" - exit 1 -fi +train_output_path=$2 -exit 0 +python3 ${BIN_DIR}/train.py \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 diff --git a/examples/iwslt2012/punc0/path.sh b/examples/iwslt2012/punc0/path.sh old mode 100644 new mode 100755 index 8f67f9c93..da790261f --- a/examples/iwslt2012/punc0/path.sh +++ b/examples/iwslt2012/punc0/path.sh @@ -10,5 +10,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ -MODEL=$1 +MODEL=ernie_linear export BIN_DIR=${MAIN_ROOT}/paddlespeech/text/exps/${MODEL} diff --git a/examples/iwslt2012/punc0/run.sh b/examples/iwslt2012/punc0/run.sh index 8d786a198..0c14eb7e2 100755 --- a/examples/iwslt2012/punc0/run.sh +++ b/examples/iwslt2012/punc0/run.sh @@ -1,40 +1,35 @@ #!/bin/bash set -e +source path.sh -if [ $# -ne 4 ]; then - echo "usage: bash ./run.sh stage gpu train_config avg_num" - echo "eg: bash ./run.sh 1 0 train_config 1" - exit -1 -fi - -stage=$1 +gpus=0,1 +stage=0 stop_stage=100 -gpus=$2 -conf_path=$3 -avg_num=$4 -avg_ckpt=avg_${avg_num} -ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}') -log_dir=log -source path.sh ${ckpt} +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_12840.pdz +text=今天的天气真不错啊你下午有空吗我想约你一起去吃饭 +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # prepare data - bash ./local/data.sh + ./local/data.sh fi -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${conf_path} ${ckpt} ${log_dir} +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 fi -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # avg n best model - bash ./local/avg.sh exp/${ckpt}/checkpoints ${avg_num} +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 fi -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} bash ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 -fi +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/punc_restore.sh ${conf_path} ${train_output_path} ${ckpt_name} ${text}|| exit -1 +fi \ No newline at end of file diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml index f3574e150..0307b9f39 100644 --- a/examples/librispeech/asr0/conf/deepspeech2.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2.yaml @@ -1,68 +1,65 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev-clean - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 30.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev-clean +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 30.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 20 - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 20.0 - delta_delta: False - dither: 1.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 20 +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 20.0 +delta_delta: False +dither: 1.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 0 -training: - n_epoch: 50 - accum_grad: 1 - lr: 1e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.9 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 1 +lr: 1.0e-3 +lr_decay: 0.83 +weight_decay: 1.0e-6 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml index 0d16bc571..a0d2bcfe2 100644 --- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml +++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml @@ -1,70 +1,67 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev-clean - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 30.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev-clean +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 30.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 15 - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 20.0 - delta_delta: False - dither: 1.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 +########################################### +# Dataloader # +########################################### +batch_size: 15 +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 20.0 +delta_delta: False +dither: 1.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - rnn_direction: forward - num_fc_layers: 2 - fc_layers_size_list: 512, 256 - use_gru: False - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +rnn_direction: forward +num_fc_layers: 2 +fc_layers_size_list: 512, 256 +use_gru: False +blank_id: 0 -training: - n_epoch: 50 - accum_grad: 4 - lr: 1e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.9 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 4 +lr: 1.0e-3 +lr_decay: 0.83 +weight_decay: 1.0e-6 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 000000000..e07026ba5 --- /dev/null +++ b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.9 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/librispeech/asr0/conf/tuning/decode.yaml b/examples/librispeech/asr0/conf/tuning/decode.yaml new file mode 100644 index 000000000..e07026ba5 --- /dev/null +++ b/examples/librispeech/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.9 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/librispeech/asr0/local/test.sh b/examples/librispeech/asr0/local/test.sh index a627ef722..ea40046b1 100755 --- a/examples/librispeech/asr0/local/test.sh +++ b/examples/librispeech/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/librispeech/asr0/local/test_wav.sh b/examples/librispeech/asr0/local/test_wav.sh index e8337da7f..25cfc45e3 100755 --- a/examples/librispeech/asr0/local/test_wav.sh +++ b/examples/librispeech/asr0/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 4 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type audio_file" +if [ $# != 5 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type audio_file" exit -1 fi @@ -9,9 +9,10 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 -audio_file=$4 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 +audio_file=$5 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ @@ -33,6 +34,7 @@ fi python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} \ diff --git a/examples/librispeech/asr0/run.sh b/examples/librispeech/asr0/run.sh index 5d811b653..ca2c2b9da 100755 --- a/examples/librispeech/asr0/run.sh +++ b/examples/librispeech/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=30 model_type=offline audio_file=data/demo_002_en.wav @@ -33,7 +34,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -43,5 +44,5 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} ${audio_file} || exit -1 fi diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml index 2872b69ef..72b9cb7be 100644 --- a/examples/librispeech/asr1/conf/chunk_conformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_conformer.yaml @@ -1,115 +1,99 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: True - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: True + use_dynamic_chunk: true + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 16 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 16 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 8 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 240 - accum_grad: 8 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. - diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml index 275e940af..19ade8ad2 100644 --- a/examples/librispeech/asr1/conf/chunk_transformer.yaml +++ b/examples/librispeech/asr1/conf/chunk_transformer.yaml @@ -1,106 +1,89 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -training: - n_epoch: 120 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: true # simulate streaming inference. Defaults to False. +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 \ No newline at end of file diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml index 1193f14b1..4f7b759be 100644 --- a/examples/librispeech/asr1/conf/conformer.yaml +++ b/examples/librispeech/asr1/conf/conformer.yaml @@ -1,104 +1,96 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + ctc_grad_norm_type: null + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 16 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 16 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + - -training: - n_epoch: 70 - accum_grad: 8 - global_grad_clip: 3.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - beam_size: 10 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 70 +accum_grad: 8 +global_grad_clip: 3.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr1/conf/preprocess.yaml b/examples/librispeech/asr1/conf/preprocess.yaml index 021ca4c58..d3992cb9f 100644 --- a/examples/librispeech/asr1/conf/preprocess.yaml +++ b/examples/librispeech/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: true + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml index a90efe482..740ce78f3 100644 --- a/examples/librispeech/asr1/conf/transformer.yaml +++ b/examples/librispeech/asr1/conf/transformer.yaml @@ -1,111 +1,88 @@ -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.5 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 100.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_5000' - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 32 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -training: - n_epoch: 120 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_5000' +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 32 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 000000000..0760e721e --- /dev/null +++ b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: true # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/conf/tuning/decode.yaml b/examples/librispeech/asr1/conf/tuning/decode.yaml new file mode 100644 index 000000000..805dd02fa --- /dev/null +++ b/examples/librispeech/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr1/local/align.sh b/examples/librispeech/asr1/local/align.sh index c65d611c4..14d91d687 100755 --- a/examples/librispeech/asr1/local/align.sh +++ b/examples/librispeech/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/librispeech/asr1/local/test.sh b/examples/librispeech/asr1/local/test.sh index aa06132e4..51ced18b2 100755 --- a/examples/librispeech/asr1/local/test.sh +++ b/examples/librispeech/asr1/local/test.sh @@ -15,8 +15,8 @@ recog_set="test-clean" stage=0 stop_stage=100 -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -24,7 +24,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -52,10 +53,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -76,10 +78,11 @@ for type in ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -96,10 +99,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/librispeech/asr1/local/test_wav.sh b/examples/librispeech/asr1/local/test_wav.sh index ab6d685d8..e70fc83c8 100755 --- a/examples/librispeech/asr1/local/test_wav.sh +++ b/examples/librispeech/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/ @@ -49,10 +50,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} #score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict} diff --git a/examples/librispeech/asr1/run.sh b/examples/librispeech/asr1/run.sh index f839e5af7..116dae126 100755 --- a/examples/librispeech/asr1/run.sh +++ b/examples/librispeech/asr1/run.sh @@ -8,6 +8,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=30 audio_file=data/demo_002_en.wav @@ -34,17 +35,17 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/librispeech/asr2/conf/decode/decode_base.yaml b/examples/librispeech/asr2/conf/decode/decode_base.yaml new file mode 100644 index 000000000..384ed197d --- /dev/null +++ b/examples/librispeech/asr2/conf/decode/decode_base.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 1 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml index a16563a59..32d95b414 100644 --- a/examples/librispeech/asr2/conf/transformer.yaml +++ b/examples/librispeech/asr2/conf/transformer.yaml @@ -1,73 +1,80 @@ # https://yaml.org/type/float.html -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean -collator: - vocab_filepath: data/lang_char/train_960_unigram5000_units.txt - unit_type: spm - spm_model_prefix: data/lang_char/train_960_unigram5000 - feat_dim: 83 - stride_ms: 10.0 - window_ms: 25.0 - sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs - batch_size: 30 - maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced - maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced - minibatches: 0 # for debug - batch_count: auto - batch_bins: 0 - batch_frames_in: 0 - batch_frames_out: 0 - batch_frames_inout: 0 - augmentation_config: conf/preprocess.yaml - num_workers: 0 - subsampling_factor: 1 - num_encs: 1 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/train_960_unigram5000_units.txt +unit_type: spm +spm_model_prefix: data/lang_char/train_960_unigram5000 +feat_dim: 83 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 30 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 120 - accum_grad: 2 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +log_interval: 1 +checkpoint: + kbest_n: 50 + latest_n: 5 optim: adam optim_conf: @@ -79,23 +86,5 @@ scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 -decoding: - batch_size: 1 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/librispeech/asr2/local/align.sh b/examples/librispeech/asr2/local/align.sh index 626c35742..60a16f42b 100755 --- a/examples/librispeech/asr2/local/align.sh +++ b/examples/librispeech/asr2/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path dict_path ckpt_path_prefix" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path dict_path ckpt_path_prefix" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -dict_path=$2 -ckpt_prefix=$3 +decode_config_path=$2 +dict_path=$3 +ckpt_prefix=$4 batch_size=1 output_dir=${ckpt_prefix} @@ -24,9 +25,10 @@ python3 -u ${BIN_DIR}/test.py \ --dict-path ${dict_path} \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result-file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/librispeech/asr2/local/test.sh b/examples/librispeech/asr2/local/test.sh index d210f2a85..8cf3b52c0 100755 --- a/examples/librispeech/asr2/local/test.sh +++ b/examples/librispeech/asr2/local/test.sh @@ -19,6 +19,7 @@ bpeprefix=data/lang_char/${train_set}_${bpemode}${nbpe} bpemodel=${bpeprefix}.model config_path=conf/transformer.yaml +decode_config_path=conf/decode/decode_base.yaml dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt ckpt_prefix= @@ -79,11 +80,12 @@ for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_resco --ngpu ${ngpu} \ --dict-path ${dict} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --checkpoint_path ${ckpt_prefix} \ --result-file ${decode_dir}/data.JOB.json \ - --opts decoding.decoding_method ${dmethd} \ - --opts decoding.batch_size ${batch_size} \ - --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} + --opts decode.decoding_method ${dmethd} \ + --opts decode.decode_batch_size ${batch_size} \ + --opts test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask} score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel} --wer false ${decode_dir} ${dict} diff --git a/examples/librispeech/asr2/run.sh b/examples/librispeech/asr2/run.sh index 5b7596f2d..c9a794e34 100755 --- a/examples/librispeech/asr2/run.sh +++ b/examples/librispeech/asr2/run.sh @@ -9,7 +9,8 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=50 conf_path=conf/transformer.yaml -dict_path=lang_char/train_960_unigram5000_units.txt +decode_conf_path=conf/decode/decode_base.yaml +dict_path=data/lang_char/train_960_unigram5000_units.txt avg_num=10 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -35,7 +36,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # attetion resocre decoder - ./local/test.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + ./local/test.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then @@ -45,7 +46,7 @@ fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} ${dict_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md index 5bb163e1d..4f7680e84 100644 --- a/examples/ljspeech/tts1/README.md +++ b/examples/ljspeech/tts1/README.md @@ -55,7 +55,7 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] Train a TransformerTTS model with LJSpeech TTS dataset. @@ -69,7 +69,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. ``` @@ -103,7 +102,7 @@ usage: synthesize.py [-h] [--transformer-tts-config TRANSFORMER_TTS_CONFIG] [--waveflow-checkpoint WAVEFLOW_CHECKPOINT] [--phones-dict PHONES_DICT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] + [--ngpu NGPU] Synthesize with transformer tts & waveflow. @@ -127,7 +126,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` `./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file. ```bash @@ -142,7 +140,6 @@ usage: synthesize_e2e.py [-h] [--waveflow-checkpoint WAVEFLOW_CHECKPOINT] [--phones-dict PHONES_DICT] [--text TEXT] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with transformer tts & waveflow. @@ -165,7 +162,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--transformer-tts-config`, `--transformer-tts-checkpoint`, `--transformer-tts-stat` and `--phones-dict` are arguments for transformer_tts, which correspond to the 4 files in the transformer_tts pretrained model. 2. `--waveflow-config`, `--waveflow-checkpoint` are arguments for waveflow, which correspond to the 2 files in the waveflow pretrained model. diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md index 692c9746a..f3602c347 100644 --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -62,8 +62,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] Train a FastSpeech2 model. @@ -77,11 +77,12 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu=0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md index 9dd0f5cc3..6fcb2a520 100644 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -57,8 +57,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] - [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -73,7 +73,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. benchmark: arguments related to benchmark. @@ -103,7 +102,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -120,7 +118,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. diff --git a/examples/ljspeech/voc1/conf/default.yaml b/examples/ljspeech/voc1/conf/default.yaml index bef2d6814..2d39beb79 100644 --- a/examples/ljspeech/voc1/conf/default.yaml +++ b/examples/ljspeech/voc1/conf/default.yaml @@ -72,10 +72,7 @@ lambda_adv: 4.0 # Loss balancing coefficient. ########################################################### batch_size: 8 # Batch size. batch_max_steps: 25600 # Length of each audio in batch. Make sure dividable by n_shift. -pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 4 # Number of workers in Pytorch DataLoader. -remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. -allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # diff --git a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml index c2d692263..c2db2c7c2 100644 --- a/examples/other/1xt2x/aishell/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/aishell/conf/deepspeech2.yaml @@ -1,67 +1,65 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 - max_input_len: 27.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.0 +max_input_len: 27.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 4333 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 4333 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 + -decoding: - batch_size: 32 - error_rate_type: cer - decoding_method: ctc_beam_search - lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm - alpha: 2.6 - beta: 5.0 - beam_size: 300 - cutoff_prob: 0.99 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/aishell/conf/tuning/decode.yaml b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml new file mode 100644 index 000000000..b5283a934 --- /dev/null +++ b/examples/other/1xt2x/aishell/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: cer +decoding_method: ctc_beam_search +lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm +alpha: 2.6 +beta: 5.0 +beam_size: 300 +cutoff_prob: 0.99 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/aishell/local/test.sh b/examples/other/1xt2x/aishell/local/test.sh index 8cbff2352..463593ef3 100755 --- a/examples/other/1xt2x/aishell/local/test.sh +++ b/examples/other/1xt2x/aishell/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_ch.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/aishell/run.sh b/examples/other/1xt2x/aishell/run.sh index 1ccac1c35..89a634119 100755 --- a/examples/other/1xt2x/aishell/run.sh +++ b/examples/other/1xt2x/aishell/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=2 @@ -23,6 +24,6 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml index be51a9b90..0c08fbc63 100644 --- a/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/baidu_en8k/conf/deepspeech2.yaml @@ -1,67 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: .inf # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: .inf # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 1024 - use_gru: True - share_rnn_weights: False - blank_id: 28 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 1024 +use_gru: True +share_rnn_weights: False +blank_id: 28 + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 32 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 1.4 - beta: 0.35 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml new file mode 100644 index 000000000..f52dde320 --- /dev/null +++ b/examples/other/1xt2x/baidu_en8k/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 1.4 +beta: 0.35 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/baidu_en8k/local/test.sh b/examples/other/1xt2x/baidu_en8k/local/test.sh index a627ef722..ea40046b1 100755 --- a/examples/other/1xt2x/baidu_en8k/local/test.sh +++ b/examples/other/1xt2x/baidu_en8k/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/baidu_en8k/run.sh b/examples/other/1xt2x/baidu_en8k/run.sh index b7f69f6b5..82de56b09 100755 --- a/examples/other/1xt2x/baidu_en8k/run.sh +++ b/examples/other/1xt2x/baidu_en8k/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=0 @@ -23,6 +24,6 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml index ad7fb2c19..a2a5649ba 100644 --- a/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml +++ b/examples/other/1xt2x/librispeech/conf/deepspeech2.yaml @@ -1,67 +1,64 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test-clean - min_input_len: 0.0 - max_input_len: 1000.0 # second - min_output_len: 0.0 - max_output_len: .inf - min_output_input_ratio: 0.00 - max_output_input_ratio: .inf +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test-clean +min_input_len: 0.0 +max_input_len: 1000.0 # second +min_output_len: 0.0 +max_output_len: .inf +min_output_input_ratio: 0.00 +max_output_input_ratio: .inf -collator: - batch_size: 64 # one gpu - mean_std_filepath: data/mean_std.npz - unit_type: char - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +batch_size: 64 # one gpu +mean_std_filepath: data/mean_std.npz +unit_type: char +vocab_filepath: data/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 28 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 28 + +########################################### +# Training # +########################################### +n_epoch: 80 +accum_grad: 1 +lr: 2e-3 +lr_decay: 0.83 +weight_decay: 1e-06 +global_grad_clip: 3.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 -training: - n_epoch: 80 - accum_grad: 1 - lr: 2e-3 - lr_decay: 0.83 - weight_decay: 1e-06 - global_grad_clip: 3.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - -decoding: - batch_size: 32 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml new file mode 100644 index 000000000..f3b51defe --- /dev/null +++ b/examples/other/1xt2x/librispeech/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 32 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 \ No newline at end of file diff --git a/examples/other/1xt2x/librispeech/local/test.sh b/examples/other/1xt2x/librispeech/local/test.sh index a627ef722..ea40046b1 100755 --- a/examples/other/1xt2x/librispeech/local/test.sh +++ b/examples/other/1xt2x/librispeech/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/other/1xt2x/librispeech/run.sh b/examples/other/1xt2x/librispeech/run.sh index 8c667de2e..8b614bbbf 100755 --- a/examples/other/1xt2x/librispeech/run.sh +++ b/examples/other/1xt2x/librispeech/run.sh @@ -5,6 +5,7 @@ source path.sh stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline gpus=1 @@ -23,5 +24,5 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${v18_ckpt} ${model_type}|| exit -1 fi diff --git a/examples/other/1xt2x/src_deepspeech2x/bin/test.py b/examples/other/1xt2x/src_deepspeech2x/bin/test.py index b4f9cdf9d..88a13fdca 100644 --- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py +++ b/examples/other/1xt2x/src_deepspeech2x/bin/test.py @@ -13,8 +13,8 @@ # limitations under the License. """Evaluation for DeepSpeech2 model.""" from src_deepspeech2x.test_model import DeepSpeech2Tester as Tester +from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -41,9 +41,13 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py index ad83a41db..fb8b321ce 100644 --- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py +++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py @@ -12,12 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Deepspeech2 ASR Model""" -from typing import Optional - import paddle from paddle import nn from src_deepspeech2x.models.ds2.rnn import RNNStack -from yacs.config import CfgNode from paddlespeech.s2t.models.ds2.conv import ConvStack from paddlespeech.s2t.modules.ctc import CTCDecoder @@ -120,20 +117,6 @@ class DeepSpeech2Model(nn.Layer): :rtype: tuple of LayerOutput """ - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, feat_size, dict_size, @@ -166,13 +149,13 @@ class DeepSpeech2Model(nn.Layer): """Compute Model loss Args: - audio (Tenosr): [B, T, D] + audio (Tensor): [B, T, D] audio_len (Tensor): [B] text (Tensor): [B, U] text_len (Tensor): [B] Returns: - loss (Tenosr): [1] + loss (Tensor): [1] """ eouts, eouts_len = self.encoder(audio, audio_len) loss = self.decoder(eouts, eouts_len, text, text_len) @@ -233,11 +216,11 @@ class DeepSpeech2Model(nn.Layer): """ model = cls(feat_size=dataloader.collate_fn.feature_size, dict_size=len(dataloader.collate_fn.vocab_list), - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights) + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) logger.info(f"checkpoint info: {infos}") @@ -250,7 +233,7 @@ class DeepSpeech2Model(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2Model diff --git a/examples/other/1xt2x/src_deepspeech2x/test_model.py b/examples/other/1xt2x/src_deepspeech2x/test_model.py index 82e190d81..2a38fb5cd 100644 --- a/examples/other/1xt2x/src_deepspeech2x/test_model.py +++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py @@ -15,8 +15,6 @@ import time from collections import defaultdict from contextlib import nullcontext -from pathlib import Path -from typing import Optional import numpy as np import paddle @@ -24,7 +22,6 @@ from paddle import distributed as dist from paddle.io import DataLoader from src_deepspeech2x.models.ds2 import DeepSpeech2InferModel from src_deepspeech2x.models.ds2 import DeepSpeech2Model -from yacs.config import CfgNode from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.io.collator import SpeechCollator @@ -44,27 +41,11 @@ logger = Log(__name__).getlog() class DeepSpeech2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -98,7 +79,7 @@ class DeepSpeech2Trainer(Trainer): iteration_time = time.time() - start msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -126,7 +107,7 @@ class DeepSpeech2Trainer(Trainer): total_loss += float(loss) * num_utts valid_losses['val_loss'].append(float(loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -146,15 +127,15 @@ class DeepSpeech2Trainer(Trainer): def setup_model(self): config = self.config.clone() config.defrost() - config.model.feat_size = self.train_loader.collate_fn.feature_size - #config.model.dict_size = self.train_loader.collate_fn.vocab_size - config.model.dict_size = len(self.train_loader.collate_fn.vocab_list) + config.feat_size = self.train_loader.collate_fn.feature_size + #config.dict_size = self.train_loader.collate_fn.vocab_size + config.dict_size = len(self.train_loader.collate_fn.vocab_list) config.freeze() if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") if self.parallel: @@ -163,17 +144,13 @@ class DeepSpeech2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - grad_clip = ClipGradByGlobalNormWithLog( - config.training.global_grad_clip) + grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip) lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=config.training.lr, - gamma=config.training.lr_decay, - verbose=True) + learning_rate=config.lr, gamma=config.lr_decay, verbose=True) optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config.training.weight_decay), + weight_decay=paddle.regularizer.L2Decay(config.weight_decay), grad_clip=grad_clip) self.model = model @@ -184,59 +161,59 @@ class DeepSpeech2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.collator.keep_transcription_text = False + config.keep_transcription_text = False - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest test_dataset = ManifestDataset.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) collate_fn_train = SpeechCollator.from_config(config) - config.collator.augmentation_config = "" + config.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" + config.keep_transcription_text = True + config.augmentation_config = "" collate_fn_test = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev) self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.decode_batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_test) @@ -250,31 +227,10 @@ class DeepSpeech2Trainer(Trainer): class DeepSpeech2Tester(DeepSpeech2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # testing config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab_filepath=None) + unit_type=config.unit_type, vocab=None) super().__init__(config, args) def ordid2token(self, texts, texts_len): @@ -293,7 +249,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer @@ -399,31 +355,3 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): self.export() except KeyboardInterrupt: exit(-1) - - def setup(self): - """Setup the experiment. - """ - paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') - - self.setup_output_dir() - self.setup_checkpointer() - - self.setup_dataloader() - self.setup_model() - - self.iteration = 0 - self.epoch = 0 - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - if self.args.output: - output_dir = Path(self.args.output).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - else: - output_dir = Path( - self.args.checkpoint_path).expanduser().parent.parent - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md index d734cc0ca..c0f55bd42 100644 --- a/examples/other/g2p/README.md +++ b/examples/other/g2p/README.md @@ -10,11 +10,11 @@ Run the command below to get the results of the test. ```bash ./run.sh ``` -The `avg WER` of g2p is: 0.027495061517943988 +The `avg WER` of g2p is: 0.027124048652822204 ```text ,--------------------------------------------------------------------. | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | |--------+-----------------+-----------------------------------------| - | Sum/Avg| 9996 299181 | 97.3 2.7 0.0 0.0 2.7 52.5 | + | Sum/Avg| 9996 299181 | 97.3 2.7 0.0 0.0 2.7 52.2 | `--------------------------------------------------------------------' ``` diff --git a/examples/other/g2p/get_g2p_data.py b/examples/other/g2p/get_g2p_data.py index 9b43ca620..8fa3e53cd 100644 --- a/examples/other/g2p/get_g2p_data.py +++ b/examples/other/g2p/get_g2p_data.py @@ -28,7 +28,8 @@ def get_baker_data(root_dir): alignment_files = [f for f in alignment_files if f.stem not in exclude] data_dict = defaultdict(dict) for alignment_fp in alignment_files: - alignment = textgrid.openTextgrid(alignment_fp, includeEmptyIntervals=True) + alignment = textgrid.openTextgrid( + alignment_fp, includeEmptyIntervals=True) # only with baker's annotation utt_id = alignment.tierNameList[0].split(".")[0] intervals = alignment.tierDict[alignment.tierNameList[0]].entryList diff --git a/examples/other/tn/README.md b/examples/other/tn/README.md index 596b1815c..3b80de661 100644 --- a/examples/other/tn/README.md +++ b/examples/other/tn/README.md @@ -7,11 +7,11 @@ Run the command below to get the results of the test. ```bash ./run.sh ``` -The `avg CER` of text normalization is: 0.006388318503308237 +The `avg CER` of text normalization is: 0.00730093543235227 ```text ,-----------------------------------------------------------------. | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | |--------+--------------+-----------------------------------------| - | Sum/Avg| 125 2254 | 99.4 0.1 0.5 0.1 0.7 3.2 | + | Sum/Avg| 125 2254 | 99.4 0.1 0.5 0.2 0.8 4.8 | `-----------------------------------------------------------------' ``` diff --git a/examples/ted_en_zh/st0/conf/preprocess.yaml b/examples/ted_en_zh/st0/conf/preprocess.yaml new file mode 100644 index 000000000..d3992cb9f --- /dev/null +++ b/examples/ted_en_zh/st0/conf/preprocess.yaml @@ -0,0 +1,25 @@ +process: + # extract kaldi fbank from PCM + - type: fbank_kaldi + fs: 16000 + n_mels: 80 + n_shift: 160 + win_length: 400 + dither: 0.1 + - type: cmvn_json + cmvn_path: data/mean_std.json + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml index 36f287b10..d113fc943 100644 --- a/examples/ted_en_zh/st0/conf/transformer.yaml +++ b/examples/ted_en_zh/st0/conf/transformer.yaml @@ -1,109 +1,98 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train.tiny - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.05 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.05 # second +max_input_len: 30.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/bpe_unigram_8000 +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +batch_size: 16 +maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.0 - ctc_weight: 0.0 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.0 + ctc_weight: 0.0 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false - -training: - n_epoch: 120 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 5 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 1.0e-06 +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml index 78887d3cd..a01ec1a6d 100644 --- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml @@ -1,112 +1,102 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.05 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test +min_input_len: 0.05 # second +max_input_len: 30.0 # second +min_output_len: 0.0 # tokens +max_output_len: 400.0 # tokens +min_output_input_ratio: 0.01 +max_output_input_ratio: 20.0 -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/bpe_unigram_8000 +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +batch_size: 16 +maxlen_in: 5 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +raw_wav: True # use raw_wav or kaldi feature +spectrum_type: fbank #linear, mfcc, fbank +feat_dim: 80 +delta_delta: False +dither: 1.0 +target_sample_rate: 16000 +max_freq: None +n_fft: None +stride_ms: 10.0 +window_ms: 25.0 +use_dB_normalization: True +target_dB: -20 +random_seed: 0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.5 - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.5 + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 120 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 2.5 - weight_decay: 1e-06 - scheduler: noam - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 50 - checkpoint: - kbest_n: 50 - latest_n: 5 +########################################### +# Training # +########################################### +n_epoch: 120 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 1.0e-06 +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - diff --git a/examples/ted_en_zh/st0/conf/tuning/decode.yaml b/examples/ted_en_zh/st0/conf/tuning/decode.yaml new file mode 100644 index 000000000..ed081cf4a --- /dev/null +++ b/examples/ted_en_zh/st0/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +batch_size: 5 +error_rate_type: char-bleu +decoding_method: fullsentence # 'fullsentence', 'simultaneous' +beam_size: 10 +word_reward: 0.7 +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/ted_en_zh/st0/local/test.sh b/examples/ted_en_zh/st0/local/test.sh index a9b18dd98..904f95c4a 100755 --- a/examples/ted_en_zh/st0/local/test.sh +++ b/examples/ted_en_zh/st0/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,18 +9,18 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 for type in fullsentence; do echo "decoding ${type}" - batch_size=32 python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/ted_en_zh/st0/run.sh b/examples/ted_en_zh/st0/run.sh index b85ba95a3..1746c0251 100755 --- a/examples/ted_en_zh/st0/run.sh +++ b/examples/ted_en_zh/st0/run.sh @@ -6,6 +6,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer_mtl_noam.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=5 data_path=./TED_EnZh # path to unzipped data source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -32,7 +33,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/ted_en_zh/st1/RESULTS.md b/examples/ted_en_zh/st1/RESULTS.md index e8aed53ec..66dbce6cd 100644 --- a/examples/ted_en_zh/st1/RESULTS.md +++ b/examples/ted_en_zh/st1/RESULTS.md @@ -12,5 +12,5 @@ ## Transformer | Model | Params | Config | Val loss | Char-BLEU | | --- | --- | --- | --- | --- | -| FAT + Transformer+ASR MTL | 50.26M | conf/transformer_mtl_noam.yaml | 62.86 | 19.45 | +| FAT + Transformer+ASR MTL | 50.26M | conf/transformer_mtl_noam.yaml | 69.91 | 20.26 | | FAT + Transformer+ASR MTL with word reward | 50.26M | conf/transformer_mtl_noam.yaml | 62.86 | 20.80 | diff --git a/examples/ted_en_zh/st1/conf/preprocess.yaml b/examples/ted_en_zh/st1/conf/preprocess.yaml new file mode 100644 index 000000000..bc86d98c1 --- /dev/null +++ b/examples/ted_en_zh/st1/conf/preprocess.yaml @@ -0,0 +1,16 @@ +process: + # these three processes are a.k.a. SpecAugument + - type: time_warp + max_time_warp: 5 + inplace: true + mode: PIL + - type: freq_mask + F: 30 + n_mask: 2 + inplace: true + replace_with_zero: false + - type: time_mask + T: 40 + n_mask: 2 + inplace: true + replace_with_zero: false diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml index 609c58240..515edee20 100644 --- a/examples/ted_en_zh/st1/conf/transformer.yaml +++ b/examples/ted_en_zh/st1/conf/transformer.yaml @@ -1,110 +1,90 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train.tiny - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 5.0 # frame - max_input_len: 3000.0 # frame - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/bpe_unigram_8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 83 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 +mean_std_filepath: "" +# preprocess_config: conf/augmentation.json +batch_size: 20 +feat_dim: 83 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: None - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: None +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.0 - ctc_weight: 0.0 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.0 + ctc_weight: 0.0 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 20 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 5 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 40 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 0. +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml index 10eccd1eb..a5f956fab 100644 --- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml @@ -1,110 +1,90 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 5.0 # frame - max_input_len: 3000.0 # frame - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.01 - max_output_input_ratio: 20.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt - unit_type: 'spm' - spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 - mean_std_filepath: "" - # augmentation_config: conf/augmentation.json - batch_size: 10 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 83 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt +unit_type: 'spm' +spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 +mean_std_filepath: "" +# preprocess_config: conf/augmentation.json +batch_size: 20 +feat_dim: 83 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: None - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: None +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - asr_weight: 0.5 - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + asr_weight: 0.5 + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 20 - accum_grad: 2 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 2.5 - weight_decay: 1e-06 - scheduler: noam - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 5 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 5 - error_rate_type: char-bleu - decoding_method: fullsentence # 'fullsentence', 'simultaneous' - alpha: 2.5 - beta: 0.3 - beam_size: 10 - word_reward: 0.7 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 40 +accum_grad: 2 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 2.5 + weight_decay: 0. +scheduler: noam +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 50 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/ted_en_zh/st1/conf/tuning/decode.yaml b/examples/ted_en_zh/st1/conf/tuning/decode.yaml new file mode 100644 index 000000000..d6104dbce --- /dev/null +++ b/examples/ted_en_zh/st1/conf/tuning/decode.yaml @@ -0,0 +1,12 @@ + +batch_size: 5 +error_rate_type: char-bleu +decoding_method: fullsentence # 'fullsentence', 'simultaneous' +beam_size: 10 +word_reward: 0.7 +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/ted_en_zh/st1/local/test.sh b/examples/ted_en_zh/st1/local/test.sh index a9b18dd98..904f95c4a 100755 --- a/examples/ted_en_zh/st1/local/test.sh +++ b/examples/ted_en_zh/st1/local/test.sh @@ -1,7 +1,7 @@ #! /usr/bin/env bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,18 +9,18 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 for type in fullsentence; do echo "decoding ${type}" - batch_size=32 python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh index f6362a8b3..1808e37b4 100755 --- a/examples/ted_en_zh/st1/run.sh +++ b/examples/ted_en_zh/st1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3 stage=1 stop_stage=4 conf_path=conf/transformer_mtl_noam.yaml +decode_conf_path=conf/tuning/decode.yaml ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model) avg_num=5 data_path=./TED_EnZh # path to unzipped data @@ -27,7 +28,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ -n "${ckpt_path}" ]; then echo "Finetune from Pretrained Model" ${ckpt_path} ./local/download_pretrain.sh || exit -1 - fi + fi CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" fi @@ -38,5 +39,5 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 -fi \ No newline at end of file + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 +fi diff --git a/examples/timit/asr1/conf/preprocess.yaml b/examples/timit/asr1/conf/preprocess.yaml index dd4cfd273..f7f4c58d5 100644 --- a/examples/timit/asr1/conf/preprocess.yaml +++ b/examples/timit/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: true + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml index 1c6059e4a..4731395f0 100644 --- a/examples/timit/asr1/conf/transformer.yaml +++ b/examples/timit/asr1/conf/transformer.yaml @@ -1,110 +1,89 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.0 # second - max_input_len: 10.0 # second - min_output_len: 0.0 # tokens - max_output_len: 150.0 # tokens - min_output_input_ratio: 0.005 - max_output_input_ratio: 1000.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: "word" - mean_std_filepath: "" - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +spm_model_prefix: '' +unit_type: "word" +mean_std_filepath: "" +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 128 # dimension of attention - attention_heads: 4 - linear_units: 1024 # the number of units of position-wise feed forward - num_blocks: 6 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 128 # dimension of attention + attention_heads: 4 + linear_units: 1024 # the number of units of position-wise feed forward + num_blocks: 6 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 1024 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 1024 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.5 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.5 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 50 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.004 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 1200 - lr_decay: 1.0 - log_interval: 10 - checkpoint: - kbest_n: 50 - latest_n: 5 - - -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - +########################################### +# Training # +########################################### +n_epoch: 50 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.004 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 1200 + lr_decay: 1.0 +log_interval: 10 +checkpoint: + kbest_n: 50 + latest_n: 5 diff --git a/examples/timit/asr1/conf/tuning/decode.yaml b/examples/timit/asr1/conf/tuning/decode.yaml new file mode 100644 index 000000000..805dd02fa --- /dev/null +++ b/examples/timit/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/timit/asr1/local/align.sh b/examples/timit/asr1/local/align.sh index c65d611c4..14d91d687 100755 --- a/examples/timit/asr1/local/align.sh +++ b/examples/timit/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/timit/asr1/local/test.sh b/examples/timit/asr1/local/test.sh index 08ee0e365..88192c583 100755 --- a/examples/timit/asr1/local/test.sh +++ b/examples/timit/asr1/local/test.sh @@ -7,8 +7,8 @@ stop_stage=50 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1; -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -17,7 +17,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -43,10 +44,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -63,10 +65,11 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -82,10 +85,11 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/timit/asr1/run.sh b/examples/timit/asr1/run.sh index a95b5f3ad..0d84be9f3 100755 --- a/examples/timit/asr1/run.sh +++ b/examples/timit/asr1/run.sh @@ -7,6 +7,7 @@ gpus=0,1,2,3 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=10 TIMIT_path=/path/to/TIMIT @@ -34,15 +35,15 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then -# # export ckpt avg_n -# CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -# fi +if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then + # export ckpt avg_n + CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +fi diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml index 7d841d474..64d432e26 100644 --- a/examples/tiny/asr0/conf/deepspeech2.yaml +++ b/examples/tiny/asr0/conf/deepspeech2.yaml @@ -1,70 +1,67 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.0 - max_input_len: 30.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.0 +max_input_len: 30.0 +min_output_len: 0.0 +max_output_len: 400.0 +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - batch_size: 4 +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: 161 +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 2 +batch_size: 4 -model: - num_conv_layers: 2 - num_rnn_layers: 3 - rnn_layer_size: 2048 - use_gru: False - share_rnn_weights: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 3 +rnn_layer_size: 2048 +use_gru: False +share_rnn_weights: True +blank_id: 0 -training: - n_epoch: 5 - accum_grad: 1 - lr: 1e-5 - lr_decay: 0.8 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 1 - checkpoint: - kbest_n: 3 - latest_n: 2 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +lr: 1.0e-5 +lr_decay: 0.8 +weight_decay: 1.0e-6 +global_grad_clip: 5.0 +log_interval: 1 +checkpoint: + kbest_n: 3 + latest_n: 2 -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml index 393b6439f..74a4dc814 100644 --- a/examples/tiny/asr0/conf/deepspeech2_online.yaml +++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml @@ -1,72 +1,68 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.0 - max_input_len: 30.0 - min_output_len: 0.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny +min_input_len: 0.0 +max_input_len: 30.0 +min_output_len: 0.0 +max_output_len: 400.0 +min_output_input_ratio: 0.05 +max_output_input_ratio: 10.0 -collator: - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/lang_char/vocab.txt - augmentation_config: conf/augmentation.json - random_seed: 0 - spm_model_prefix: - spectrum_type: linear - feat_dim: - delta_delta: False - stride_ms: 10.0 - window_ms: 20.0 - n_fft: None - max_freq: None - target_sample_rate: 16000 - use_dB_normalization: True - target_dB: -20 - dither: 1.0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 - batch_size: 4 +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +unit_type: char +vocab_filepath: data/lang_char/vocab.txt +augmentation_config: conf/augmentation.json +random_seed: 0 +spm_model_prefix: +spectrum_type: linear +feat_dim: 161 +delta_delta: False +stride_ms: 10.0 +window_ms: 20.0 +n_fft: None +max_freq: None +target_sample_rate: 16000 +use_dB_normalization: True +target_dB: -20 +dither: 1.0 +keep_transcription_text: False +sortagrad: True +shuffle_method: batch_shuffle +num_workers: 0 +batch_size: 4 -model: - num_conv_layers: 2 - num_rnn_layers: 4 - rnn_layer_size: 2048 - rnn_direction: forward - num_fc_layers: 2 - fc_layers_size_list: 512, 256 - use_gru: True - blank_id: 0 +############################################ +# Network Architecture # +############################################ +num_conv_layers: 2 +num_rnn_layers: 4 +rnn_layer_size: 2048 +rnn_direction: forward +num_fc_layers: 2 +fc_layers_size_list: 512, 256 +use_gru: True +blank_id: 0 -training: - n_epoch: 5 - accum_grad: 1 - lr: 1e-5 - lr_decay: 1.0 - weight_decay: 1e-06 - global_grad_clip: 5.0 - log_interval: 1 - checkpoint: - kbest_n: 3 - latest_n: 2 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +lr: 1.0e-5 +lr_decay: 1.0 +weight_decay: 1.0e-6 +global_grad_clip: 5.0 +log_interval: 1 +checkpoint: + kbest_n: 3 + latest_n: 2 - -decoding: - batch_size: 128 - error_rate_type: wer - decoding_method: ctc_beam_search - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 500 - cutoff_prob: 1.0 - cutoff_top_n: 40 - num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/tuning/chunk_decode.yaml b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml new file mode 100644 index 000000000..94c3dbdee --- /dev/null +++ b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/conf/tuning/decode.yaml b/examples/tiny/asr0/conf/tuning/decode.yaml new file mode 100644 index 000000000..94c3dbdee --- /dev/null +++ b/examples/tiny/asr0/conf/tuning/decode.yaml @@ -0,0 +1,10 @@ +decode_batch_size: 128 +error_rate_type: wer +decoding_method: ctc_beam_search +lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm +alpha: 2.5 +beta: 0.3 +beam_size: 500 +cutoff_prob: 1.0 +cutoff_top_n: 40 +num_proc_bsearch: 8 diff --git a/examples/tiny/asr0/local/test.sh b/examples/tiny/asr0/local/test.sh index a627ef722..ea40046b1 100755 --- a/examples/tiny/asr0/local/test.sh +++ b/examples/tiny/asr0/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix model_type" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix model_type" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -model_type=$3 +decode_config_path=$2 +ckpt_prefix=$3 +model_type=$4 # download language model bash local/download_lm_en.sh @@ -21,6 +22,7 @@ fi python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.rsl \ --checkpoint_path ${ckpt_prefix} \ --model_type ${model_type} diff --git a/examples/tiny/asr0/run.sh b/examples/tiny/asr0/run.sh index f39fb3fa0..25f046245 100755 --- a/examples/tiny/asr0/run.sh +++ b/examples/tiny/asr0/run.sh @@ -6,6 +6,7 @@ gpus=0 stage=0 stop_stage=100 conf_path=conf/deepspeech2.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 model_type=offline @@ -32,7 +33,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${model_type} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml index ad27478de..8f785121f 100644 --- a/examples/tiny/asr1/conf/chunk_confermer.yaml +++ b/examples/tiny/asr1/conf/chunk_confermer.yaml @@ -1,120 +1,97 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 30.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: True - use_dynamic_chunk: True - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + causal: True + use_dynamic_chunk: True + cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster + use_dynamic_left_chunk: false - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. - + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +preprocess_config: conf/preprocess.yaml +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 + +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml index 298518fb5..2570bb852 100644 --- a/examples/tiny/asr1/conf/chunk_transformer.yaml +++ b/examples/tiny/asr1/conf/chunk_transformer.yaml @@ -1,113 +1,91 @@ -# https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - - -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_dynamic_chunk: true + use_dynamic_left_chunk: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 - +# https://yaml.org/type/float.html +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml index eb8509024..eb8f0ab9f 100644 --- a/examples/tiny/asr1/conf/conformer.yaml +++ b/examples/tiny/asr1/conf/conformer.yaml @@ -1,116 +1,97 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: "" - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +############################################ +# Network Architecture # +############################################ +cmvn_file: "data/mean_std.json" +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + use_cnn_module: True + cnn_module_kernel: 15 + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 -# network architecture -model: - cmvn_file: "data/mean_std.json" - cmvn_file_type: "json" - # encoder related - encoder: conformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - use_cnn_module: True - cnn_module_kernel: 15 - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: "" +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 5 - accum_grad: 4 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 10 - latest_n: 1 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 4 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 10 + latest_n: 1 -decoding: - batch_size: 64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/conf/preprocess.yaml b/examples/tiny/asr1/conf/preprocess.yaml index dd4cfd273..f7f4c58d5 100644 --- a/examples/tiny/asr1/conf/preprocess.yaml +++ b/examples/tiny/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: true + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml index c641d1f5b..4e3068d15 100644 --- a/examples/tiny/asr1/conf/transformer.yaml +++ b/examples/tiny/asr1/conf/transformer.yaml @@ -1,110 +1,90 @@ # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.tiny - dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - min_input_len: 0.5 # second - max_input_len: 20.0 # second - min_output_len: 0.0 # tokens - max_output_len: 400.0 # tokens - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 - -collator: - mean_std_filepath: data/mean_std.json - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/lang_char/bpe_unigram_200' - augmentation_config: conf/preprocess.yaml - batch_size: 4 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 - -# network architecture -model: - cmvn_file: - cmvn_file_type: "json" - # encoder related - encoder: transformer - encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: transformer +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false +########################################### +# Data # +########################################### +train_manifest: data/manifest.tiny +dev_manifest: data/manifest.tiny +test_manifest: data/manifest.tiny + +########################################### +# Dataloader # +########################################### +mean_std_filepath: data/mean_std.json +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'spm' +spm_model_prefix: 'data/lang_char/bpe_unigram_200' +preprocess_config: conf/preprocess.yaml +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 4 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 5 - accum_grad: 1 - global_grad_clip: 5.0 - optim: adam - optim_conf: - lr: 0.002 - weight_decay: 1e-06 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 25000 - lr_decay: 1.0 - log_interval: 1 - checkpoint: - kbest_n: 2 - latest_n: 1 +########################################### +# Training # +########################################### +n_epoch: 5 +accum_grad: 1 +global_grad_clip: 5.0 +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +log_interval: 1 +checkpoint: + kbest_n: 2 + latest_n: 1 -decoding: - batch_size: 8 #64 - error_rate_type: wer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/conf/tuning/chunk_decode.yaml b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml new file mode 100644 index 000000000..c5b641dae --- /dev/null +++ b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 8 #64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/tiny/asr1/conf/tuning/decode.yaml b/examples/tiny/asr1/conf/tuning/decode.yaml new file mode 100644 index 000000000..a0984f9ee --- /dev/null +++ b/examples/tiny/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 8 #64 +error_rate_type: wer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. diff --git a/examples/tiny/asr1/local/align.sh b/examples/tiny/asr1/local/align.sh index c65d611c4..14d91d687 100755 --- a/examples/tiny/asr1/local/align.sh +++ b/examples/tiny/asr1/local/align.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 batch_size=1 output_dir=${ckpt_prefix} @@ -20,9 +21,10 @@ mkdir -p ${output_dir} python3 -u ${BIN_DIR}/alignment.py \ --ngpu ${ngpu} \ --config ${config_path} \ +--decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.align \ --checkpoint_path ${ckpt_prefix} \ ---opts decoding.batch_size ${batch_size} +--opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in ctc alignment!" diff --git a/examples/tiny/asr1/local/test.sh b/examples/tiny/asr1/local/test.sh index 190bacffc..79df969b4 100755 --- a/examples/tiny/asr1/local/test.sh +++ b/examples/tiny/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -33,10 +34,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -50,10 +52,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${ckpt_prefix}.${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/tiny/asr1/run.sh b/examples/tiny/asr1/run.sh index ec9c5a567..1651c034c 100755 --- a/examples/tiny/asr1/run.sh +++ b/examples/tiny/asr1/run.sh @@ -6,6 +6,7 @@ gpus=0 stage=0 stop_stage=50 conf_path=conf/transformer.yaml +decode_conf_path=conf/tuning/decode.yaml avg_num=1 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -31,12 +32,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=${gpus} ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md index 1f2c9338e..74c1086a0 100644 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -65,8 +65,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--phones-dict PHONES_DICT] - [--speaker-dict SPEAKER_DICT] + [--ngpu NGPU] [--phones-dict PHONES_DICT] + [--speaker-dict SPEAKER_DICT] [--voice-cloning VOICE_CLONING] Train a FastSpeech2 model. @@ -80,11 +80,12 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu=0, use cpu. - --verbose VERBOSE verbose. --phones-dict PHONES_DICT phone vocabulary file. --speaker-dict SPEAKER_DICT speaker id map file for multiple speaker model. + --voice-cloning VOICE_CLONING + whether training voice cloning model. ``` 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`. 2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder. @@ -94,16 +95,16 @@ optional arguments: ### Synthesizing We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1) as the neural vocoder. -Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)and unzip it. +Download pretrained parallel wavegan model from [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip) and unzip it. ```bash -unzip pwg_vctk_ckpt_0.5.zip +unzip pwg_vctk_ckpt_0.1.1.zip ``` Parallel WaveGAN checkpoint contains files listed below. ```text -pwg_vctk_ckpt_0.5 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +pwg_vctk_ckpt_0.1.1 +├── default.yaml # default config used to train parallel wavegan +├── snapshot_iter_1500000.pdz # generator parameters of parallel wavegan +└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` `./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`. ```bash @@ -245,7 +246,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \ --lang=en \ --text=${BIN_DIR}/../sentences_en.txt \ --output_dir=exp/default/test_e2e \ - --phones_dict=dump/phone_id_map.txt \ - --speaker_dict=dump/speaker_id_map.txt \ + --phones_dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \ + --speaker_dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt \ --spk_id=0 ``` diff --git a/examples/vctk/tts3/local/synthesize.sh b/examples/vctk/tts3/local/synthesize.sh index a8aef034c..8381af464 100755 --- a/examples/vctk/tts3/local/synthesize.sh +++ b/examples/vctk/tts3/local/synthesize.sh @@ -12,9 +12,9 @@ python3 ${BIN_DIR}/../synthesize.py \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=pwgan_vctk \ - --voc_config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ - --voc_ckpt=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ - --voc_stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml \ + --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \ --test_metadata=dump/test/norm/metadata.jsonl \ --output_dir=${train_output_path}/test \ --phones_dict=dump/phone_id_map.txt \ diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh index 954e8cb9a..51bb9e192 100755 --- a/examples/vctk/tts3/local/synthesize_e2e.sh +++ b/examples/vctk/tts3/local/synthesize_e2e.sh @@ -12,9 +12,9 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=pwgan_vctk \ - --voc_config=pwg_vctk_ckpt_0.5/pwg_default.yaml \ - --voc_ckpt=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \ - --voc_stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \ + --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml \ + --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \ --lang=en \ --text=${BIN_DIR}/../sentences_en.txt \ --output_dir=${train_output_path}/test_e2e \ diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md index 78254d4e0..4714f28dc 100644 --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -62,8 +62,8 @@ Here's the complete help message. ```text usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA] [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR] - [--ngpu NGPU] [--verbose VERBOSE] [--batch-size BATCH_SIZE] - [--max-iter MAX_ITER] [--run-benchmark RUN_BENCHMARK] + [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER] + [--run-benchmark RUN_BENCHMARK] [--profiler_options PROFILER_OPTIONS] Train a ParallelWaveGAN model. @@ -78,7 +78,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. benchmark: arguments related to benchmark. @@ -108,7 +107,6 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG] [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR] [--ngpu NGPU] - [--verbose VERBOSE] Synthesize with GANVocoder. @@ -125,7 +123,6 @@ optional arguments: --output-dir OUTPUT_DIR output dir. --ngpu NGPU if ngpu == 0, use cpu. - --verbose VERBOSE verbose. ``` 1. `--config` parallel wavegan config file. You should use the same config with which the model is trained. @@ -135,15 +132,15 @@ optional arguments: 5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained models can be downloaded here [pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip). +Pretrained models can be downloaded here [pwg_vctk_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip). Parallel WaveGAN checkpoint contains files listed below. ```text -pwg_vctk_ckpt_0.5 -├── pwg_default.yaml # default config used to train parallel wavegan -├── pwg_snapshot_iter_1000000.pdz # generator parameters of parallel wavegan -└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan +pwg_vctk_ckpt_0.1.1 +├── default.yaml # default config used to train parallel wavegan +├── snapshot_iter_1500000.pdz # generator parameters of parallel wavegan +└── feats_stats.npy # statistics used to normalize spectrogram when training parallel wavegan ``` ## Acknowledgement We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN. diff --git a/examples/vctk/voc1/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml index d95eaad9d..59ce3825d 100644 --- a/examples/vctk/voc1/conf/default.yaml +++ b/examples/vctk/voc1/conf/default.yaml @@ -70,12 +70,9 @@ lambda_adv: 4.0 # Loss balancing coefficient. ########################################################### # DATA LOADER SETTING # ########################################################### -batch_size: 8 # Batch size. +batch_size: 6 # Batch size. batch_max_steps: 24000 # Length of each audio in batch. Make sure dividable by n_shift. -pin_memory: true # Whether to pin memory in Pytorch DataLoader. -num_workers: 4 # Number of workers in Pytorch DataLoader. -remove_short_samples: true # Whether to remove samples the length of which are less than batch_max_steps. -allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. +num_workers: 2 # Number of workers in DataLoader. ########################################################### # OPTIMIZER & SCHEDULER SETTING # @@ -103,7 +100,7 @@ discriminator_grad_norm: 1 # Discriminator's gradient norm. # INTERVAL SETTING # ########################################################### discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator. -train_max_steps: 1000000 # Number of training steps. +train_max_steps: 1500000 # Number of training steps. save_interval_steps: 5000 # Interval steps to save checkpoint. eval_interval_steps: 1000 # Interval steps to evaluate the network. diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml index a438236d8..6c2bbca41 100644 --- a/examples/wenetspeech/asr1/conf/conformer.yaml +++ b/examples/wenetspeech/asr1/conf/conformer.yaml @@ -1,111 +1,92 @@ -# network architecture -model: - # encoder related - encoder: conformer - encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: True - use_cnn_module: True - cnn_module_kernel: 15 - cnn_module_norm: layer_norm - activation_type: swish - pos_enc_layer_type: rel_pos - selfattention_layer_type: rel_selfattn +############################################ +# Network Architecture # +############################################ +cmvn_file: +cmvn_file_type: "json" +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: True + use_cnn_module: True + cnn_module_kernel: 15 + cnn_module_norm: layer_norm + activation_type: swish + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn - # decoder related - decoder: transformer - decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 - # hybrid CTC/attention - model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false # https://yaml.org/type/float.html -data: - train_manifest: data/manifest.train - dev_manifest: data/manifest.dev - test_manifest: data/manifest.test - min_input_len: 0.1 # second - max_input_len: 12.0 # second - min_output_len: 1.0 - max_output_len: 400.0 - min_output_input_ratio: 0.05 - max_output_input_ratio: 10.0 +########################################### +# Data # +########################################### +train_manifest: data/manifest.train +dev_manifest: data/manifest.dev +test_manifest: data/manifest.test -collator: - vocab_filepath: data/lang_char/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/preprocess.yaml - batch_size: 64 - raw_wav: True # use raw_wav or kaldi feature - spectrum_type: fbank #linear, mfcc, fbank - feat_dim: 80 - delta_delta: False - dither: 1.0 - target_sample_rate: 16000 - max_freq: None - n_fft: None - stride_ms: 10.0 - window_ms: 25.0 - use_dB_normalization: True - target_dB: -20 - random_seed: 0 - keep_transcription_text: False - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 2 +########################################### +# Dataloader # +########################################### +vocab_filepath: data/lang_char/vocab.txt +unit_type: 'char' +preprocess_config: conf/preprocess.yaml +spm_model_prefix: '' +feat_dim: 80 +stride_ms: 10.0 +window_ms: 25.0 +sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs +batch_size: 64 +maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced +maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced +minibatches: 0 # for debug +batch_count: auto +batch_bins: 0 +batch_frames_in: 0 +batch_frames_out: 0 +batch_frames_inout: 0 +num_workers: 0 +subsampling_factor: 1 +num_encs: 1 -training: - n_epoch: 240 - accum_grad: 16 - global_grad_clip: 5.0 - log_interval: 100 - checkpoint: - kbest_n: 50 - latest_n: 5 - optim: adam - optim_conf: - lr: 0.001 - weight_decay: 1e-6 - scheduler: warmuplr - scheduler_conf: - warmup_steps: 5000 - lr_decay: 1.0 - - -decoding: - batch_size: 128 - error_rate_type: cer - decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' - lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm - alpha: 2.5 - beta: 0.3 - beam_size: 10 - cutoff_prob: 1.0 - cutoff_top_n: 0 - num_proc_bsearch: 8 - ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. - decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. - simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file +########################################### +# Training # +########################################### +n_epoch: 240 +accum_grad: 16 +global_grad_clip: 5.0 +log_interval: 100 +checkpoint: + kbest_n: 50 + latest_n: 5 +optim: adam +optim_conf: + lr: 0.001 + weight_decay: 1.0e-6 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 5000 + lr_decay: 1.0 diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml index dd4cfd273..f7f4c58d5 100644 --- a/examples/wenetspeech/asr1/conf/preprocess.yaml +++ b/examples/wenetspeech/asr1/conf/preprocess.yaml @@ -5,7 +5,7 @@ process: n_mels: 80 n_shift: 160 win_length: 400 - dither: true + dither: 0.1 - type: cmvn_json cmvn_path: data/mean_std.json # these three processes are a.k.a. SpecAugument diff --git a/examples/wenetspeech/asr1/conf/tuning/decode.yaml b/examples/wenetspeech/asr1/conf/tuning/decode.yaml new file mode 100644 index 000000000..6924bfa63 --- /dev/null +++ b/examples/wenetspeech/asr1/conf/tuning/decode.yaml @@ -0,0 +1,11 @@ +decode_batch_size: 128 +error_rate_type: cer +decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring' +beam_size: 10 +ctc_weight: 0.5 # ctc weight for attention rescoring decode mode. +decoding_chunk_size: -1 # decoding chunk size. Defaults to -1. + # <0: for decoding, use full chunk. + # >0: for decoding, use fixed chunk size as set. + # 0: used for training, it's prohibited here. +num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1. +simulate_streaming: False # simulate streaming inference. Defaults to False. \ No newline at end of file diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh index 7dd478d19..d216dd84a 100755 --- a/examples/wenetspeech/asr1/local/data.sh +++ b/examples/wenetspeech/asr1/local/data.sh @@ -96,7 +96,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/build_vocab.py \ --unit_type="char" \ --count_threshold=0 \ - --vocab_path="data/vocab.txt" \ + --vocab_path="data/lang_char/vocab.txt" \ --manifest_paths "data/manifest.train.raw" if [ $? -ne 0 ]; then diff --git a/examples/wenetspeech/asr1/local/test.sh b/examples/wenetspeech/asr1/local/test.sh index da159de73..65b884e51 100755 --- a/examples/wenetspeech/asr1/local/test.sh +++ b/examples/wenetspeech/asr1/local/test.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 2 ];then - echo "usage: ${0} config_path ckpt_path_prefix" +if [ $# != 3 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix" exit -1 fi @@ -9,7 +9,8 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 +decode_config_path=$2 +ckpt_prefix=$3 chunk_mode=false if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then @@ -36,10 +37,11 @@ for type in attention ctc_greedy_search; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" @@ -55,10 +57,11 @@ for type in ctc_prefix_beam_search attention_rescoring; do python3 -u ${BIN_DIR}/test.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh index 5c7794740..474642624 100755 --- a/examples/wenetspeech/asr1/local/test_wav.sh +++ b/examples/wenetspeech/asr1/local/test_wav.sh @@ -1,7 +1,7 @@ #!/bin/bash -if [ $# != 3 ];then - echo "usage: ${0} config_path ckpt_path_prefix audio_file" +if [ $# != 4 ];then + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" exit -1 fi @@ -9,8 +9,9 @@ ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') echo "using $ngpu gpus..." config_path=$1 -ckpt_prefix=$2 -audio_file=$3 +decode_config_path=$2 +ckpt_prefix=$3 +audio_file=$4 mkdir -p data wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ @@ -43,10 +44,11 @@ for type in attention_rescoring; do python3 -u ${BIN_DIR}/test_wav.py \ --ngpu ${ngpu} \ --config ${config_path} \ + --decode_cfg ${decode_config_path} \ --result_file ${output_dir}/${type}.rsl \ --checkpoint_path ${ckpt_prefix} \ - --opts decoding.decoding_method ${type} \ - --opts decoding.batch_size ${batch_size} \ + --opts decode.decoding_method ${type} \ + --opts decode.decode_batch_size ${batch_size} \ --audio_file ${audio_file} if [ $? -ne 0 ]; then diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh index d77f409fd..9995bc63e 100644 --- a/examples/wenetspeech/asr1/run.sh +++ b/examples/wenetspeech/asr1/run.sh @@ -7,7 +7,7 @@ gpus=0,1,2,3,4,5,6,7 stage=0 stop_stage=100 conf_path=conf/conformer.yaml - +decode_conf_path=conf/tuning/decode.yaml average_checkpoint=true avg_num=10 @@ -36,12 +36,12 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # ctc alignment of test data - CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then @@ -51,5 +51,5 @@ fi if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi diff --git a/paddleaudio/CHANGELOG.md b/paddleaudio/CHANGELOG.md new file mode 100644 index 000000000..4dc68c6ff --- /dev/null +++ b/paddleaudio/CHANGELOG.md @@ -0,0 +1,2 @@ +# Changelog + diff --git a/paddleaudio/__init__.py b/paddleaudio/__init__.py index 2685cf57c..b717777d3 100644 --- a/paddleaudio/__init__.py +++ b/paddleaudio/__init__.py @@ -13,3 +13,5 @@ # limitations under the License. from .backends import * from .features import * + +__version__ = '0.1.0' diff --git a/paddleaudio/datasets/dataset.py b/paddleaudio/datasets/dataset.py index fb521beae..7a57fd6cc 100644 --- a/paddleaudio/datasets/dataset.py +++ b/paddleaudio/datasets/dataset.py @@ -36,6 +36,7 @@ class AudioClassificationDataset(paddle.io.Dataset): files: List[str], labels: List[int], feat_type: str='raw', + sample_rate: int=None, **kwargs): """ Ags: @@ -55,6 +56,7 @@ class AudioClassificationDataset(paddle.io.Dataset): self.labels = labels self.feat_type = feat_type + self.sample_rate = sample_rate self.feat_config = kwargs # Pass keyword arguments to customize feature config def _get_data(self, input_file: str): @@ -63,7 +65,11 @@ class AudioClassificationDataset(paddle.io.Dataset): def _convert_to_record(self, idx): file, label = self.files[idx], self.labels[idx] - waveform, sample_rate = load_audio(file) + if self.sample_rate is None: + waveform, sample_rate = load_audio(file) + else: + waveform, sample_rate = load_audio(file, sr=self.sample_rate) + feat_func = feat_funcs[self.feat_type] record = {} diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md index 34466ec2f..5ac7a3bca 100644 --- a/paddlespeech/cli/README.md +++ b/paddlespeech/cli/README.md @@ -1,5 +1,7 @@ # PaddleSpeech Command Line +([简体中文](./README_cn.md)|English) + The simplest approach to use PaddleSpeech models. ## Help @@ -28,3 +30,9 @@ paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav ``` + ## Text Post-precessing + +- Punctuation Restoration + ```bash + paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 + ``` diff --git a/paddlespeech/cli/README_cn.md b/paddlespeech/cli/README_cn.md new file mode 100644 index 000000000..75ab9e41b --- /dev/null +++ b/paddlespeech/cli/README_cn.md @@ -0,0 +1,39 @@ +# PaddleSpeech 命令行工具 + +(简体中文|[English](./README.md)) + +`paddlespeech.cli` 模块是 PaddleSpeech 的命令行工具,它提供了最简便的方式调用 PaddleSpeech 提供的不同语音应用场景的预训练模型,用一行命令就可以进行模型预测: + + ## 命令行使用帮助 + ```bash + paddlespeech help + ``` + + ## 声音分类 + ```bash + paddlespeech cls --input input.wav + ``` + + ## 语音识别 + ``` + paddlespeech asr --lang zh --input input_16k.wav + ``` + + ## 语音翻译(英-中) + + (暂不支持Windows系统) + ```bash + paddlespeech st --input input_16k.wav + ``` + + ## 语音合成 + ```bash + paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav + ``` + + ## 文本后处理 + +- 标点恢复 + ```bash + paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 + ``` diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 05fcc20a2..aa4e31d9e 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -31,6 +31,7 @@ from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress from ..utils import MODEL_HOME +from ..utils import stats_wrapper from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.utils.dynamic_import import dynamic_import @@ -45,19 +46,29 @@ pretrained_models = { # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" "conformer_wenetspeech-zh-16k": { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz', 'md5': - '54e7a558a6e020c2f5fb224874943f97', + '76cb19ed857e6623856b7cd7ebbfeda4', 'cfg_path': - 'conf/conformer.yaml', + 'model.yaml', 'ckpt_path': 'exp/conformer/checkpoints/wenetspeech', }, + "transformer_librispeech-en-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz', + 'md5': + '2c667da24922aad391eacafe37bc1660', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/transformer/checkpoints/avg_10', + }, } model_alias = { - "ds2_offline": "paddlespeech.s2t.models.ds2:DeepSpeech2Model", - "ds2_online": "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline", + "deepspeech2offline": "paddlespeech.s2t.models.ds2:DeepSpeech2Model", + "deepspeech2online": "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline", "conformer": "paddlespeech.s2t.models.u2:U2Model", "transformer": "paddlespeech.s2t.models.u2:U2Model", "wenetspeech": "paddlespeech.s2t.models.u2:U2Model", @@ -84,7 +95,7 @@ class ASRExecutor(BaseExecutor): '--lang', type=str, default='zh', - help='Choose model language. zh or en') + help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]') self.parser.add_argument( "--sample_rate", type=int, @@ -96,6 +107,12 @@ class ASRExecutor(BaseExecutor): type=str, default=None, help='Config of asr task. Use deault config when it is None.') + self.parser.add_argument( + '--decode_method', + type=str, + default='attention_rescoring', + choices=['ctc_greedy_search', 'ctc_prefix_beam_search', 'attention', 'attention_rescoring'], + help='only support transformer and conformer model') self.parser.add_argument( '--ckpt_path', type=str, @@ -135,6 +152,7 @@ class ASRExecutor(BaseExecutor): lang: str='zh', sample_rate: int=16000, cfg_path: Optional[os.PathLike]=None, + decode_method: str='attention_rescoring', ckpt_path: Optional[os.PathLike]=None): """ Init model and other resources from a specific path. @@ -158,51 +176,36 @@ class ASRExecutor(BaseExecutor): else: self.cfg_path = os.path.abspath(cfg_path) self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams") - res_path = os.path.dirname( + self.res_path = os.path.dirname( os.path.dirname(os.path.abspath(self.cfg_path))) #Init body. self.config = CfgNode(new_allowed=True) self.config.merge_from_file(self.cfg_path) - self.config.decoding.decoding_method = "attention_rescoring" with UpdateConfig(self.config): - if "ds2_online" in model_type or "ds2_offline" in model_type: + if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: from paddlespeech.s2t.io.collator import SpeechCollator - self.config.collator.vocab_filepath = os.path.join( - res_path, self.config.collator.vocab_filepath) - self.config.collator.mean_std_filepath = os.path.join( - res_path, self.config.collator.cmvn_path) + self.vocab = self.config.vocab_filepath + self.config.decode.lang_model_path = os.path.join(res_path, self.config.decode.lang_model_path) self.collate_fn_test = SpeechCollator.from_config(self.config) - text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) - self.config.model.input_dim = self.collate_fn_test.feature_size - self.config.model.output_dim = text_feature.vocab_size + self.text_feature = TextFeaturizer( + unit_type=self.config.unit_type, + vocab=self.vocab) elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: - self.config.collator.vocab_filepath = os.path.join( - res_path, self.config.collator.vocab_filepath) - self.config.collator.augmentation_config = os.path.join( - res_path, self.config.collator.augmentation_config) - self.config.collator.spm_model_prefix = os.path.join( - res_path, self.config.collator.spm_model_prefix) - text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) - self.config.model.input_dim = self.config.collator.feat_dim - self.config.model.output_dim = text_feature.vocab_size + self.config.spm_model_prefix = os.path.join(self.res_path, self.config.spm_model_prefix) + self.text_feature = TextFeaturizer( + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) + self.config.decode.decoding_method = decode_method else: raise Exception("wrong type") - # Enter the path of model root - model_name = model_type[:model_type.rindex( '_')] # model_type: {model_name}_{dataset} model_class = dynamic_import(model_name, model_alias) - model_conf = self.config.model - logger.info(model_conf) + model_conf = self.config model = model_class.from_config(model_conf) self.model = model self.model.eval() @@ -221,32 +224,21 @@ class ASRExecutor(BaseExecutor): logger.info("Preprocess audio_file:" + audio_file) # Get the object for feature extraction - if "ds2_online" in model_type or "ds2_offline" in model_type: + if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: audio, _ = self.collate_fn_test.process_utterance( audio_file=audio_file, transcript=" ") audio_len = audio.shape[0] audio = paddle.to_tensor(audio, dtype='float32') audio_len = paddle.to_tensor(audio_len) audio = paddle.unsqueeze(audio, axis=0) - vocab_list = collate_fn_test.vocab_list + # vocab_list = collate_fn_test.vocab_list self._inputs["audio"] = audio self._inputs["audio_len"] = audio_len logger.info(f"audio feat shape: {audio.shape}") elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: logger.info("get the preprocess conf") - preprocess_conf_file = self.config.collator.augmentation_config - # redirect the cmvn path - with io.open(preprocess_conf_file, encoding="utf-8") as f: - preprocess_conf = yaml.safe_load(f) - for idx, process in enumerate(preprocess_conf["process"]): - if process['type'] == "cmvn_json": - preprocess_conf["process"][idx][ - "cmvn_path"] = os.path.join( - self.res_path, - preprocess_conf["process"][idx]["cmvn_path"]) - break - logger.info(preprocess_conf) + preprocess_conf = self.config.preprocess_config preprocess_args = {"train": False} preprocessing = Transformation(preprocess_conf) logger.info("read the audio file") @@ -274,10 +266,7 @@ class ASRExecutor(BaseExecutor): audio_len = paddle.to_tensor(audio.shape[0]) audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0) - text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + self._inputs["audio"] = audio self._inputs["audio_len"] = audio_len logger.info(f"audio feat shape: {audio.shape}") @@ -290,18 +279,15 @@ class ASRExecutor(BaseExecutor): """ Model inference and result stored in self.output. """ - text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) - cfg = self.config.decoding + + cfg = self.config.decode audio = self._inputs["audio"] audio_len = self._inputs["audio_len"] - if "ds2_online" in model_type or "ds2_offline" in model_type: + if "deepspeech2online" in model_type or "deepspeech2offline" in model_type: result_transcripts = self.model.decode( audio, audio_len, - text_feature.vocab_list, + self.text_feature.vocab_list, decoding_method=cfg.decoding_method, lang_model_path=cfg.lang_model_path, beam_alpha=cfg.alpha, @@ -316,7 +302,7 @@ class ASRExecutor(BaseExecutor): result_transcripts = self.model.decode( audio, audio_len, - text_feature=text_feature, + text_feature=self.text_feature, decoding_method=cfg.decoding_method, beam_size=cfg.beam_size, ctc_weight=cfg.ctc_weight, @@ -419,18 +405,20 @@ class ASRExecutor(BaseExecutor): config = parser_args.config ckpt_path = parser_args.ckpt_path audio_file = parser_args.input + decode_method = parser_args.decode_method force_yes = parser_args.yes device = parser_args.device try: res = self(audio_file, model, lang, sample_rate, config, ckpt_path, - force_yes, device) + decode_method, force_yes, device) logger.info('ASR Result: {}'.format(res)) return True except Exception as e: logger.exception(e) return False + @stats_wrapper def __call__(self, audio_file: os.PathLike, model: str='conformer_wenetspeech', @@ -438,6 +426,7 @@ class ASRExecutor(BaseExecutor): sample_rate: int=16000, config: os.PathLike=None, ckpt_path: os.PathLike=None, + decode_method: str='attention_rescoring', force_yes: bool=False, device=paddle.get_device()): """ @@ -446,7 +435,7 @@ class ASRExecutor(BaseExecutor): audio_file = os.path.abspath(audio_file) self._check(audio_file, sample_rate, force_yes) paddle.set_device(device) - self._init_from_path(model, lang, sample_rate, config, ckpt_path) + self._init_from_path(model, lang, sample_rate, config, decode_method, ckpt_path) self.preprocess(model, audio_file) self.infer(model) res = self.postprocess() # Retrieve result of asr. diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index c31ad3610..52bc1972d 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -26,6 +26,7 @@ from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress from ..utils import MODEL_HOME +from ..utils import stats_wrapper from paddleaudio import load from paddleaudio.features import LogMelSpectrogram from paddlespeech.s2t.utils.dynamic_import import dynamic_import @@ -245,6 +246,7 @@ class CLSExecutor(BaseExecutor): logger.exception(e) return False + @stats_wrapper def __call__(self, audio_file: os.PathLike, model: str='panns_cnn14', diff --git a/paddlespeech/cli/log.py b/paddlespeech/cli/log.py index 891b71a94..8644064c7 100644 --- a/paddlespeech/cli/log.py +++ b/paddlespeech/cli/log.py @@ -43,8 +43,7 @@ class Logger(object): level) self.format = logging.Formatter( - fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s' - ) + fmt='[%(asctime)-15s] [%(levelname)8s] - %(message)s') self.handler = logging.StreamHandler() self.handler.setFormatter(self.format) diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index 553b025f0..1276424c5 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -30,6 +30,7 @@ from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress from ..utils import MODEL_HOME +from ..utils import stats_wrapper from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.utility import UpdateConfig @@ -39,11 +40,11 @@ __all__ = ["STExecutor"] pretrained_models = { "fat_st_ted-en-zh": { "url": - "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz", + "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz", "md5": - "fa0a7425b91b4f8d259c70b2aca5ae67", + "d62063f35a16d91210a71081bd2dd557", "cfg_path": - "conf/transformer_mtl_noam.yaml", + "model.yaml", "ckpt_path": "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams", } @@ -169,24 +170,19 @@ class STExecutor(BaseExecutor): #Init body. self.config = CfgNode(new_allowed=True) self.config.merge_from_file(self.cfg_path) - self.config.decoding.decoding_method = "fullsentence" + self.config.decode.decoding_method = "fullsentence" with UpdateConfig(self.config): - self.config.collator.vocab_filepath = os.path.join( - res_path, self.config.collator.vocab_filepath) - self.config.collator.cmvn_path = os.path.join( - res_path, self.config.collator.cmvn_path) - self.config.collator.spm_model_prefix = os.path.join( - res_path, self.config.collator.spm_model_prefix) + self.config.cmvn_path = os.path.join( + res_path, self.config.cmvn_path) + self.config.spm_model_prefix = os.path.join( + res_path, self.config.spm_model_prefix) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) - self.config.model.input_dim = self.config.collator.feat_dim - self.config.model.output_dim = self.text_feature.vocab_size - - model_conf = self.config.model - logger.info(model_conf) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) + + model_conf = self.config model_name = model_type[:model_type.rindex( '_')] # model_type: {model_name}_{dataset} model_class = dynamic_import(model_name, model_alias) @@ -217,7 +213,7 @@ class STExecutor(BaseExecutor): logger.info("Preprocess audio_file:" + audio_file) if "fat_st" in model_type: - cmvn = self.config.collator.cmvn_path + cmvn = self.config.cmvn_path utt_name = "_tmp" # Get the object for feature extraction @@ -283,7 +279,7 @@ class STExecutor(BaseExecutor): """ Model inference and result stored in self.output. """ - cfg = self.config.decoding + cfg = self.config.decode audio = self._inputs["audio"] audio_len = self._inputs["audio_len"] if model_type == "fat_st_ted": @@ -334,6 +330,7 @@ class STExecutor(BaseExecutor): logger.exception(e) return False + @stats_wrapper def __call__(self, audio_file: os.PathLike, model: str='fat_st_ted', diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index da9c5fe05..1cef8fcfd 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -26,6 +26,7 @@ from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress from ..utils import MODEL_HOME +from ..utils import stats_wrapper __all__ = ['TextExecutor'] @@ -272,6 +273,7 @@ class TextExecutor(BaseExecutor): logger.exception(e) return False + @stats_wrapper def __call__( self, text: str, diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index f60f42245..a39a5c4e6 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -29,6 +29,7 @@ from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress from ..utils import MODEL_HOME +from ..utils import stats_wrapper from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend.zh_frontend import Frontend @@ -155,26 +156,52 @@ pretrained_models = { }, "pwgan_vctk-en": { 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip', + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip', 'md5': - '322ca688aec9b127cec2788b65aa3d52', + 'b3da1defcde3e578be71eb284cb89f2c', 'config': - 'pwg_default.yaml', + 'default.yaml', 'ckpt': - 'pwg_snapshot_iter_1000000.pdz', + 'snapshot_iter_1500000.pdz', 'speech_stats': - 'pwg_stats.npy', + 'feats_stats.npy', }, # mb_melgan "mb_melgan_csmsc-zh": { 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip', + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', 'md5': - 'b69322ab4ea766d955bd3d9af7dc5f2d', + 'ee5f0604e20091f0d495b6ec4618b90d', 'config': - 'finetune.yaml', + 'default.yaml', 'ckpt': - 'snapshot_iter_2000000.pdz', + 'snapshot_iter_1000000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + # style_melgan + "style_melgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip', + 'md5': + '5de2d5348f396de0c966926b8c462755', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_1500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + # hifigan + "hifigan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'dd40a3d88dfcf64513fba2f0f961ada6', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_2500000.pdz', 'speech_stats': 'feats_stats.npy', }, @@ -199,6 +226,14 @@ model_alias = { "paddlespeech.t2s.models.melgan:MelGANGenerator", "mb_melgan_inference": "paddlespeech.t2s.models.melgan:MelGANInference", + "style_melgan": + "paddlespeech.t2s.models.melgan:StyleMelGANGenerator", + "style_melgan_inference": + "paddlespeech.t2s.models.melgan:StyleMelGANInference", + "hifigan": + "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", + "hifigan_inference": + "paddlespeech.t2s.models.hifigan:HiFiGANInference", } @@ -266,7 +301,7 @@ class TTSExecutor(BaseExecutor): default='pwgan_csmsc', choices=[ 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc' + 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc' ], help='Choose vocoder type of tts task.') @@ -504,37 +539,47 @@ class TTSExecutor(BaseExecutor): am_name = am[:am.rindex('_')] am_dataset = am[am.rindex('_') + 1:] get_tone_ids = False + merge_sentences = False if am_name == 'speedyspeech': get_tone_ids = True if lang == 'zh': input_ids = self.frontend.get_input_ids( - text, merge_sentences=True, get_tone_ids=get_tone_ids) + text, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids) phone_ids = input_ids["phone_ids"] - phone_ids = phone_ids[0] if get_tone_ids: tone_ids = input_ids["tone_ids"] - tone_ids = tone_ids[0] elif lang == 'en': - input_ids = self.frontend.get_input_ids(text) + input_ids = self.frontend.get_input_ids( + text, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] else: print("lang should in {'zh', 'en'}!") - # am - if am_name == 'speedyspeech': - mel = self.am_inference(phone_ids, tone_ids) - # fastspeech2 - else: - # multi speaker - if am_dataset in {"aishell3", "vctk"}: - mel = self.am_inference( - phone_ids, spk_id=paddle.to_tensor(spk_id)) + flags = 0 + for i in range(len(phone_ids)): + part_phone_ids = phone_ids[i] + # am + if am_name == 'speedyspeech': + part_tone_ids = tone_ids[i] + mel = self.am_inference(part_phone_ids, part_tone_ids) + # fastspeech2 else: - mel = self.am_inference(phone_ids) - - # voc - wav = self.voc_inference(mel) - self._outputs['wav'] = wav + # multi speaker + if am_dataset in {"aishell3", "vctk"}: + mel = self.am_inference( + part_phone_ids, spk_id=paddle.to_tensor(spk_id)) + else: + mel = self.am_inference(part_phone_ids) + # voc + wav = self.voc_inference(mel) + if flags == 0: + wav_all = wav + flags = 1 + else: + wav_all = paddle.concat([wav_all, wav]) + self._outputs['wav'] = wav_all def postprocess(self, output: str='output.wav') -> Union[str, os.PathLike]: """ @@ -601,6 +646,7 @@ class TTSExecutor(BaseExecutor): logger.exception(e) return False + @stats_wrapper def __call__(self, text: str, am: str='fastspeech2_csmsc', diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py index ee31b771b..63b670c86 100644 --- a/paddlespeech/cli/utils.py +++ b/paddlespeech/cli/utils.py @@ -11,22 +11,36 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import hashlib +import inspect +import json import os import tarfile +import threading +import time +import uuid import zipfile from typing import Any from typing import Dict +import paddle +import paddleaudio +import requests +import yaml from paddle.framework import load from . import download +from .. import __version__ from .entry import commands +requests.adapters.DEFAULT_RETRIES = 3 + __all__ = [ 'cli_register', 'get_command', 'download_and_decompress', 'load_state_dict_from_url', + 'stats_wrapper', ] @@ -101,6 +115,13 @@ def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike: if not os.path.isdir(uncompress_path): download._decompress(filepath) else: + StatsWorker( + task='download', + version=__version__, + extra_info={ + 'download_url': archive['url'], + 'paddle_version': paddle.__version__ + }).start() uncompress_path = download.get_path_from_url(archive['url'], path, archive['md5']) @@ -146,3 +167,171 @@ def _get_sub_home(directory): PPSPEECH_HOME = _get_paddlespcceh_home() MODEL_HOME = _get_sub_home('models') +CONF_HOME = _get_sub_home('conf') + + +def _md5(text: str): + '''Calculate the md5 value of the input text.''' + md5code = hashlib.md5(text.encode()) + return md5code.hexdigest() + + +class ConfigCache: + def __init__(self): + self._data = {} + self._initialize() + self.file = os.path.join(CONF_HOME, 'cache.yaml') + if not os.path.exists(self.file): + self.flush() + return + + with open(self.file, 'r') as file: + try: + cfg = yaml.load(file, Loader=yaml.FullLoader) + self._data.update(cfg) + except: + self.flush() + + @property + def cache_info(self): + return self._data['cache_info'] + + def _initialize(self): + # Set default configuration values. + cache_info = _md5(str(uuid.uuid1())[-12:]) + "-" + str(int(time.time())) + self._data['cache_info'] = cache_info + + def flush(self): + '''Flush the current configuration into the configuration file.''' + with open(self.file, 'w') as file: + cfg = json.loads(json.dumps(self._data)) + yaml.dump(cfg, file) + + +stats_api = "http://paddlepaddle.org.cn/paddlehub/stat" +cache_info = ConfigCache().cache_info + + +class StatsWorker(threading.Thread): + def __init__(self, + task="asr", + model=None, + version=__version__, + extra_info={}): + threading.Thread.__init__(self) + self._task = task + self._model = model + self._version = version + self._extra_info = extra_info + + def run(self): + params = { + 'task': self._task, + 'version': self._version, + 'from': 'ppspeech' + } + if self._model: + params['model'] = self._model + + self._extra_info.update({ + 'cache_info': cache_info, + }) + params.update({"extra": json.dumps(self._extra_info)}) + + try: + requests.get(stats_api, params) + except Exception: + pass + + return + + +def _note_one_stat(cls_name, params={}): + task = cls_name.replace('Executor', '').lower() # XXExecutor + extra_info = { + 'paddle_version': paddle.__version__, + } + + if 'model' in params: + model = params['model'] + else: + model = None + + if 'audio_file' in params: + try: + _, sr = paddleaudio.load(params['audio_file']) + except Exception: + sr = -1 + + if task == 'asr': + extra_info.update({ + 'lang': params['lang'], + 'inp_sr': sr, + 'model_sr': params['sample_rate'], + }) + elif task == 'st': + extra_info.update({ + 'lang': + params['src_lang'] + '-' + params['tgt_lang'], + 'inp_sr': + sr, + 'model_sr': + params['sample_rate'], + }) + elif task == 'tts': + model = params['am'] + extra_info.update({ + 'lang': params['lang'], + 'vocoder': params['voc'], + }) + elif task == 'cls': + extra_info.update({ + 'inp_sr': sr, + }) + elif task == 'text': + extra_info.update({ + 'sub_task': params['task'], + 'lang': params['lang'], + }) + else: + return + + StatsWorker( + task=task, + model=model, + version=__version__, + extra_info=extra_info, ).start() + + +def _parse_args(func, *args, **kwargs): + # FullArgSpec(args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations) + argspec = inspect.getfullargspec(func) + + keys = argspec[0] + if keys[0] == 'self': # Remove self pointer. + keys = keys[1:] + + default_values = argspec[3] + values = [None] * (len(keys) - len(default_values)) + values.extend(list(default_values)) + params = dict(zip(keys, values)) + + for idx, v in enumerate(args): + params[keys[idx]] = v + for k, v in kwargs.items(): + params[k] = v + + return params + + +def stats_wrapper(executor_func): + def _warpper(self, *args, **kwargs): + try: + _note_one_stat( + type(self).__name__, _parse_args(executor_func, *args, + **kwargs)) + except Exception: + pass + return executor_func(self, *args, **kwargs) + + return _warpper diff --git a/paddlespeech/cls/exps/panns/predict.py b/paddlespeech/cls/exps/panns/predict.py index 9cfd8b6ce..ffe42d390 100644 --- a/paddlespeech/cls/exps/panns/predict.py +++ b/paddlespeech/cls/exps/panns/predict.py @@ -12,58 +12,61 @@ # See the License for the specific language governing permissions and # limitations under the License. import argparse +import os -import numpy as np import paddle import paddle.nn.functional as F +import yaml from paddleaudio.backends import load as load_audio -from paddleaudio.datasets import ESC50 from paddleaudio.features import LogMelSpectrogram -from paddleaudio.features import melspectrogram -from paddlespeech.cls.models import cnn14 +from paddleaudio.utils import logger from paddlespeech.cls.models import SoundClassifier +from paddlespeech.s2t.utils.dynamic_import import dynamic_import # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") -parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.") -parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results") -parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.") +parser.add_argument("--cfg_path", type=str, required=True) args = parser.parse_args() # yapf: enable -def extract_features(file: str, feat_backend: str='numpy', - **kwargs) -> paddle.Tensor: - waveform, sr = load_audio(file, sr=None) - - if args.feat_backend == 'numpy': - feat = melspectrogram(waveform, sr, **kwargs).transpose() - feat = np.expand_dims(feat, 0) - feat = paddle.to_tensor(feat) - else: - feature_extractor = LogMelSpectrogram(sr=sr, **kwargs) - feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0)) - feat = paddle.transpose(feat, [0, 2, 1]) +def extract_features(file: str, **feat_conf) -> paddle.Tensor: + file = os.path.abspath(os.path.expanduser(file)) + waveform, _ = load_audio(file, sr=feat_conf['sr']) + feature_extractor = LogMelSpectrogram(**feat_conf) + feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0)) + feat = paddle.transpose(feat, [0, 2, 1]) return feat if __name__ == '__main__': + args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) + with open(args.cfg_path, 'r') as f: + config = yaml.safe_load(f) + + model_conf = config['model'] + data_conf = config['data'] + feat_conf = config['feature'] + predicting_conf = config['predicting'] + + ds_class = dynamic_import(data_conf['dataset']) + backbone_class = dynamic_import(model_conf['backbone']) + model = SoundClassifier( - backbone=cnn14(pretrained=False, extract_embedding=True), - num_class=len(ESC50.label_list)) - model.set_state_dict(paddle.load(args.checkpoint)) + backbone=backbone_class(pretrained=False, extract_embedding=True), + num_class=len(ds_class.label_list)) + model.set_state_dict(paddle.load(predicting_conf['checkpoint'])) model.eval() - feat = extract_features(args.wav, args.feat_backend) + feat = extract_features(predicting_conf['audio_file'], **feat_conf) logits = model(feat) probs = F.softmax(logits, axis=1).numpy() sorted_indices = (-probs[0]).argsort() - msg = f'[{args.wav}]\n' - for idx in sorted_indices[:args.top_k]: - msg += f'{ESC50.label_list[idx]}: {probs[0][idx]}\n' - print(msg) + msg = f"[{predicting_conf['audio_file']}]\n" + for idx in sorted_indices[:predicting_conf['top_k']]: + msg += f'{ds_class.label_list[idx]}: {probs[0][idx]}\n' + logger.info(msg) diff --git a/paddlespeech/cls/exps/panns/train.py b/paddlespeech/cls/exps/panns/train.py index 121309789..7e2922148 100644 --- a/paddlespeech/cls/exps/panns/train.py +++ b/paddlespeech/cls/exps/panns/train.py @@ -15,24 +15,17 @@ import argparse import os import paddle +import yaml -from paddleaudio.datasets import ESC50 from paddleaudio.features import LogMelSpectrogram from paddleaudio.utils import logger from paddleaudio.utils import Timer -from paddlespeech.cls.models import cnn14 from paddlespeech.cls.models import SoundClassifier +from paddlespeech.s2t.utils.dynamic_import import dynamic_import # yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.") -parser.add_argument("--feat_backend", type=str, choices=['numpy', 'paddle'], default='numpy', help="Choose backend to extract features from audio files.") -parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.") -parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.") -parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.") -parser.add_argument("--checkpoint_dir", type=str, default='./checkpoint', help="Directory to save model checkpoints.") -parser.add_argument("--save_freq", type=int, default=10, help="Save checkpoint every n epoch.") -parser.add_argument("--log_freq", type=int, default=10, help="Log the training infomation every n steps.") +parser.add_argument("--cfg_path", type=str, required=True) args = parser.parse_args() # yapf: enable @@ -42,50 +35,60 @@ if __name__ == "__main__": paddle.distributed.init_parallel_env() local_rank = paddle.distributed.get_rank() - backbone = cnn14(pretrained=True, extract_embedding=True) - model = SoundClassifier(backbone, num_class=len(ESC50.label_list)) - model = paddle.DataParallel(model) - optimizer = paddle.optimizer.Adam( - learning_rate=args.learning_rate, parameters=model.parameters()) - criterion = paddle.nn.loss.CrossEntropyLoss() + args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path)) + with open(args.cfg_path, 'r') as f: + config = yaml.safe_load(f) - if args.feat_backend == 'numpy': - train_ds = ESC50(mode='train', feat_type='melspectrogram') - dev_ds = ESC50(mode='dev', feat_type='melspectrogram') - else: - train_ds = ESC50(mode='train') - dev_ds = ESC50(mode='dev') - feature_extractor = LogMelSpectrogram(sr=16000) + model_conf = config['model'] + data_conf = config['data'] + feat_conf = config['feature'] + training_conf = config['training'] + # Dataset + ds_class = dynamic_import(data_conf['dataset']) + train_ds = ds_class(**data_conf['train']) + dev_ds = ds_class(**data_conf['dev']) train_sampler = paddle.io.DistributedBatchSampler( - train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False) + train_ds, + batch_size=training_conf['batch_size'], + shuffle=True, + drop_last=False) train_loader = paddle.io.DataLoader( train_ds, batch_sampler=train_sampler, - num_workers=args.num_workers, + num_workers=training_conf['num_workers'], return_list=True, use_buffer_reader=True, ) + # Feature + feature_extractor = LogMelSpectrogram(**feat_conf) + + # Model + backbone_class = dynamic_import(model_conf['backbone']) + backbone = backbone_class(pretrained=True, extract_embedding=True) + model = SoundClassifier(backbone, num_class=data_conf['num_classes']) + model = paddle.DataParallel(model) + optimizer = paddle.optimizer.Adam( + learning_rate=training_conf['learning_rate'], + parameters=model.parameters()) + criterion = paddle.nn.loss.CrossEntropyLoss() + steps_per_epoch = len(train_sampler) - timer = Timer(steps_per_epoch * args.epochs) + timer = Timer(steps_per_epoch * training_conf['epochs']) timer.start() - for epoch in range(1, args.epochs + 1): + for epoch in range(1, training_conf['epochs'] + 1): model.train() avg_loss = 0 num_corrects = 0 num_samples = 0 for batch_idx, batch in enumerate(train_loader): - if args.feat_backend == 'numpy': - feats, labels = batch - else: - waveforms, labels = batch - feats = feature_extractor( - waveforms - ) # Need a padding when lengths of waveforms differ in a batch. - feats = paddle.transpose(feats, - [0, 2, 1]) # To [N, length, n_mels] + waveforms, labels = batch + feats = feature_extractor( + waveforms + ) # Need a padding when lengths of waveforms differ in a batch. + feats = paddle.transpose(feats, [0, 2, 1]) # To [N, length, n_mels] logits = model(feats) @@ -107,13 +110,15 @@ if __name__ == "__main__": timer.count() - if (batch_idx + 1) % args.log_freq == 0 and local_rank == 0: + if (batch_idx + 1 + ) % training_conf['log_freq'] == 0 and local_rank == 0: lr = optimizer.get_lr() - avg_loss /= args.log_freq + avg_loss /= training_conf['log_freq'] avg_acc = num_corrects / num_samples print_msg = 'Epoch={}/{}, Step={}/{}'.format( - epoch, args.epochs, batch_idx + 1, steps_per_epoch) + epoch, training_conf['epochs'], batch_idx + 1, + steps_per_epoch) print_msg += ' loss={:.4f}'.format(avg_loss) print_msg += ' acc={:.4f}'.format(avg_acc) print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format( @@ -124,16 +129,17 @@ if __name__ == "__main__": num_corrects = 0 num_samples = 0 - if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0: + if epoch % training_conf[ + 'save_freq'] == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0: dev_sampler = paddle.io.BatchSampler( dev_ds, - batch_size=args.batch_size, + batch_size=training_conf['batch_size'], shuffle=False, drop_last=False) dev_loader = paddle.io.DataLoader( dev_ds, batch_sampler=dev_sampler, - num_workers=args.num_workers, + num_workers=training_conf['num_workers'], return_list=True, ) model.eval() @@ -141,12 +147,9 @@ if __name__ == "__main__": num_samples = 0 with logger.processing('Evaluation on validation dataset'): for batch_idx, batch in enumerate(dev_loader): - if args.feat_backend == 'numpy': - feats, labels = batch - else: - waveforms, labels = batch - feats = feature_extractor(waveforms) - feats = paddle.transpose(feats, [0, 2, 1]) + waveforms, labels = batch + feats = feature_extractor(waveforms) + feats = paddle.transpose(feats, [0, 2, 1]) logits = model(feats) @@ -160,7 +163,7 @@ if __name__ == "__main__": logger.eval(print_msg) # Save model - save_dir = os.path.join(args.checkpoint_dir, + save_dir = os.path.join(training_conf['checkpoint_dir'], 'epoch_{}'.format(epoch)) logger.info('Saving model checkpoint to {}'.format(save_dir)) paddle.save(model.state_dict(), diff --git a/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py b/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py index d81fb2e3c..362098fe6 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py +++ b/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py @@ -62,7 +62,7 @@ class Scorer(object): """Evaluation function, gathering all the different scores and return the final one. - :param sentence: The input sentence for evalutation + :param sentence: The input sentence for evaluation :type sentence: str :param log: Whether return the score in log representation. :type log: bool diff --git a/paddlespeech/s2t/decoders/recog.py b/paddlespeech/s2t/decoders/recog.py index 3e9939f02..88955eacb 100644 --- a/paddlespeech/s2t/decoders/recog.py +++ b/paddlespeech/s2t/decoders/recog.py @@ -85,7 +85,7 @@ def recog_v2(args): mode="asr", load_output=False, sort_in_input_length=False, - preprocess_conf=confs.collator.augmentation_config + preprocess_conf=confs.preprocess_config if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) diff --git a/paddlespeech/s2t/decoders/scorers/ctc.py b/paddlespeech/s2t/decoders/scorers/ctc.py index ace80bd3e..81d8b0783 100644 --- a/paddlespeech/s2t/decoders/scorers/ctc.py +++ b/paddlespeech/s2t/decoders/scorers/ctc.py @@ -154,7 +154,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface): Args: state: The states of hyps - Returns: exteded state + Returns: extended state """ new_state = [] diff --git a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py index 13429d491..78b8fe36c 100644 --- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py +++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py @@ -11,7 +11,7 @@ class CTCPrefixScorePD(): which is based on Algorithm 2 in WATANABE et al. "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION," - but extended to efficiently compute the label probablities for multiple + but extended to efficiently compute the label probabilities for multiple hypotheses simultaneously See also Seki et al. "Vectorized Beam Search for CTC-Attention-Based Speech Recognition," In INTERSPEECH (pp. 3825-3829), 2019. @@ -272,7 +272,7 @@ class CTCPrefixScore(): which is based on Algorithm 2 in WATANABE et al. "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION," - but extended to efficiently compute the probablities of multiple labels + but extended to efficiently compute the probabilities of multiple labels simultaneously """ diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py index 7ccb3a6c2..5755a5f10 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py @@ -19,8 +19,8 @@ import paddle from paddle.inference import Config from paddle.inference import create_predictor from paddle.io import DataLoader +from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.models.ds2 import DeepSpeech2Model @@ -80,13 +80,13 @@ def inference(config, args): def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True - config.collator.batch_size = 1 - config.collator.num_workers = 0 + config.augmentation_config = "" + config.keep_transcription_text = True + config.batch_size = 1 + config.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) @@ -105,14 +105,14 @@ def start_server(config, args): paddle.to_tensor(audio), paddle.to_tensor(audio_len), vocab_list=test_loader.collate_fn.vocab_list, - decoding_method=config.decoding.decoding_method, - lang_model_path=config.decoding.lang_model_path, - beam_alpha=config.decoding.alpha, - beam_beta=config.decoding.beta, - beam_size=config.decoding.beam_size, - cutoff_prob=config.decoding.cutoff_prob, - cutoff_top_n=config.decoding.cutoff_top_n, - num_processes=config.decoding.num_proc_bsearch) + decoding_method=config.decode.decoding_method, + lang_model_path=config.decode.lang_model_path, + beam_alpha=config.decode.alpha, + beam_beta=config.decode.beta, + beam_size=config.decode.beam_size, + cutoff_prob=config.decode.cutoff_prob, + cutoff_top_n=config.decode.cutoff_top_n, + num_processes=config.decode.num_proc_bsearch) return result_transcript[0] # warming up with utterrances sampled from Librispeech @@ -176,15 +176,19 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() print(config) - args.warmup_manifest = config.data.test_manifest + args.warmup_manifest = config.test_manifest print_arguments(args, globals()) if args.dump_config: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py index 5c6eee3f6..0d0b4f219 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py @@ -17,8 +17,8 @@ import functools import numpy as np import paddle from paddle.io import DataLoader +from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.models.ds2 import DeepSpeech2Model @@ -33,13 +33,13 @@ from paddlespeech.s2t.utils.utility import print_arguments def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True - config.collator.batch_size = 1 - config.collator.num_workers = 0 + config.augmentation_config = "" + config.keep_transcription_text = True + config.batch_size = 1 + config.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) @@ -62,14 +62,14 @@ def start_server(config, args): paddle.to_tensor(audio), paddle.to_tensor(audio_len), vocab_list=test_loader.collate_fn.vocab_list, - decoding_method=config.decoding.decoding_method, - lang_model_path=config.decoding.lang_model_path, - beam_alpha=config.decoding.alpha, - beam_beta=config.decoding.beta, - beam_size=config.decoding.beam_size, - cutoff_prob=config.decoding.cutoff_prob, - cutoff_top_n=config.decoding.cutoff_top_n, - num_processes=config.decoding.num_proc_bsearch) + decoding_method=config.decode.decoding_method, + lang_model_path=config.decode.lang_model_path, + beam_alpha=config.decode.alpha, + beam_beta=config.decode.beta, + beam_size=config.decode.beam_size, + cutoff_prob=config.decode.cutoff_prob, + cutoff_top_n=config.decode.cutoff_top_n, + num_processes=config.decode.num_proc_bsearch) return result_transcript[0] # warming up with utterrances sampled from Librispeech @@ -111,15 +111,19 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() print(config) - args.warmup_manifest = config.data.test_manifest + args.warmup_manifest = config.test_manifest print_arguments(args, globals()) if args.dump_config: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/paddlespeech/s2t/exps/deepspeech2/bin/export.py index 66042e843..ee013d79e 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for DeepSpeech2 model.""" -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -41,7 +42,7 @@ if __name__ == "__main__": print_arguments(args) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py index f52615fae..388b380d1 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Evaluation for DeepSpeech2 model.""" -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -41,9 +42,13 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py index e073ebbf9..707eb9e1b 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Evaluation for DeepSpeech2 model.""" -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -46,9 +47,13 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py index cf2ca0d64..a909dd416 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py @@ -18,8 +18,8 @@ from pathlib import Path import paddle import soundfile +from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.models.ds2 import DeepSpeech2Model @@ -41,7 +41,7 @@ class DeepSpeech2Tester_hub(): self.audio_file = args.audio_file self.collate_fn_test = SpeechCollator.from_config(config) self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab=None) + unit_type=config.unit_type, vocab=None) def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): result_transcripts = self.model.decode( @@ -74,7 +74,7 @@ class DeepSpeech2Tester_hub(): audio = paddle.unsqueeze(audio, axis=0) vocab_list = collate_fn_test.vocab_list result_transcripts = self.compute_result_transcripts( - audio, audio_len, vocab_list, cfg.decoding) + audio, audio_len, vocab_list, cfg.decode) logger.info("result_transcripts: " + result_transcripts[0]) def run_test(self): @@ -110,13 +110,13 @@ class DeepSpeech2Tester_hub(): def setup_model(self): config = self.config.clone() with UpdateConfig(config): - config.model.input_dim = self.collate_fn_test.feature_size - config.model.output_dim = self.collate_fn_test.vocab_size + config.input_dim = self.collate_fn_test.feature_size + config.output_dim = self.collate_fn_test.vocab_size if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") @@ -134,8 +134,8 @@ class DeepSpeech2Tester_hub(): self.checkpoint_dir = checkpoint_dir self.checkpoint = Checkpoint( - kbest_n=self.config.training.checkpoint.kbest_n, - latest_n=self.config.training.checkpoint.latest_n) + kbest_n=self.config.checkpoint.kbest_n, + latest_n=self.config.checkpoint.latest_n) def resume(self): """Resume from the checkpoint at checkpoints in the output @@ -187,9 +187,13 @@ if __name__ == "__main__": print("model_type:{}".format(args.model_type)) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py index 400538f9b..09e8662f1 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py @@ -13,8 +13,8 @@ # limitations under the License. """Trainer for DeepSpeech2 model.""" from paddle import distributed as dist +from yacs.config import CfgNode -from paddlespeech.s2t.exps.deepspeech2.config import get_cfg_defaults from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -42,7 +42,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults(args.model_type) + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/deepspeech2/config.py b/paddlespeech/s2t/exps/deepspeech2/config.py deleted file mode 100644 index 58dc05ff6..000000000 --- a/paddlespeech/s2t/exps/deepspeech2/config.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester -from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.ds2 import DeepSpeech2Model -from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline - - -def get_cfg_defaults(model_type='offline'): - _C = CfgNode() - _C.data = ManifestDataset.params() - _C.collator = SpeechCollator.params() - _C.training = DeepSpeech2Trainer.params() - _C.decoding = DeepSpeech2Tester.params() - if model_type == 'offline': - _C.model = DeepSpeech2Model.params() - else: - _C.model = DeepSpeech2ModelOnline.params() - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py index a0b69d64f..049311c78 100644 --- a/paddlespeech/s2t/exps/deepspeech2/model.py +++ b/paddlespeech/s2t/exps/deepspeech2/model.py @@ -16,7 +16,6 @@ import os import time from collections import defaultdict from contextlib import nullcontext -from typing import Optional import jsonlines import numpy as np @@ -24,7 +23,6 @@ import paddle from paddle import distributed as dist from paddle import inference from paddle.io import DataLoader -from yacs.config import CfgNode from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.io.collator import SpeechCollator @@ -49,28 +47,12 @@ logger = Log(__name__).getlog() class DeepSpeech2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - lr=5e-4, # learning rate - lr_decay=1.0, # learning rate decay - weight_decay=1e-6, # the coeff of weight decay - global_grad_clip=5.0, # the global norm clip - n_epoch=50, # train epochs - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - batch_size = self.config.collator.batch_size - accum_grad = self.config.training.accum_grad + batch_size = self.config.batch_size + accum_grad = self.config.accum_grad start = time.time() @@ -133,7 +115,7 @@ class DeepSpeech2Trainer(Trainer): total_loss += float(loss) * num_utts valid_losses['val_loss'].append(float(loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -154,16 +136,16 @@ class DeepSpeech2Trainer(Trainer): config = self.config.clone() with UpdateConfig(config): if self.train: - config.model.input_dim = self.train_loader.collate_fn.feature_size - config.model.output_dim = self.train_loader.collate_fn.vocab_size + config.input_dim = self.train_loader.collate_fn.feature_size + config.output_dim = self.train_loader.collate_fn.vocab_size else: - config.model.input_dim = self.test_loader.collate_fn.feature_size - config.model.output_dim = self.test_loader.collate_fn.vocab_size + config.input_dim = self.test_loader.collate_fn.feature_size + config.output_dim = self.test_loader.collate_fn.vocab_size if self.args.model_type == 'offline': - model = DeepSpeech2Model.from_config(config.model) + model = DeepSpeech2Model.from_config(config) elif self.args.model_type == 'online': - model = DeepSpeech2ModelOnline.from_config(config.model) + model = DeepSpeech2ModelOnline.from_config(config) else: raise Exception("wrong model type") if self.parallel: @@ -177,17 +159,13 @@ class DeepSpeech2Trainer(Trainer): if not self.train: return - grad_clip = ClipGradByGlobalNormWithLog( - config.training.global_grad_clip) + grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip) lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=config.training.lr, - gamma=config.training.lr_decay, - verbose=True) + learning_rate=config.lr, gamma=config.lr_decay, verbose=True) optimizer = paddle.optimizer.Adam( learning_rate=lr_scheduler, parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config.training.weight_decay), + weight_decay=paddle.regularizer.L2Decay(config.weight_decay), grad_clip=grad_clip) self.optimizer = optimizer self.lr_scheduler = lr_scheduler @@ -198,95 +176,75 @@ class DeepSpeech2Trainer(Trainer): config.defrost() if self.train: # train - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) - config.collator.keep_transcription_text = False + config.keep_transcription_text = False collate_fn_train = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) # dev - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = False + config.augmentation_config = "" + config.keep_transcription_text = False collate_fn_dev = SpeechCollator.from_config(config) self.valid_loader = DataLoader( dev_dataset, - batch_size=int(config.collator.batch_size), + batch_size=int(config.batch_size), shuffle=False, drop_last=False, collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) logger.info("Setup train/valid Dataloader!") else: # test - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest test_dataset = ManifestDataset.from_config(config) - config.collator.augmentation_config = "" - config.collator.keep_transcription_text = True + config.augmentation_config = "" + config.keep_transcription_text = True collate_fn_test = SpeechCollator.from_config(config) - + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_test, - num_workers=config.collator.num_workers) + num_workers=config.num_workers) logger.info("Setup test Dataloader!") class DeepSpeech2Tester(DeepSpeech2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # testing config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='ctc_beam_search', # Decoding method. Options: ctc_beam_search, ctc_greedy - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=500, # Beam search width. - batch_size=128, # decoding batch size - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self._text_featurizer = TextFeaturizer( - unit_type=config.collator.unit_type, vocab=None) + unit_type=config.unit_type, vocab=None) def ordid2token(self, texts, texts_len): """ ord() id to chr() chr """ @@ -304,17 +262,17 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer vocab_list = self.test_loader.collate_fn.vocab_list target_transcripts = self.ordid2token(texts, texts_len) - result_transcripts = self.compute_result_transcripts(audio, audio_len, - vocab_list, cfg) + result_transcripts = self.compute_result_transcripts( + audio, audio_len, vocab_list, decode_cfg) for utt, target, result in zip(utts, target_transcripts, result_transcripts): @@ -327,29 +285,31 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("Current error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info( + "Current error rate [%s] = %f" % + (decode_cfg.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type) + error_rate_type=decode_cfg.error_rate_type) - def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): + def compute_result_transcripts(self, audio, audio_len, vocab_list, + decode_cfg): result_transcripts = self.model.decode( audio, audio_len, vocab_list, - decoding_method=cfg.decoding_method, - lang_model_path=cfg.lang_model_path, - beam_alpha=cfg.alpha, - beam_beta=cfg.beta, - beam_size=cfg.beam_size, - cutoff_prob=cfg.cutoff_prob, - cutoff_top_n=cfg.cutoff_top_n, - num_processes=cfg.num_proc_bsearch) + decoding_method=decode_cfg.decoding_method, + lang_model_path=decode_cfg.lang_model_path, + beam_alpha=decode_cfg.alpha, + beam_beta=decode_cfg.beta, + beam_size=decode_cfg.beam_size, + cutoff_prob=decode_cfg.cutoff_prob, + cutoff_top_n=decode_cfg.cutoff_top_n, + num_processes=decode_cfg.num_proc_bsearch) return result_transcripts @@ -358,7 +318,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def test(self): logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") self.model.eval() - cfg = self.config error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 with jsonlines.open(self.args.result_file, 'w') as fout: @@ -412,11 +371,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): if self.args.enable_auto_log is True: from paddlespeech.s2t.utils.log import Autolog self.autolog = Autolog( - batch_size=self.config.decoding.batch_size, + batch_size=self.config.decode.decode_batch_size, model_name="deepspeech2", model_precision="fp32").getlog() self.model.eval() - cfg = self.config error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 with jsonlines.open(self.args.result_file, 'w') as fout: @@ -441,7 +399,8 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): if self.args.enable_auto_log is True: self.autolog.report() - def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): + def compute_result_transcripts(self, audio, audio_len, vocab_list, + decode_cfg): if self.args.model_type == "online": output_probs, output_lens = self.static_forward_online(audio, audio_len) @@ -454,13 +413,15 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): self.predictor.clear_intermediate_tensor() self.predictor.try_shrink_memory() - self.model.decoder.init_decode(cfg.alpha, cfg.beta, cfg.lang_model_path, - vocab_list, cfg.decoding_method) + self.model.decoder.init_decode(decode_cfg.alpha, decode_cfg.beta, + decode_cfg.lang_model_path, vocab_list, + decode_cfg.decoding_method) result_transcripts = self.model.decoder.decode_probs( - output_probs, output_lens, vocab_list, cfg.decoding_method, - cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size, - cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch) + output_probs, output_lens, vocab_list, decode_cfg.decoding_method, + decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta, + decode_cfg.beam_size, decode_cfg.cutoff_prob, + decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch) #replace the with ' ' result_transcripts = [ self._text_featurizer.detokenize(sentence) @@ -531,12 +492,10 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester): num_chunk = int(num_chunk) chunk_state_h_box = np.zeros( - (self.config.model.num_rnn_layers, 1, - self.config.model.rnn_layer_size), + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), dtype=x.dtype) chunk_state_c_box = np.zeros( - (self.config.model.num_rnn_layers, 1, - self.config.model.rnn_layer_size), + (self.config.num_rnn_layers, 1, self.config.rnn_layer_size), dtype=x.dtype) input_names = self.predictor.get_input_names() diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py index df95baeb9..e3390feb1 100644 --- a/paddlespeech/s2t/exps/u2/bin/alignment.py +++ b/paddlespeech/s2t/exps/u2/bin/alignment.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Alignment for U2 model.""" -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,16 +32,20 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/export.py b/paddlespeech/s2t/exps/u2/bin/export.py index 44fc7c3e5..592b12379 100644 --- a/paddlespeech/s2t/exps/u2/bin/export.py +++ b/paddlespeech/s2t/exps/u2/bin/export.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for U2 model.""" -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,14 +32,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save jit model to + # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py index 48b0670d5..f14d804f1 100644 --- a/paddlespeech/s2t/exps/u2/bin/test.py +++ b/paddlespeech/s2t/exps/u2/bin/test.py @@ -14,12 +14,13 @@ """Evaluation for U2 model.""" import cProfile -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load +# TODO(hui zhang): dynamic load def main_sp(config, args): @@ -35,16 +36,20 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py index 556316ec0..9904813a5 100644 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -18,8 +18,8 @@ from pathlib import Path import paddle import soundfile +from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.cli import default_argument_parser @@ -36,23 +36,22 @@ class U2Infer(): self.args = args self.config = config self.audio_file = args.audio_file - self.sr = config.collator.target_sample_rate - self.preprocess_conf = config.collator.augmentation_config + self.preprocess_conf = config.preprocess_config self.preprocess_args = {"train": False} self.preprocessing = Transformation(self.preprocess_conf) self.text_feature = TextFeaturizer( - unit_type=config.collator.unit_type, - vocab=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix) + unit_type=config.unit_type, + vocab=config.vocab_filepath, + spm_model_prefix=config.spm_model_prefix) paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') # model - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): - model_conf.input_dim = config.collator.feat_dim + model_conf.input_dim = config.feat_dim model_conf.output_dim = self.text_feature.vocab_size model = U2Model.from_config(model_conf) self.model = model @@ -70,10 +69,6 @@ class U2Infer(): # read audio, sample_rate = soundfile.read( self.audio_file, dtype="int16", always_2d=True) - if sample_rate != self.sr: - logger.error( - f"sample rate error: {sample_rate}, need {self.sr} ") - sys.exit(-1) audio = audio[:, 0] logger.info(f"audio shape: {audio.shape}") @@ -85,17 +80,17 @@ class U2Infer(): ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) - cfg = self.config.decoding + decode_config = self.config.decode result_transcripts = self.model.decode( xs, ilen, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_config.decoding_method, + beam_size=decode_config.beam_size, + ctc_weight=decode_config.ctc_weight, + decoding_chunk_size=decode_config.decoding_chunk_size, + num_decoding_left_chunks=decode_config.num_decoding_left_chunks, + simulate_streaming=decode_config.simulate_streaming) rsl = result_transcripts[0][0] utt = Path(self.audio_file).name logger.info(f"hyp: {utt} {result_transcripts[0][0]}") @@ -133,9 +128,13 @@ if __name__ == "__main__": "--audio_file", type=str, help="path of the input audio file") args = parser.parse_args() - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py index d6ee8b307..53c223283 100644 --- a/paddlespeech/s2t/exps/u2/bin/train.py +++ b/paddlespeech/s2t/exps/u2/bin/train.py @@ -16,8 +16,8 @@ import cProfile import os from paddle import distributed as dist +from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2.config import get_cfg_defaults from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -44,7 +44,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2/config.py b/paddlespeech/s2t/exps/u2/config.py deleted file mode 100644 index 898b0bb25..000000000 --- a/paddlespeech/s2t/exps/u2/config.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.u2.model import U2Tester -from paddlespeech.s2t.exps.u2.model import U2Trainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.u2 import U2Model - -_C = CfgNode() - -_C.data = ManifestDataset.params() - -_C.collator = SpeechCollator.params() - -_C.model = U2Model.params() - -_C.training = U2Trainer.params() - -_C.decoding = U2Tester.params() - - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index 404058edc..992be5cd4 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -18,13 +18,11 @@ import time from collections import defaultdict from collections import OrderedDict from contextlib import nullcontext -from typing import Optional import jsonlines import numpy as np import paddle from paddle import distributed as dist -from yacs.config import CfgNode from paddlespeech.s2t.frontend.featurizer import TextFeaturizer from paddlespeech.s2t.io.dataloader import BatchDataLoader @@ -46,38 +44,11 @@ logger = Log(__name__).getlog() class U2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - global_grad_clip=5.0, # the global norm clip - )) - default.optim = 'adam' - default.optim_conf = CfgNode( - dict( - lr=5e-4, # learning rate - weight_decay=1e-6, # the coeff of weight decay - )) - default.scheduler = 'warmuplr' - default.scheduler_conf = CfgNode( - dict( - warmup_steps=25000, - lr_decay=1.0, # learning rate decay - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -120,7 +91,7 @@ class U2Trainer(Trainer): for k, v in losses_np.items(): report(k, v) - report("batch_size", self.config.collator.batch_size) + report("batch_size", self.config.batch_size) report("accum", train_conf.accum_grad) report("step_cost", iteration_time) @@ -153,7 +124,7 @@ class U2Trainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -182,7 +153,7 @@ class U2Trainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: @@ -214,8 +185,7 @@ class U2Trainer(Trainer): k.split(',')) == 2 else "" msg += "," msg = msg[:-1] # remove the last "," - if (batch_index + 1 - ) % self.config.training.log_interval == 0: + if (batch_index + 1) % self.config.log_interval == 0: logger.info(msg) data_start_time = time.time() except Exception as e: @@ -252,30 +222,31 @@ class U2Trainer(Trainer): if self.train: # train/valid dataset, return token ids self.train_loader = BatchDataLoader( - json_file=config.data.train_manifest, + json_file=config.train_manifest, train_mode=True, - sortagrad=False, - batch_size=config.collator.batch_size, - maxlen_in=float('inf'), - maxlen_out=float('inf'), - minibatches=0, + sortagrad=config.sortagrad, + batch_size=config.batch_size, + maxlen_in=config.maxlen_in, + maxlen_out=config.maxlen_out, + minibatches=config.minibatches, mini_batch_size=self.args.ngpu, - batch_count='auto', - batch_bins=0, - batch_frames_in=0, - batch_frames_out=0, - batch_frames_inout=0, - preprocess_conf=config.collator. - augmentation_config, # aug will be off when train_mode=False - n_iter_processes=config.collator.num_workers, + batch_count=config.batch_count, + batch_bins=config.batch_bins, + batch_frames_in=config.batch_frames_in, + batch_frames_out=config.batch_frames_out, + batch_frames_inout=config.batch_frames_inout, + preprocess_conf=config.preprocess_config, + n_iter_processes=config.num_workers, subsampling_factor=1, - num_encs=1) + num_encs=1, + dist_sampler=False, + shortest_first=False) self.valid_loader = BatchDataLoader( - json_file=config.data.dev_manifest, + json_file=config.dev_manifest, train_mode=False, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -285,19 +256,22 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator. - augmentation_config, # aug will be off when train_mode=False - n_iter_processes=config.collator.num_workers, + preprocess_conf=config.preprocess_config, + n_iter_processes=config.num_workers, subsampling_factor=1, - num_encs=1) + num_encs=1, + dist_sampler=False, + shortest_first=False) logger.info("Setup train/valid Dataloader!") else: + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) # test dataset, return raw text self.test_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -307,17 +281,16 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator. - augmentation_config, # aug will be off when train_mode=False + preprocess_conf=config.preprocess_config, n_iter_processes=1, subsampling_factor=1, num_encs=1) self.align_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -327,8 +300,7 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator. - augmentation_config, # aug will be off when train_mode=False + preprocess_conf=config.preprocess_config, n_iter_processes=1, subsampling_factor=1, num_encs=1) @@ -336,7 +308,7 @@ class U2Trainer(Trainer): def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): if self.train: @@ -359,7 +331,7 @@ class U2Trainer(Trainer): if not self.train: return - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -379,7 +351,7 @@ class U2Trainer(Trainer): config, parameters, lr_scheduler=None, ): - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -404,41 +376,12 @@ class U2Trainer(Trainer): class U2Tester(U2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -457,10 +400,10 @@ class U2Tester(U2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_config = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_config.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer start_time = time.time() target_transcripts = self.id2token(texts, texts_len, self.text_feature) @@ -468,12 +411,12 @@ class U2Tester(U2Trainer): audio, audio_len, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_config.decoding_method, + beam_size=decode_config.beam_size, + ctc_weight=decode_config.ctc_weight, + decoding_chunk_size=decode_config.decoding_chunk_size, + num_decoding_left_chunks=decode_config.num_decoding_left_chunks, + simulate_streaming=decode_config.simulate_streaming) decode_time = time.time() - start_time for utt, target, result, rec_tids in zip( @@ -492,15 +435,15 @@ class U2Tester(U2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("One example error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info("One example error rate [%s] = %f" % ( + decode_config.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, # num examples error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type, + error_rate_type=decode_config.error_rate_type, num_frames=audio_len.sum().numpy().item(), decode_time=decode_time) @@ -511,7 +454,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.config.collator.stride_ms + stride_ms = self.config.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -562,15 +505,15 @@ class U2Tester(U2Trainer): "ref_len": len_refs, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') @paddle.no_grad() def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, + self.config.decode.decode_batch_size, + self.config.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -581,10 +524,10 @@ class U2Tester(U2Trainer): List[paddle.static.InputSpec]: input spec. """ from paddlespeech.s2t.models.u2 import U2InferModel - infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), + infer_model = U2InferModel.from_pretrained(self.train_loader, + self.config.clone(), self.args.checkpoint_path) - feat_dim = self.test_loader.feat_dim + feat_dim = self.train_loader.feat_dim input_spec = [ paddle.static.InputSpec(shape=[1, None, feat_dim], dtype='float32'), # audio, [B,T,D] diff --git a/paddlespeech/s2t/exps/u2/trainer.py b/paddlespeech/s2t/exps/u2/trainer.py index 22a0a3c58..ab87c30d6 100644 --- a/paddlespeech/s2t/exps/u2/trainer.py +++ b/paddlespeech/s2t/exps/u2/trainer.py @@ -44,77 +44,75 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.collator.keep_transcription_text = False + config.keep_transcription_text = False # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest + config.manifest = config.train_manifest train_dataset = ManifestDataset.from_config(config) - config.data.manifest = config.data.dev_manifest + config.manifest = config.dev_manifest dev_dataset = ManifestDataset.from_config(config) collate_fn_train = SpeechCollator.from_config(config) - config.collator.augmentation_config = "" collate_fn_dev = SpeechCollator.from_config(config) if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + sortagrad=config.sortagrad, + shuffle_method=config.shuffle_method) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, shuffle=False, drop_last=False, collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) + num_workers=config.num_workers, ) # test dataset, return raw text - config.data.manifest = config.data.test_manifest + config.manifest = config.test_manifest # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. - config.data.min_input_len = 0.0 # second - config.data.max_input_len = float('inf') # second - config.data.min_output_len = 0.0 # tokens - config.data.max_output_len = float('inf') # tokens - config.data.min_output_input_ratio = 0.00 - config.data.max_output_input_ratio = float('inf') + config.min_input_len = 0.0 # second + config.max_input_len = float('inf') # second + config.min_output_len = 0.0 # tokens + config.max_output_len = float('inf') # tokens + config.min_output_input_ratio = 0.00 + config.max_output_input_ratio = float('inf') test_dataset = ManifestDataset.from_config(config) # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" + config.keep_transcription_text = True self.test_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.batch_size, shuffle=False, drop_last=False, collate_fn=SpeechCollator.from_config(config)) # return text token id - config.collator.keep_transcription_text = False + config.keep_transcription_text = False self.align_loader = DataLoader( test_dataset, - batch_size=config.decoding.batch_size, + batch_size=config.decode.batch_size, shuffle=False, drop_last=False, collate_fn=SpeechCollator.from_config(config)) @@ -122,7 +120,7 @@ class U2Trainer(Trainer): def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.collate_fn.feature_size model_conf.output_dim = self.train_loader.collate_fn.vocab_size @@ -136,7 +134,7 @@ class U2Trainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -156,7 +154,7 @@ class U2Trainer(Trainer): config, parameters, lr_scheduler=None, ): - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler @@ -182,7 +180,7 @@ class U2Trainer(Trainer): def setup_updater(self): output_dir = self.output_dir - config = self.config.training + config = self.config updater = U2Updater( model=self.model, diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py index 67bed3497..422483b97 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py +++ b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py @@ -69,6 +69,10 @@ if __name__ == "__main__": config = CfgNode() config.set_new_allowed(True) config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 9b8274ad6..bc995977a 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -17,13 +17,11 @@ import os import time from collections import defaultdict from contextlib import nullcontext -from typing import Optional import jsonlines import numpy as np import paddle from paddle import distributed as dist -from yacs.config import CfgNode from paddlespeech.s2t.frontend.featurizer import TextFeaturizer from paddlespeech.s2t.frontend.utility import load_dict @@ -43,44 +41,12 @@ from paddlespeech.s2t.utils.utility import UpdateConfig logger = Log(__name__).getlog() -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - _C = CfgNode() - - _C.model = U2Model.params() - - _C.training = U2Trainer.params() - - _C.decoding = U2Tester.params() - - config = _C.clone() - config.set_new_allowed(True) - return config - - class U2Trainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - checkpoint=dict( - kbest_n=50, - latest_n=5, ), )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward @@ -122,7 +88,7 @@ class U2Trainer(Trainer): if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -157,7 +123,7 @@ class U2Trainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_loss'] = total_loss / num_seen_utts @@ -186,7 +152,7 @@ class U2Trainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: @@ -235,10 +201,10 @@ class U2Trainer(Trainer): config = self.config.clone() # train/valid dataset, return token ids self.train_loader = BatchDataLoader( - json_file=config.data.train_manifest, + json_file=config.train_manifest, train_mode=True, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -248,16 +214,16 @@ class U2Trainer(Trainer): batch_frames_in=0, batch_frames_out=0, batch_frames_inout=0, - preprocess_conf=config.collator.augmentation_config, - n_iter_processes=config.collator.num_workers, + preprocess_conf=config.preprocess_config, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) self.valid_loader = BatchDataLoader( - json_file=config.data.dev_manifest, + json_file=config.dev_manifest, train_mode=False, sortagrad=False, - batch_size=config.collator.batch_size, + batch_size=config.batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -268,16 +234,18 @@ class U2Trainer(Trainer): batch_frames_out=0, batch_frames_inout=0, preprocess_conf=None, - n_iter_processes=config.collator.num_workers, + n_iter_processes=config.num_workers, subsampling_factor=1, num_encs=1) + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) # test dataset, return raw text self.test_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -293,10 +261,10 @@ class U2Trainer(Trainer): num_encs=1) self.align_loader = BatchDataLoader( - json_file=config.data.test_manifest, + json_file=config.test_manifest, train_mode=False, sortagrad=False, - batch_size=config.decoding.batch_size, + batch_size=decode_batch_size, maxlen_in=float('inf'), maxlen_out=float('inf'), minibatches=0, @@ -316,7 +284,7 @@ class U2Trainer(Trainer): config = self.config # model - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): model_conf.input_dim = self.train_loader.feat_dim model_conf.output_dim = self.train_loader.vocab_size @@ -360,41 +328,12 @@ class U2Trainer(Trainer): class U2Tester(U2Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='wer', # Error rate type for evaluation. Options `wer`, 'cer' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) self.text_feature = TextFeaturizer( - unit_type=self.config.collator.unit_type, - vocab=self.config.collator.vocab_filepath, - spm_model_prefix=self.config.collator.spm_model_prefix) + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -413,10 +352,10 @@ class U2Tester(U2Trainer): texts, texts_len, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode errors_sum, len_refs, num_ins = 0.0, 0, 0 - errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors - error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer + errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer start_time = time.time() target_transcripts = self.id2token(texts, texts_len, self.text_feature) @@ -424,12 +363,12 @@ class U2Tester(U2Trainer): audio, audio_len, text_feature=self.text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - ctc_weight=cfg.ctc_weight, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + ctc_weight=decode_cfg.ctc_weight, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) decode_time = time.time() - start_time for i, (utt, target, result, rec_tids) in enumerate( @@ -449,15 +388,16 @@ class U2Tester(U2Trainer): logger.info(f"Utt: {utt}") logger.info(f"Ref: {target}") logger.info(f"Hyp: {result}") - logger.info("One example error rate [%s] = %f" % - (cfg.error_rate_type, error_rate_func(target, result))) + logger.info( + "One example error rate [%s] = %f" % + (decode_cfg.error_rate_type, error_rate_func(target, result))) return dict( errors_sum=errors_sum, len_refs=len_refs, num_ins=num_ins, # num examples error_rate=errors_sum / len_refs, - error_rate_type=cfg.error_rate_type, + error_rate_type=decode_cfg.error_rate_type, num_frames=audio_len.sum().numpy().item(), decode_time=decode_time) @@ -468,7 +408,7 @@ class U2Tester(U2Trainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - stride_ms = self.config.collator.stride_ms + stride_ms = self.config.stride_ms error_rate_type = None errors_sum, len_refs, num_ins = 0.0, 0, 0 num_frames = 0.0 @@ -519,15 +459,15 @@ class U2Tester(U2Trainer): "ref_len": len_refs, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') @paddle.no_grad() def align(self): ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, + self.config.decode.decode_batch_size, + self.config.stride_ms, self.vocab_list, self.args.result_file) def load_inferspec(self): @@ -539,7 +479,7 @@ class U2Tester(U2Trainer): """ from paddlespeech.s2t.models.u2 import U2InferModel infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), + self.config.clone(), self.args.checkpoint_path) feat_dim = self.test_loader.feat_dim input_spec = [ diff --git a/paddlespeech/s2t/exps/u2_st/bin/export.py b/paddlespeech/s2t/exps/u2_st/bin/export.py index 69d9718f8..c641152fe 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/export.py +++ b/paddlespeech/s2t/exps/u2_st/bin/export.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Export for U2 model.""" -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -31,14 +32,14 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save jit model to + # save jit model to parser.add_argument( "--export_path", type=str, help="path of the jit model to save") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py index 93c2fee0a..1d70a3103 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/test.py +++ b/paddlespeech/s2t/exps/u2_st/bin/test.py @@ -14,12 +14,13 @@ """Evaluation for U2 model.""" import cProfile -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults +from yacs.config import CfgNode + from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments -# TODO(hui zhang): dynamic load +# TODO(hui zhang): dynamic load def main_sp(config, args): @@ -35,16 +36,20 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() - # save asr result to + # save asr result to parser.add_argument( "--result_file", type=str, help="path of save the asr result") args = parser.parse_args() print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) + if args.decode_cfg: + decode_conf = CfgNode(new_allowed=True) + decode_conf.merge_from_file(args.decode_cfg) + config.decode = decode_conf if args.opts: config.merge_from_list(args.opts) config.freeze() diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py index 58496c887..4dec9ec8a 100644 --- a/paddlespeech/s2t/exps/u2_st/bin/train.py +++ b/paddlespeech/s2t/exps/u2_st/bin/train.py @@ -16,8 +16,8 @@ import cProfile import os from paddle import distributed as dist +from yacs.config import CfgNode -from paddlespeech.s2t.exps.u2_st.config import get_cfg_defaults from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils.utility import print_arguments @@ -42,7 +42,7 @@ if __name__ == "__main__": print_arguments(args, globals()) # https://yaml.org/type/float.html - config = get_cfg_defaults() + config = CfgNode(new_allowed=True) if args.config: config.merge_from_file(args.config) if args.opts: diff --git a/paddlespeech/s2t/exps/u2_st/config.py b/paddlespeech/s2t/exps/u2_st/config.py deleted file mode 100644 index a48f9106a..000000000 --- a/paddlespeech/s2t/exps/u2_st/config.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from yacs.config import CfgNode - -from paddlespeech.s2t.exps.u2_st.model import U2STTester -from paddlespeech.s2t.exps.u2_st.model import U2STTrainer -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.models.u2_st import U2STModel - -_C = CfgNode() - -_C.data = ManifestDataset.params() - -_C.collator = SpeechCollator.params() - -_C.model = U2STModel.params() - -_C.training = U2STTrainer.params() - -_C.decoding = U2STTester.params() - - -def get_cfg_defaults(): - """Get a yacs CfgNode object with default values for my_project.""" - # Return a clone so that the defaults will not be altered - # This is for the "local variable" use pattern - config = _C.clone() - config.set_new_allowed(True) - return config diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index a3b39df7c..b03ca38b6 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -16,28 +16,24 @@ import json import os import time from collections import defaultdict +from collections import OrderedDict from contextlib import nullcontext -from typing import Optional import jsonlines import numpy as np import paddle from paddle import distributed as dist -from paddle.io import DataLoader -from yacs.config import CfgNode - -from paddlespeech.s2t.io.collator import SpeechCollator -from paddlespeech.s2t.io.collator import TripletSpeechCollator -from paddlespeech.s2t.io.dataset import ManifestDataset -from paddlespeech.s2t.io.sampler import SortagradBatchSampler -from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler + +from paddlespeech.s2t.frontend.featurizer import TextFeaturizer +from paddlespeech.s2t.io.dataloader import BatchDataLoader from paddlespeech.s2t.models.u2_st import U2STModel -from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog -from paddlespeech.s2t.training.scheduler import WarmupLR +from paddlespeech.s2t.training.optimizer import OptimizerFactory +from paddlespeech.s2t.training.reporter import ObsScope +from paddlespeech.s2t.training.reporter import report +from paddlespeech.s2t.training.scheduler import LRSchedulerFactory from paddlespeech.s2t.training.timer import Timer from paddlespeech.s2t.training.trainer import Trainer from paddlespeech.s2t.utils import bleu_score -from paddlespeech.s2t.utils import ctc_utils from paddlespeech.s2t.utils import layer_tools from paddlespeech.s2t.utils import mp_tools from paddlespeech.s2t.utils.log import Log @@ -47,38 +43,11 @@ logger = Log(__name__).getlog() class U2STTrainer(Trainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # training config - default = CfgNode( - dict( - n_epoch=50, # train epochs - log_interval=100, # steps - accum_grad=1, # accum grad by # steps - global_grad_clip=5.0, # the global norm clip - )) - default.optim = 'adam' - default.optim_conf = CfgNode( - dict( - lr=5e-4, # learning rate - weight_decay=1e-6, # the coeff of weight decay - )) - default.scheduler = 'warmuplr' - default.scheduler_conf = CfgNode( - dict( - warmup_steps=25000, - lr_decay=1.0, # learning rate decay - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) def train_batch(self, batch_index, batch_data, msg): - train_conf = self.config.training + train_conf = self.config start = time.time() # forward utt, audio, audio_len, text, text_len = batch_data @@ -96,6 +65,8 @@ class U2STTrainer(Trainer): # loss div by `batch_size * accum_grad` loss /= train_conf.accum_grad losses_np = {'loss': float(loss) * train_conf.accum_grad} + if st_loss: + losses_np['st_loss'] = float(st_loss) if attention_loss: losses_np['att_loss'] = float(attention_loss) if ctc_loss: @@ -125,9 +96,15 @@ class U2STTrainer(Trainer): iteration_time = time.time() - start + for k, v in losses_np.items(): + report(k, v) + report("batch_size", self.config.batch_size) + report("accum", train_conf.accum_grad) + report("step_cost", iteration_time) + if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.collator.batch_size) + msg += "batch size: {}, ".format(self.config.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -168,7 +145,7 @@ class U2STTrainer(Trainer): if ctc_loss: valid_losses['val_ctc_loss'].append(float(ctc_loss)) - if (i + 1) % self.config.training.log_interval == 0: + if (i + 1) % self.config.log_interval == 0: valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} valid_dump['val_history_st_loss'] = total_loss / num_seen_utts @@ -197,23 +174,40 @@ class U2STTrainer(Trainer): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: data_start_time = time.time() for batch_index, batch in enumerate(self.train_loader): dataload_time = time.time() - data_start_time - msg = "Train: Rank: {}, ".format(dist.get_rank()) - msg += "epoch: {}, ".format(self.epoch) - msg += "step: {}, ".format(self.iteration) - msg += "batch : {}/{}, ".format(batch_index + 1, - len(self.train_loader)) - msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) - msg += "data time: {:>.3f}s, ".format(dataload_time) - self.train_batch(batch_index, batch, msg) - self.after_train_batch() - data_start_time = time.time() + msg = "Train:" + observation = OrderedDict() + with ObsScope(observation): + report("Rank", dist.get_rank()) + report("epoch", self.epoch) + report('step', self.iteration) + report("lr", self.lr_scheduler()) + self.train_batch(batch_index, batch, msg) + self.after_train_batch() + report('iter', batch_index + 1) + report('total', len(self.train_loader)) + report('reader_cost', dataload_time) + observation['batch_cost'] = observation[ + 'reader_cost'] + observation['step_cost'] + observation['samples'] = observation['batch_size'] + observation['ips,sent./sec'] = observation[ + 'batch_size'] / observation['batch_cost'] + for k, v in observation.items(): + msg += f" {k.split(',')[0]}: " + msg += f"{v:>.8f}" if isinstance(v, + float) else f"{v}" + msg += f" {k.split(',')[1]}" if len( + k.split(',')) == 2 else "" + msg += "," + msg = msg[:-1] # remove the last "," + if (batch_index + 1) % self.config.log_interval == 0: + logger.info(msg) except Exception as e: logger.error(e) raise e @@ -244,95 +238,92 @@ class U2STTrainer(Trainer): def setup_dataloader(self): config = self.config.clone() - config.defrost() - config.collator.keep_transcription_text = False - - # train/valid dataset, return token ids - config.data.manifest = config.data.train_manifest - train_dataset = ManifestDataset.from_config(config) - - config.data.manifest = config.data.dev_manifest - dev_dataset = ManifestDataset.from_config(config) - if config.model.model_conf.asr_weight > 0.: - Collator = TripletSpeechCollator - TestCollator = SpeechCollator - else: - TestCollator = Collator = SpeechCollator - - collate_fn_train = Collator.from_config(config) - config.collator.augmentation_config = "" - collate_fn_dev = Collator.from_config(config) - - if self.parallel: - batch_sampler = SortagradDistributedBatchSampler( - train_dataset, - batch_size=config.collator.batch_size, - num_replicas=None, - rank=None, - shuffle=True, - drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) + load_transcript = True if config.model_conf.asr_weight > 0 else False + + if self.train: + # train/valid dataset, return token ids + self.train_loader = BatchDataLoader( + json_file=config.train_manifest, + train_mode=True, + sortagrad=False, + batch_size=config.batch_size, + maxlen_in=config.maxlen_in, + maxlen_out=config.maxlen_out, + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config. + preprocess_config, # aug will be off when train_mode=False + n_iter_processes=config.num_workers, + subsampling_factor=1, + load_aux_output=load_transcript, + num_encs=1, + dist_sampler=True) + + self.valid_loader = BatchDataLoader( + json_file=config.dev_manifest, + train_mode=False, + sortagrad=False, + batch_size=config.batch_size, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config. + preprocess_config, # aug will be off when train_mode=False + n_iter_processes=config.num_workers, + subsampling_factor=1, + load_aux_output=load_transcript, + num_encs=1, + dist_sampler=True) + logger.info("Setup train/valid Dataloader!") else: - batch_sampler = SortagradBatchSampler( - train_dataset, - shuffle=True, - batch_size=config.collator.batch_size, - drop_last=True, - sortagrad=config.collator.sortagrad, - shuffle_method=config.collator.shuffle_method) - self.train_loader = DataLoader( - train_dataset, - batch_sampler=batch_sampler, - collate_fn=collate_fn_train, - num_workers=config.collator.num_workers, ) - self.valid_loader = DataLoader( - dev_dataset, - batch_size=config.collator.batch_size, - shuffle=False, - drop_last=False, - collate_fn=collate_fn_dev, - num_workers=config.collator.num_workers, ) - - # test dataset, return raw text - config.data.manifest = config.data.test_manifest - # filter test examples, will cause less examples, but no mismatch with training - # and can use large batch size , save training time, so filter test egs now. - # config.data.min_input_len = 0.0 # second - # config.data.max_input_len = float('inf') # second - # config.data.min_output_len = 0.0 # tokens - # config.data.max_output_len = float('inf') # tokens - # config.data.min_output_input_ratio = 0.00 - # config.data.max_output_input_ratio = float('inf') - test_dataset = ManifestDataset.from_config(config) - # return text ord id - config.collator.keep_transcription_text = True - config.collator.augmentation_config = "" - self.test_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=TestCollator.from_config(config), - num_workers=config.collator.num_workers, ) - # return text token id - config.collator.keep_transcription_text = False - self.align_loader = DataLoader( - test_dataset, - batch_size=config.decoding.batch_size, - shuffle=False, - drop_last=False, - collate_fn=TestCollator.from_config(config), - num_workers=config.collator.num_workers, ) - logger.info("Setup train/valid/test/align Dataloader!") + # test dataset, return raw text + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) + self.test_loader = BatchDataLoader( + json_file=config.test_manifest, + train_mode=False, + sortagrad=False, + batch_size=decode_batch_size, + maxlen_in=float('inf'), + maxlen_out=float('inf'), + minibatches=0, + mini_batch_size=1, + batch_count='auto', + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + preprocess_conf=config. + preprocess_config, # aug will be off when train_mode=False + n_iter_processes=config.num_workers, + subsampling_factor=1, + num_encs=1, + dist_sampler=False) + + logger.info("Setup test Dataloader!") def setup_model(self): config = self.config - model_conf = config.model + model_conf = config with UpdateConfig(model_conf): - model_conf.input_dim = self.train_loader.collate_fn.feature_size - model_conf.output_dim = self.train_loader.collate_fn.vocab_size + if self.train: + model_conf.input_dim = self.train_loader.feat_dim + model_conf.output_dim = self.train_loader.vocab_size + else: + model_conf.input_dim = self.test_loader.feat_dim + model_conf.output_dim = self.test_loader.vocab_size model = U2STModel.from_config(model_conf) @@ -342,41 +333,44 @@ class U2STTrainer(Trainer): logger.info(f"{model}") layer_tools.print_params(model, logger.info) - train_config = config.training + train_config = config optim_type = train_config.optim optim_conf = train_config.optim_conf scheduler_type = train_config.scheduler scheduler_conf = train_config.scheduler_conf - if scheduler_type == 'expdecaylr': - lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=optim_conf.lr, - gamma=scheduler_conf.lr_decay, - verbose=False) - elif scheduler_type == 'warmuplr': - lr_scheduler = WarmupLR( - learning_rate=optim_conf.lr, - warmup_steps=scheduler_conf.warmup_steps, - verbose=False) - elif scheduler_type == 'noam': - lr_scheduler = paddle.optimizer.lr.NoamDecay( - learning_rate=optim_conf.lr, - d_model=model_conf.encoder_conf.output_size, - warmup_steps=scheduler_conf.warmup_steps, - verbose=False) - else: - raise ValueError(f"Not support scheduler: {scheduler_type}") - - grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip) - weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay) - if optim_type == 'adam': - optimizer = paddle.optimizer.Adam( - learning_rate=lr_scheduler, - parameters=model.parameters(), - weight_decay=weight_decay, - grad_clip=grad_clip) - else: - raise ValueError(f"Not support optim: {optim_type}") + scheduler_args = { + "learning_rate": optim_conf.lr, + "verbose": False, + "warmup_steps": scheduler_conf.warmup_steps, + "gamma": scheduler_conf.lr_decay, + "d_model": model_conf.encoder_conf.output_size, + } + lr_scheduler = LRSchedulerFactory.from_args(scheduler_type, + scheduler_args) + + def optimizer_args( + config, + parameters, + lr_scheduler=None, ): + train_config = config + optim_type = train_config.optim + optim_conf = train_config.optim_conf + scheduler_type = train_config.scheduler + scheduler_conf = train_config.scheduler_conf + return { + "grad_clip": train_config.global_grad_clip, + "weight_decay": optim_conf.weight_decay, + "learning_rate": lr_scheduler + if lr_scheduler else optim_conf.lr, + "parameters": parameters, + "epsilon": 1e-9 if optim_type == 'noam' else None, + "beta1": 0.9 if optim_type == 'noam' else None, + "beat2": 0.98 if optim_type == 'noam' else None, + } + + optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler) + optimizer = OptimizerFactory.from_args(optim_type, optimzer_args) self.model = model self.optimizer = optimizer @@ -385,63 +379,38 @@ class U2STTrainer(Trainer): class U2STTester(U2STTrainer): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # decoding config - default = CfgNode( - dict( - alpha=2.5, # Coef of LM for beam search. - beta=0.3, # Coef of WC for beam search. - cutoff_prob=1.0, # Cutoff probability for pruning. - cutoff_top_n=40, # Cutoff number for pruning. - lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm', # Filepath for language model. - decoding_method='attention', # Decoding method. Options: 'attention', 'ctc_greedy_search', - # 'ctc_prefix_beam_search', 'attention_rescoring' - error_rate_type='bleu', # Error rate type for evaluation. Options `bleu`, 'char_bleu' - num_proc_bsearch=8, # # of CPUs for beam search. - beam_size=10, # Beam search width. - batch_size=16, # decoding batch size - ctc_weight=0.0, # ctc weight for attention rescoring decode mode. - decoding_chunk_size=-1, # decoding chunk size. Defaults to -1. - # <0: for decoding, use full chunk. - # >0: for decoding, use fixed chunk size as set. - # 0: used for training, it's prohibited here. - num_decoding_left_chunks=-1, # number of left chunks for decoding. Defaults to -1. - simulate_streaming=False, # simulate streaming inference. Defaults to False. - )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, config, args): super().__init__(config, args) + self.text_feature = TextFeaturizer( + unit_type=self.config.unit_type, + vocab=self.config.vocab_filepath, + spm_model_prefix=self.config.spm_model_prefix) + self.vocab_list = self.text_feature.vocab_list - def ordid2token(self, texts, texts_len): + def id2token(self, texts, texts_len, text_feature): """ ord() id to chr() chr """ trans = [] for text, n in zip(texts, texts_len): n = n.numpy().item() ids = text[:n] - trans.append(''.join([chr(i) for i in ids])) + trans.append(text_feature.defeaturize(ids.numpy().tolist())) return trans def translate(self, audio, audio_len): """"E2E translation from extracted audio feature""" - cfg = self.config.decoding - text_feature = self.test_loader.collate_fn.text_feature + decode_cfg = self.config.decode self.model.eval() hyps = self.model.decode( audio, audio_len, - text_feature=text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - word_reward=cfg.word_reward, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + text_feature=self.text_feature, + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + word_reward=decode_cfg.word_reward, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) return hyps def compute_translation_metrics(self, @@ -452,27 +421,24 @@ class U2STTester(U2STTrainer): texts_len, bleu_func, fout=None): - cfg = self.config.decoding + decode_cfg = self.config.decode len_refs, num_ins = 0, 0 start_time = time.time() - text_feature = self.test_loader.collate_fn.text_feature - refs = [ - "".join(chr(t) for t in text[:text_len]) - for text, text_len in zip(texts, texts_len) - ] + refs = self.id2token(texts, texts_len, self.text_feature) hyps = self.model.decode( audio, audio_len, - text_feature=text_feature, - decoding_method=cfg.decoding_method, - beam_size=cfg.beam_size, - word_reward=cfg.word_reward, - decoding_chunk_size=cfg.decoding_chunk_size, - num_decoding_left_chunks=cfg.num_decoding_left_chunks, - simulate_streaming=cfg.simulate_streaming) + text_feature=self.text_feature, + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size, + word_reward=decode_cfg.word_reward, + decoding_chunk_size=decode_cfg.decoding_chunk_size, + num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks, + simulate_streaming=decode_cfg.simulate_streaming) + decode_time = time.time() - start_time for utt, target, result in zip(utts, refs, hyps): @@ -502,10 +468,10 @@ class U2STTester(U2STTrainer): self.model.eval() logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") - cfg = self.config.decoding - bleu_func = bleu_score.char_bleu if cfg.error_rate_type == 'char-bleu' else bleu_score.bleu + decode_cfg = self.config.decode + bleu_func = bleu_score.char_bleu if decode_cfg.error_rate_type == 'char-bleu' else bleu_score.bleu - stride_ms = self.test_loader.collate_fn.stride_ms + stride_ms = self.config.stride_ms hyps, refs = [], [] len_refs, num_ins = 0, 0 num_frames = 0.0 @@ -522,7 +488,8 @@ class U2STTester(U2STTrainer): len_refs += metrics['len_refs'] num_ins += metrics['num_ins'] rtf = num_time / (num_frames * stride_ms) - logger.info("RTF: %f, BELU (%d) = %f" % (rtf, num_ins, bleu)) + logger.info("RTF: %f, instance (%d), batch BELU = %f" % + (rtf, num_ins, bleu)) rtf = num_time / (num_frames * stride_ms) msg = "Test: " @@ -549,17 +516,10 @@ class U2STTester(U2STTrainer): "num_examples": num_ins, "decode_method": - self.config.decoding.decoding_method, + self.config.decode.decoding_method, }) f.write(data + '\n') - @paddle.no_grad() - def align(self): - ctc_utils.ctc_align(self.config, self.model, self.align_loader, - self.config.decoding.batch_size, - self.config.collator.stride_ms, self.vocab_list, - self.args.result_file) - def load_inferspec(self): """infer model and input spec. @@ -567,11 +527,11 @@ class U2STTester(U2STTrainer): nn.Layer: inference model List[paddle.static.InputSpec]: input spec. """ - from paddlespeech.s2t.models.u2 import U2InferModel - infer_model = U2InferModel.from_pretrained(self.test_loader, - self.config.model.clone(), - self.args.checkpoint_path) - feat_dim = self.test_loader.collate_fn.feature_size + from paddlespeech.s2t.models.u2_st import U2STInferModel + infer_model = U2STInferModel.from_pretrained(self.test_loader, + self.config.clone(), + self.args.checkpoint_path) + feat_dim = self.test_loader.feat_dim input_spec = [ paddle.static.InputSpec(shape=[1, None, feat_dim], dtype='float32'), # audio, [B,T,D] diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py index 017851e63..b596b2ab0 100644 --- a/paddlespeech/s2t/frontend/normalizer.py +++ b/paddlespeech/s2t/frontend/normalizer.py @@ -117,7 +117,8 @@ class FeatureNormalizer(object): self._compute_mean_std(manifest_path, featurize_func, num_samples, num_workers) else: - self._read_mean_std_from_file(mean_std_filepath) + mean_std = mean_std_filepath + self._read_mean_std_from_file(mean_std) def apply(self, features): """Normalize features to be of zero mean and unit stddev. @@ -131,10 +132,14 @@ class FeatureNormalizer(object): """ return (features - self._mean) * self._istd - def _read_mean_std_from_file(self, filepath, eps=1e-20): + def _read_mean_std_from_file(self, mean_std, eps=1e-20): """Load mean and std from file.""" - filetype = filepath.split(".")[-1] - mean, istd = load_cmvn(filepath, filetype=filetype) + if isinstance(mean_std, list): + mean = mean_std[0]['cmvn_stats']['mean'] + istd = mean_std[0]['cmvn_stats']['istd'] + else: + filetype = mean_std.split(".")[-1] + mean, istd = load_cmvn(mean_std, filetype=filetype) self._mean = np.expand_dims(mean, axis=0) self._istd = np.expand_dims(istd, axis=0) diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index 5f2335496..b99fc80c0 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -12,10 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. import io -from typing import Optional import numpy as np -from yacs.config import CfgNode from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline from paddlespeech.s2t.frontend.featurizer.speech_featurizer import SpeechFeaturizer @@ -219,33 +217,6 @@ class SpeechCollatorBase(): class SpeechCollator(SpeechCollatorBase): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - augmentation_config="", - random_seed=0, - mean_std_filepath="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - spectrum_type='linear', # 'linear', 'mfcc', 'fbank' - feat_dim=0, # 'mfcc', 'fbank' - delta_delta=False, # 'mfcc', 'fbank' - stride_ms=10.0, # ms - window_ms=20.0, # ms - n_fft=None, # fft points - max_freq=None, # None for samplerate/2 - target_sample_rate=16000, # target sample rate - use_dB_normalization=True, - target_dB=-20, - dither=1.0, # feature dither - keep_transcription_text=False)) - - if config is not None: - config.merge_from_other_cfg(default) - return default - @classmethod def from_config(cls, config): """Build a SpeechCollator object from a config. @@ -256,45 +227,43 @@ class SpeechCollator(SpeechCollatorBase): Returns: SpeechCollator: collator object. """ - assert 'augmentation_config' in config.collator - assert 'keep_transcription_text' in config.collator - assert 'mean_std_filepath' in config.collator - assert 'vocab_filepath' in config.collator - assert 'spectrum_type' in config.collator - assert 'n_fft' in config.collator - assert config.collator - - if isinstance(config.collator.augmentation_config, (str, bytes)): - if config.collator.augmentation_config: + assert 'augmentation_config' in config + assert 'keep_transcription_text' in config + assert 'mean_std_filepath' in config + assert 'vocab_filepath' in config + assert 'spectrum_type' in config + assert 'n_fft' in config + assert config + + if isinstance(config.augmentation_config, (str, bytes)): + if config.augmentation_config: aug_file = io.open( - config.collator.augmentation_config, - mode='r', - encoding='utf8') + config.augmentation_config, mode='r', encoding='utf8') else: aug_file = io.StringIO(initial_value='{}', newline='') else: - aug_file = config.collator.augmentation_config + aug_file = config.augmentation_config assert isinstance(aug_file, io.StringIO) speech_collator = cls( aug_file=aug_file, random_seed=0, - mean_std_filepath=config.collator.mean_std_filepath, - unit_type=config.collator.unit_type, - vocab_filepath=config.collator.vocab_filepath, - spm_model_prefix=config.collator.spm_model_prefix, - spectrum_type=config.collator.spectrum_type, - feat_dim=config.collator.feat_dim, - delta_delta=config.collator.delta_delta, - stride_ms=config.collator.stride_ms, - window_ms=config.collator.window_ms, - n_fft=config.collator.n_fft, - max_freq=config.collator.max_freq, - target_sample_rate=config.collator.target_sample_rate, - use_dB_normalization=config.collator.use_dB_normalization, - target_dB=config.collator.target_dB, - dither=config.collator.dither, - keep_transcription_text=config.collator.keep_transcription_text) + mean_std_filepath=config.mean_std_filepath, + unit_type=config.unit_type, + vocab_filepath=config.vocab_filepath, + spm_model_prefix=config.spm_model_prefix, + spectrum_type=config.spectrum_type, + feat_dim=config.feat_dim, + delta_delta=config.delta_delta, + stride_ms=config.stride_ms, + window_ms=config.window_ms, + n_fft=config.n_fft, + max_freq=config.max_freq, + target_sample_rate=config.target_sample_rate, + use_dB_normalization=config.use_dB_normalization, + target_dB=config.target_dB, + dither=config.dither, + keep_transcription_text=config.keep_transcription_text) return speech_collator diff --git a/paddlespeech/s2t/io/converter.py b/paddlespeech/s2t/io/converter.py index b217d2b1b..a802ac749 100644 --- a/paddlespeech/s2t/io/converter.py +++ b/paddlespeech/s2t/io/converter.py @@ -31,11 +31,17 @@ class CustomConverter(): """ - def __init__(self, subsampling_factor=1, dtype=np.float32): + def __init__(self, + subsampling_factor=1, + dtype=np.float32, + load_aux_input=False, + load_aux_output=False): """Construct a CustomConverter object.""" self.subsampling_factor = subsampling_factor self.ignore_id = -1 self.dtype = dtype + self.load_aux_input = load_aux_input + self.load_aux_output = load_aux_output def __call__(self, batch): """Transform a batch and send it to a device. @@ -49,34 +55,53 @@ class CustomConverter(): """ # batch should be located in list assert len(batch) == 1 - (xs, ys), utts = batch[0] - assert xs[0] is not None, "please check Reader and Augmentation impl." - - # perform subsampling - if self.subsampling_factor > 1: - xs = [x[::self.subsampling_factor, :] for x in xs] - - # get batch of lengths of input sequences - ilens = np.array([x.shape[0] for x in xs]) - - # perform padding and convert to tensor - # currently only support real number - if xs[0].dtype.kind == "c": - xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype) - xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype) - # Note(kamo): - # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E. - # Don't create ComplexTensor and give it E2E here - # because torch.nn.DataParellel can't handle it. - xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag} - else: - xs_pad = pad_list(xs, 0).astype(self.dtype) + data, utts = batch[0] + xs_data, ys_data = [], [] + for ud in data: + if ud[0].ndim > 1: + # speech data (input): (speech_len, feat_dim) + xs_data.append(ud) + else: + # text data (output): (text_len, ) + ys_data.append(ud) + + assert xs_data[0][ + 0] is not None, "please check Reader and Augmentation impl." + + xs_pad, ilens = [], [] + for xs in xs_data: + # perform subsampling + if self.subsampling_factor > 1: + xs = [x[::self.subsampling_factor, :] for x in xs] + + # get batch of lengths of input sequences + ilens.append(np.array([x.shape[0] for x in xs])) + + # perform padding and convert to tensor + # currently only support real number + xs_pad.append(pad_list(xs, 0).astype(self.dtype)) + + if not self.load_aux_input: + xs_pad, ilens = xs_pad[0], ilens[0] + break # NOTE: this is for multi-output (e.g., speech translation) - ys_pad = pad_list( - [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys], - self.ignore_id) + ys_pad, olens = [], [] + + for ys in ys_data: + ys_pad.append( + pad_list([ + np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys + ], self.ignore_id)) + + olens.append( + np.array([ + y[0].shape[0] if isinstance(y, tuple) else y.shape[0] + for y in ys + ])) + + if not self.load_aux_output: + ys_pad, olens = ys_pad[0], olens[0] + break - olens = np.array( - [y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys]) return utts, xs_pad, ilens, ys_pad, olens diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py index b8eb33679..920de34fc 100644 --- a/paddlespeech/s2t/io/dataloader.py +++ b/paddlespeech/s2t/io/dataloader.py @@ -18,7 +18,9 @@ from typing import Text import jsonlines import numpy as np +from paddle.io import BatchSampler from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler from paddlespeech.s2t.io.batchfy import make_batchset from paddlespeech.s2t.io.converter import CustomConverter @@ -73,7 +75,11 @@ class BatchDataLoader(): preprocess_conf=None, n_iter_processes: int=1, subsampling_factor: int=1, - num_encs: int=1): + load_aux_input: bool=False, + load_aux_output: bool=False, + num_encs: int=1, + dist_sampler: bool=False, + shortest_first: bool=False): self.json_file = json_file self.train_mode = train_mode self.use_sortagrad = sortagrad == -1 or sortagrad > 0 @@ -89,6 +95,10 @@ class BatchDataLoader(): self.num_encs = num_encs self.preprocess_conf = preprocess_conf self.n_iter_processes = n_iter_processes + self.load_aux_input = load_aux_input + self.load_aux_output = load_aux_output + self.dist_sampler = dist_sampler + self.shortest_first = shortest_first # read json data with jsonlines.open(json_file, 'r') as reader: @@ -105,7 +115,7 @@ class BatchDataLoader(): maxlen_out, minibatches, # for debug min_batch_size=mini_batch_size, - shortest_first=self.use_sortagrad, + shortest_first=self.shortest_first or self.use_sortagrad, count=batch_count, batch_bins=batch_bins, batch_frames_in=batch_frames_in, @@ -126,21 +136,36 @@ class BatchDataLoader(): # Setup a converter if num_encs == 1: self.converter = CustomConverter( - subsampling_factor=subsampling_factor, dtype=np.float32) + subsampling_factor=subsampling_factor, + dtype=np.float32, + load_aux_input=load_aux_input, + load_aux_output=load_aux_output) else: assert NotImplementedError("not impl CustomConverterMulEnc.") # hack to make batchsize argument as 1 # actual bathsize is included in a list - # default collate function converts numpy array to pytorch tensor + # default collate function converts numpy array to paddle tensor # we used an empty collate function instead which returns list self.dataset = TransformDataset(self.minibaches, self.converter, self.reader) + if self.dist_sampler: + self.batch_sampler = DistributedBatchSampler( + dataset=self.dataset, + batch_size=1, + shuffle=not self.use_sortagrad if self.train_mode else False, + drop_last=False, ) + else: + self.batch_sampler = BatchSampler( + dataset=self.dataset, + batch_size=1, + shuffle=not self.use_sortagrad if self.train_mode else False, + drop_last=False, ) + self.dataloader = DataLoader( dataset=self.dataset, - batch_size=1, - shuffle=not self.use_sortagrad if self.train_mode else False, + batch_sampler=self.batch_sampler, collate_fn=batch_collate, num_workers=self.n_iter_processes, ) @@ -168,5 +193,9 @@ class BatchDataLoader(): echo += f"subsampling_factor: {self.subsampling_factor}, " echo += f"num_encs: {self.num_encs}, " echo += f"num_workers: {self.n_iter_processes}, " + echo += f"load_aux_input: {self.load_aux_input}, " + echo += f"load_aux_output: {self.load_aux_output}, " + echo += f"dist_sampler: {self.dist_sampler}, " + echo += f"shortest_first: {self.shortest_first}, " echo += f"file: {self.json_file}" return echo diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index d64d7d3ec..0e94f047b 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -13,11 +13,8 @@ # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) # Modified from wenet(https://github.com/wenet-e2e/wenet) -from typing import Optional - import jsonlines from paddle.io import Dataset -from yacs.config import CfgNode from paddlespeech.s2t.frontend.utility import read_manifest from paddlespeech.s2t.utils.log import Log @@ -28,22 +25,6 @@ logger = Log(__name__).getlog() class ManifestDataset(Dataset): - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - manifest="", - max_input_len=27.0, - min_input_len=0.0, - max_output_len=float('inf'), - min_output_len=0.0, - max_output_input_ratio=float('inf'), - min_output_input_ratio=0.0, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - @classmethod def from_config(cls, config): """Build a ManifestDataset object from a config. @@ -54,17 +35,17 @@ class ManifestDataset(Dataset): Returns: ManifestDataset: dataet object. """ - assert 'manifest' in config.data - assert config.data.manifest + assert 'manifest' in config + assert config.manifest dataset = cls( - manifest_path=config.data.manifest, - max_input_len=config.data.max_input_len, - min_input_len=config.data.min_input_len, - max_output_len=config.data.max_output_len, - min_output_len=config.data.min_output_len, - max_output_input_ratio=config.data.max_output_input_ratio, - min_output_input_ratio=config.data.min_output_input_ratio, ) + manifest_path=config.manifest, + max_input_len=config.max_input_len, + min_input_len=config.min_input_len, + max_output_len=config.max_output_len, + min_output_len=config.min_output_len, + max_output_input_ratio=config.max_output_input_ratio, + min_output_input_ratio=config.min_output_input_ratio, ) return dataset def __init__(self, diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py index 38ff13963..4e136bdce 100644 --- a/paddlespeech/s2t/io/reader.py +++ b/paddlespeech/s2t/io/reader.py @@ -68,7 +68,7 @@ class LoadInputsAndTargets(): if mode not in ["asr"]: raise ValueError("Only asr are allowed: mode={}".format(mode)) - if preprocess_conf is not None: + if preprocess_conf: self.preprocessing = Transformation(preprocess_conf) logger.warning( "[Experimental feature] Some preprocessing will be done " @@ -82,12 +82,11 @@ class LoadInputsAndTargets(): self.load_output = load_output self.load_input = load_input self.sort_in_input_length = sort_in_input_length - if preprocess_args is None: - self.preprocess_args = {} - else: + if preprocess_args: assert isinstance(preprocess_args, dict), type(preprocess_args) self.preprocess_args = dict(preprocess_args) - + else: + self.preprocess_args = {} self.keep_all_data_on_mem = keep_all_data_on_mem def __call__(self, batch, return_uttid=False): diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index 0dfaec29c..4a4d67ce9 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -12,11 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Deepspeech2 ASR Model""" -from typing import Optional - import paddle from paddle import nn -from yacs.config import CfgNode from paddlespeech.s2t.models.ds2.conv import ConvStack from paddlespeech.s2t.models.ds2.rnn import RNNStack @@ -120,20 +117,6 @@ class DeepSpeech2Model(nn.Layer): :rtype: tuple of LayerOutput """ - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True, #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - ctc_grad_norm_type=None, )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, feat_size, dict_size, @@ -168,13 +151,13 @@ class DeepSpeech2Model(nn.Layer): """Compute Model loss Args: - audio (Tenosr): [B, T, D] + audio (Tensors): [B, T, D] audio_len (Tensor): [B] text (Tensor): [B, U] text_len (Tensor): [B] Returns: - loss (Tenosr): [1] + loss (Tensor): [1] """ eouts, eouts_len = self.encoder(audio, audio_len) loss = self.decoder(eouts, eouts_len, text, text_len) @@ -221,12 +204,12 @@ class DeepSpeech2Model(nn.Layer): model = cls( feat_size=dataloader.collate_fn.feature_size, dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - use_gru=config.model.use_gru, - share_rnn_weights=config.model.share_rnn_weights, - blank_id=config.model.blank_id, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights, + blank_id=config.blank_id, ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) @@ -240,7 +223,7 @@ class DeepSpeech2Model(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2Model diff --git a/paddlespeech/s2t/models/ds2_online/deepspeech2.py b/paddlespeech/s2t/models/ds2_online/deepspeech2.py index 85876bce8..5e4981c06 100644 --- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py @@ -12,12 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Deepspeech2 ASR Online Model""" -from typing import Optional - import paddle import paddle.nn.functional as F from paddle import nn -from yacs.config import CfgNode from paddlespeech.s2t.models.ds2_online.conv import Conv2dSubsampling4Online from paddlespeech.s2t.modules.ctc import CTCDecoder @@ -244,22 +241,6 @@ class DeepSpeech2ModelOnline(nn.Layer): :rtype: tuple of LayerOutput """ - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - default = CfgNode( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=4, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - num_fc_layers=2, - fc_layers_size_list=[512, 256], - use_gru=True, #Use gru if set True. Use simple rnn if set False. - blank_id=0, # index of blank in vocob.txt - ctc_grad_norm_type=None, )) - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__( self, feat_size, @@ -298,13 +279,13 @@ class DeepSpeech2ModelOnline(nn.Layer): """Compute Model loss Args: - audio (Tenosr): [B, T, D] + audio (Tensor): [B, T, D] audio_len (Tensor): [B] text (Tensor): [B, U] text_len (Tensor): [B] Returns: - loss (Tenosr): [1] + loss (Tensor): [1] """ eouts, eouts_len, final_state_h_box, final_state_c_box = self.encoder( audio, audio_len, None, None) @@ -353,14 +334,14 @@ class DeepSpeech2ModelOnline(nn.Layer): model = cls( feat_size=dataloader.collate_fn.feature_size, dict_size=dataloader.collate_fn.vocab_size, - num_conv_layers=config.model.num_conv_layers, - num_rnn_layers=config.model.num_rnn_layers, - rnn_size=config.model.rnn_layer_size, - rnn_direction=config.model.rnn_direction, - num_fc_layers=config.model.num_fc_layers, - fc_layers_size_list=config.model.fc_layers_size_list, - use_gru=config.model.use_gru, - blank_id=config.model.blank_id, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + rnn_direction=config.rnn_direction, + num_fc_layers=config.num_fc_layers, + fc_layers_size_list=config.fc_layers_size_list, + use_gru=config.use_gru, + blank_id=config.blank_id, ctc_grad_norm_type=config.get('ctc_grad_norm_type', None), ) infos = Checkpoint().load_parameters( model, checkpoint_path=checkpoint_path) @@ -374,7 +355,7 @@ class DeepSpeech2ModelOnline(nn.Layer): Parameters config: yacs.config.CfgNode - config.model + config Returns ------- DeepSpeech2ModelOnline diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 2d6fb2180..ff4012e8e 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -26,7 +26,6 @@ from typing import Tuple import paddle from paddle import jit from paddle import nn -from yacs.config import CfgNode from paddlespeech.s2t.decoders.scorers.ctc import CTCPrefixScorer from paddlespeech.s2t.frontend.utility import IGNORE_ID @@ -60,56 +59,6 @@ logger = Log(__name__).getlog() class U2BaseModel(ASRInterface, nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # network architecture - default = CfgNode() - # allow add new item when merge_with_file - default.cmvn_file = "" - default.cmvn_file_type = "json" - default.input_dim = 0 - default.output_dim = 0 - # encoder related - default.encoder = 'transformer' - default.encoder_conf = CfgNode( - dict( - output_size=256, # dimension of attention - attention_heads=4, - linear_units=2048, # the number of units of position-wise feed forward - num_blocks=12, # the number of encoder blocks - dropout_rate=0.1, - positional_dropout_rate=0.1, - attention_dropout_rate=0.0, - input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before=True, - # use_cnn_module=True, - # cnn_module_kernel=15, - # activation_type='swish', - # pos_enc_layer_type='rel_pos', - # selfattention_layer_type='rel_selfattn', - )) - # decoder related - default.decoder = 'transformer' - default.decoder_conf = CfgNode( - dict( - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - positional_dropout_rate=0.1, - self_attention_dropout_rate=0.0, - src_attention_dropout_rate=0.0, )) - # hybrid CTC/attention - default.model_conf = CfgNode( - dict( - ctc_weight=0.3, - lsm_weight=0.1, # label smoothing option - length_normalized_loss=False, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, vocab_size: int, encoder: TransformerEncoder, @@ -584,8 +533,9 @@ class U2BaseModel(ASRInterface, nn.Layer): hyp_content = hyp[0] # Prevent the hyp is empty if len(hyp_content) == 0: - hyp_content = (self.ctc.blank_id,) - hyp_content = paddle.to_tensor(hyp_content, place=device, dtype=paddle.long) + hyp_content = (self.ctc.blank_id, ) + hyp_content = paddle.to_tensor( + hyp_content, place=device, dtype=paddle.long) hyp_list.append(hyp_content) hyps_pad = pad_sequence(hyp_list, True, self.ignore_id) hyps_lens = paddle.to_tensor( @@ -730,8 +680,8 @@ class U2BaseModel(ASRInterface, nn.Layer): """u2 decoding. Args: - feats (Tenosr): audio features, (B, T, D) - feats_lengths (Tenosr): (B) + feats (Tensor): audio features, (B, T, D) + feats_lengths (Tensor): (B) text_feature (TextFeaturizer): text feature object. decoding_method (str): decoding mode, e.g. 'attention', 'ctc_greedy_search', diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index 8b07e389d..79ca423f8 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """U2 ASR Model -Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition +Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition (https://arxiv.org/pdf/2012.05481.pdf) """ import time @@ -24,7 +24,6 @@ from typing import Tuple import paddle from paddle import jit from paddle import nn -from yacs.config import CfgNode from paddlespeech.s2t.frontend.utility import IGNORE_ID from paddlespeech.s2t.frontend.utility import load_cmvn @@ -52,57 +51,6 @@ logger = Log(__name__).getlog() class U2STBaseModel(nn.Layer): """CTC-Attention hybrid Encoder-Decoder model""" - @classmethod - def params(cls, config: Optional[CfgNode]=None) -> CfgNode: - # network architecture - default = CfgNode() - # allow add new item when merge_with_file - default.cmvn_file = "" - default.cmvn_file_type = "json" - default.input_dim = 0 - default.output_dim = 0 - # encoder related - default.encoder = 'transformer' - default.encoder_conf = CfgNode( - dict( - output_size=256, # dimension of attention - attention_heads=4, - linear_units=2048, # the number of units of position-wise feed forward - num_blocks=12, # the number of encoder blocks - dropout_rate=0.1, - positional_dropout_rate=0.1, - attention_dropout_rate=0.0, - input_layer='conv2d', # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before=True, - # use_cnn_module=True, - # cnn_module_kernel=15, - # activation_type='swish', - # pos_enc_layer_type='rel_pos', - # selfattention_layer_type='rel_selfattn', - )) - # decoder related - default.decoder = 'transformer' - default.decoder_conf = CfgNode( - dict( - attention_heads=4, - linear_units=2048, - num_blocks=6, - dropout_rate=0.1, - positional_dropout_rate=0.1, - self_attention_dropout_rate=0.0, - src_attention_dropout_rate=0.0, )) - # hybrid CTC/attention - default.model_conf = CfgNode( - dict( - asr_weight=0.0, - ctc_weight=0.0, - lsm_weight=0.1, # label smoothing option - length_normalized_loss=False, )) - - if config is not None: - config.merge_from_other_cfg(default) - return default - def __init__(self, vocab_size: int, encoder: TransformerEncoder, @@ -289,8 +237,8 @@ class U2STBaseModel(nn.Layer): simulate_streaming (bool, optional): streaming or not. Defaults to False. Returns: - Tuple[paddle.Tensor, paddle.Tensor]: - encoder hiddens (B, Tmax, D), + Tuple[paddle.Tensor, paddle.Tensor]: + encoder hiddens (B, Tmax, D), encoder hiddens mask (B, 1, Tmax). """ # Let's assume B = batch_size @@ -530,24 +478,24 @@ class U2STBaseModel(nn.Layer): """u2 decoding. Args: - feats (Tenosr): audio features, (B, T, D) - feats_lengths (Tenosr): (B) + feats (Tensor): audio features, (B, T, D) + feats_lengths (Tensor): (B) text_feature (TextFeaturizer): text feature object. - decoding_method (str): decoding mode, e.g. - 'fullsentence', + decoding_method (str): decoding mode, e.g. + 'fullsentence', 'simultaneous' beam_size (int): beam size for search decoding_chunk_size (int, optional): decoding chunk size. Defaults to -1. <0: for decoding, use full chunk. >0: for decoding, use fixed chunk size as set. - 0: used for training, it's prohibited here. - num_decoding_left_chunks (int, optional): + 0: used for training, it's prohibited here. + num_decoding_left_chunks (int, optional): number of left chunks for decoding. Defaults to -1. simulate_streaming (bool, optional): simulate streaming inference. Defaults to False. Raises: ValueError: when not support decoding_method. - + Returns: List[List[int]]: transcripts. """ @@ -601,7 +549,7 @@ class U2STModel(U2STBaseModel): ValueError: raise when using not support encoder type. Returns: - int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc + int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc """ if configs['cmvn_file'] is not None: mean, istd = load_cmvn(configs['cmvn_file'], diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py index ffc9f0387..1f9838077 100644 --- a/paddlespeech/s2t/modules/ctc.py +++ b/paddlespeech/s2t/modules/ctc.py @@ -39,10 +39,6 @@ except ImportError: except Exception as e: logger.info("paddlespeech_ctcdecoders not installed!") -#try: -#except Exception as e: -# logger.info("ctcdecoder not installed!") - __all__ = ['CTCDecoder'] @@ -85,10 +81,10 @@ class CTCDecoderBase(nn.Layer): Args: hs_pad (Tensor): batch of padded hidden state sequences (B, Tmax, D) hlens (Tensor): batch of lengths of hidden state sequences (B) - ys_pad (Tenosr): batch of padded character id sequence tensor (B, Lmax) + ys_pad (Tensor): batch of padded character id sequence tensor (B, Lmax) ys_lens (Tensor): batch of lengths of character sequence (B) Returns: - loss (Tenosr): ctc loss value, scalar. + loss (Tensor): ctc loss value, scalar. """ logits = self.ctc_lo(self.dropout(hs_pad)) loss = self.criterion(logits, ys_pad, hlens, ys_lens) @@ -256,8 +252,8 @@ class CTCDecoder(CTCDecoderBase): """ctc decoding with probs. Args: - probs (Tenosr): activation after softmax - logits_lens (Tenosr): audio output lens + probs (Tensor): activation after softmax + logits_lens (Tensor): audio output lens vocab_list ([type]): [description] decoding_method ([type]): [description] lang_model_path ([type]): [description] diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py index d6b63761b..1f66c015a 100644 --- a/paddlespeech/s2t/modules/mask.py +++ b/paddlespeech/s2t/modules/mask.py @@ -54,7 +54,7 @@ def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor: [0, 0, 0, 1, 1], [0, 0, 1, 1, 1]] """ - # (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim + # (TODO: Hui Zhang): jit not support Tensor.dim() and Tensor.ndim # assert lengths.dim() == 1 batch_size = int(lengths.shape[0]) max_len = int(lengths.max()) diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py index 3ef871c5d..bb85732a6 100644 --- a/paddlespeech/s2t/training/cli.py +++ b/paddlespeech/s2t/training/cli.py @@ -97,6 +97,14 @@ def default_argument_parser(parser=None): train_group.add_argument( "--dump-config", metavar="FILE", help="dump config to `this` file.") + test_group = parser.add_argument_group( + title='Test Options', description=None) + + test_group.add_argument( + "--decode_cfg", + metavar="DECODE_CONFIG_FILE", + help="decode config file.") + profile_group = parser.add_argument_group( title='Benchmark Options', description=None) profile_group.add_argument( diff --git a/paddlespeech/s2t/training/scheduler.py b/paddlespeech/s2t/training/scheduler.py index 0222246e8..b22f7ef85 100644 --- a/paddlespeech/s2t/training/scheduler.py +++ b/paddlespeech/s2t/training/scheduler.py @@ -67,18 +67,19 @@ class WarmupLR(LRScheduler): super().__init__(learning_rate, last_epoch, verbose) def __repr__(self): - return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" + return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps}, lr={self.base_lr}, last_epoch={self.last_epoch})" def get_lr(self): + # self.last_epoch start from zero step_num = self.last_epoch + 1 return self.base_lr * self.warmup_steps**0.5 * min( step_num**-0.5, step_num * self.warmup_steps**-1.5) def set_step(self, step: int=None): ''' - It will update the learning rate in optimizer according to current ``epoch`` . + It will update the learning rate in optimizer according to current ``epoch`` . The new learning rate will take effect on next ``optimizer.step`` . - + Args: step (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1. Returns: @@ -94,7 +95,7 @@ class ConstantLR(LRScheduler): learning_rate (float): The initial learning rate. It is a python float number. last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . - + Returns: ``ConstantLR`` instance to schedule learning rate. """ diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py index 9bf1ca4db..cac5e5704 100644 --- a/paddlespeech/s2t/training/trainer.py +++ b/paddlespeech/s2t/training/trainer.py @@ -117,8 +117,8 @@ class Trainer(): self.init_parallel() self.checkpoint = Checkpoint( - kbest_n=self.config.training.checkpoint.kbest_n, - latest_n=self.config.training.checkpoint.latest_n) + kbest_n=self.config.checkpoint.kbest_n, + latest_n=self.config.checkpoint.latest_n) # set random seed if needed if args.seed: @@ -129,8 +129,8 @@ class Trainer(): if hasattr(self.args, "benchmark_batch_size") and self.args.benchmark_batch_size: with UpdateConfig(self.config): - self.config.collator.batch_size = self.args.benchmark_batch_size - self.config.training.log_interval = 1 + self.config.batch_size = self.args.benchmark_batch_size + self.config.log_interval = 1 logger.info( f"Benchmark reset batch-size: {self.args.benchmark_batch_size}") @@ -222,7 +222,7 @@ class Trainer(): batch_sampler = self.train_loader.batch_sampler if isinstance(batch_sampler, paddle.io.DistributedBatchSampler): logger.debug( - f"train_loader.batch_sample set epoch: {self.epoch}") + f"train_loader.batch_sample.set_epoch: {self.epoch}") batch_sampler.set_epoch(self.epoch) def before_train(self): @@ -260,7 +260,7 @@ class Trainer(): self.before_train() logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") - while self.epoch < self.config.training.n_epoch: + while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() try: diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py index f35adef0c..a6346c344 100644 --- a/paddlespeech/s2t/transform/spectrogram.py +++ b/paddlespeech/s2t/transform/spectrogram.py @@ -319,7 +319,7 @@ class LogMelSpectrogramKaldi(): fmin=20, fmax=None, eps=1e-10, - dither=False): + dither=1.0): self.fs = fs self.n_mels = n_mels self.n_fft = n_fft @@ -374,7 +374,7 @@ class LogMelSpectrogramKaldi(): Returns: np.ndarray: (T, D) """ - dither = self.dither if train else False + dither = self.dither if train else 0.0 if x.ndim != 1: raise ValueError("Not support x: [Time, Channel]") diff --git a/paddlespeech/s2t/utils/bleu_score.py b/paddlespeech/s2t/utils/bleu_score.py index ea32fcf95..a50c000ae 100644 --- a/paddlespeech/s2t/utils/bleu_score.py +++ b/paddlespeech/s2t/utils/bleu_score.py @@ -14,7 +14,6 @@ """This module provides functions to calculate bleu score in different level. e.g. wer for word-level, cer for char-level. """ -import nltk import numpy as np import sacrebleu @@ -114,6 +113,5 @@ class ErrorCalculator(): seq_true_text = "".join(seq_true).replace(self.space, " ") seqs_hat.append(seq_hat_text) seqs_true.append(seq_true_text) - bleu = nltk.bleu_score.corpus_bleu([[ref] for ref in seqs_true], - seqs_hat) - return bleu * 100 + bleu = sacrebleu.corpus_bleu(seqs_hat, [[ref] for ref in seqs_true]) + return bleu.score * 100 diff --git a/paddlespeech/s2t/utils/dynamic_import.py b/paddlespeech/s2t/utils/dynamic_import.py index 50bd73a6d..bd738edf8 100644 --- a/paddlespeech/s2t/utils/dynamic_import.py +++ b/paddlespeech/s2t/utils/dynamic_import.py @@ -57,7 +57,7 @@ def filter_valid_args(args: Dict[Text, Any], valid_keys: List[Text]): return new_args -def filter_out_tenosr(args: Dict[Text, Any]): +def filter_out_tensor(args: Dict[Text, Any]): return {key: val for key, val in args.items() if not has_tensor(val)} @@ -65,5 +65,5 @@ def instance_class(module_class, args: Dict[Text, Any]): valid_keys = inspect.signature(module_class).parameters.keys() new_args = filter_valid_args(args, valid_keys) logger.info( - f"Instance: {module_class.__name__} {filter_out_tenosr(new_args)}.") + f"Instance: {module_class.__name__} {filter_out_tensor(new_args)}.") return module_class(**new_args) diff --git a/paddlespeech/s2t/utils/log.py b/paddlespeech/s2t/utils/log.py index 1790efdb1..4f51b7f05 100644 --- a/paddlespeech/s2t/utils/log.py +++ b/paddlespeech/s2t/utils/log.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import getpass +import inspect import os import socket import sys @@ -94,15 +95,31 @@ def find_log_dir_and_names(program_name=None, log_dir=None): class Log(): """Default Logger for all.""" logger.remove() - logger.add( - sys.stdout, - level='INFO', - enqueue=True, - filter=lambda record: record['level'].no >= 20) - _, file_prefix, _ = find_log_dir_and_names() - sink_prefix = os.path.join("exp/log", file_prefix) - sink_path = sink_prefix[:-3] + "{time}.log" - logger.add(sink_path, level='DEBUG', enqueue=True, rotation="500 MB") + + _call_from_cli = False + _frame = inspect.currentframe() + while _frame: + if 'paddlespeech/cli/__init__.py' in _frame.f_code.co_filename or 'paddlespeech/t2s' in _frame.f_code.co_filename: + _call_from_cli = True + break + _frame = _frame.f_back + + if _call_from_cli: + logger.add( + sys.stdout, + level='ERROR', + enqueue=True, + filter=lambda record: record['level'].no >= 20) + else: + logger.add( + sys.stdout, + level='INFO', + enqueue=True, + filter=lambda record: record['level'].no >= 20) + _, file_prefix, _ = find_log_dir_and_names() + sink_prefix = os.path.join("exp/log", file_prefix) + sink_path = sink_prefix[:-3] + "{time}.log" + logger.add(sink_path, level='DEBUG', enqueue=True, rotation="500 MB") def __init__(self, name=None): pass diff --git a/paddlespeech/s2t/utils/utility.py b/paddlespeech/s2t/utils/utility.py index 73c798166..dc1be8159 100644 --- a/paddlespeech/s2t/utils/utility.py +++ b/paddlespeech/s2t/utils/utility.py @@ -130,7 +130,7 @@ def get_subsample(config): Returns: int: subsample rate. """ - input_layer = config["model"]["encoder_conf"]["input_layer"] + input_layer = config["encoder_conf"]["input_layer"] assert input_layer in ["conv2d", "conv2d6", "conv2d8"] if input_layer == "conv2d": return 4 diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index 9470f9234..526871a23 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -17,7 +17,7 @@ import paddle from paddlespeech.t2s.data.batch import batch_sequences -def speedyspeech_batch_fn(examples): +def speedyspeech_single_spk_batch_fn(examples): # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] phones = [np.array(item["phones"], dtype=np.int64) for item in examples] tones = [np.array(item["tones"], dtype=np.int64) for item in examples] @@ -55,6 +55,48 @@ def speedyspeech_batch_fn(examples): return batch +def speedyspeech_multi_spk_batch_fn(examples): + # fields = ["phones", "tones", "num_phones", "num_frames", "feats", "durations"] + phones = [np.array(item["phones"], dtype=np.int64) for item in examples] + tones = [np.array(item["tones"], dtype=np.int64) for item in examples] + feats = [np.array(item["feats"], dtype=np.float32) for item in examples] + durations = [ + np.array(item["durations"], dtype=np.int64) for item in examples + ] + num_phones = [ + np.array(item["num_phones"], dtype=np.int64) for item in examples + ] + num_frames = [ + np.array(item["num_frames"], dtype=np.int64) for item in examples + ] + + phones = batch_sequences(phones) + tones = batch_sequences(tones) + feats = batch_sequences(feats) + durations = batch_sequences(durations) + + # convert each batch to paddle.Tensor + phones = paddle.to_tensor(phones) + tones = paddle.to_tensor(tones) + feats = paddle.to_tensor(feats) + durations = paddle.to_tensor(durations) + num_phones = paddle.to_tensor(num_phones) + num_frames = paddle.to_tensor(num_frames) + batch = { + "phones": phones, + "tones": tones, + "num_phones": num_phones, + "num_frames": num_frames, + "feats": feats, + "durations": durations, + } + if "spk_id" in examples[0]: + spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples] + spk_id = paddle.to_tensor(spk_id) + batch["spk_id"] = spk_id + return batch + + def fastspeech2_single_spk_batch_fn(examples): # fields = ["text", "text_lengths", "speech", "speech_lengths", "durations", "pitch", "energy"] text = [np.array(item["text"], dtype=np.int64) for item in examples] diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py index 8a9ef370c..4ddd19f72 100644 --- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py +++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py @@ -21,6 +21,8 @@ import numpy as np import paddle import yaml from yacs.config import CfgNode +from tqdm import tqdm +import os from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import merge_silence @@ -30,6 +32,8 @@ from paddlespeech.t2s.modules.normalizer import ZScore def evaluate(args, fastspeech2_config): + rootdir = Path(args.rootdir).expanduser() + assert rootdir.is_dir() # construct dataset for evaluation with open(args.phones_dict, "r") as f: @@ -41,9 +45,16 @@ def evaluate(args, fastspeech2_config): for phn, id in phn_id: phone_dict[phn] = int(id) + if args.speaker_dict: + with open(args.speaker_dict, 'rt') as f: + spk_id_list = [line.strip().split() for line in f.readlines()] + spk_num = len(spk_id_list) + else: + spk_num=None + odim = fastspeech2_config.n_mels model = FastSpeech2( - idim=vocab_size, odim=odim, **fastspeech2_config["model"]) + idim=vocab_size, odim=odim, **fastspeech2_config["model"], spk_num=spk_num) model.set_state_dict( paddle.load(args.fastspeech2_checkpoint)["main_params"]) @@ -65,7 +76,34 @@ def evaluate(args, fastspeech2_config): sentences, speaker_set = get_phn_dur(args.dur_file) merge_silence(sentences) - for i, utt_id in enumerate(sentences): + if args.dataset == "baker": + wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) + # split data into 3 sections + num_train = 9800 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + elif args.dataset == "aishell3": + sub_num_dev = 5 + wav_dir = rootdir / "train" / "wav" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*.wav"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files + + train_wav_files = [os.path.basename(str(str_path)) for str_path in train_wav_files] + dev_wav_files = [os.path.basename(str(str_path)) for str_path in dev_wav_files] + test_wav_files = [os.path.basename(str(str_path)) for str_path in test_wav_files] + + for i, utt_id in enumerate(tqdm(sentences)): phones = sentences[utt_id][0] durations = sentences[utt_id][1] speaker = sentences[utt_id][2] @@ -82,21 +120,30 @@ def evaluate(args, fastspeech2_config): phone_ids = [phone_dict[phn] for phn in phones] phone_ids = paddle.to_tensor(np.array(phone_ids)) + + if args.speaker_dict: + speaker_id = int([item[1] for item in spk_id_list if speaker == item[0]][0]) + speaker_id = paddle.to_tensor(speaker_id) + else: + speaker_id = None + durations = paddle.to_tensor(np.array(durations)) # 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复 # split data into 3 sections - if args.dataset == "baker": - num_train = 9800 - num_dev = 100 - if i in range(0, num_train): + + wav_path = utt_id + ".wav" + + if wav_path in train_wav_files: sub_output_dir = output_dir / ("train/raw") - elif i in range(num_train, num_train + num_dev): + elif wav_path in dev_wav_files: sub_output_dir = output_dir / ("dev/raw") - else: + elif wav_path in test_wav_files: sub_output_dir = output_dir / ("test/raw") + sub_output_dir.mkdir(parents=True, exist_ok=True) + with paddle.no_grad(): - mel = fastspeech2_inference(phone_ids, durations=durations) + mel = fastspeech2_inference(phone_ids, durations=durations, spk_id=speaker_id) np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) @@ -109,6 +156,8 @@ def main(): default="baker", type=str, help="name of dataset, should in {baker, ljspeech, vctk} now") + parser.add_argument( + "--rootdir", default=None, type=str, help="directory to dataset.") parser.add_argument( "--fastspeech2-config", type=str, help="fastspeech2 config file.") parser.add_argument( @@ -126,13 +175,18 @@ def main(): type=str, default="phone_id_map.txt", help="phone vocabulary file.") + + parser.add_argument( + "--speaker-dict", + type=str, + default=None, + help="speaker id map file.") parser.add_argument( "--dur-file", default=None, type=str, help="path to durations.txt.") parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") def str2bool(str): return True if str.lower() == 'true' else False diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py index fafded6fc..1dfa575a1 100644 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -174,7 +174,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") parser.add_argument( "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py index f0e7708f3..9ac6cbd34 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py @@ -242,8 +242,7 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser( - description="Train a HiFiGAN model.") + parser = argparse.ArgumentParser(description="Train a HiFiGAN model.") parser.add_argument( "--config", type=str, help="config file to overwrite default config.") parser.add_argument("--train-metadata", type=str, help="training data.") @@ -251,7 +250,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py index a44d2d3c2..3d0ff7d35 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py @@ -239,7 +239,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py index ca2e3f550..f5affb50b 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py @@ -93,7 +93,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py index 98b0ed717..a7881d6bb 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py @@ -216,7 +216,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") benchmark_group = parser.add_argument_group( 'benchmark', 'arguments related to benchmark.') diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py index bc7464678..b162260d6 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py +++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py @@ -223,8 +223,7 @@ def train_sp(args, config): def main(): # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser( - description="Train a Multi-Band MelGAN model.") + parser = argparse.ArgumentParser(description="Train a Style MelGAN model.") parser.add_argument( "--config", type=str, help="config file to overwrite default config.") parser.add_argument("--train-metadata", type=str, help="training data.") @@ -232,7 +231,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/gan_vocoder/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py index 6f4dc92db..c60b9add2 100644 --- a/paddlespeech/t2s/exps/gan_vocoder/synthesize.py +++ b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py @@ -42,7 +42,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py new file mode 100644 index 000000000..b6440fd6f --- /dev/null +++ b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py @@ -0,0 +1,246 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# generate mels using durations.txt +# for mb melgan finetune +# 长度和原本的 mel 不一致怎么办? +import argparse +import os +from pathlib import Path + +import numpy as np +import paddle +import yaml +from tqdm import tqdm +from yacs.config import CfgNode + +from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur +from paddlespeech.t2s.datasets.preprocess_utils import merge_silence +from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.models.speedyspeech import SpeedySpeech +from paddlespeech.t2s.models.speedyspeech import SpeedySpeechInference +from paddlespeech.t2s.modules.normalizer import ZScore + + +def evaluate(args, speedyspeech_config): + rootdir = Path(args.rootdir).expanduser() + assert rootdir.is_dir() + + # construct dataset for evaluation + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + phone_dict = {} + for phn, id in phn_id: + phone_dict[phn] = int(id) + + with open(args.tones_dict, "r") as f: + tone_id = [line.strip().split() for line in f.readlines()] + tone_size = len(tone_id) + print("tone_size:", tone_size) + + frontend = Frontend( + phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict) + + if args.speaker_dict: + with open(args.speaker_dict, 'rt') as f: + spk_id_list = [line.strip().split() for line in f.readlines()] + spk_num = len(spk_id_list) + else: + spk_num = None + + model = SpeedySpeech( + vocab_size=vocab_size, + tone_size=tone_size, + **speedyspeech_config["model"], + spk_num=spk_num) + + model.set_state_dict( + paddle.load(args.speedyspeech_checkpoint)["main_params"]) + model.eval() + + stat = np.load(args.speedyspeech_stat) + mu, std = stat + mu = paddle.to_tensor(mu) + std = paddle.to_tensor(std) + speedyspeech_normalizer = ZScore(mu, std) + + speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer, + model) + speedyspeech_inference.eval() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + sentences, speaker_set = get_phn_dur(args.dur_file) + merge_silence(sentences) + + if args.dataset == "baker": + wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) + # split data into 3 sections + num_train = 9800 + num_dev = 100 + train_wav_files = wav_files[:num_train] + dev_wav_files = wav_files[num_train:num_train + num_dev] + test_wav_files = wav_files[num_train + num_dev:] + elif args.dataset == "aishell3": + sub_num_dev = 5 + wav_dir = rootdir / "train" / "wav" + train_wav_files = [] + dev_wav_files = [] + test_wav_files = [] + for speaker in os.listdir(wav_dir): + wav_files = sorted(list((wav_dir / speaker).rglob("*.wav"))) + if len(wav_files) > 100: + train_wav_files += wav_files[:-sub_num_dev * 2] + dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev] + test_wav_files += wav_files[-sub_num_dev:] + else: + train_wav_files += wav_files + + train_wav_files = [ + os.path.basename(str(str_path)) for str_path in train_wav_files + ] + dev_wav_files = [ + os.path.basename(str(str_path)) for str_path in dev_wav_files + ] + test_wav_files = [ + os.path.basename(str(str_path)) for str_path in test_wav_files + ] + + for i, utt_id in enumerate(tqdm(sentences)): + phones = sentences[utt_id][0] + durations = sentences[utt_id][1] + speaker = sentences[utt_id][2] + # 裁剪掉开头和结尾的 sil + if args.cut_sil: + if phones[0] == "sil" and len(durations) > 1: + durations = durations[1:] + phones = phones[1:] + if phones[-1] == 'sil' and len(durations) > 1: + durations = durations[:-1] + phones = phones[:-1] + + phones, tones = frontend._get_phone_tone(phones, get_tone_ids=True) + if tones: + tone_ids = frontend._t2id(tones) + tone_ids = paddle.to_tensor(tone_ids) + if phones: + phone_ids = frontend._p2id(phones) + phone_ids = paddle.to_tensor(phone_ids) + + if args.speaker_dict: + speaker_id = int( + [item[1] for item in spk_id_list if speaker == item[0]][0]) + speaker_id = paddle.to_tensor(speaker_id) + else: + speaker_id = None + + durations = paddle.to_tensor(np.array(durations)) + durations = paddle.unsqueeze(durations, axis=0) + + # 生成的和真实的可能有 1, 2 帧的差距,但是 batch_fn 会修复 + # split data into 3 sections + + wav_path = utt_id + ".wav" + + if wav_path in train_wav_files: + sub_output_dir = output_dir / ("train/raw") + elif wav_path in dev_wav_files: + sub_output_dir = output_dir / ("dev/raw") + elif wav_path in test_wav_files: + sub_output_dir = output_dir / ("test/raw") + + sub_output_dir.mkdir(parents=True, exist_ok=True) + + with paddle.no_grad(): + mel = speedyspeech_inference( + phone_ids, tone_ids, durations=durations, spk_id=speaker_id) + np.save(sub_output_dir / (utt_id + "_feats.npy"), mel) + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser( + description="Synthesize with speedyspeech & parallel wavegan.") + parser.add_argument( + "--dataset", + default="baker", + type=str, + help="name of dataset, should in {baker, ljspeech, vctk} now") + parser.add_argument( + "--rootdir", default=None, type=str, help="directory to dataset.") + parser.add_argument( + "--speedyspeech-config", type=str, help="speedyspeech config file.") + parser.add_argument( + "--speedyspeech-checkpoint", + type=str, + help="speedyspeech checkpoint to load.") + parser.add_argument( + "--speedyspeech-stat", + type=str, + help="mean and standard deviation used to normalize spectrogram when training speedyspeech." + ) + + parser.add_argument( + "--phones-dict", + type=str, + default="phone_id_map.txt", + help="phone vocabulary file.") + parser.add_argument( + "--tones-dict", + type=str, + default="tone_id_map.txt", + help="tone vocabulary file.") + parser.add_argument( + "--speaker-dict", type=str, default=None, help="speaker id map file.") + + parser.add_argument( + "--dur-file", default=None, type=str, help="path to durations.txt.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") + + def str2bool(str): + return True if str.lower() == 'true' else False + + parser.add_argument( + "--cut-sil", + type=str2bool, + default=True, + help="whether cut sil in the edge of audio") + + args = parser.parse_args() + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + with open(args.speedyspeech_config) as f: + speedyspeech_config = CfgNode(yaml.safe_load(f)) + + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(speedyspeech_config) + + evaluate(args, speedyspeech_config) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/speedyspeech/normalize.py b/paddlespeech/t2s/exps/speedyspeech/normalize.py index 91d15c40b..a427c4692 100644 --- a/paddlespeech/t2s/exps/speedyspeech/normalize.py +++ b/paddlespeech/t2s/exps/speedyspeech/normalize.py @@ -47,7 +47,8 @@ def main(): "--phones-dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( "--tones-dict", type=str, default=None, help="tone vocabulary file.") - + parser.add_argument( + "--speaker-dict", type=str, default=None, help="speaker id map file.") parser.add_argument( "--verbose", type=int, @@ -121,6 +122,12 @@ def main(): for tone, id in tone_id: vocab_tones[tone] = int(id) + vocab_speaker = {} + with open(args.speaker_dict, 'rt') as f: + spk_id = [line.strip().split() for line in f.readlines()] + for spk, id in spk_id: + vocab_speaker[spk] = int(id) + # process each file output_metadata = [] @@ -135,11 +142,13 @@ def main(): np.save(mel_path, mel.astype(np.float32), allow_pickle=False) phone_ids = [vocab_phones[p] for p in item['phones']] tone_ids = [vocab_tones[p] for p in item['tones']] + spk_id = vocab_speaker[item["speaker"]] if args.use_relative_path: # convert absolute path to relative path: mel_path = mel_path.relative_to(dumpdir) output_metadata.append({ 'utt_id': utt_id, + "spk_id": spk_id, 'phones': phone_ids, 'tones': tone_ids, 'num_phones': item['num_phones'], diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py index aa589d5a3..9ff771442 100644 --- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py +++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py @@ -31,6 +31,7 @@ from paddlespeech.t2s.data.get_feats import LogMelFBank from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur from paddlespeech.t2s.datasets.preprocess_utils import get_phones_tones +from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map from paddlespeech.t2s.datasets.preprocess_utils import merge_silence @@ -101,6 +102,7 @@ def process_sentence(config: Dict[str, Any], "utt_id": utt_id, "phones": phones, "tones": tones, + "speaker": speaker, "num_phones": len(phones), "num_frames": num_frames, "durations": durations, @@ -229,6 +231,8 @@ def main(): tone_id_map_path = dumpdir / "tone_id_map.txt" get_phones_tones(sentences, phone_id_map_path, tone_id_map_path, args.dataset) + speaker_id_map_path = dumpdir / "speaker_id_map.txt" + get_spk_id_map(speaker_set, speaker_id_map_path) if args.dataset == "baker": wav_files = sorted(list((rootdir / "Wave").rglob("*.wav"))) diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py index 2854d0555..cb742c595 100644 --- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py @@ -173,7 +173,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir") parser.add_argument( "--inference-dir", type=str, help="dir to save inference models") - parser.add_argument("--verbose", type=int, default=1, help="verbose") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py index 001e22aea..448cd7bbf 100644 --- a/paddlespeech/t2s/exps/speedyspeech/train.py +++ b/paddlespeech/t2s/exps/speedyspeech/train.py @@ -27,7 +27,8 @@ from paddle.io import DataLoader from paddle.io import DistributedBatchSampler from yacs.config import CfgNode -from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_batch_fn +from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_multi_spk_batch_fn +from paddlespeech.t2s.datasets.am_batch_fn import speedyspeech_single_spk_batch_fn from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.models.speedyspeech import SpeedySpeech from paddlespeech.t2s.models.speedyspeech import SpeedySpeechEvaluator @@ -57,6 +58,23 @@ def train_sp(args, config): f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", ) + fields = [ + "phones", "tones", "num_phones", "num_frames", "feats", "durations" + ] + + spk_num = None + if args.speaker_dict is not None: + print("multiple speaker speedyspeech!") + collate_fn = speedyspeech_multi_spk_batch_fn + with open(args.speaker_dict, 'rt') as f: + spk_id = [line.strip().split() for line in f.readlines()] + spk_num = len(spk_id) + fields += ["spk_id"] + else: + print("single speaker speedyspeech!") + collate_fn = speedyspeech_single_spk_batch_fn + print("spk_num:", spk_num) + # dataloader has been too verbose logging.getLogger("DataLoader").disabled = True @@ -71,9 +89,7 @@ def train_sp(args, config): train_dataset = DataTable( data=train_metadata, - fields=[ - "phones", "tones", "num_phones", "num_frames", "feats", "durations" - ], + fields=fields, converters={ "feats": np.load, }, ) @@ -87,9 +103,7 @@ def train_sp(args, config): dev_dataset = DataTable( data=dev_metadata, - fields=[ - "phones", "tones", "num_phones", "num_frames", "feats", "durations" - ], + fields=fields, converters={ "feats": np.load, }, ) @@ -105,14 +119,14 @@ def train_sp(args, config): train_dataloader = DataLoader( train_dataset, batch_sampler=train_sampler, - collate_fn=speedyspeech_batch_fn, + collate_fn=collate_fn, num_workers=config.num_workers) dev_dataloader = DataLoader( dev_dataset, shuffle=False, drop_last=False, batch_size=config.batch_size, - collate_fn=speedyspeech_batch_fn, + collate_fn=collate_fn, num_workers=config.num_workers) print("dataloaders done!") with open(args.phones_dict, "r") as f: @@ -125,7 +139,10 @@ def train_sp(args, config): print("tone_size:", tone_size) model = SpeedySpeech( - vocab_size=vocab_size, tone_size=tone_size, **config["model"]) + vocab_size=vocab_size, + tone_size=tone_size, + spk_num=spk_num, + **config["model"]) if world_size > 1: model = DataParallel(model) print("model done!") @@ -168,7 +185,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") def str2bool(str): return True if str.lower() == 'true' else False @@ -185,6 +201,12 @@ def main(): parser.add_argument( "--tones-dict", type=str, default=None, help="tone vocabulary file.") + parser.add_argument( + "--speaker-dict", + type=str, + default=None, + help="speaker id map file for multiple speaker model.") + # 这里可以多传入 max_epoch 等 args, rest = parser.parse_known_args() diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 9a83ec1bc..15ed1e4d4 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -196,41 +196,50 @@ def evaluate(args): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) - + merge_sentences = False for utt_id, sentence in sentences: get_tone_ids = False if am_name == 'speedyspeech': get_tone_ids = True if args.lang == 'zh': input_ids = frontend.get_input_ids( - sentence, merge_sentences=True, get_tone_ids=get_tone_ids) + sentence, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids) phone_ids = input_ids["phone_ids"] - phone_ids = phone_ids[0] if get_tone_ids: tone_ids = input_ids["tone_ids"] - tone_ids = tone_ids[0] elif args.lang == 'en': - input_ids = frontend.get_input_ids(sentence) + input_ids = frontend.get_input_ids( + sentence, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] else: print("lang should in {'zh', 'en'}!") - with paddle.no_grad(): - # acoustic model - if am_name == 'fastspeech2': - # multi speaker - if am_dataset in {"aishell3", "vctk"}: - spk_id = paddle.to_tensor(args.spk_id) - mel = am_inference(phone_ids, spk_id) + flags = 0 + for i in range(len(phone_ids)): + part_phone_ids = phone_ids[i] + # acoustic model + if am_name == 'fastspeech2': + # multi speaker + if am_dataset in {"aishell3", "vctk"}: + spk_id = paddle.to_tensor(args.spk_id) + mel = am_inference(part_phone_ids, spk_id) + else: + mel = am_inference(part_phone_ids) + elif am_name == 'speedyspeech': + part_tone_ids = tone_ids[i] + mel = am_inference(part_phone_ids, part_tone_ids) + # vocoder + wav = voc_inference(mel) + if flags == 0: + wav_all = wav + flags = 1 else: - mel = am_inference(phone_ids) - elif am_name == 'speedyspeech': - mel = am_inference(phone_ids, tone_ids) - # vocoder - wav = voc_inference(mel) + wav_all = paddle.concat([wav_all, wav]) sf.write( str(output_dir / (utt_id + ".wav")), - wav.numpy(), + wav_all.numpy(), samplerate=am_config.fs) print(f"{utt_id} done!") diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize.py b/paddlespeech/t2s/exps/transformer_tts/synthesize.py index 666c3b723..7b6b1873f 100644 --- a/paddlespeech/t2s/exps/transformer_tts/synthesize.py +++ b/paddlespeech/t2s/exps/transformer_tts/synthesize.py @@ -118,7 +118,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py index ba197f43c..0cd7d224e 100644 --- a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py @@ -137,7 +137,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py index 163339f4a..8695c06a9 100644 --- a/paddlespeech/t2s/exps/transformer_tts/train.py +++ b/paddlespeech/t2s/exps/transformer_tts/train.py @@ -165,7 +165,6 @@ def main(): parser.add_argument("--output-dir", type=str, help="output dir.") parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - parser.add_argument("--verbose", type=int, default=1, help="verbose.") parser.add_argument( "--phones-dict", type=str, default=None, help="phone vocabulary file.") diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py index fbc8fd388..254138713 100644 --- a/paddlespeech/t2s/frontend/phonectic.py +++ b/paddlespeech/t2s/frontend/phonectic.py @@ -13,7 +13,9 @@ # limitations under the License. from abc import ABC from abc import abstractmethod +from typing import List +import numpy as np import paddle from g2p_en import G2p from g2pM import G2pM @@ -21,6 +23,7 @@ from g2pM import G2pM from paddlespeech.t2s.frontend.normalizer.normalizer import normalize from paddlespeech.t2s.frontend.punctuation import get_punctuations from paddlespeech.t2s.frontend.vocab import Vocab +from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer # discard opencc untill we find an easy solution to install it on windows # from opencc import OpenCC @@ -53,6 +56,7 @@ class English(Phonetics): self.vocab = Vocab(self.phonemes + self.punctuations) self.vocab_phones = {} self.punc = ":,;。?!“”‘’':,;.?!" + self.text_normalizer = TextNormalizer() if phone_vocab_path: with open(phone_vocab_path, 'rt') as f: phn_id = [line.strip().split() for line in f.readlines()] @@ -78,19 +82,42 @@ class English(Phonetics): phonemes = [item for item in phonemes if item in self.vocab.stoi] return phonemes - def get_input_ids(self, sentence: str) -> paddle.Tensor: - result = {} - phones = self.phoneticize(sentence) - # remove start_symbol and end_symbol - phones = phones[1:-1] - phones = [phn for phn in phones if not phn.isspace()] - phones = [ + def _p2id(self, phonemes: List[str]) -> np.array: + # replace unk phone with sp + phonemes = [ phn if (phn in self.vocab_phones and phn not in self.punc) else "sp" - for phn in phones + for phn in phonemes ] - phone_ids = [self.vocab_phones[phn] for phn in phones] - phone_ids = paddle.to_tensor(phone_ids) - result["phone_ids"] = phone_ids + phone_ids = [self.vocab_phones[item] for item in phonemes] + return np.array(phone_ids, np.int64) + + def get_input_ids(self, sentence: str, + merge_sentences: bool=False) -> paddle.Tensor: + result = {} + sentences = self.text_normalizer._split(sentence, lang="en") + phones_list = [] + temp_phone_ids = [] + for sentence in sentences: + phones = self.phoneticize(sentence) + # remove start_symbol and end_symbol + phones = phones[1:-1] + phones = [phn for phn in phones if not phn.isspace()] + phones_list.append(phones) + + if merge_sentences: + merge_list = sum(phones_list, []) + # rm the last 'sp' to avoid the noise at the end + # cause in the training data, no 'sp' in the end + if merge_list[-1] == 'sp': + merge_list = merge_list[:-1] + phones_list = [] + phones_list.append(merge_list) + + for part_phones_list in phones_list: + phone_ids = self._p2id(part_phones_list) + phone_ids = paddle.to_tensor(phone_ids) + temp_phone_ids.append(phone_ids) + result["phone_ids"] = temp_phone_ids return result def numericalize(self, phonemes): diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 6ba567bb9..5264e0687 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -65,6 +65,7 @@ class ToneSandhi(): self.must_not_neural_tone_words = { "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子" } + self.punc = ":,;。?!“”‘’':,;.?!" # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041 # e.g. @@ -147,7 +148,9 @@ class ToneSandhi(): finals[i] = finals[i][:-1] + "2" # "一" before non-tone4 should be yi4, e.g. 一天 else: - finals[i] = finals[i][:-1] + "4" + # "一" 后面如果是标点,还读一声 + if word[i + 1] not in self.punc: + finals[i] = finals[i][:-1] + "4" return finals def _split_word(self, word: str) -> List[str]: diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index 8eb55ff25..a905c412d 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -105,6 +105,8 @@ class Frontend(): phones_list = [] for seg in segments: phones = [] + # Replace all English words in the sentence + seg = re.sub('[a-zA-Z]+', '', seg) seg_cut = psg.lcut(seg) initials = [] finals = [] diff --git a/paddlespeech/t2s/frontend/zh_normalization/char_convert.py b/paddlespeech/t2s/frontend/zh_normalization/char_convert.py index 22462a0c5..dcf95d728 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/char_convert.py +++ b/paddlespeech/t2s/frontend/zh_normalization/char_convert.py @@ -14,7 +14,7 @@ # limitations under the License. """Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters. """ -simplified_charcters = '制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀㝉冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤' +simplified_charcters = '制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁移稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢尥尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎毑蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓鎸鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤' traditional_characters = '制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨傲倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢塕鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤' diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index c68caeeb7..9794a7007 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -51,9 +51,9 @@ from .quantifier import replace_temperature class TextNormalizer(): def __init__(self): - self.SENTENCE_SPLITOR = re.compile(r'([:,;。?!,;?!][”’]?)') + self.SENTENCE_SPLITOR = re.compile(r'([:、,;。?!,;?!][”’]?)') - def _split(self, text: str) -> List[str]: + def _split(self, text: str, lang="zh") -> List[str]: """Split long text into sentences with sentence-splitting punctuations. Parameters ---------- @@ -65,7 +65,8 @@ class TextNormalizer(): Sentences. """ # Only for pure Chinese here - text = text.replace(" ", "") + if lang == "zh": + text = text.replace(" ", "") text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = text.strip() sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py index cdec03abc..405ad957d 100644 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -907,7 +907,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): energy: Union[paddle.Tensor, np.ndarray]=None, energy_scale: Union[int, float]=None, energy_bias: Union[int, float]=None, - robot: bool=False): + robot: bool=False, + spk_emb=None, + spk_id=None): """ Parameters ---------- @@ -939,7 +941,12 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): Output sequence of features (L, odim). """ normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference( - text, durations=None, pitch=None, energy=None) + text, + durations=None, + pitch=None, + energy=None, + spk_emb=spk_emb, + spk_id=spk_id) # priority: groundtruth > scale/bias > previous output # set durations if isinstance(durations, np.ndarray): @@ -991,7 +998,9 @@ class StyleFastSpeech2Inference(FastSpeech2Inference): durations=durations, pitch=pitch, energy=energy, - use_teacher_forcing=True) + use_teacher_forcing=True, + spk_emb=spk_emb, + spk_id=spk_id) logmel = self.normalizer.inverse(normalized_mel) return logmel diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py index 0854c0a98..bd451e1fd 100644 --- a/paddlespeech/t2s/models/melgan/style_melgan.py +++ b/paddlespeech/t2s/models/melgan/style_melgan.py @@ -188,7 +188,8 @@ class StyleMelGANGenerator(nn.Layer): try: if layer: nn.utils.remove_weight_norm(layer) - except ValueError: + # add AttributeError to bypass https://github.com/PaddlePaddle/Paddle/issues/38532 temporarily + except (ValueError, AttributeError): pass self.apply(_remove_weight_norm) diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index ece5c279f..cc9e20662 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import paddle from paddle import nn @@ -23,18 +22,18 @@ def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor: encodings: (B, T, C) durations: (B, T) """ - batch_size, t_enc = durations.shape - durations = durations.numpy() - slens = np.sum(durations, -1) - t_dec = np.max(slens) - M = np.zeros([batch_size, t_dec, t_enc]) + batch_size, t_enc = paddle.shape(durations) + slens = paddle.sum(durations, -1) + t_dec = paddle.max(slens) + M = paddle.zeros([batch_size, t_dec, t_enc]) for i in range(batch_size): k = 0 for j in range(t_enc): d = durations[i, j] - M[i, k:k + d, j] = 1 + # If the d == 0, slice action is meaningless and not supported + if d >= 1: + M[0, k:k + d, j] = 1 k += d - M = paddle.to_tensor(M, dtype=encodings.dtype) encodings = paddle.matmul(M, encodings) return encodings @@ -95,8 +94,13 @@ class TextEmbedding(nn.Layer): class SpeedySpeechEncoder(nn.Layer): - def __init__(self, vocab_size, tone_size, hidden_size, kernel_size, - dilations): + def __init__(self, + vocab_size, + tone_size, + hidden_size, + kernel_size, + dilations, + spk_num=None): super().__init__() self.embedding = TextEmbedding( vocab_size, @@ -104,6 +108,15 @@ class SpeedySpeechEncoder(nn.Layer): tone_size, padding_idx=0, tone_padding_idx=0) + + if spk_num: + self.spk_emb = nn.Embedding( + num_embeddings=spk_num, + embedding_dim=hidden_size, + padding_idx=0) + else: + self.spk_emb = None + self.prenet = nn.Sequential( nn.Linear(hidden_size, hidden_size), nn.ReLU(), ) @@ -118,8 +131,10 @@ class SpeedySpeechEncoder(nn.Layer): nn.BatchNorm1D(hidden_size, data_format="NLC"), nn.Linear(hidden_size, hidden_size), ) - def forward(self, text, tones): + def forward(self, text, tones, spk_id=None): embedding = self.embedding(text, tones) + if self.spk_emb: + embedding += self.spk_emb(spk_id).unsqueeze(1) embedding = self.prenet(embedding) x = self.res_blocks(embedding) x = embedding + self.postnet1(x) @@ -160,22 +175,22 @@ class SpeedySpeechDecoder(nn.Layer): class SpeedySpeech(nn.Layer): - def __init__( - self, - vocab_size, - encoder_hidden_size, - encoder_kernel_size, - encoder_dilations, - duration_predictor_hidden_size, - decoder_hidden_size, - decoder_output_size, - decoder_kernel_size, - decoder_dilations, - tone_size=None, ): + def __init__(self, + vocab_size, + encoder_hidden_size, + encoder_kernel_size, + encoder_dilations, + duration_predictor_hidden_size, + decoder_hidden_size, + decoder_output_size, + decoder_kernel_size, + decoder_dilations, + tone_size=None, + spk_num=None): super().__init__() encoder = SpeedySpeechEncoder(vocab_size, tone_size, encoder_hidden_size, encoder_kernel_size, - encoder_dilations) + encoder_dilations, spk_num) duration_predictor = DurationPredictor(duration_predictor_hidden_size) decoder = SpeedySpeechDecoder(decoder_hidden_size, decoder_output_size, decoder_kernel_size, decoder_dilations) @@ -184,13 +199,15 @@ class SpeedySpeech(nn.Layer): self.duration_predictor = duration_predictor self.decoder = decoder - def forward(self, text, tones, durations): + def forward(self, text, tones, durations, spk_id: paddle.Tensor=None): # input of embedding must be int64 text = paddle.cast(text, 'int64') tones = paddle.cast(tones, 'int64') + if spk_id is not None: + spk_id = paddle.cast(spk_id, 'int64') durations = paddle.cast(durations, 'int64') - encodings = self.encoder(text, tones) - # (B, T) + encodings = self.encoder(text, tones, spk_id) + pred_durations = self.duration_predictor(encodings.detach()) # expand encodings @@ -204,7 +221,7 @@ class SpeedySpeech(nn.Layer): decoded = self.decoder(encodings) return decoded, pred_durations - def inference(self, text, tones=None): + def inference(self, text, tones=None, durations=None, spk_id=None): # text: [T] # tones: [T] # input of embedding must be int64 @@ -214,25 +231,16 @@ class SpeedySpeech(nn.Layer): tones = paddle.cast(tones, 'int64') tones = tones.unsqueeze(0) - encodings = self.encoder(text, tones) - pred_durations = self.duration_predictor(encodings) # (1, T) - durations_to_expand = paddle.round(pred_durations.exp()) - durations_to_expand = (durations_to_expand).astype(paddle.int64) + encodings = self.encoder(text, tones, spk_id) - slens = paddle.sum(durations_to_expand, -1) # [1] - t_dec = slens[0] # [1] - t_enc = paddle.shape(pred_durations)[-1] - M = paddle.zeros([1, t_dec, t_enc]) - - k = paddle.full([1], 0, dtype=paddle.int64) - for j in range(t_enc): - d = durations_to_expand[0, j] - # If the d == 0, slice action is meaningless and not supported - if d >= 1: - M[0, k:k + d, j] = 1 - k += d - - encodings = paddle.matmul(M, encodings) + if durations is None: + # (1, T) + pred_durations = self.duration_predictor(encodings) + durations_to_expand = paddle.round(pred_durations.exp()) + durations_to_expand = durations_to_expand.astype(paddle.int64) + else: + durations_to_expand = durations + encodings = expand(encodings, durations_to_expand) shape = paddle.shape(encodings) t_dec, feature_size = shape[1], shape[2] @@ -247,7 +255,8 @@ class SpeedySpeechInference(nn.Layer): self.normalizer = normalizer self.acoustic_model = speedyspeech_model - def forward(self, phones, tones): - normalized_mel = self.acoustic_model.inference(phones, tones) + def forward(self, phones, tones, durations=None, spk_id=None): + normalized_mel = self.acoustic_model.inference( + phones, tones, durations=durations, spk_id=spk_id) logmel = self.normalizer.inverse(normalized_mel) return logmel diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py index 6f9937a51..ee45cdc85 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py @@ -50,10 +50,14 @@ class SpeedySpeechUpdater(StandardUpdater): self.msg = "Rank: {}, ".format(dist.get_rank()) losses_dict = {} + # spk_id!=None in multiple spk speedyspeech + spk_id = batch["spk_id"] if "spk_id" in batch else None + decoded, predicted_durations = self.model( text=batch["phones"], tones=batch["tones"], - durations=batch["durations"]) + durations=batch["durations"], + spk_id=spk_id) target_mel = batch["feats"] spec_mask = F.sequence_mask( @@ -112,10 +116,13 @@ class SpeedySpeechEvaluator(StandardEvaluator): self.msg = "Evaluate: " losses_dict = {} + spk_id = batch["spk_id"] if "spk_id" in batch else None + decoded, predicted_durations = self.model( text=batch["phones"], tones=batch["tones"], - durations=batch["durations"]) + durations=batch["durations"], + spk_id=spk_id) target_mel = batch["feats"] spec_mask = F.sequence_mask( diff --git a/paddlespeech/t2s/modules/predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py index 6d7adf236..6b7c6a6be 100644 --- a/paddlespeech/t2s/modules/predictor/duration_predictor.py +++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py @@ -115,8 +115,8 @@ class DurationPredictor(nn.Layer): Returns ---------- - Tensor - Batch of predicted durations in log domain (B, Tmax). + Tensor + Batch of predicted durations in log domain (B, Tmax). """ return self._forward(xs, x_masks, False) diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py index bf595b24e..f1ecfb7c1 100644 --- a/paddlespeech/t2s/modules/predictor/length_regulator.py +++ b/paddlespeech/t2s/modules/predictor/length_regulator.py @@ -70,8 +70,8 @@ class LengthRegulator(nn.Layer): ---------- xs : Tensor Batch of sequences of char or phoneme embeddings (B, Tmax, D). - ds : LongTensor - Batch of durations of each frame (B, T). + ds : Tensor(int64) + Batch of durations of each frame (B, T). alpha : float, optional Alpha value to control speed of speech. diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py index 19b07639f..1ca4e6d81 100644 --- a/paddlespeech/t2s/modules/tade_res_block.py +++ b/paddlespeech/t2s/modules/tade_res_block.py @@ -33,7 +33,11 @@ class TADELayer(nn.Layer): """Initilize TADE layer.""" super().__init__() self.norm = nn.InstanceNorm1D( - in_channels, momentum=0.1, data_format="NCL") + in_channels, + momentum=0.1, + data_format="NCL", + weight_attr=False, + bias_attr=False) self.aux_conv = nn.Sequential( nn.Conv1D( aux_channels, diff --git a/paddlespeech/text/training/__init__.py b/paddlespeech/text/exps/__init__.py similarity index 89% rename from paddlespeech/text/training/__init__.py rename to paddlespeech/text/exps/__init__.py index 185a92b8d..abf198b97 100644 --- a/paddlespeech/text/training/__init__.py +++ b/paddlespeech/text/exps/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/text/utils/__init__.py b/paddlespeech/text/exps/ernie_linear/__init__.py similarity index 89% rename from paddlespeech/text/utils/__init__.py rename to paddlespeech/text/exps/ernie_linear/__init__.py index 185a92b8d..abf198b97 100644 --- a/paddlespeech/text/utils/__init__.py +++ b/paddlespeech/text/exps/ernie_linear/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/text/exps/ernie_linear/punc_restore.py b/paddlespeech/text/exps/ernie_linear/punc_restore.py new file mode 100644 index 000000000..2cb4d0719 --- /dev/null +++ b/paddlespeech/text/exps/ernie_linear/punc_restore.py @@ -0,0 +1,110 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import re + +import paddle +import yaml +from paddlenlp.transformers import ErnieTokenizer +from yacs.config import CfgNode + +from paddlespeech.text.models.ernie_linear import ErnieLinear + +DefinedClassifier = { + 'ErnieLinear': ErnieLinear, +} + +tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') + + +def _clean_text(text, punc_list): + text = text.lower() + text = re.sub('[^A-Za-z0-9\u4e00-\u9fa5]', '', text) + text = re.sub(f'[{"".join([p for p in punc_list][1:])}]', '', text) + return text + + +def preprocess(text, punc_list): + clean_text = _clean_text(text, punc_list) + assert len(clean_text) > 0, f'Invalid input string: {text}' + tokenized_input = tokenizer( + list(clean_text), return_length=True, is_split_into_words=True) + _inputs = dict() + _inputs['input_ids'] = tokenized_input['input_ids'] + _inputs['seg_ids'] = tokenized_input['token_type_ids'] + _inputs['seq_len'] = tokenized_input['seq_len'] + return _inputs + + +def test(args): + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + + punc_list = [] + with open(config["data_params"]["punc_path"], 'r') as f: + for line in f: + punc_list.append(line.strip()) + + model = DefinedClassifier[config["model_type"]](**config["model"]) + state_dict = paddle.load(args.checkpoint) + model.set_state_dict(state_dict["main_params"]) + model.eval() + _inputs = preprocess(args.text, punc_list) + seq_len = _inputs['seq_len'] + input_ids = paddle.to_tensor(_inputs['input_ids']).unsqueeze(0) + seg_ids = paddle.to_tensor(_inputs['seg_ids']).unsqueeze(0) + logits, _ = model(input_ids, seg_ids) + preds = paddle.argmax(logits, axis=-1).squeeze(0) + tokens = tokenizer.convert_ids_to_tokens( + _inputs['input_ids'][1:seq_len - 1]) + labels = preds[1:seq_len - 1].tolist() + assert len(tokens) == len(labels) + # add 0 for non punc + punc_list = [0] + punc_list + text = '' + for t, l in zip(tokens, labels): + text += t + if l != 0: # Non punc. + text += punc_list[l] + print("Punctuation Restoration Result:", text) + return text + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="Run Punctuation Restoration.") + parser.add_argument("--config", type=str, help="ErnieLinear config file.") + parser.add_argument("--checkpoint", type=str, help="snapshot to load.") + parser.add_argument("--text", type=str, help="raw text to be restored.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") + + args = parser.parse_args() + + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + test(args) + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/text/exps/ernie_linear/test.py b/paddlespeech/text/exps/ernie_linear/test.py index 3cd507fbb..4302a1a3b 100644 --- a/paddlespeech/text/exps/ernie_linear/test.py +++ b/paddlespeech/text/exps/ernie_linear/test.py @@ -11,36 +11,110 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Evaluation for model.""" +import argparse + +import numpy as np +import paddle +import pandas as pd import yaml +from paddle import nn +from paddle.io import DataLoader +from sklearn.metrics import classification_report +from sklearn.metrics import precision_recall_fscore_support +from yacs.config import CfgNode -from paddlespeech.s2t.utils.utility import print_arguments -from paddlespeech.text.training.trainer import Tester -from paddlespeech.text.utils.default_parser import default_argument_parser +from paddlespeech.text.models.ernie_linear import ErnieLinear +from paddlespeech.text.models.ernie_linear import PuncDataset +from paddlespeech.text.models.ernie_linear import PuncDatasetFromErnieTokenizer +DefinedClassifier = { + 'ErnieLinear': ErnieLinear, +} -def main_sp(config, args): - exp = Tester(config, args) - exp.setup() - exp.run_test() +DefinedLoss = { + "ce": nn.CrossEntropyLoss, +} +DefinedDataset = { + 'Punc': PuncDataset, + 'Ernie': PuncDatasetFromErnieTokenizer, +} -def main(config, args): - main_sp(config, args) +def evaluation(y_pred, y_test): + precision, recall, f1, _ = precision_recall_fscore_support( + y_test, y_pred, average=None, labels=[1, 2, 3]) + overall = precision_recall_fscore_support( + y_test, y_pred, average='macro', labels=[1, 2, 3]) + result = pd.DataFrame( + np.array([precision, recall, f1]), + columns=list(['O', 'COMMA', 'PERIOD', 'QUESTION'])[1:], + index=['Precision', 'Recall', 'F1']) + result['OVERALL'] = overall[:3] + return result + + +def test(args): + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") + print(config) + + test_dataset = DefinedDataset[config["dataset_type"]]( + train_path=config["test_path"], **config["data_params"]) + test_loader = DataLoader( + test_dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=False) + model = DefinedClassifier[config["model_type"]](**config["model"]) + state_dict = paddle.load(args.checkpoint) + model.set_state_dict(state_dict["main_params"]) + model.eval() + + punc_list = [] + for i in range(len(test_loader.dataset.id2punc)): + punc_list.append(test_loader.dataset.id2punc[i]) + + test_total_label = [] + test_total_predict = [] + + for i, batch in enumerate(test_loader): + input, label = batch + label = paddle.reshape(label, shape=[-1]) + y, logit = model(input) + pred = paddle.argmax(logit, axis=1) + test_total_label.extend(label.numpy().tolist()) + test_total_predict.extend(pred.numpy().tolist()) + t = classification_report( + test_total_label, test_total_predict, target_names=punc_list) + print(t) + t2 = evaluation(test_total_label, test_total_predict) + print('=========================================================') + print(t2) + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="Test a ErnieLinear model.") + parser.add_argument("--config", type=str, help="ErnieLinear config file.") + parser.add_argument("--checkpoint", type=str, help="snapshot to load.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") -if __name__ == "__main__": - parser = default_argument_parser() args = parser.parse_args() - print_arguments(args, globals()) - # https://yaml.org/type/float.html - with open(args.config, "r") as f: - config = yaml.load(f, Loader=yaml.FullLoader) + if args.ngpu == 0: + paddle.set_device("cpu") + elif args.ngpu > 0: + paddle.set_device("gpu") + else: + print("ngpu should >= 0 !") + + test(args) - print(config) - if args.dump_config: - with open(args.dump_config, 'w') as f: - print(config, file=f) - main(config, args) +if __name__ == "__main__": + main() diff --git a/paddlespeech/text/exps/ernie_linear/train.py b/paddlespeech/text/exps/ernie_linear/train.py index 090714381..0d730d666 100644 --- a/paddlespeech/text/exps/ernie_linear/train.py +++ b/paddlespeech/text/exps/ernie_linear/train.py @@ -11,40 +11,163 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Trainer for punctuation_restoration task.""" +import argparse +import logging +import os +import shutil +from pathlib import Path + +import paddle import yaml +from paddle import DataParallel from paddle import distributed as dist +from paddle import nn +from paddle.io import DataLoader +from paddle.optimizer import Adam +from paddle.optimizer.lr import ExponentialDecay +from yacs.config import CfgNode -from paddlespeech.s2t.utils.utility import print_arguments -from paddlespeech.text.training.trainer import Trainer -from paddlespeech.text.utils.default_parser import default_argument_parser +from paddlespeech.t2s.training.extensions.snapshot import Snapshot +from paddlespeech.t2s.training.extensions.visualizer import VisualDL +from paddlespeech.t2s.training.seeding import seed_everything +from paddlespeech.t2s.training.trainer import Trainer +from paddlespeech.text.models.ernie_linear import ErnieLinear +from paddlespeech.text.models.ernie_linear import ErnieLinearEvaluator +from paddlespeech.text.models.ernie_linear import ErnieLinearUpdater +from paddlespeech.text.models.ernie_linear import PuncDataset +from paddlespeech.text.models.ernie_linear import PuncDatasetFromErnieTokenizer +DefinedClassifier = { + 'ErnieLinear': ErnieLinear, +} -def main_sp(config, args): - exp = Trainer(config, args) - exp.setup() - exp.run() +DefinedLoss = { + "ce": nn.CrossEntropyLoss, +} +DefinedDataset = { + 'Punc': PuncDataset, + 'Ernie': PuncDatasetFromErnieTokenizer, +} -def main(config, args): - if args.ngpu > 1: - dist.spawn(main_sp, args=(config, args), nprocs=args.ngpu) + +def train_sp(args, config): + # decides device type and whether to run in parallel + # setup running environment correctly + if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0: + paddle.set_device("cpu") else: - main_sp(config, args) + paddle.set_device("gpu") + world_size = paddle.distributed.get_world_size() + if world_size > 1: + paddle.distributed.init_parallel_env() + # set the random seed, it is a must for multiprocess training + seed_everything(config.seed) + + print( + f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}", + ) + # dataloader has been too verbose + logging.getLogger("DataLoader").disabled = True + train_dataset = DefinedDataset[config["dataset_type"]]( + train_path=config["train_path"], **config["data_params"]) + dev_dataset = DefinedDataset[config["dataset_type"]]( + train_path=config["dev_path"], **config["data_params"]) + train_dataloader = DataLoader( + train_dataset, + shuffle=True, + num_workers=config.num_workers, + batch_size=config.batch_size) + + dev_dataloader = DataLoader( + dev_dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=False, + num_workers=config.num_workers) + + print("dataloaders done!") + + model = DefinedClassifier[config["model_type"]](**config["model"]) + + if world_size > 1: + model = DataParallel(model) + print("model done!") + + criterion = DefinedLoss[config["loss_type"]]( + **config["loss"]) if "loss_type" in config else DefinedLoss["ce"]() + + print("criterions done!") + + lr_schedule = ExponentialDecay(**config["scheduler_params"]) + optimizer = Adam( + learning_rate=lr_schedule, + parameters=model.parameters(), + weight_decay=paddle.regularizer.L2Decay( + config["optimizer_params"]["weight_decay"])) + + print("optimizer done!") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + if dist.get_rank() == 0: + config_name = args.config.split("/")[-1] + # copy conf to output_dir + shutil.copyfile(args.config, output_dir / config_name) + + updater = ErnieLinearUpdater( + model=model, + criterion=criterion, + scheduler=lr_schedule, + optimizer=optimizer, + dataloader=train_dataloader, + output_dir=output_dir) + + trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) + + evaluator = ErnieLinearEvaluator( + model=model, + criterion=criterion, + dataloader=dev_dataloader, + output_dir=output_dir) + + if dist.get_rank() == 0: + trainer.extend(evaluator, trigger=(1, "epoch")) + trainer.extend(VisualDL(output_dir), trigger=(1, "iteration")) + trainer.extend( + Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch')) + # print(trainer.extensions) + trainer.run() + + +def main(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser(description="Train a ErnieLinear model.") + parser.add_argument("--config", type=str, help="ErnieLinear config file.") + parser.add_argument("--output-dir", type=str, help="output dir.") + parser.add_argument( + "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") -if __name__ == "__main__": - parser = default_argument_parser() args = parser.parse_args() - print_arguments(args, globals()) - # https://yaml.org/type/float.html - with open(args.config, "r") as f: - config = yaml.load(f, Loader=yaml.FullLoader) + with open(args.config) as f: + config = CfgNode(yaml.safe_load(f)) + print("========Args========") + print(yaml.safe_dump(vars(args))) + print("========Config========") print(config) - if args.dump_config: - with open(args.dump_config, 'w') as f: - print(config, file=f) + print( + f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}" + ) + + # dispatch + if args.ngpu > 1: + dist.spawn(train_sp, (args, config), nprocs=args.ngpu) + else: + train_sp(args, config) + - main(config, args) +if __name__ == "__main__": + main() diff --git a/paddlespeech/text/models/ernie_linear/__init__.py b/paddlespeech/text/models/ernie_linear/__init__.py index 93453ce74..0a10a6eb2 100644 --- a/paddlespeech/text/models/ernie_linear/__init__.py +++ b/paddlespeech/text/models/ernie_linear/__init__.py @@ -11,4 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .model import ErnieLinear +from .dataset import * +from .ernie_linear import * +from .ernie_linear_updater import * diff --git a/paddlespeech/text/models/ernie_linear/dataset.py b/paddlespeech/text/models/ernie_linear/dataset.py index 086e91bb8..64c8d0bdf 100644 --- a/paddlespeech/text/models/ernie_linear/dataset.py +++ b/paddlespeech/text/models/ernie_linear/dataset.py @@ -99,10 +99,8 @@ class PuncDatasetFromErnieTokenizer(Dataset): self.tokenizer = ErnieTokenizer.from_pretrained(pretrained_token) self.paddingID = self.tokenizer.pad_token_id self.seq_len = seq_len - self.punc2id = self.load_vocab(punc_path, extra_word_list=[" "]) self.id2punc = {k: v for (v, k) in self.punc2id.items()} - tmp_seqs = open(train_path, encoding='utf-8').readlines() self.txt_seqs = [i for seq in tmp_seqs for i in seq.split()] self.preprocess(self.txt_seqs) @@ -125,6 +123,7 @@ class PuncDatasetFromErnieTokenizer(Dataset): input_data = [] label = [] count = 0 + print("Preprocessing in PuncDatasetFromErnieTokenizer...") for i in range(len(txt_seqs) - 1): word = txt_seqs[i] punc = txt_seqs[i + 1] diff --git a/paddlespeech/text/models/ernie_linear/model.py b/paddlespeech/text/models/ernie_linear/ernie_linear.py similarity index 100% rename from paddlespeech/text/models/ernie_linear/model.py rename to paddlespeech/text/models/ernie_linear/ernie_linear.py diff --git a/paddlespeech/text/models/ernie_linear/ernie_linear_updater.py b/paddlespeech/text/models/ernie_linear/ernie_linear_updater.py new file mode 100644 index 000000000..8b3d7410e --- /dev/null +++ b/paddlespeech/text/models/ernie_linear/ernie_linear_updater.py @@ -0,0 +1,123 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging + +import paddle +from paddle import distributed as dist +from paddle.io import DataLoader +from paddle.nn import Layer +from paddle.optimizer import Optimizer +from paddle.optimizer.lr import LRScheduler +from sklearn.metrics import f1_score + +from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator +from paddlespeech.t2s.training.reporter import report +from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater +logging.basicConfig( + format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', + datefmt='[%Y-%m-%d %H:%M:%S]') +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class ErnieLinearUpdater(StandardUpdater): + def __init__(self, + model: Layer, + criterion: Layer, + scheduler: LRScheduler, + optimizer: Optimizer, + dataloader: DataLoader, + output_dir=None): + super().__init__(model, optimizer, dataloader, init_state=None) + self.model = model + self.dataloader = dataloader + + self.criterion = criterion + self.scheduler = scheduler + self.optimizer = optimizer + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def update_core(self, batch): + self.msg = "Rank: {}, ".format(dist.get_rank()) + losses_dict = {} + + input, label = batch + label = paddle.reshape(label, shape=[-1]) + y, logit = self.model(input) + pred = paddle.argmax(logit, axis=1) + + loss = self.criterion(y, label) + + self.optimizer.clear_grad() + loss.backward() + + self.optimizer.step() + self.scheduler.step() + + F1_score = f1_score( + label.numpy().tolist(), pred.numpy().tolist(), average="macro") + + report("train/loss", float(loss)) + losses_dict["loss"] = float(loss) + report("train/F1_score", float(F1_score)) + losses_dict["F1_score"] = float(F1_score) + + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + + +class ErnieLinearEvaluator(StandardEvaluator): + def __init__(self, + model: Layer, + criterion: Layer, + dataloader: DataLoader, + output_dir=None): + super().__init__(model, dataloader) + self.model = model + self.criterion = criterion + self.dataloader = dataloader + + log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) + self.filehandler = logging.FileHandler(str(log_file)) + logger.addHandler(self.filehandler) + self.logger = logger + self.msg = "" + + def evaluate_core(self, batch): + self.msg = "Evaluate: " + losses_dict = {} + + input, label = batch + label = paddle.reshape(label, shape=[-1]) + y, logit = self.model(input) + pred = paddle.argmax(logit, axis=1) + + loss = self.criterion(y, label) + + F1_score = f1_score( + label.numpy().tolist(), pred.numpy().tolist(), average="macro") + + report("eval/loss", float(loss)) + losses_dict["loss"] = float(loss) + report("eval/F1_score", float(F1_score)) + losses_dict["F1_score"] = float(F1_score) + + self.msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in losses_dict.items()) + self.logger.info(self.msg) diff --git a/paddlespeech/text/training/trainer.py b/paddlespeech/text/training/trainer.py deleted file mode 100644 index b5e6a563c..000000000 --- a/paddlespeech/text/training/trainer.py +++ /dev/null @@ -1,524 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import time -from collections import defaultdict -from pathlib import Path - -import numpy as np -import paddle -import paddle.nn as nn -import pandas as pd -from paddle import distributed as dist -from paddle.io import DataLoader -from sklearn.metrics import classification_report -from sklearn.metrics import f1_score -from sklearn.metrics import precision_recall_fscore_support - -from ...s2t.utils import layer_tools -from ...s2t.utils import mp_tools -from ...s2t.utils.checkpoint import Checkpoint -from ...text.models import ErnieLinear -from ...text.models.ernie_linear.dataset import PuncDataset -from ...text.models.ernie_linear.dataset import PuncDatasetFromErnieTokenizer - -__all__ = ["Trainer", "Tester"] - -DefinedClassifier = { - 'ErnieLinear': ErnieLinear, -} - -DefinedLoss = { - "ce": nn.CrossEntropyLoss, -} - -DefinedDataset = { - 'Punc': PuncDataset, - 'Ernie': PuncDatasetFromErnieTokenizer, -} - - -class Trainer(): - def __init__(self, config, args): - self.config = config - self.args = args - self.optimizer = None - self.output_dir = None - self.log_dir = None - self.checkpoint_dir = None - self.iteration = 0 - self.epoch = 0 - - def setup(self): - """Setup the experiment. - """ - self.setup_log_dir() - self.setup_logger() - if self.args.ngpu > 0: - paddle.set_device('gpu') - else: - paddle.set_device('cpu') - if self.parallel: - self.init_parallel() - - self.setup_output_dir() - self.dump_config() - self.setup_checkpointer() - - self.setup_model() - - self.setup_dataloader() - - self.iteration = 0 - self.epoch = 1 - - @property - def parallel(self): - """A flag indicating whether the experiment should run with - multiprocessing. - """ - return self.args.ngpu > 1 - - def init_parallel(self): - """Init environment for multiprocess training. - """ - dist.init_parallel_env() - - @mp_tools.rank_zero_only - def save(self, tag=None, infos: dict=None): - """Save checkpoint (model parameters and optimizer states). - - Args: - tag (int or str, optional): None for step, else using tag, e.g epoch. Defaults to None. - infos (dict, optional): meta data to save. Defaults to None. - """ - - infos = infos if infos else dict() - infos.update({ - "step": self.iteration, - "epoch": self.epoch, - "lr": self.optimizer.get_lr() - }) - self.checkpointer.save_parameters(self.checkpoint_dir, self.iteration - if tag is None else tag, self.model, - self.optimizer, infos) - - def resume_or_scratch(self): - """Resume from latest checkpoint at checkpoints in the output - directory or load a specified checkpoint. - - If ``args.checkpoint_path`` is not None, load the checkpoint, else - resume training. - """ - scratch = None - infos = self.checkpointer.load_parameters( - self.model, - self.optimizer, - checkpoint_dir=self.checkpoint_dir, - checkpoint_path=self.args.checkpoint_path) - if infos: - # restore from ckpt - self.iteration = infos["step"] - self.epoch = infos["epoch"] - scratch = False - else: - self.iteration = 0 - self.epoch = 0 - scratch = True - - return scratch - - def new_epoch(self): - """Reset the train loader seed and increment `epoch`. - """ - self.epoch += 1 - if self.parallel: - self.train_loader.batch_sampler.set_epoch(self.epoch) - - def train(self): - """The training process control by epoch.""" - from_scratch = self.resume_or_scratch() - - if from_scratch: - # save init model, i.e. 0 epoch - self.save(tag="init") - - self.lr_scheduler.step(self.iteration) - if self.parallel: - self.train_loader.batch_sampler.set_epoch(self.epoch) - - self.logger.info( - f"Train Total Examples: {len(self.train_loader.dataset)}") - self.punc_list = [] - for i in range(len(self.train_loader.dataset.id2punc)): - self.punc_list.append(self.train_loader.dataset.id2punc[i]) - while self.epoch < self.config["training"]["n_epoch"]: - self.model.train() - self.total_label_train = [] - self.total_predict_train = [] - try: - data_start_time = time.time() - for batch_index, batch in enumerate(self.train_loader): - dataload_time = time.time() - data_start_time - msg = "Train: Rank: {}, ".format(dist.get_rank()) - msg += "epoch: {}, ".format(self.epoch) - msg += "step: {}, ".format(self.iteration) - msg += "batch : {}/{}, ".format(batch_index + 1, - len(self.train_loader)) - msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) - msg += "data time: {:>.3f}s, ".format(dataload_time) - self.train_batch(batch_index, batch, msg) - data_start_time = time.time() - # t = classification_report( - # self.total_label_train, - # self.total_predict_train, - # target_names=self.punc_list) - # self.logger.info(t) - except Exception as e: - self.logger.error(e) - raise e - - total_loss, F1_score = self.valid() - self.logger.info("Epoch {} Val info val_loss {}, F1_score {}". - format(self.epoch, total_loss, F1_score)) - - self.save( - tag=self.epoch, infos={"val_loss": total_loss, - "F1": F1_score}) - # step lr every epoch - self.lr_scheduler.step() - self.new_epoch() - - def run(self): - """The routine of the experiment after setup. This method is intended - to be used by the user. - """ - try: - self.train() - except KeyboardInterrupt: - self.logger.info("Training was aborted by keybord interrupt.") - self.save() - exit(-1) - finally: - self.destory() - self.logger.info("Training Done.") - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - output_dir = Path(self.args.output_dir).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir - - def setup_log_dir(self): - """Create a directory used for logging. - """ - # log dir - log_dir = Path(self.args.log_dir).expanduser() - log_dir.mkdir(parents=True, exist_ok=True) - - self.log_dir = log_dir - - def setup_checkpointer(self): - """Create a directory used to save checkpoints into. - - It is "checkpoints" inside the output directory. - """ - # checkpoint dir - self.checkpointer = Checkpoint(self.config["checkpoint"]["kbest_n"], - self.config["checkpoint"]["latest_n"]) - - checkpoint_dir = self.output_dir / "checkpoints" - checkpoint_dir.mkdir(exist_ok=True) - - self.checkpoint_dir = checkpoint_dir - - def setup_logger(self): - LOG_FORMAT = "%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s" - format_str = logging.Formatter( - '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' - ) - logging.basicConfig( - filename=self.config["training"]["log_path"], - level=logging.INFO, - format=LOG_FORMAT) - self.logger = logging.getLogger(__name__) - - self.logger.setLevel(logging.INFO) - sh = logging.StreamHandler() - sh.setFormatter(format_str) - self.logger.addHandler(sh) - - self.logger.info('info') - - @mp_tools.rank_zero_only - def destory(self): - pass - - @mp_tools.rank_zero_only - def dump_config(self): - """Save the configuration used for this experiment. - - It is saved in to ``config.yaml`` in the output directory at the - beginning of the experiment. - """ - with open(self.output_dir / "config.yaml", "wt") as f: - print(self.config, file=f) - - def train_batch(self, batch_index, batch_data, msg): - start = time.time() - - input, label = batch_data - label = paddle.reshape(label, shape=[-1]) - y, logit = self.model(input) - pred = paddle.argmax(logit, axis=1) - self.total_label_train.extend(label.numpy().tolist()) - self.total_predict_train.extend(pred.numpy().tolist()) - loss = self.crit(y, label) - - loss.backward() - layer_tools.print_grads(self.model, print_func=None) - self.optimizer.step() - self.optimizer.clear_grad() - iteration_time = time.time() - start - - losses_np = { - "train_loss": float(loss), - } - msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config["data"]["batch_size"]) - msg += ", ".join("{}: {:>.6f}".format(k, v) - for k, v in losses_np.items()) - self.logger.info(msg) - self.iteration += 1 - - @paddle.no_grad() - def valid(self): - self.logger.info( - f"Valid Total Examples: {len(self.valid_loader.dataset)}") - self.model.eval() - valid_losses = defaultdict(list) - num_seen_utts = 1 - total_loss = 0.0 - valid_total_label = [] - valid_total_predict = [] - for i, batch in enumerate(self.valid_loader): - input, label = batch - label = paddle.reshape(label, shape=[-1]) - y, logit = self.model(input) - pred = paddle.argmax(logit, axis=1) - valid_total_label.extend(label.numpy().tolist()) - valid_total_predict.extend(pred.numpy().tolist()) - loss = self.crit(y, label) - - if paddle.isfinite(loss): - num_utts = batch[1].shape[0] - num_seen_utts += num_utts - total_loss += float(loss) * num_utts - valid_losses["val_loss"].append(float(loss)) - - if (i + 1) % self.config["training"]["log_interval"] == 0: - valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} - valid_dump["val_history_loss"] = total_loss / num_seen_utts - - # logging - msg = f"Valid: Rank: {dist.get_rank()}, " - msg += "epoch: {}, ".format(self.epoch) - msg += "step: {}, ".format(self.iteration) - msg += "batch : {}/{}, ".format(i + 1, len(self.valid_loader)) - msg += ", ".join("{}: {:>.6f}".format(k, v) - for k, v in valid_dump.items()) - self.logger.info(msg) - - self.logger.info("Rank {} Val info val_loss {}".format( - dist.get_rank(), total_loss / num_seen_utts)) - F1_score = f1_score( - valid_total_label, valid_total_predict, average="macro") - return total_loss / num_seen_utts, F1_score - - def setup_model(self): - config = self.config - - model = DefinedClassifier[self.config["model_type"]]( - **self.config["model_params"]) - self.crit = DefinedLoss[self.config["loss_type"]](**self.config[ - "loss"]) if "loss_type" in self.config else DefinedLoss["ce"]() - - if self.parallel: - model = paddle.DataParallel(model) - - # self.logger.info(f"{model}") - # layer_tools.print_params(model, self.logger.info) - - lr_scheduler = paddle.optimizer.lr.ExponentialDecay( - learning_rate=config["training"]["lr"], - gamma=config["training"]["lr_decay"], - verbose=True) - optimizer = paddle.optimizer.Adam( - learning_rate=lr_scheduler, - parameters=model.parameters(), - weight_decay=paddle.regularizer.L2Decay( - config["training"]["weight_decay"])) - - self.model = model - self.optimizer = optimizer - self.lr_scheduler = lr_scheduler - self.logger.info("Setup model/criterion/optimizer/lr_scheduler!") - - def setup_dataloader(self): - config = self.config["data"].copy() - train_dataset = DefinedDataset[config["dataset_type"]]( - train_path=config["train_path"], **config["data_params"]) - dev_dataset = DefinedDataset[config["dataset_type"]]( - train_path=config["dev_path"], **config["data_params"]) - - self.train_loader = DataLoader( - train_dataset, - num_workers=config["num_workers"], - batch_size=config["batch_size"]) - self.valid_loader = DataLoader( - dev_dataset, - batch_size=config["batch_size"], - shuffle=False, - drop_last=False, - num_workers=config["num_workers"]) - self.logger.info("Setup train/valid Dataloader!") - - -class Tester(Trainer): - def __init__(self, config, args): - super().__init__(config, args) - - @mp_tools.rank_zero_only - @paddle.no_grad() - def test(self): - self.logger.info( - f"Test Total Examples: {len(self.test_loader.dataset)}") - self.punc_list = [] - for i in range(len(self.test_loader.dataset.id2punc)): - self.punc_list.append(self.test_loader.dataset.id2punc[i]) - self.model.eval() - test_total_label = [] - test_total_predict = [] - with open(self.args.result_file, 'w') as fout: - for i, batch in enumerate(self.test_loader): - input, label = batch - label = paddle.reshape(label, shape=[-1]) - y, logit = self.model(input) - pred = paddle.argmax(logit, axis=1) - test_total_label.extend(label.numpy().tolist()) - test_total_predict.extend(pred.numpy().tolist()) - - # logging - msg = "Test: " - msg += "epoch: {}, ".format(self.epoch) - msg += "step: {}, ".format(self.iteration) - self.logger.info(msg) - t = classification_report( - test_total_label, test_total_predict, target_names=self.punc_list) - print(t) - t2 = self.evaluation(test_total_label, test_total_predict) - print(t2) - - def evaluation(self, y_pred, y_test): - precision, recall, f1, _ = precision_recall_fscore_support( - y_test, y_pred, average=None, labels=[1, 2, 3]) - overall = precision_recall_fscore_support( - y_test, y_pred, average='macro', labels=[1, 2, 3]) - result = pd.DataFrame( - np.array([precision, recall, f1]), - columns=list(['O', 'COMMA', 'PERIOD', 'QUESTION'])[1:], - index=['Precision', 'Recall', 'F1']) - result['OVERALL'] = overall[:3] - return result - - def run_test(self): - self.resume_or_scratch() - try: - self.test() - except KeyboardInterrupt: - self.logger.info("Testing was aborted by keybord interrupt.") - exit(-1) - - def setup(self): - """Setup the experiment. - """ - if self.args.ngpu > 0: - paddle.set_device('gpu') - else: - paddle.set_device('cpu') - self.setup_logger() - self.setup_output_dir() - self.setup_checkpointer() - - self.setup_dataloader() - self.setup_model() - - self.iteration = 0 - self.epoch = 0 - - def setup_model(self): - config = self.config - model = DefinedClassifier[self.config["model_type"]]( - **self.config["model_params"]) - - self.model = model - self.logger.info("Setup model!") - - def setup_dataloader(self): - config = self.config["data"].copy() - - test_dataset = DefinedDataset[config["dataset_type"]]( - train_path=config["test_path"], **config["data_params"]) - - self.test_loader = DataLoader( - test_dataset, - batch_size=config["batch_size"], - shuffle=False, - drop_last=False) - self.logger.info("Setup test Dataloader!") - - def setup_output_dir(self): - """Create a directory used for output. - """ - # output dir - if self.args.output_dir: - output_dir = Path(self.args.output_dir).expanduser() - output_dir.mkdir(parents=True, exist_ok=True) - else: - output_dir = Path( - self.args.checkpoint_path).expanduser().parent.parent - output_dir.mkdir(parents=True, exist_ok=True) - - self.output_dir = output_dir - - def setup_logger(self): - LOG_FORMAT = "%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s" - format_str = logging.Formatter( - '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' - ) - logging.basicConfig( - filename=self.config["testing"]["log_path"], - level=logging.INFO, - format=LOG_FORMAT) - self.logger = logging.getLogger(__name__) - - self.logger.setLevel(logging.INFO) - sh = logging.StreamHandler() - sh.setFormatter(format_str) - self.logger.addHandler(sh) - - self.logger.info('info') diff --git a/paddlespeech/text/utils/default_parser.py b/paddlespeech/text/utils/default_parser.py deleted file mode 100644 index 469157a69..000000000 --- a/paddlespeech/text/utils/default_parser.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse - - -def default_argument_parser(): - r"""A simple yet genral argument parser for experiments with t2s. - - This is used in examples with t2s. And it is intended to be used by - other experiments with t2s. It requires a minimal set of command line - arguments to start a training script. - - The ``--config`` and ``--opts`` are used for overwrite the deault - configuration. - - The ``--data`` and ``--output`` specifies the data path and output path. - Resuming training from existing progress at the output directory is the - intended default behavior. - - The ``--checkpoint_path`` specifies the checkpoint to load from. - - The ``--ngpu`` specifies how to run the training. - - - See Also - -------- - paddlespeech.t2s.training.experiment - Returns - ------- - argparse.ArgumentParser - the parser - """ - parser = argparse.ArgumentParser() - - # yapf: disable - # data and output - parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.") - parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.") - # parser.add_argument("--data", metavar="DATA_DIR", help="path to the datatset.") - parser.add_argument("--output_dir", metavar="OUTPUT_DIR", help="path to save checkpoint.") - parser.add_argument("--log_dir", metavar="LOG_DIR", help="path to save logs.") - - # load from saved checkpoint - parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load") - - # save jit model to - parser.add_argument("--export_path", type=str, help="path of the jit model to save") - - # save asr result to - parser.add_argument("--result_file", type=str, help="path of save the asr result") - - # running - parser.add_argument("--ngpu", type=int, default=1, help="number of parallel processes to use. if ngpu=0, using cpu.") - - # overwrite extra config and default config - # parser.add_argument("--opts", nargs=argparse.REMAINDER, - # help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") - parser.add_argument("--opts", type=str, default=[], nargs='+', - help="options to overwrite --config file and the default config, passing in KEY VALUE pairs") - # yapd: enable - - return parser diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py new file mode 100644 index 000000000..e493b8004 --- /dev/null +++ b/paddlespeech/vector/models/ecapa_tdnn.py @@ -0,0 +1,409 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +def length_to_mask(length, max_len=None, dtype=None): + assert len(length.shape) == 1 + + if max_len is None: + max_len = length.max().astype( + 'int').item() # using arange to generate mask + mask = paddle.arange( + max_len, dtype=length.dtype).expand( + (len(length), max_len)) < length.unsqueeze(1) + + if dtype is None: + dtype = length.dtype + + mask = paddle.to_tensor(mask, dtype=dtype) + return mask + + +class Conv1d(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding="same", + dilation=1, + groups=1, + bias=True, + padding_mode="reflect", ): + super().__init__() + + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.padding = padding + self.padding_mode = padding_mode + + self.conv = nn.Conv1D( + in_channels, + out_channels, + self.kernel_size, + stride=self.stride, + padding=0, + dilation=self.dilation, + groups=groups, + bias_attr=bias, ) + + def forward(self, x): + if self.padding == "same": + x = self._manage_padding(x, self.kernel_size, self.dilation, + self.stride) + else: + raise ValueError("Padding must be 'same'. Got {self.padding}") + + return self.conv(x) + + def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int): + L_in = x.shape[-1] # Detecting input shape + padding = self._get_padding_elem(L_in, stride, kernel_size, + dilation) # Time padding + x = F.pad( + x, padding, mode=self.padding_mode, + data_format="NCL") # Applying padding + return x + + def _get_padding_elem(self, + L_in: int, + stride: int, + kernel_size: int, + dilation: int): + if stride > 1: + n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1) + L_out = stride * (n_steps - 1) + kernel_size * dilation + padding = [kernel_size // 2, kernel_size // 2] + else: + L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1 + + padding = [(L_in - L_out) // 2, (L_in - L_out) // 2] + + return padding + + +class BatchNorm1d(nn.Layer): + def __init__( + self, + input_size, + eps=1e-05, + momentum=0.9, + weight_attr=None, + bias_attr=None, + data_format='NCL', + use_global_stats=None, ): + super().__init__() + + self.norm = nn.BatchNorm1D( + input_size, + epsilon=eps, + momentum=momentum, + weight_attr=weight_attr, + bias_attr=bias_attr, + data_format=data_format, + use_global_stats=use_global_stats, ) + + def forward(self, x): + x_n = self.norm(x) + return x_n + + +class TDNNBlock(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + dilation, + activation=nn.ReLU, ): + super().__init__() + self.conv = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + dilation=dilation, ) + self.activation = activation() + self.norm = BatchNorm1d(input_size=out_channels) + + def forward(self, x): + return self.norm(self.activation(self.conv(x))) + + +class Res2NetBlock(nn.Layer): + def __init__(self, in_channels, out_channels, scale=8, dilation=1): + super().__init__() + assert in_channels % scale == 0 + assert out_channels % scale == 0 + + in_channel = in_channels // scale + hidden_channel = out_channels // scale + + self.blocks = nn.LayerList([ + TDNNBlock( + in_channel, hidden_channel, kernel_size=3, dilation=dilation) + for i in range(scale - 1) + ]) + self.scale = scale + + def forward(self, x): + y = [] + for i, x_i in enumerate(paddle.chunk(x, self.scale, axis=1)): + if i == 0: + y_i = x_i + elif i == 1: + y_i = self.blocks[i - 1](x_i) + else: + y_i = self.blocks[i - 1](x_i + y_i) + y.append(y_i) + y = paddle.concat(y, axis=1) + return y + + +class SEBlock(nn.Layer): + def __init__(self, in_channels, se_channels, out_channels): + super().__init__() + + self.conv1 = Conv1d( + in_channels=in_channels, out_channels=se_channels, kernel_size=1) + self.relu = paddle.nn.ReLU() + self.conv2 = Conv1d( + in_channels=se_channels, out_channels=out_channels, kernel_size=1) + self.sigmoid = paddle.nn.Sigmoid() + + def forward(self, x, lengths=None): + L = x.shape[-1] + if lengths is not None: + mask = length_to_mask(lengths * L, max_len=L) + mask = mask.unsqueeze(1) + total = mask.sum(axis=2, keepdim=True) + s = (x * mask).sum(axis=2, keepdim=True) / total + else: + s = x.mean(axis=2, keepdim=True) + + s = self.relu(self.conv1(s)) + s = self.sigmoid(self.conv2(s)) + + return s * x + + +class AttentiveStatisticsPooling(nn.Layer): + def __init__(self, channels, attention_channels=128, global_context=True): + super().__init__() + + self.eps = 1e-12 + self.global_context = global_context + if global_context: + self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1) + else: + self.tdnn = TDNNBlock(channels, attention_channels, 1, 1) + self.tanh = nn.Tanh() + self.conv = Conv1d( + in_channels=attention_channels, + out_channels=channels, + kernel_size=1) + + def forward(self, x, lengths=None): + C, L = x.shape[1], x.shape[2] # KP: (N, C, L) + + def _compute_statistics(x, m, axis=2, eps=self.eps): + mean = (m * x).sum(axis) + std = paddle.sqrt( + (m * (x - mean.unsqueeze(axis)).pow(2)).sum(axis).clip(eps)) + return mean, std + + if lengths is None: + lengths = paddle.ones([x.shape[0]]) + + # Make binary mask of shape [N, 1, L] + mask = length_to_mask(lengths * L, max_len=L) + mask = mask.unsqueeze(1) + + # Expand the temporal context of the pooling layer by allowing the + # self-attention to look at global properties of the utterance. + if self.global_context: + total = mask.sum(axis=2, keepdim=True).astype('float32') + mean, std = _compute_statistics(x, mask / total) + mean = mean.unsqueeze(2).tile((1, 1, L)) + std = std.unsqueeze(2).tile((1, 1, L)) + attn = paddle.concat([x, mean, std], axis=1) + else: + attn = x + + # Apply layers + attn = self.conv(self.tanh(self.tdnn(attn))) + + # Filter out zero-paddings + attn = paddle.where( + mask.tile((1, C, 1)) == 0, + paddle.ones_like(attn) * float("-inf"), attn) + + attn = F.softmax(attn, axis=2) + mean, std = _compute_statistics(x, attn) + + # Append mean and std of the batch + pooled_stats = paddle.concat((mean, std), axis=1) + pooled_stats = pooled_stats.unsqueeze(2) + + return pooled_stats + + +class SERes2NetBlock(nn.Layer): + def __init__( + self, + in_channels, + out_channels, + res2net_scale=8, + se_channels=128, + kernel_size=1, + dilation=1, + activation=nn.ReLU, ): + super().__init__() + self.out_channels = out_channels + self.tdnn1 = TDNNBlock( + in_channels, + out_channels, + kernel_size=1, + dilation=1, + activation=activation, ) + self.res2net_block = Res2NetBlock(out_channels, out_channels, + res2net_scale, dilation) + self.tdnn2 = TDNNBlock( + out_channels, + out_channels, + kernel_size=1, + dilation=1, + activation=activation, ) + self.se_block = SEBlock(out_channels, se_channels, out_channels) + + self.shortcut = None + if in_channels != out_channels: + self.shortcut = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, ) + + def forward(self, x, lengths=None): + residual = x + if self.shortcut: + residual = self.shortcut(x) + + x = self.tdnn1(x) + x = self.res2net_block(x) + x = self.tdnn2(x) + x = self.se_block(x, lengths) + + return x + residual + + +class EcapaTdnn(nn.Layer): + def __init__( + self, + input_size, + lin_neurons=192, + activation=nn.ReLU, + channels=[512, 512, 512, 512, 1536], + kernel_sizes=[5, 3, 3, 3, 1], + dilations=[1, 2, 3, 4, 1], + attention_channels=128, + res2net_scale=8, + se_channels=128, + global_context=True, ): + + super().__init__() + assert len(channels) == len(kernel_sizes) + assert len(channels) == len(dilations) + self.channels = channels + self.blocks = nn.LayerList() + self.emb_size = lin_neurons + + # The initial TDNN layer + self.blocks.append( + TDNNBlock( + input_size, + channels[0], + kernel_sizes[0], + dilations[0], + activation, )) + + # SE-Res2Net layers + for i in range(1, len(channels) - 1): + self.blocks.append( + SERes2NetBlock( + channels[i - 1], + channels[i], + res2net_scale=res2net_scale, + se_channels=se_channels, + kernel_size=kernel_sizes[i], + dilation=dilations[i], + activation=activation, )) + + # Multi-layer feature aggregation + self.mfa = TDNNBlock( + channels[-1], + channels[-1], + kernel_sizes[-1], + dilations[-1], + activation, ) + + # Attentive Statistical Pooling + self.asp = AttentiveStatisticsPooling( + channels[-1], + attention_channels=attention_channels, + global_context=global_context, ) + self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2) + + # Final linear transformation + self.fc = Conv1d( + in_channels=channels[-1] * 2, + out_channels=self.emb_size, + kernel_size=1, ) + + def forward(self, x, lengths=None): + """ + Compute embeddings. + + Args: + x (paddle.Tensor): Input log-fbanks with shape (N, n_mels, T). + lengths (paddle.Tensor, optional): Length proportions of batch length with shape (N). Defaults to None. + + Returns: + paddle.Tensor: Output embeddings with shape (N, self.emb_size, 1) + """ + xl = [] + for layer in self.blocks: + try: + x = layer(x, lengths=lengths) + except TypeError: + x = layer(x) + xl.append(x) + + # Multi-layer feature aggregation + x = paddle.concat(xl[1:], axis=1) + x = self.mfa(x) + + # Attentive Statistical Pooling + x = self.asp(x, lengths=lengths) + x = self.asp_bn(x) + + # Final linear transformation + x = self.fc(x) + + return x diff --git a/requirements.txt b/requirements.txt index 42cb33f67..760821662 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,6 +15,7 @@ matplotlib nara_wpe nltk paddleaudio +paddlenlp paddlespeech_ctcdecoders paddlespeech_feat pandas @@ -42,5 +43,6 @@ typeguard unidecode visualdl webrtcvad -yacs +yacs~=0.1.8 yq +zhon diff --git a/setup.py b/setup.py index 73f392c6c..a6b18f979 100644 --- a/setup.py +++ b/setup.py @@ -41,9 +41,9 @@ requirements = { "loguru", "matplotlib", "nara_wpe", - "nltk", "pandas", "paddleaudio", + "paddlenlp", "paddlespeech_feat", "praatio==5.0.0", "pypinyin", @@ -60,7 +60,7 @@ requirements = { "typeguard", "visualdl", "webrtcvad", - "yacs", + "yacs~=0.1.8", ], "develop": [ "ConfigArgParse", @@ -77,6 +77,7 @@ requirements = { "unidecode", "yq", "pre-commit", + "zhon", ] } @@ -126,7 +127,7 @@ def _post_install(install_lib_dir): print("tools install.") # ctcdecoder - ctcdecoder_dir = HERE / 'paddlespeech/s2t/decoders/ctcdecoder/swig' + ctcdecoder_dir = HERE / 'third_party/ctc_decoders' with pushd(ctcdecoder_dir): check_call("bash -e setup.sh") print("ctcdecoder install.") @@ -171,7 +172,7 @@ class UploadCommand(Command): setup_info = dict( # Metadata name='paddlespeech', - version='0.1.0', + version='0.1.1', author='PaddlePaddle Speech and Language Team', author_email='paddlesl@baidu.com', url='https://github.com/PaddlePaddle/PaddleSpeech', diff --git a/setup_audio.py b/setup_audio.py index 24c9bb9b9..5f0140656 100644 --- a/setup_audio.py +++ b/setup_audio.py @@ -13,8 +13,10 @@ # limitations under the License. import setuptools +import paddleaudio + # set the version here -version = '0.1.0a' +version = paddleaudio.__version__ setuptools.setup( name="paddleaudio", diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh index fcd0c2359..c9d640ed2 100644 --- a/tests/benchmark/conformer/run.sh +++ b/tests/benchmark/conformer/run.sh @@ -22,6 +22,7 @@ sed -i "s/ accum_grad: 2/ accum_grad: 1/g" conf/benchmark/conformer.yaml fp_item_list=(fp32) bs_item=(16) config_path=conf/benchmark/conformer.yaml +decode_config_path=conf/tuning/decode.yaml seed=0 output=exp/conformer profiler_options=None @@ -34,13 +35,13 @@ for fp_item in ${fp_item_list[@]}; do echo "index is speed, 8gpus, run_mode is multi_process, begin, conformer" run_mode=mp ngpu=8 - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1 sleep 60 log_name=speech_${model_item}_bs${bs_item}_${fp_item} # 如:clas_MobileNetv1_mp_bs32_fp32_8 echo "index is speed, 1gpus, begin, ${log_name}" run_mode=sp ngpu=1 - CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) + CUDA_VISIBLE_DEVICES=0 bash ${CUR_DIR}/run_benchmark.sh ${run_mode} ${config_path} ${decode_config_path} ${output} ${seed} ${ngpu} ${profiler_options} ${bs_item} ${fp_item} ${model_item} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min) sleep 60 done done diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh index 5b83b15ce..16cd410e2 100644 --- a/tests/benchmark/conformer/run_benchmark.sh +++ b/tests/benchmark/conformer/run_benchmark.sh @@ -5,13 +5,14 @@ function _set_params(){ run_mode=${1:-"sp"} # 单卡sp|多卡mp config_path=${2:-"conf/conformer.yaml"} - output=${3:-"exp/conformer"} - seed=${4:-"0"} - ngpu=${5:-"1"} - profiler_options=${6:-"None"} - batch_size=${7:-"32"} - fp_item=${8:-"fp32"} - model_item=${9:-"conformer"} + decode_config_path=${3:-"conf/tuning/decode.yaml"} + output=${4:-"exp/conformer"} + seed=${5:-"0"} + ngpu=${6:-"1"} + profiler_options=${7:-"None"} + batch_size=${8:-"32"} + fp_item=${9:-"fp32"} + model_item=${10:-"conformer"} benchmark_max_step=0 run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数 # 添加日志解析需要的参数 @@ -35,6 +36,7 @@ function _train(){ echo "Train on ${num_gpu_devices} GPUs" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" train_cmd="--config=${config_path} \ + --decode_cfg=${decode_config_path} \ --output=${output} \ --seed=${seed} \ --ngpu=${ngpu} \ @@ -69,6 +71,6 @@ function _train(){ source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开 _set_params $@ -# _train # 如果只想产出训练log,不解析,可取消注释 +#_train # 如果只想产出训练log,不解析,可取消注释 _run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开 diff --git a/tests/chains/ds2/ds2_params_lite_train_infer.txt b/tests/chains/ds2/ds2_params_lite_train_infer.txt index b11872bd0..cad8efa3c 100644 --- a/tests/chains/ds2/ds2_params_lite_train_infer.txt +++ b/tests/chains/ds2/ds2_params_lite_train_infer.txt @@ -21,13 +21,13 @@ null:null null:null ## ===========================eval_params=========================== -eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --result_file tests/9.rsl --model_type offline +eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --result_file tests/4.rsl --model_type offline null:null ## ===========================infer_params=========================== null:null null:null -norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/9 --export_path exp/deepspeech_tiny/checkpoints/9.jit +norm_export: ../../../paddlespeech/s2t/exps/deepspeech2/bin/export.py --ngpu 1 --config conf/deepspeech2.yaml --model_type offline --checkpoint_path exp/deepspeech_tiny/checkpoints/4 --export_path exp/deepspeech_tiny/checkpoints/4.jit quant_export:null fpgm_export:null distill_export:null diff --git a/tests/chains/ds2/ds2_params_whole_train_infer.txt b/tests/chains/ds2/ds2_params_whole_train_infer.txt index 875e3ccf9..5c6195069 100644 --- a/tests/chains/ds2/ds2_params_whole_train_infer.txt +++ b/tests/chains/ds2/ds2_params_whole_train_infer.txt @@ -21,7 +21,7 @@ null:null null:null ## ===========================eval_params=========================== -eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline +eval: ../../../paddlespeech/s2t/exps/deepspeech2/bin/test.py --ngpu 1 --config conf/deepspeech2.yaml --decode_cfg conf/tuning/decode.yaml --result_file tests/49.rsl --checkpoint_path exp/deepspeech_whole/checkpoints/49 --model_type offline null:null ## ===========================infer_params=========================== diff --git a/tests/chains/ds2/lite_train_infer.sh b/tests/chains/ds2/lite_train_infer.sh index 76b22a38c..1dce1b291 100644 --- a/tests/chains/ds2/lite_train_infer.sh +++ b/tests/chains/ds2/lite_train_infer.sh @@ -1,5 +1,5 @@ bash prepare.sh ds2_params_lite_train_infer.txt lite_train_infer -cd ../../examples/tiny/s0 +cd ../../../examples/tiny/asr0 source path.sh -bash ../../../tests/chains/test.sh ../../../tests/chains/ds2_params_lite_train_infer.txt lite_train_infer +bash ../../../tests/chains/ds2/test.sh ../../../tests/chains/ds2/ds2_params_lite_train_infer.txt lite_train_infer cd ../../../tests/chains diff --git a/tests/chains/ds2/prepare.sh b/tests/chains/ds2/prepare.sh index 73a302836..4913ce42e 100644 --- a/tests/chains/ds2/prepare.sh +++ b/tests/chains/ds2/prepare.sh @@ -34,7 +34,7 @@ MODE=$2 if [ ${MODE} = "lite_train_infer" ];then # pretrain lite train data curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/tiny/s0 + cd ${curPath}/../../../examples/tiny/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -47,7 +47,7 @@ if [ ${MODE} = "lite_train_infer" ];then elif [ ${MODE} = "whole_train_infer" ];then curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -59,7 +59,7 @@ elif [ ${MODE} = "whole_train_infer" ];then cd ${curPath} elif [ ${MODE} = "whole_infer" ];then curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 @@ -71,7 +71,7 @@ elif [ ${MODE} = "whole_infer" ];then cd ${curPath} else curPath=$(readlink -f "$(dirname "$0")") - cd ${curPath}/../../examples/aishell/s0 + cd ${curPath}/../../../examples/aishell/asr0 source path.sh # download audio data bash ./local/data.sh || exit -1 diff --git a/tests/chains/ds2/test.sh b/tests/chains/ds2/test.sh index c93078205..26917c672 100644 --- a/tests/chains/ds2/test.sh +++ b/tests/chains/ds2/test.sh @@ -324,6 +324,7 @@ else gsu=${gpu//,/ } nump=`echo $gsu | wc -w` cmd="${python} ${run_train} --ngpu=$nump" + export CUDA_VISIBLE_DEVICES=${gpu} else # train with multi-machine cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_autocast} ${set_batchsize} ${set_train_params1}" fi diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh new file mode 100755 index 000000000..845c5d6a2 --- /dev/null +++ b/tests/unit/cli/test_cli.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -e +# Audio classification +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav +paddlespeech cls --input ./cat.wav --topk 10 + +# Punctuation_restoration +paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 + +# Speech_recognition +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +paddlespeech asr --input ./zh.wav +paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + +# Text To Speech +paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --voc mb_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --voc style_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --voc hifigan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" +paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0 +paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world" +paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0 + +# Speech Translation (only support linux) +paddlespeech st --input ./en.wav diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/.gitignore b/third_party/ctc_decoders/.gitignore similarity index 100% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/.gitignore rename to third_party/ctc_decoders/.gitignore diff --git a/third_party/ctc_decoders/COPYING.APACHE2.0 b/third_party/ctc_decoders/COPYING.APACHE2.0 new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/third_party/ctc_decoders/COPYING.APACHE2.0 @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/third_party/ctc_decoders/COPYING.LESSER.3 b/third_party/ctc_decoders/COPYING.LESSER.3 new file mode 100644 index 000000000..cca7fc278 --- /dev/null +++ b/third_party/ctc_decoders/COPYING.LESSER.3 @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/third_party/ctc_decoders/LICENSE b/third_party/ctc_decoders/LICENSE new file mode 100644 index 000000000..eeef74b30 --- /dev/null +++ b/third_party/ctc_decoders/LICENSE @@ -0,0 +1,8 @@ +Most of the code here is licensed under the Apache License 2.0. +There are exceptions that have their own licenses, listed below. + +score.h and score.cpp is under the LGPL license. +The two files include the header files from KenLM project. + +For the rest: +The default licence of paddlespeech-ctcdecoders is Apache License 2.0. diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/__init__.py b/third_party/ctc_decoders/__init__.py similarity index 100% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/__init__.py rename to third_party/ctc_decoders/__init__.py diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.cpp b/third_party/ctc_decoders/ctc_beam_search_decoder.cpp similarity index 98% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.cpp rename to third_party/ctc_decoders/ctc_beam_search_decoder.cpp index 8469a194d..db742fbbe 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.cpp +++ b/third_party/ctc_decoders/ctc_beam_search_decoder.cpp @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // @@ -183,7 +183,7 @@ std::vector> ctc_beam_search_decoder( std::sort( prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare); - // compute aproximate ctc score as the return score, without affecting the + // compute approximate ctc score as the return score, without affecting the // return order of decoding result. To delete when decoder gets stable. for (size_t i = 0; i < beam_size && i < prefixes.size(); ++i) { double approx_ctc = prefixes[i]->score; diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.h b/third_party/ctc_decoders/ctc_beam_search_decoder.h similarity index 97% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.h rename to third_party/ctc_decoders/ctc_beam_search_decoder.h index eaba9da8c..584226574 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.h +++ b/third_party/ctc_decoders/ctc_beam_search_decoder.h @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp b/third_party/ctc_decoders/ctc_greedy_decoder.cpp similarity index 96% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp rename to third_party/ctc_decoders/ctc_greedy_decoder.cpp index 53a04fba0..a178c6734 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp +++ b/third_party/ctc_decoders/ctc_greedy_decoder.cpp @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.h b/third_party/ctc_decoders/ctc_greedy_decoder.h similarity index 93% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.h rename to third_party/ctc_decoders/ctc_greedy_decoder.h index dd1b33315..4d60beaf1 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.h +++ b/third_party/ctc_decoders/ctc_greedy_decoder.h @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.cpp b/third_party/ctc_decoders/decoder_utils.cpp similarity index 98% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.cpp rename to third_party/ctc_decoders/decoder_utils.cpp index 5d69ad032..c7ef65428 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.cpp +++ b/third_party/ctc_decoders/decoder_utils.cpp @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // @@ -26,7 +26,7 @@ std::vector> get_pruned_log_probs( for (size_t i = 0; i < prob_step.size(); ++i) { prob_idx.push_back(std::pair(i, prob_step[i])); } - // pruning of vacobulary + // pruning of vocabulary size_t cutoff_len = prob_step.size(); if (cutoff_prob < 1.0 || cutoff_top_n < cutoff_len) { std::sort(prob_idx.begin(), diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.h b/third_party/ctc_decoders/decoder_utils.h similarity index 98% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.h rename to third_party/ctc_decoders/decoder_utils.h index 1d75d03db..098741552 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.h +++ b/third_party/ctc_decoders/decoder_utils.h @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoders.i b/third_party/ctc_decoders/decoders.i similarity index 100% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/decoders.i rename to third_party/ctc_decoders/decoders.i diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.cpp b/third_party/ctc_decoders/path_trie.cpp similarity index 98% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.cpp rename to third_party/ctc_decoders/path_trie.cpp index f52d11573..a5e7dd3da 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.cpp +++ b/third_party/ctc_decoders/path_trie.cpp @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.h b/third_party/ctc_decoders/path_trie.h similarity index 96% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.h rename to third_party/ctc_decoders/path_trie.h index 717d4b004..5193e0a47 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.h +++ b/third_party/ctc_decoders/path_trie.h @@ -1,6 +1,6 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // -// Licensed under the Apache License, Version 2.0 (the "License"); +// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.cpp b/third_party/ctc_decoders/scorer.cpp similarity index 90% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.cpp rename to third_party/ctc_decoders/scorer.cpp index 7bd6542df..977112d17 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.cpp +++ b/third_party/ctc_decoders/scorer.cpp @@ -1,16 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the "COPYING.LESSER.3"); #include "scorer.h" @@ -20,8 +8,6 @@ #include "lm/config.hh" #include "lm/model.hh" #include "lm/state.hh" -#include "util/string_piece.hh" -#include "util/tokenize_piece.hh" #include "decoder_utils.h" @@ -223,7 +209,7 @@ void Scorer::fill_dictionary(bool add_space) { * This gets rid of "epsilon" transitions in the FST. * These are transitions that don't require a string input to be taken. - * Getting rid of them is necessary to make the FST determinisitc, but + * Getting rid of them is necessary to make the FST deterministic, but * can greatly increase the size of the FST */ fst::RmEpsilon(&dictionary); diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.h b/third_party/ctc_decoders/scorer.h similarity index 82% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.h rename to third_party/ctc_decoders/scorer.h index 3f3001e77..5739339df 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.h +++ b/third_party/ctc_decoders/scorer.h @@ -1,16 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the "COPYING.LESSER.3"); #ifndef SCORER_H_ #define SCORER_H_ @@ -23,7 +11,6 @@ #include "lm/enumerate_vocab.hh" #include "lm/virtual_interface.hh" #include "lm/word_index.hh" -#include "util/string_piece.hh" #include "path_trie.h" diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py b/third_party/ctc_decoders/setup.py similarity index 97% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py rename to third_party/ctc_decoders/setup.py index 8a2086d6b..6484b87c5 100644 --- a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py +++ b/third_party/ctc_decoders/setup.py @@ -127,11 +127,11 @@ decoders_module = [ setup( name='paddlespeech_ctcdecoders', - version='0.1.0', + version='0.1.1', description="CTC decoders in paddlespeech", author="PaddlePaddle Speech and Language Team", author_email="paddlesl@baidu.com", url="https://github.com/PaddlePaddle/PaddleSpeech", - license='Apache 2.0', + license='Apache 2.0, GNU Lesser General Public License v3 (LGPLv3) (LGPL-3)', ext_modules=decoders_module, py_modules=['swig_decoders']) diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.sh b/third_party/ctc_decoders/setup.sh similarity index 100% rename from paddlespeech/s2t/decoders/ctcdecoder/swig/setup.sh rename to third_party/ctc_decoders/setup.sh diff --git a/tools/extras/install_kaldi.sh b/tools/extras/install_kaldi.sh index b93e7ecf6..f8cd961fc 100755 --- a/tools/extras/install_kaldi.sh +++ b/tools/extras/install_kaldi.sh @@ -34,7 +34,7 @@ make -j4 pushd ../src OPENBLAS_DIR=${KALDI_DIR}/../OpenBLAS mkdir -p ${OPENBLAS_DIR}/install -if [ $SHARED == true ]; +if [ $SHARED == true ]; then ./configure --shared --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install else ./configure --static --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install diff --git a/tools/release_note.py b/tools/release_note.py index 07a057697..2016c1a90 100755 --- a/tools/release_note.py +++ b/tools/release_note.py @@ -14,191 +14,180 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== - """ Create release notes with the issues from a milestone. python3 release_notes.py -c didi delta v.xxxxx """ - -import sys -import json import argparse -import urllib.request import collections +import json +import sys +import urllib.request github_url = 'https://api.github.com/repos' if __name__ == '__main__': - # usage: - # 1. close milestone on github - # 2. python3 tools/release_notes.py -c didi delta v0.3.3 - - # Parse arguments - parser = argparse.ArgumentParser( - description='Create a draft release with the issues from a milestone.', - ) - - parser.add_argument( - 'user', - metavar='user', - type=str, - default='paddlepaddle', - help='github user: paddlepaddle' - ) - - parser.add_argument( - 'repository', - metavar='repository', - type=str, - default='paddlespeech', - help='github repository: paddlespeech' - ) - - parser.add_argument( - 'milestone', - metavar='milestone', - type=str, - help='name of used milestone: v0.3.3' - ) - - parser.add_argument( - '-c', '--closed', - help='Fetch closed milestones/issues', - action='store_true' - ) - - parser.print_help() - args = parser.parse_args() - - # Fetch milestone infos - url = "%s/%s/%s/milestones" % ( - github_url, - args.user, - args.repository - ) - - headers = { - 'Origin': 'https://github.com', - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) ' - 'AppleWebKit/537.11 (KHTML, like Gecko) ' - 'Chrome/23.0.1271.64 Safari/537.11', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', - 'Accept-Encoding': 'none', - 'Accept-Language': 'en-US,en;q=0.8', - 'Connection': 'keep-alive'} - - if args.closed: - url += "?state=closed" - - req = urllib.request.Request(url, headers=headers) - github_request = urllib.request.urlopen(req) - if not github_request: - parser.error('Cannot read milestone list.') - - decoder = json.JSONDecoder() - milestones = decoder.decode(github_request.read().decode('utf-8')) - github_request.close() - - print('parse milestones', file=sys.stderr) - milestone_id = None - for milestone in milestones: - if milestone['title'] == args.milestone: - milestone_id = milestone['number'] - if not milestone_id: - parser.error('Cannot find milestone') - - - # Get milestone related issue info - url = '%s/%s/%s/issues?milestone=%d' % ( - github_url, - args.user, - args.repository, - milestone_id - ) - if args.closed: - url += "&state=closed" - - req = urllib.request.Request(url, headers=headers) - github_request = urllib.request.urlopen(req) - if not github_request: - parser.error('Cannot read issue list.') - - issues = decoder.decode(github_request.read().decode('utf-8')) - github_request.close() - - #print('parse issues', file=sys.stderr) - #final_data = [] - #labels = [] - #thanks_to = [] - #for issue in issues: - - # for label in issue['labels']: - # labels.append(label['name']) - - # thanks_to.append('@%s' % (issue['user']['login'])) - # final_data.append(' * **[%s]** - %s #%d by **@%s**\n' % ( - # label['name'], - # issue['title'], - # issue['number'], - # issue['user']['login'] - # )) - - #dic = collections.defaultdict(set) - #for l_release in list(set(labels)): - - # for f_data in final_data: - # if '[%s]' % l_release in f_data: - # dic[l_release].add(f_data) - - #with open(f"release_note_issues_{args.milestone}.md", 'w') as f: - # for key, value in dic.items(): - # print('# %s\n%s' % (key, ''.join(value)), file=f) - # print('# %s\n%s' % ('Acknowledgements', 'Special thanks to %s ' % (' '.join(list(set(thanks_to))))), file=f) - - - # Get milestone related PR info - url = '%s/%s/%s/pulls?milestone=%d' % ( - github_url, - args.user, - args.repository, - milestone_id - ) - if args.closed: - url += "&state=closed" - - req = urllib.request.Request(url, headers=headers) - github_request = urllib.request.urlopen(req) - if not github_request: - parser.error('Cannot read issue list.') - - issues = decoder.decode(github_request.read().decode('utf-8')) - github_request.close() - - print('parse pulls', file=sys.stderr) - final_data = [] - labels = [] - thanks_to = [] - for issue in issues: - - for label in issue['labels']: - labels.append(label['name']) - - thanks_to.append('@%s' % (issue['user']['login'])) - final_data.append(' * **[%s]** - %s #%d by **@%s**\n' % ( - label['name'], - issue['title'], - issue['number'], - issue['user']['login'] - )) - - dic = collections.defaultdict(set) - for l_release in list(set(labels)): - - for f_data in final_data: - if '[%s]' % l_release in f_data: - dic[l_release].add(f_data) - - with open(f"release_note_pulls_{args.milestone}.md", 'w') as f: - for key, value in dic.items(): - print('# %s\n%s' % (key, ''.join(value)), file=f) - print('# %s\n%s' % ('Acknowledgements', 'Special thanks to %s ' % (' '.join(list(set(thanks_to))))), file=f) + # usage: + # 1. close milestone on github + # 2. python3 tools/release_notes.py -c didi delta v0.3.3 + + # Parse arguments + parser = argparse.ArgumentParser( + description='Create a draft release with the issues from a milestone.', + ) + + parser.add_argument( + 'user', + metavar='user', + type=str, + default='paddlepaddle', + help='github user: paddlepaddle') + + parser.add_argument( + 'repository', + metavar='repository', + type=str, + default='paddlespeech', + help='github repository: paddlespeech') + + parser.add_argument( + 'milestone', + metavar='milestone', + type=str, + help='name of used milestone: v0.3.3') + + parser.add_argument( + '-c', + '--closed', + help='Fetch closed milestones/issues', + action='store_true') + + parser.print_help() + args = parser.parse_args() + + # Fetch milestone infos + url = "%s/%s/%s/milestones" % (github_url, args.user, args.repository) + + headers = { + 'Origin': + 'https://github.com', + 'User-Agent': + 'Mozilla/5.0 (X11; Linux x86_64) ' + 'AppleWebKit/537.11 (KHTML, like Gecko) ' + 'Chrome/23.0.1271.64 Safari/537.11', + 'Accept': + 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Charset': + 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', + 'Accept-Encoding': + 'none', + 'Accept-Language': + 'en-US,en;q=0.8', + 'Connection': + 'keep-alive' + } + + if args.closed: + url += "?state=closed" + + req = urllib.request.Request(url, headers=headers) + github_request = urllib.request.urlopen(req) + if not github_request: + parser.error('Cannot read milestone list.') + + decoder = json.JSONDecoder() + milestones = decoder.decode(github_request.read().decode('utf-8')) + github_request.close() + + print('parse milestones', file=sys.stderr) + milestone_id = None + for milestone in milestones: + if milestone['title'] == args.milestone: + milestone_id = milestone['number'] + if not milestone_id: + parser.error('Cannot find milestone') + + # Get milestone related issue info + url = '%s/%s/%s/issues?milestone=%d' % (github_url, args.user, + args.repository, milestone_id) + if args.closed: + url += "&state=closed" + + req = urllib.request.Request(url, headers=headers) + github_request = urllib.request.urlopen(req) + if not github_request: + parser.error('Cannot read issue list.') + + issues = decoder.decode(github_request.read().decode('utf-8')) + github_request.close() + + #print('parse issues', file=sys.stderr) + #final_data = [] + #labels = [] + #thanks_to = [] + #for issue in issues: + + # for label in issue['labels']: + # labels.append(label['name']) + + # thanks_to.append('@%s' % (issue['user']['login'])) + # final_data.append(' * **[%s]** - %s #%d by **@%s**\n' % ( + # label['name'], + # issue['title'], + # issue['number'], + # issue['user']['login'] + # )) + + #dic = collections.defaultdict(set) + #for l_release in list(set(labels)): + + # for f_data in final_data: + # if '[%s]' % l_release in f_data: + # dic[l_release].add(f_data) + + #with open(f"release_note_issues_{args.milestone}.md", 'w') as f: + # for key, value in dic.items(): + # print('# %s\n%s' % (key, ''.join(value)), file=f) + # print('# %s\n%s' % ('Acknowledgements', 'Special thanks to %s ' % (' '.join(list(set(thanks_to))))), file=f) + + # Get milestone related PR info + url = '%s/%s/%s/pulls?milestone=%d' % (github_url, args.user, + args.repository, milestone_id) + if args.closed: + url += "&state=closed" + + req = urllib.request.Request(url, headers=headers) + github_request = urllib.request.urlopen(req) + if not github_request: + parser.error('Cannot read issue list.') + + issues = decoder.decode(github_request.read().decode('utf-8')) + github_request.close() + + print('parse pulls', file=sys.stderr) + final_data = [] + labels = [] + thanks_to = [] + for issue in issues: + + for label in issue['labels']: + labels.append(label['name']) + + thanks_to.append('@%s' % (issue['user']['login'])) + final_data.append(' * **[%s]** - %s #%d by **@%s**\n' % + (label['name'], issue['title'], issue['number'], + issue['user']['login'])) + + dic = collections.defaultdict(set) + for l_release in list(set(labels)): + + for f_data in final_data: + if '[%s]' % l_release in f_data: + dic[l_release].add(f_data) + + with open(f"release_note_pulls_{args.milestone}.md", 'w') as f: + for key, value in dic.items(): + print('# %s\n%s' % (key, ''.join(value)), file=f) + print( + '# %s\n%s' % ('Acknowledgements', 'Special thanks to %s ' % + (' '.join(list(set(thanks_to))))), + file=f) diff --git a/utils/generate_infer_yaml.py b/utils/generate_infer_yaml.py new file mode 100644 index 000000000..a2eb28c76 --- /dev/null +++ b/utils/generate_infer_yaml.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +''' + Merge training configs into a single inference config. + The single inference config is for CLI, which only takes a single config to do inferencing. + The trainig configs includes: model config, preprocess config, decode config, vocab file and cmvn file. +''' + +import yaml +import json +import os +import argparse +import math +from yacs.config import CfgNode + +from paddlespeech.s2t.frontend.utility import load_dict +from contextlib import redirect_stdout + + +def save(save_path, config): + with open(save_path, 'w') as fp: + with redirect_stdout(fp): + print(config.dump()) + + +def load(save_path): + config = CfgNode(new_allowed=True) + config.merge_from_file(save_path) + return config + +def load_json(json_path): + with open(json_path) as f: + json_content = json.load(f) + return json_content + +def remove_config_part(config, key_list): + if len(key_list) == 0: + return + for i in range(len(key_list) -1): + config = config[key_list[i]] + config.pop(key_list[-1]) + +def load_cmvn_from_json(cmvn_stats): + means = cmvn_stats['mean_stat'] + variance = cmvn_stats['var_stat'] + count = cmvn_stats['frame_num'] + for i in range(len(means)): + means[i] /= count + variance[i] = variance[i] / count - means[i] * means[i] + if variance[i] < 1.0e-20: + variance[i] = 1.0e-20 + variance[i] = 1.0 / math.sqrt(variance[i]) + cmvn_stats = {"mean":means, "istd":variance} + return cmvn_stats + +def merge_configs( + conf_path = "conf/conformer.yaml", + preprocess_path = "conf/preprocess.yaml", + decode_path = "conf/tuning/decode.yaml", + vocab_path = "data/vocab.txt", + cmvn_path = "data/mean_std.json", + save_path = "conf/conformer_infer.yaml", + ): + + # Load the configs + config = load(conf_path) + decode_config = load(decode_path) + vocab_list = load_dict(vocab_path) + + # If use the kaldi feature, do not load the cmvn file + if cmvn_path.split(".")[-1] == 'json': + cmvn_stats = load_json(cmvn_path) + if os.path.exists(preprocess_path): + preprocess_config = load(preprocess_path) + for idx, process in enumerate(preprocess_config["process"]): + if process['type'] == "cmvn_json": + preprocess_config["process"][idx][ + "cmvn_path"] = cmvn_stats + break + + config.preprocess_config = preprocess_config + else: + cmvn_stats = load_cmvn_from_json(cmvn_stats) + config.mean_std_filepath = [{"cmvn_stats":cmvn_stats}] + config.augmentation_config = '' + # the cmvn file is end with .ark + else: + config.cmvn_path = cmvn_path + # Updata the config + config.vocab_filepath = vocab_list + config.input_dim = config.feat_dim + config.output_dim = len(config.vocab_filepath) + config.decode = decode_config + # Remove some parts of the config + + if os.path.exists(preprocess_path): + remove_train_list = ["train_manifest", + "dev_manifest", + "test_manifest", + "n_epoch", + "accum_grad", + "global_grad_clip", + "optim", + "optim_conf", + "scheduler", + "scheduler_conf", + "log_interval", + "checkpoint", + "shuffle_method", + "weight_decay", + "ctc_grad_norm_type", + "minibatches", + "subsampling_factor", + "batch_bins", + "batch_count", + "batch_frames_in", + "batch_frames_inout", + "batch_frames_out", + "sortagrad", + "feat_dim", + "stride_ms", + "window_ms", + "batch_size", + "maxlen_in", + "maxlen_out", + ] + else: + remove_train_list = ["train_manifest", + "dev_manifest", + "test_manifest", + "n_epoch", + "accum_grad", + "global_grad_clip", + "log_interval", + "checkpoint", + "lr", + "lr_decay", + "batch_size", + "shuffle_method", + "weight_decay", + "sortagrad", + "num_workers", + ] + + for item in remove_train_list: + try: + remove_config_part(config, [item]) + except: + print ( item + " " +"can not be removed") + + # Save the config + save(save_path, config) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog='Config merge', add_help=True) + parser.add_argument( + '--cfg_pth', type=str, default = 'conf/transformer.yaml', help='origin config file') + parser.add_argument( + '--pre_pth', type=str, default= "conf/preprocess.yaml", help='') + parser.add_argument( + '--dcd_pth', type=str, default= "conf/tuninig/decode.yaml", help='') + parser.add_argument( + '--vb_pth', type=str, default= "data/lang_char/vocab.txt", help='') + parser.add_argument( + '--cmvn_pth', type=str, default= "data/mean_std.json", help='') + parser.add_argument( + '--save_pth', type=str, default= "conf/transformer_infer.yaml", help='') + parser_args = parser.parse_args() + + merge_configs( + conf_path = parser_args.cfg_pth, + decode_path = parser_args.dcd_pth, + preprocess_path = parser_args.pre_pth, + vocab_path = parser_args.vb_pth, + cmvn_path = parser_args.cmvn_pth, + save_path = parser_args.save_pth, + ) + + diff --git a/examples/csmsc/voc3/local/link_wav.py b/utils/link_wav.py similarity index 77% rename from examples/csmsc/voc3/local/link_wav.py rename to utils/link_wav.py index c81e0d4b8..8fe2156b2 100644 --- a/examples/csmsc/voc3/local/link_wav.py +++ b/utils/link_wav.py @@ -18,6 +18,7 @@ from pathlib import Path import jsonlines import numpy as np +from tqdm import tqdm def main(): @@ -52,15 +53,24 @@ def main(): output_dir = dump_dir / sub output_dir.mkdir(parents=True, exist_ok=True) results = [] - for name in os.listdir(output_dir / "raw"): - # 003918_feats.npy - utt_id = name.split("_")[0] + files = os.listdir(output_dir / "raw") + for name in tqdm(files): + utt_id = name.split("_feats.npy")[0] mel_path = output_dir / ("raw/" + name) gen_mel = np.load(mel_path) wave_name = utt_id + "_wave.npy" - wav = np.load(old_dump_dir / sub / ("raw/" + wave_name)) - os.symlink(old_dump_dir / sub / ("raw/" + wave_name), - output_dir / ("raw/" + wave_name)) + try: + wav = np.load(old_dump_dir / sub / ("raw/" + wave_name)) + os.symlink(old_dump_dir / sub / ("raw/" + wave_name), + output_dir / ("raw/" + wave_name)) + except FileNotFoundError: + print("delete " + name + + " because it cannot be found in the dump folder") + os.remove(output_dir / "raw" / name) + continue + except FileExistsError: + print("file " + name + " exists, skip.") + continue num_sample = wav.shape[0] num_frames = gen_mel.shape[0] wav_path = output_dir / ("raw/" + wave_name)