diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6e7ae1fbf..0435cfbe1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -50,13 +50,20 @@ repos: entry: bash .pre-commit-hooks/clang-format.hook -i language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$ - exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$ + exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$ #- id: copyright_checker # name: copyright_checker # entry: python .pre-commit-hooks/copyright-check.hook # language: system # files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ # exclude: (?=third_party|pypinyin|speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin).*(\.cpp|\.cc|\.h|\.py)$ + - id: cpplint + name: cpplint + description: Static code analysis of C/C++ files + language: python + files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$ + exclude: (?=speechx/speechx/kaldi|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.py)$ + entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent - repo: https://github.com/asottile/reorder_python_imports rev: v2.4.0 hooks: diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/MANIFEST.in b/MANIFEST.in old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 49e40624d..2321920de --- a/README.md +++ b/README.md @@ -157,11 +157,18 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update -- 👑 2022.10.11: Add [Wav2vec2ASR](./examples/librispeech/asr3), wav2vec2.0 fine-tuning for ASR on LibriSpeech. -- 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and ERNIE-SAT in [PaddleSpeech Web Demo](./demos/speech_web). +- 👑 2022.11.18: Add [Whisper CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), support multi language recognition and translation. +- 🔥 2022.11.18: Add [Wav2vec2 CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_ssl), Support ASR and Feature Extraction. +- 🎉 2022.11.17: Add [male voice for TTS](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660). +- 🔥 2022.11.07: Add [U2/U2++ C++ High Performance Streaming ASR Deployment](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/speechx/examples/u2pp_ol/wenetspeech). +- 👑 2022.11.01: Add [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) for [Chinese English mixed TTS](./examples/zh_en_tts/tts3). +- 🔥 2022.10.26: Add [Prosody Prediction](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/rhy) for TTS. +- 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend. +- 👑 2022.10.11: Add [Wav2vec2ASR-en](./examples/librispeech/asr3), wav2vec2.0 fine-tuning for ASR on LibriSpeech. +- 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and [ERNIE-SAT](https://arxiv.org/abs/2211.03545) in [PaddleSpeech Web Demo](./demos/speech_web). - ⚡ 2022.09.09: Add AISHELL-3 Voice Cloning [example](./examples/aishell3/vc2) with ECAPA-TDNN speaker encoder. - ⚡ 2022.08.25: Release TTS [finetune](./examples/other/tts_finetune/tts3) example. -- 🔥 2022.08.22: Add ERNIE-SAT models: [ERNIE-SAT-vctk](./examples/vctk/ernie_sat)、[ERNIE-SAT-aishell3](./examples/aishell3/ernie_sat)、[ERNIE-SAT-zh_en](./examples/aishell3_vctk/ernie_sat). +- 🔥 2022.08.22: Add [ERNIE-SAT](https://arxiv.org/abs/2211.03545) models: [ERNIE-SAT-vctk](./examples/vctk/ernie_sat)、[ERNIE-SAT-aishell3](./examples/aishell3/ernie_sat)、[ERNIE-SAT-zh_en](./examples/aishell3_vctk/ernie_sat). - 🔥 2022.08.15: Add [g2pW](https://github.com/GitYCC/g2pW) into TTS Chinese Text Frontend. - 🔥 2022.08.09: Release [Chinese English mixed TTS](./examples/zh_en_tts/tts3). - ⚡ 2022.08.03: Add ONNXRuntime infer for TTS CLI. @@ -576,7 +583,7 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r - ERNIE-SAT + ERNIE-SAT VCTK / AISHELL-3 / ZH_EN ERNIE-SAT-vctk / ERNIE-SAT-aishell3 / ERNIE-SAT-zh_en @@ -697,6 +704,31 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r + + +**Keyword Spotting** + + + + + + + + + + + + + + + + + + +
Task Dataset Model Type Example
Keyword Spottinghey-snipsMDTC + mdtc-hey-snips +
+ **Speaker Verification** @@ -826,7 +858,21 @@ The Text-to-Speech module is originally called [Parakeet](https://github.com/Pad ## Citation To cite PaddleSpeech for research, please use the following format. -```tex +```text +@InProceedings{pmlr-v162-bai22d, + title = {{A}$^3${T}: Alignment-Aware Acoustic and Text Pretraining for Speech Synthesis and Editing}, + author = {Bai, He and Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Li, Xintong and Huang, Liang}, + booktitle = {Proceedings of the 39th International Conference on Machine Learning}, + pages = {1399--1411}, + year = {2022}, + volume = {162}, + series = {Proceedings of Machine Learning Research}, + month = {17--23 Jul}, + publisher = {PMLR}, + pdf = {https://proceedings.mlr.press/v162/bai22d/bai22d.pdf}, + url = {https://proceedings.mlr.press/v162/bai22d.html}, +} + @inproceedings{zhang2022paddlespeech, title = {PaddleSpeech: An Easy-to-Use All-in-One Speech Toolkit}, author = {Hui Zhang, Tian Yuan, Junkun Chen, Xintong Li, Renjie Zheng, Yuxin Huang, Xiaojie Chen, Enlei Gong, Zeyu Chen, Xiaoguang Hu, dianhai yu, Yanjun Ma, Liang Huang}, @@ -923,8 +969,8 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P ## Acknowledgement - Many thanks to [HighCWu](https://github.com/HighCWu) for adding [VITS-aishell3](./examples/aishell3/vits) and [VITS-VC](./examples/aishell3/vits-vc) examples. -- Many thanks to [david-95](https://github.com/david-95) improved TTS, fixed multi-punctuation bug, and contributed to multiple program and data. -- Many thanks to [BarryKCL](https://github.com/BarryKCL) improved TTS Chinses frontend based on [G2PW](https://github.com/GitYCC/g2pW). +- Many thanks to [david-95](https://github.com/david-95) for fixing multi-punctuation bug、contributing to multiple program and data, and adding [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend. +- Many thanks to [BarryKCL](https://github.com/BarryKCL) for improving TTS Chinses Frontend based on [G2PW](https://github.com/GitYCC/g2pW). - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help. - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files. - Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function. diff --git a/README_cn.md b/README_cn.md old mode 100644 new mode 100755 index bf3ff4dfd..8127c5570 --- a/README_cn.md +++ b/README_cn.md @@ -164,11 +164,18 @@ ### 近期更新 -- 👑 2022.10.11: 新增 [Wav2vec2ASR](./examples/librispeech/asr3), 在 LibriSpeech 上针对ASR任务对wav2vec2.0 的fine-tuning. -- 🔥 2022.09.26: 新增 Voice Cloning, TTS finetune 和 ERNIE-SAT 到 [PaddleSpeech 网页应用](./demos/speech_web)。 +- 👑 2022.11.18: 新增 [Whisper CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640),支持多种语言的识别与翻译。 +- 🔥 2022.11.18: 新增 [Wav2vec2 CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_ssl), 支持 ASR 和 特征提取. +- 🎉 2022.11.17: TTS 新增[高质量男性音色](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660)。 +- 🔥 2022.11.07: 新增 [U2/U2++ 高性能流式 ASR C++ 部署](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/speechx/examples/u2pp_ol/wenetspeech)。 +- 👑 2022.11.01: [中英文混合 TTS](./examples/zh_en_tts/tts3) 新增 [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) 模块。 +- 🔥 2022.10.26: TTS 新增[韵律预测](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/rhy)功能。 +- 🎉 2022.10.21: TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。 +- 👑 2022.10.11: 新增 [Wav2vec2ASR-en](./examples/librispeech/asr3), 在 LibriSpeech 上针对 ASR 任务对 wav2vec2.0 的 finetuning。 +- 🔥 2022.09.26: 新增 Voice Cloning, TTS finetune 和 [ERNIE-SAT](https://arxiv.org/abs/2211.03545) 到 [PaddleSpeech 网页应用](./demos/speech_web)。 - ⚡ 2022.09.09: 新增基于 ECAPA-TDNN 声纹模型的 AISHELL-3 Voice Cloning [示例](./examples/aishell3/vc2)。 - ⚡ 2022.08.25: 发布 TTS [finetune](./examples/other/tts_finetune/tts3) 示例。 -- 🔥 2022.08.22: 新增 ERNIE-SAT 模型: [ERNIE-SAT-vctk](./examples/vctk/ernie_sat)、[ERNIE-SAT-aishell3](./examples/aishell3/ernie_sat)、[ERNIE-SAT-zh_en](./examples/aishell3_vctk/ernie_sat)。 +- 🔥 2022.08.22: 新增 [ERNIE-SAT](https://arxiv.org/abs/2211.03545) 模型: [ERNIE-SAT-vctk](./examples/vctk/ernie_sat)、[ERNIE-SAT-aishell3](./examples/aishell3/ernie_sat)、[ERNIE-SAT-zh_en](./examples/aishell3_vctk/ernie_sat)。 - 🔥 2022.08.15: 将 [g2pW](https://github.com/GitYCC/g2pW) 引入 TTS 中文文本前端。 - 🔥 2022.08.09: 发布[中英文混合 TTS](./examples/zh_en_tts/tts3)。 - ⚡ 2022.08.03: TTS CLI 新增 ONNXRuntime 推理方式。 @@ -573,7 +580,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 - ERNIE-SAT + ERNIE-SAT VCTK / AISHELL-3 / ZH_EN ERNIE-SAT-vctk / ERNIE-SAT-aishell3 / ERNIE-SAT-zh_en @@ -694,6 +701,31 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 + + +**语音唤醒** + + + + + + + + + + + + + + + + + + +
任务 数据集 模型类型 脚本
语音唤醒hey-snipsMDTC + mdtc-hey-snips +
+ **声纹识别** @@ -832,6 +864,20 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 要引用 PaddleSpeech 进行研究,请使用以下格式进行引用。 ```text +@InProceedings{pmlr-v162-bai22d, + title = {{A}$^3${T}: Alignment-Aware Acoustic and Text Pretraining for Speech Synthesis and Editing}, + author = {Bai, He and Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Li, Xintong and Huang, Liang}, + booktitle = {Proceedings of the 39th International Conference on Machine Learning}, + pages = {1399--1411}, + year = {2022}, + volume = {162}, + series = {Proceedings of Machine Learning Research}, + month = {17--23 Jul}, + publisher = {PMLR}, + pdf = {https://proceedings.mlr.press/v162/bai22d/bai22d.pdf}, + url = {https://proceedings.mlr.press/v162/bai22d.html}, +} + @inproceedings{zhang2022paddlespeech, title = {PaddleSpeech: An Easy-to-Use All-in-One Speech Toolkit}, author = {Hui Zhang, Tian Yuan, Junkun Chen, Xintong Li, Renjie Zheng, Yuxin Huang, Xiaojie Chen, Enlei Gong, Zeyu Chen, Xiaoguang Hu, dianhai yu, Yanjun Ma, Liang Huang}, @@ -928,7 +974,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声 ## 致谢 - 非常感谢 [HighCWu](https://github.com/HighCWu) 新增 [VITS-aishell3](./examples/aishell3/vits) 和 [VITS-VC](./examples/aishell3/vits-vc) 代码示例。 -- 非常感谢 [david-95](https://github.com/david-95) 修复句尾多标点符号出错的问题,贡献补充多条程序和数据。 +- 非常感谢 [david-95](https://github.com/david-95) 修复 TTS 句尾多标点符号出错的问题,贡献补充多条程序和数据。为 TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。 - 非常感谢 [BarryKCL](https://github.com/BarryKCL) 基于 [G2PW](https://github.com/GitYCC/g2pW) 对 TTS 中文文本前端的优化。 - 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。 diff --git a/dataset/aidatatang_200zh/.gitignore b/dataset/aidatatang_200zh/.gitignore old mode 100644 new mode 100755 diff --git a/dataset/aidatatang_200zh/README.md b/dataset/aidatatang_200zh/README.md old mode 100644 new mode 100755 diff --git a/dataset/aidatatang_200zh/aidatatang_200zh.py b/dataset/aidatatang_200zh/aidatatang_200zh.py old mode 100644 new mode 100755 diff --git a/dataset/aishell/.gitignore b/dataset/aishell/.gitignore old mode 100644 new mode 100755 diff --git a/dataset/aishell/README.md b/dataset/aishell/README.md old mode 100644 new mode 100755 diff --git a/dataset/aishell/aishell.py b/dataset/aishell/aishell.py old mode 100644 new mode 100755 diff --git a/dataset/aishell3/README.md b/dataset/aishell3/README.md old mode 100644 new mode 100755 diff --git a/dataset/chime3_background/chime3_background.py b/dataset/chime3_background/chime3_background.py old mode 100644 new mode 100755 diff --git a/dataset/gigaspeech/.gitignore b/dataset/gigaspeech/.gitignore old mode 100644 new mode 100755 diff --git a/dataset/gigaspeech/README.md b/dataset/gigaspeech/README.md old mode 100644 new mode 100755 diff --git a/dataset/gigaspeech/gigaspeech.py b/dataset/gigaspeech/gigaspeech.py old mode 100644 new mode 100755 diff --git a/dataset/librispeech/.gitignore b/dataset/librispeech/.gitignore old mode 100644 new mode 100755 diff --git a/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py old mode 100644 new mode 100755 diff --git a/dataset/magicdata/README.md b/dataset/magicdata/README.md old mode 100644 new mode 100755 diff --git a/dataset/mini_librispeech/.gitignore b/dataset/mini_librispeech/.gitignore old mode 100644 new mode 100755 diff --git a/dataset/mini_librispeech/mini_librispeech.py b/dataset/mini_librispeech/mini_librispeech.py old mode 100644 new mode 100755 diff --git a/dataset/multi_cn/README.md b/dataset/multi_cn/README.md old mode 100644 new mode 100755 diff --git a/dataset/musan/.gitignore b/dataset/musan/.gitignore old mode 100644 new mode 100755 diff --git a/dataset/musan/musan.py b/dataset/musan/musan.py old mode 100644 new mode 100755 diff --git a/dataset/primewords/README.md b/dataset/primewords/README.md old mode 100644 new mode 100755 diff --git a/dataset/rir_noise/.gitignore b/dataset/rir_noise/.gitignore old mode 100644 new mode 100755 diff --git a/dataset/rir_noise/rir_noise.py b/dataset/rir_noise/rir_noise.py old mode 100644 new mode 100755 diff --git a/dataset/st-cmds/README.md b/dataset/st-cmds/README.md old mode 100644 new mode 100755 diff --git a/dataset/ted_en_zh/.gitignore b/dataset/ted_en_zh/.gitignore old mode 100644 new mode 100755 diff --git a/dataset/ted_en_zh/ted_en_zh.py b/dataset/ted_en_zh/ted_en_zh.py old mode 100644 new mode 100755 diff --git a/dataset/thchs30/.gitignore b/dataset/thchs30/.gitignore old mode 100644 new mode 100755 diff --git a/dataset/thchs30/README.md b/dataset/thchs30/README.md old mode 100644 new mode 100755 diff --git a/dataset/thchs30/thchs30.py b/dataset/thchs30/thchs30.py old mode 100644 new mode 100755 diff --git a/dataset/timit/.gitignore b/dataset/timit/.gitignore old mode 100644 new mode 100755 diff --git a/dataset/timit/timit.py b/dataset/timit/timit.py old mode 100644 new mode 100755 diff --git a/dataset/timit/timit_kaldi_standard_split.py b/dataset/timit/timit_kaldi_standard_split.py old mode 100644 new mode 100755 diff --git a/dataset/voxceleb/README.md b/dataset/voxceleb/README.md old mode 100644 new mode 100755 diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py old mode 100644 new mode 100755 diff --git a/dataset/voxceleb/voxceleb2.py b/dataset/voxceleb/voxceleb2.py old mode 100644 new mode 100755 diff --git a/dataset/voxforge/run_data.sh b/dataset/voxforge/run_data.sh old mode 100644 new mode 100755 diff --git a/dataset/voxforge/voxforge.py b/dataset/voxforge/voxforge.py old mode 100644 new mode 100755 diff --git a/demos/README.md b/demos/README.md old mode 100644 new mode 100755 index 72b70b237..a41967864 --- a/demos/README.md +++ b/demos/README.md @@ -17,3 +17,5 @@ This directory contains many speech applications in multiple scenarios. * story talker - book reader based on OCR and TTS * style_fs2 - multi style control for FastSpeech2 model * text_to_speech - convert text into speech +* self supervised pretraining - speech feature extraction and speech recognition based on wav2vec2 +* Wishper - speech recognize and translate based on Whisper model diff --git a/demos/README_cn.md b/demos/README_cn.md old mode 100644 new mode 100755 index 04fc1fa7d..ffb028f0e --- a/demos/README_cn.md +++ b/demos/README_cn.md @@ -17,3 +17,5 @@ * 会说话的故事书 - 基于 OCR 和语音合成的会说话的故事书。 * 个性化语音合成 - 基于 FastSpeech2 模型的个性化语音合成。 * 语音合成 - 基于给定的文本生成语音音频。 +* 自监督预训练模型 - 基于wav2vec2的语音特征提取和语音识别。 +* Whisper - 基于Whisper模型的语音识别与翻译。 diff --git a/demos/asr_deployment/README.md b/demos/asr_deployment/README.md new file mode 100755 index 000000000..9d36f19f2 --- /dev/null +++ b/demos/asr_deployment/README.md @@ -0,0 +1,100 @@ +([简体中文](./README_cn.md)|English) +# ASR Deployment by SpeechX + +## Introduction + +ASR deployment support U2/U2++/Deepspeech2 asr model using c++, which is good practice in industry deployment. + +More info about SpeechX, please see [here](../../speechx/README.md). + +## Usage +### 1. Environment + +* python - 3.7 +* docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7` +* os - Ubuntu 16.04.7 LTS +* gcc/g++/gfortran - 8.2.0 +* cmake - 3.16.0 + +More info please see [here](../../speechx/README.md). + +### 2. Compile SpeechX + +Please see [here](../../speechx/README.md). + +### 3. Usage + +For u2++ asr deployment example, please to see [here](../../speechx/examples/u2pp_ol/wenetspeech/). + +First go to `speechx/speechx/examples/u2pp_ol/wenetspeech` dir. + +- Source path.sh + ```bash + source path.sh + ``` + +- Download Model, Prepare test data and cmvn + ```bash + run.sh --stage 0 --stop_stage 1 + ``` + +- Decode with WAV + + ```bash + # FP32 + ./local/recognizer.sh + + # INT8 + ./local/recognizer_quant.sh + ``` + + Output: + ```bash + I1026 16:13:24.683531 48038 u2_recognizer_main.cc:55] utt: BAC009S0916W0495 + I1026 16:13:24.683578 48038 u2_recognizer_main.cc:56] wav dur: 4.17119 sec. + I1026 16:13:24.683595 48038 u2_recognizer_main.cc:64] wav len (sample): 66739 + I1026 16:13:25.037652 48038 u2_recognizer_main.cc:87] Pratial result: 3 这令 + I1026 16:13:25.043697 48038 u2_recognizer_main.cc:87] Pratial result: 4 这令 + I1026 16:13:25.222124 48038 u2_recognizer_main.cc:87] Pratial result: 5 这令被贷款 + I1026 16:13:25.228385 48038 u2_recognizer_main.cc:87] Pratial result: 6 这令被贷款 + I1026 16:13:25.414669 48038 u2_recognizer_main.cc:87] Pratial result: 7 这令被贷款的员工 + I1026 16:13:25.420714 48038 u2_recognizer_main.cc:87] Pratial result: 8 这令被贷款的员工 + I1026 16:13:25.608129 48038 u2_recognizer_main.cc:87] Pratial result: 9 这令被贷款的员工们请 + I1026 16:13:25.801620 48038 u2_recognizer_main.cc:87] Pratial result: 10 这令被贷款的员工们请食难安 + I1026 16:13:25.804101 48038 feature_cache.h:44] set finished + I1026 16:13:25.804128 48038 feature_cache.h:51] compute last feats done. + I1026 16:13:25.948771 48038 u2_recognizer_main.cc:87] Pratial result: 11 这令被贷款的员工们请食难安 + I1026 16:13:26.246963 48038 u2_recognizer_main.cc:113] BAC009S0916W0495 这令被贷款的员工们请食难安 + ``` + +## Result + +> CER compute under aishell-test. +> RTF compute with feature and decoder, which is more end to end. +> Machine Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz avx512_vnni + +### FP32 + +``` +Overall -> 5.75 % N=104765 C=99035 S=5587 D=143 I=294 +Mandarin -> 5.75 % N=104762 C=99035 S=5584 D=143 I=294 +English -> 0.00 % N=0 C=0 S=0 D=0 I=0 +Other -> 100.00 % N=3 C=0 S=3 D=0 I=0 +``` + +``` +RTF is: 0.315337 +``` + +### INT8 + +``` +Overall -> 5.83 % N=104765 C=98943 S=5675 D=147 I=286 +Mandarin -> 5.83 % N=104762 C=98943 S=5672 D=147 I=286 +English -> 0.00 % N=0 C=0 S=0 D=0 I=0 +Other -> 100.00 % N=3 C=0 S=3 D=0 I=0 +``` + +``` +RTF is: 0.269674 +``` diff --git a/demos/asr_deployment/README_cn.md b/demos/asr_deployment/README_cn.md new file mode 100755 index 000000000..ee4aa8489 --- /dev/null +++ b/demos/asr_deployment/README_cn.md @@ -0,0 +1,96 @@ +([简体中文](./README_cn.md)|English) +# 基于SpeechX 的 ASR 部署 + +## 简介 + +支持 U2/U2++/Deepspeech2 模型的 C++ 部署,其在工业实践中经常被用到。 + +更多 Speechx 信息可以参看[文档](../../speechx/README.md)。 + +## 使用 +### 1. 环境 + +* python - 3.7 +* docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7` +* os - Ubuntu 16.04.7 LTS +* gcc/g++/gfortran - 8.2.0 +* cmake - 3.16.0 + +更多信息可以参看[文档](../../speechx/README.md)。 + +### 2. 编译 SpeechX + +更多信息可以参看[文档](../../speechx/README.md)。 + +### 3. 例子 + +u2++ 识别部署参看[这里](../../speechx/examples/u2pp_ol/wenetspeech/)。 + +以下是在 `speechx/speechx/examples/u2pp_ol/wenetspeech`. + +- Source path.sh + ```bash + source path.sh + ``` + +- 下载模型,准备测试数据和cmvn文件 + ```bash + run.sh --stage 0 --stop_stage 1 + ``` + +- 解码 + + ```bash + # FP32 + ./local/recognizer.sh + + # INT8 + ./local/recognizer_quant.sh + ``` + + 输出: + ```bash + I1026 16:13:24.683531 48038 u2_recognizer_main.cc:55] utt: BAC009S0916W0495 + I1026 16:13:24.683578 48038 u2_recognizer_main.cc:56] wav dur: 4.17119 sec. + I1026 16:13:24.683595 48038 u2_recognizer_main.cc:64] wav len (sample): 66739 + I1026 16:13:25.037652 48038 u2_recognizer_main.cc:87] Pratial result: 3 这令 + I1026 16:13:25.043697 48038 u2_recognizer_main.cc:87] Pratial result: 4 这令 + I1026 16:13:25.222124 48038 u2_recognizer_main.cc:87] Pratial result: 5 这令被贷款 + I1026 16:13:25.228385 48038 u2_recognizer_main.cc:87] Pratial result: 6 这令被贷款 + I1026 16:13:25.414669 48038 u2_recognizer_main.cc:87] Pratial result: 7 这令被贷款的员工 + I1026 16:13:25.420714 48038 u2_recognizer_main.cc:87] Pratial result: 8 这令被贷款的员工 + I1026 16:13:25.608129 48038 u2_recognizer_main.cc:87] Pratial result: 9 这令被贷款的员工们请 + I1026 16:13:25.801620 48038 u2_recognizer_main.cc:87] Pratial result: 10 这令被贷款的员工们请食难安 + I1026 16:13:25.804101 48038 feature_cache.h:44] set finished + I1026 16:13:25.804128 48038 feature_cache.h:51] compute last feats done. + I1026 16:13:25.948771 48038 u2_recognizer_main.cc:87] Pratial result: 11 这令被贷款的员工们请食难安 + I1026 16:13:26.246963 48038 u2_recognizer_main.cc:113] BAC009S0916W0495 这令被贷款的员工们请食难安 + ``` + +## 结果 + +> CER 测试集为 aishell-test +> RTF 计算包含提特征和解码 +> 测试机器: Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz avx512_vnni + +### FP32 + +``` +Overall -> 5.75 % N=104765 C=99035 S=5587 D=143 I=294 +Mandarin -> 5.75 % N=104762 C=99035 S=5584 D=143 I=294 +English -> 0.00 % N=0 C=0 S=0 D=0 I=0 +Other -> 100.00 % N=3 C=0 S=3 D=0 I=0 +``` + +``` +RTF is: 0.315337 +``` + +### INT8 + +``` +Overall -> 5.87 % N=104765 C=98909 S=5711 D=145 I=289 +Mandarin -> 5.86 % N=104762 C=98909 S=5708 D=145 I=289 +English -> 0.00 % N=0 C=0 S=0 D=0 I=0 +Other -> 100.00 % N=3 C=0 S=3 D=0 I=0 +``` diff --git a/demos/audio_content_search/README.md b/demos/audio_content_search/README.md old mode 100644 new mode 100755 diff --git a/demos/audio_content_search/README_cn.md b/demos/audio_content_search/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/audio_content_search/acs_clinet.py b/demos/audio_content_search/acs_clinet.py old mode 100644 new mode 100755 diff --git a/demos/audio_content_search/conf/acs_application.yaml b/demos/audio_content_search/conf/acs_application.yaml old mode 100644 new mode 100755 diff --git a/demos/audio_content_search/conf/words.txt b/demos/audio_content_search/conf/words.txt old mode 100644 new mode 100755 diff --git a/demos/audio_content_search/conf/ws_conformer_application.yaml b/demos/audio_content_search/conf/ws_conformer_application.yaml old mode 100644 new mode 100755 diff --git a/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml b/demos/audio_content_search/conf/ws_conformer_wenetspeech_application.yaml old mode 100644 new mode 100755 diff --git a/demos/audio_content_search/requirements.txt b/demos/audio_content_search/requirements.txt old mode 100644 new mode 100755 diff --git a/demos/audio_content_search/streaming_asr_server.py b/demos/audio_content_search/streaming_asr_server.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/README.md b/demos/audio_searching/README.md old mode 100644 new mode 100755 diff --git a/demos/audio_searching/README_cn.md b/demos/audio_searching/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/audio_searching/docker-compose.yaml b/demos/audio_searching/docker-compose.yaml old mode 100644 new mode 100755 diff --git a/demos/audio_searching/img/audio_searching.png b/demos/audio_searching/img/audio_searching.png old mode 100644 new mode 100755 diff --git a/demos/audio_searching/img/insert.png b/demos/audio_searching/img/insert.png old mode 100644 new mode 100755 diff --git a/demos/audio_searching/img/result.png b/demos/audio_searching/img/result.png old mode 100644 new mode 100755 diff --git a/demos/audio_searching/img/search.png b/demos/audio_searching/img/search.png old mode 100644 new mode 100755 diff --git a/demos/audio_searching/requirements.txt b/demos/audio_searching/requirements.txt old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/audio_search.py b/demos/audio_searching/src/audio_search.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/config.py b/demos/audio_searching/src/config.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/encode.py b/demos/audio_searching/src/encode.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/logs.py b/demos/audio_searching/src/logs.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/milvus_helpers.py b/demos/audio_searching/src/milvus_helpers.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/mysql_helpers.py b/demos/audio_searching/src/mysql_helpers.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/operations/__init__.py b/demos/audio_searching/src/operations/__init__.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/operations/count.py b/demos/audio_searching/src/operations/count.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/operations/drop.py b/demos/audio_searching/src/operations/drop.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/operations/load.py b/demos/audio_searching/src/operations/load.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/operations/search.py b/demos/audio_searching/src/operations/search.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/test_audio_search.py b/demos/audio_searching/src/test_audio_search.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/test_vpr_search.py b/demos/audio_searching/src/test_vpr_search.py old mode 100644 new mode 100755 diff --git a/demos/audio_searching/src/vpr_search.py b/demos/audio_searching/src/vpr_search.py old mode 100644 new mode 100755 diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md old mode 100644 new mode 100755 diff --git a/demos/audio_tagging/README_cn.md b/demos/audio_tagging/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/automatic_video_subtitiles/README.md b/demos/automatic_video_subtitiles/README.md old mode 100644 new mode 100755 diff --git a/demos/automatic_video_subtitiles/README_cn.md b/demos/automatic_video_subtitiles/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/automatic_video_subtitiles/recognize.py b/demos/automatic_video_subtitiles/recognize.py old mode 100644 new mode 100755 diff --git a/demos/custom_streaming_asr/README.md b/demos/custom_streaming_asr/README.md old mode 100644 new mode 100755 diff --git a/demos/custom_streaming_asr/README_cn.md b/demos/custom_streaming_asr/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/custom_streaming_asr/path.sh b/demos/custom_streaming_asr/path.sh old mode 100644 new mode 100755 diff --git a/demos/keyword_spotting/README.md b/demos/keyword_spotting/README.md old mode 100644 new mode 100755 diff --git a/demos/keyword_spotting/README_cn.md b/demos/keyword_spotting/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/metaverse/README.md b/demos/metaverse/README.md old mode 100644 new mode 100755 diff --git a/demos/metaverse/README_cn.md b/demos/metaverse/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/metaverse/sentences.txt b/demos/metaverse/sentences.txt old mode 100644 new mode 100755 diff --git a/demos/punctuation_restoration/README.md b/demos/punctuation_restoration/README.md old mode 100644 new mode 100755 diff --git a/demos/punctuation_restoration/README_cn.md b/demos/punctuation_restoration/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md old mode 100644 new mode 100755 diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/speech_recognition/.gitignore b/demos/speech_recognition/.gitignore old mode 100644 new mode 100755 diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md old mode 100644 new mode 100755 diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/speech_server/.gitignore b/demos/speech_server/.gitignore old mode 100644 new mode 100755 diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md old mode 100644 new mode 100755 diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml old mode 100644 new mode 100755 diff --git a/demos/speech_server/start_multi_progress_server.py b/demos/speech_server/start_multi_progress_server.py old mode 100644 new mode 100755 diff --git a/demos/speech_ssl/README.md b/demos/speech_ssl/README.md index fdef37e7b..6de1fcb4e 100755 --- a/demos/speech_ssl/README.md +++ b/demos/speech_ssl/README.md @@ -82,7 +82,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav Output: ```bash ASR Result: +<<<<<<< HEAD 我认为跑步最重要的就是给我带来了身体健康 +======= + i knocked at the door on the ancient side of the building +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 Representation: Tensor(shape=[1, 164, 1024], dtype=float32, place=Place(gpu:0), stop_gradient=True, diff --git a/demos/speech_ssl/README_cn.md b/demos/speech_ssl/README_cn.md index 8e95f6a4d..d622117eb 100755 --- a/demos/speech_ssl/README_cn.md +++ b/demos/speech_ssl/README_cn.md @@ -36,9 +36,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav ``` 参数: - `input`(必须输入):用于识别的音频文件。 +<<<<<<< HEAD - `model`:ASR 任务的模型,默认值:`conformer_wenetspeech`。 - `task`:输出类别,默认值:`asr`。 - `lang`:模型语言,默认值:`zh`。 +======= + - `model`:ASR 任务的模型,默认值:`wav2vec2ASR_librispeech`。 + - `task`:输出类别,默认值:`asr`。 + - `lang`:模型语言,默认值:`en`。 +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 - `sample_rate`:音频采样率,默认值:`16000`。 - `config`:ASR 任务的参数文件,若不设置则使用预训练模型中的默认配置,默认值:`None`。 - `ckpt_path`:模型参数文件,若不设置则下载预训练模型使用,默认值:`None`。 @@ -83,8 +89,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav 输出: ```bash ASR Result: +<<<<<<< HEAD 我认为跑步最重要的就是给我带来了身体健康 +======= + i knocked at the door on the ancient side of the building + +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 Representation: Tensor(shape=[1, 164, 1024], dtype=float32, place=Place(gpu:0), stop_gradient=True, [[[ 0.02351918, -0.12980647, 0.17868176, ..., 0.10118122, @@ -100,4 +111,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav -0.04687393, 0.17864393], [ 0.05269200, 0.01297141, -0.23336855, ..., -0.11257174, -0.17227529, 0.20338398]]]) - ``` \ No newline at end of file +<<<<<<< HEAD + ``` +======= + ``` +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 diff --git a/demos/speech_ssl/run.sh b/demos/speech_ssl/run.sh index 204ccc826..704992cfd 100755 --- a/demos/speech_ssl/run.sh +++ b/demos/speech_ssl/run.sh @@ -1,11 +1,18 @@ #!/bin/bash # audio download +<<<<<<< HEAD wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +======= +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 # to recognize text paddlespeech ssl --task asr --lang en --input ./en.wav # to get acoustic representation paddlespeech ssl --task vector --lang en --input ./en.wav +<<<<<<< HEAD +======= +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md old mode 100644 new mode 100755 diff --git a/demos/speech_translation/README_cn.md b/demos/speech_translation/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/speech_web/.gitignore b/demos/speech_web/.gitignore old mode 100644 new mode 100755 diff --git a/demos/speech_web/API.md b/demos/speech_web/API.md old mode 100644 new mode 100755 diff --git a/demos/speech_web/README.md b/demos/speech_web/README.md old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/conf/tts3_finetune.yaml b/demos/speech_web/speech_server/conf/tts3_finetune.yaml old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/conf/tts_online_application.yaml b/demos/speech_web/speech_server/conf/tts_online_application.yaml old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/conf/ws_conformer_wenetspeech_application_faster.yaml b/demos/speech_web/speech_server/conf/ws_conformer_wenetspeech_application_faster.yaml old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/main.py b/demos/speech_web/speech_server/main.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/requirements.txt b/demos/speech_web/speech_server/requirements.txt old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/AudioManeger.py b/demos/speech_web/speech_server/src/AudioManeger.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/SpeechBase/asr.py b/demos/speech_web/speech_server/src/SpeechBase/asr.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/SpeechBase/nlp.py b/demos/speech_web/speech_server/src/SpeechBase/nlp.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/SpeechBase/sql_helper.py b/demos/speech_web/speech_server/src/SpeechBase/sql_helper.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/SpeechBase/tts.py b/demos/speech_web/speech_server/src/SpeechBase/tts.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/SpeechBase/vpr.py b/demos/speech_web/speech_server/src/SpeechBase/vpr.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/SpeechBase/vpr_encode.py b/demos/speech_web/speech_server/src/SpeechBase/vpr_encode.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/WebsocketManeger.py b/demos/speech_web/speech_server/src/WebsocketManeger.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/ernie_sat.py b/demos/speech_web/speech_server/src/ernie_sat.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/finetune.py b/demos/speech_web/speech_server/src/finetune.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/ge2e_clone.py b/demos/speech_web/speech_server/src/ge2e_clone.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/robot.py b/demos/speech_web/speech_server/src/robot.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/tdnn_clone.py b/demos/speech_web/speech_server/src/tdnn_clone.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/src/util.py b/demos/speech_web/speech_server/src/util.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/speech_server/vc.py b/demos/speech_web/speech_server/vc.py old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/.gitignore b/demos/speech_web/web_client/.gitignore old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/index.html b/demos/speech_web/web_client/index.html old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/package-lock.json b/demos/speech_web/web_client/package-lock.json old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/package.json b/demos/speech_web/web_client/package.json old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/public/favicon.ico b/demos/speech_web/web_client/public/favicon.ico old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/App.vue b/demos/speech_web/web_client/src/App.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/api/API.js b/demos/speech_web/web_client/src/api/API.js old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/api/ApiASR.js b/demos/speech_web/web_client/src/api/ApiASR.js old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/api/ApiNLP.js b/demos/speech_web/web_client/src/api/ApiNLP.js old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/api/ApiTTS.js b/demos/speech_web/web_client/src/api/ApiTTS.js old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/api/ApiVC.js b/demos/speech_web/web_client/src/api/ApiVC.js old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/api/ApiVPR.js b/demos/speech_web/web_client/src/api/ApiVPR.js old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/ic_大-上传文件.svg b/demos/speech_web/web_client/src/assets/image/ic_大-上传文件.svg old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/ic_大-声音波浪.svg b/demos/speech_web/web_client/src/assets/image/ic_大-声音波浪.svg old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/ic_大-语音.svg b/demos/speech_web/web_client/src/assets/image/ic_大-语音.svg old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/ic_小-录制语音.svg b/demos/speech_web/web_client/src/assets/image/ic_小-录制语音.svg old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/ic_小-结束.svg b/demos/speech_web/web_client/src/assets/image/ic_小-结束.svg old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/ic_开始聊天.svg b/demos/speech_web/web_client/src/assets/image/ic_开始聊天.svg old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/ic_开始聊天_hover.svg b/demos/speech_web/web_client/src/assets/image/ic_开始聊天_hover.svg old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/ic_播放(按钮).svg b/demos/speech_web/web_client/src/assets/image/ic_播放(按钮).svg old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/ic_暂停(按钮).svg b/demos/speech_web/web_client/src/assets/image/ic_暂停(按钮).svg old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/ic_更换示例.svg b/demos/speech_web/web_client/src/assets/image/ic_更换示例.svg old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/icon_小-声音波浪.svg b/demos/speech_web/web_client/src/assets/image/icon_小-声音波浪.svg old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/icon_录制声音小语音1.svg b/demos/speech_web/web_client/src/assets/image/icon_录制声音小语音1.svg old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/在线体验-背景@2x.png b/demos/speech_web/web_client/src/assets/image/在线体验-背景@2x.png old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/场景齐全@3x.png b/demos/speech_web/web_client/src/assets/image/场景齐全@3x.png old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/教程丰富@3x.png b/demos/speech_web/web_client/src/assets/image/教程丰富@3x.png old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/模型全面@3x.png b/demos/speech_web/web_client/src/assets/image/模型全面@3x.png old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/步骤-箭头切图@2x.png b/demos/speech_web/web_client/src/assets/image/步骤-箭头切图@2x.png old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/用户头像@2x.png b/demos/speech_web/web_client/src/assets/image/用户头像@2x.png old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/image/飞桨头像@2x.png b/demos/speech_web/web_client/src/assets/image/飞桨头像@2x.png old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/assets/logo.png b/demos/speech_web/web_client/src/assets/logo.png old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/Content/Header/Header.vue b/demos/speech_web/web_client/src/components/Content/Header/Header.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/Content/Header/style.less b/demos/speech_web/web_client/src/components/Content/Header/style.less old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/Content/Tail/Tail.vue b/demos/speech_web/web_client/src/components/Content/Tail/Tail.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/Content/Tail/style.less b/demos/speech_web/web_client/src/components/Content/Tail/style.less old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/Experience.vue b/demos/speech_web/web_client/src/components/Experience.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/ASR/ASR.vue b/demos/speech_web/web_client/src/components/SubMenu/ASR/ASR.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/ASR/ASRT.vue b/demos/speech_web/web_client/src/components/SubMenu/ASR/ASRT.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/ASR/AudioFile/AudioFileIdentification.vue b/demos/speech_web/web_client/src/components/SubMenu/ASR/AudioFile/AudioFileIdentification.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/ASR/AudioFile/style.less b/demos/speech_web/web_client/src/components/SubMenu/ASR/AudioFile/style.less old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/ASR/EndToEnd/EndToEndIdentification.vue b/demos/speech_web/web_client/src/components/SubMenu/ASR/EndToEnd/EndToEndIdentification.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/ASR/EndToEnd/style.less b/demos/speech_web/web_client/src/components/SubMenu/ASR/EndToEnd/style.less old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/ASR/RealTime/RealTime.vue b/demos/speech_web/web_client/src/components/SubMenu/ASR/RealTime/RealTime.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/ASR/RealTime/style.less b/demos/speech_web/web_client/src/components/SubMenu/ASR/RealTime/style.less old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/ASR/style.less b/demos/speech_web/web_client/src/components/SubMenu/ASR/style.less old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/ChatBot/ChatT.vue b/demos/speech_web/web_client/src/components/SubMenu/ChatBot/ChatT.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/ChatBot/style.less b/demos/speech_web/web_client/src/components/SubMenu/ChatBot/style.less old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/ERNIE_SAT/ERNIE_SAT.vue b/demos/speech_web/web_client/src/components/SubMenu/ERNIE_SAT/ERNIE_SAT.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/FineTune/FineTune.vue b/demos/speech_web/web_client/src/components/SubMenu/FineTune/FineTune.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/IE/IET.vue b/demos/speech_web/web_client/src/components/SubMenu/IE/IET.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/IE/style.less b/demos/speech_web/web_client/src/components/SubMenu/IE/style.less old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/TTS/TTST.vue b/demos/speech_web/web_client/src/components/SubMenu/TTS/TTST.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/TTS/style.less b/demos/speech_web/web_client/src/components/SubMenu/TTS/style.less old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/VPR/VPRT.vue b/demos/speech_web/web_client/src/components/SubMenu/VPR/VPRT.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/VPR/style.less b/demos/speech_web/web_client/src/components/SubMenu/VPR/style.less old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/SubMenu/VoiceClone/VoiceClone.vue b/demos/speech_web/web_client/src/components/SubMenu/VoiceClone/VoiceClone.vue old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/components/style.less b/demos/speech_web/web_client/src/components/style.less old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/src/main.js b/demos/speech_web/web_client/src/main.js old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/vite.config.js b/demos/speech_web/web_client/vite.config.js old mode 100644 new mode 100755 diff --git a/demos/speech_web/web_client/yarn.lock b/demos/speech_web/web_client/yarn.lock old mode 100644 new mode 100755 diff --git a/demos/story_talker/README.md b/demos/story_talker/README.md old mode 100644 new mode 100755 diff --git a/demos/story_talker/README_cn.md b/demos/story_talker/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/story_talker/ocr.py b/demos/story_talker/ocr.py old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/.gitignore b/demos/streaming_asr_server/.gitignore old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/conf/application.yaml b/demos/streaming_asr_server/conf/application.yaml old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/conf/punc_application.yaml b/demos/streaming_asr_server/conf/punc_application.yaml old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/conf/ws_conformer_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_application.yaml old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application_faster.yaml b/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application_faster.yaml old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/conf/ws_ds2_application.yaml b/demos/streaming_asr_server/conf/ws_ds2_application.yaml old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/local/punc_server.py b/demos/streaming_asr_server/local/punc_server.py old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/local/streaming_asr_server.py b/demos/streaming_asr_server/local/streaming_asr_server.py old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/local/websocket_client.py b/demos/streaming_asr_server/local/websocket_client.py old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/web/favicon.ico b/demos/streaming_asr_server/web/favicon.ico old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/web/index.html b/demos/streaming_asr_server/web/index.html old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/web/paddle_web_demo.png b/demos/streaming_asr_server/web/paddle_web_demo.png old mode 100644 new mode 100755 diff --git a/demos/streaming_asr_server/web/readme.md b/demos/streaming_asr_server/web/readme.md old mode 100644 new mode 100755 diff --git a/demos/streaming_tts_server/README.md b/demos/streaming_tts_server/README.md old mode 100644 new mode 100755 diff --git a/demos/streaming_tts_server/README_cn.md b/demos/streaming_tts_server/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/streaming_tts_server/conf/tts_online_application.yaml b/demos/streaming_tts_server/conf/tts_online_application.yaml old mode 100644 new mode 100755 diff --git a/demos/streaming_tts_server/conf/tts_online_ws_application.yaml b/demos/streaming_tts_server/conf/tts_online_ws_application.yaml old mode 100644 new mode 100755 diff --git a/demos/streaming_tts_serving_fastdeploy/README.md b/demos/streaming_tts_serving_fastdeploy/README.md old mode 100644 new mode 100755 diff --git a/demos/streaming_tts_serving_fastdeploy/README_cn.md b/demos/streaming_tts_serving_fastdeploy/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/streaming_tts_serving_fastdeploy/streaming_tts_serving/1/model.py b/demos/streaming_tts_serving_fastdeploy/streaming_tts_serving/1/model.py old mode 100644 new mode 100755 diff --git a/demos/streaming_tts_serving_fastdeploy/streaming_tts_serving/config.pbtxt b/demos/streaming_tts_serving_fastdeploy/streaming_tts_serving/config.pbtxt old mode 100644 new mode 100755 diff --git a/demos/streaming_tts_serving_fastdeploy/streaming_tts_serving/stream_client.py b/demos/streaming_tts_serving_fastdeploy/streaming_tts_serving/stream_client.py old mode 100644 new mode 100755 diff --git a/demos/streaming_tts_serving_fastdeploy/tree.png b/demos/streaming_tts_serving_fastdeploy/tree.png old mode 100644 new mode 100755 diff --git a/demos/style_fs2/README.md b/demos/style_fs2/README.md old mode 100644 new mode 100755 diff --git a/demos/style_fs2/README_cn.md b/demos/style_fs2/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/style_fs2/sentences.txt b/demos/style_fs2/sentences.txt old mode 100644 new mode 100755 diff --git a/demos/style_fs2/style_syn.py b/demos/style_fs2/style_syn.py old mode 100644 new mode 100755 diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md old mode 100644 new mode 100755 diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md old mode 100644 new mode 100755 diff --git a/demos/whisper/README.md b/demos/whisper/README.md new file mode 100755 index 000000000..9b12554e6 --- /dev/null +++ b/demos/whisper/README.md @@ -0,0 +1,95 @@ +([简体中文](./README_cn.md)|English) + +## Introduction +Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. + +Whisper model trained by OpenAI whisper https://github.com/openai/whisper + +## Usage + ### 1. Installation + see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + + You can choose one way from easy, meduim and hard to install paddlespeech. + + ### 2. Prepare Input File + The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. + + Here are sample files for this demo that can be downloaded: + ```bash + wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + ``` + + ### 3. Usage + - Command Line(Recommended) + ```bash + # to recognize text + paddlespeech whisper --task transcribe --input ./zh.wav + + # to change model English-Only base size model + paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav + + # to recognize text and translate to English + paddlespeech whisper --task translate --input ./zh.wav + + ``` + + Usage: + ```bash + paddlespeech whisper --help + ``` + Arguments: + - `input`(required): Audio file to recognize. + - `model`: Model type of asr task. Default: `whisper-large`. + - `task`: Output type. Default: `transcribe`. + - `lang`: Model language. Default: ``. Use `en` to choice English-only model. Now [medium,base,small,tiny] size can support English-only. + - `size`: Model size for decode. Defalut: `large`. Now can support [large,medium,base,small,tiny]. + - `language`: Set decode language. Default: `None`. Forcibly set the recognized language, which is determined by the model itself by default. + - `sample_rate`: Sample rate of the model. Default: `16000`. Other sampling rates are not supported now. + - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`. + - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`. + - `yes`: No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate. Default: `False`. + - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment. + - `verbose`: Show the log information. + + + - Python API + ```python + import paddle + from paddlespeech.cli.whisper import WhisperExecutor + + whisper_executor = WhisperExecutor() + + # to recognize text + text = whisper_executor( + model='whisper', + task='transcribe', + sample_rate=16000, + config=None, # Set `config` and `ckpt_path` to None to use pretrained model. + ckpt_path=None, + audio_file='./zh.wav', + device=paddle.get_device()) + print('ASR Result: \n{}'.format(text)) + + # to recognize text and translate to English + feature = whisper_executor( + model='whisper', + task='translate', + sample_rate=16000, + config=None, # Set `config` and `ckpt_path` to None to use pretrained model. + ckpt_path=None, + audio_file='./zh.wav', + device=paddle.get_device()) + print('Representation: \n{}'.format(feature)) + ``` + + Output: + ```bash + Transcribe Result: + Detected language: Chinese + [00:00.000 --> 00:05.000] 我认为跑步最重要的就是给我带来了身体健康 + {'text': '我认为跑步最重要的就是给我带来了身体健康', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': '我认为跑步最重要的就是给我带来了身体健康', 'tokens': [50364, 1654, 7422, 97, 13992, 32585, 31429, 8661, 24928, 1546, 5620, 49076, 4845, 99, 34912, 19847, 29485, 44201, 6346, 115, 50614], 'temperature': 0.0, 'avg_logprob': -0.23577967557040128, 'compression_ratio': 0.28169014084507044, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'} + + Translate Result: + Detected language: Chinese + [00:00.000 --> 00:05.000] I think the most important thing about running is that it brings me good health. + {'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'} diff --git a/demos/whisper/README_cn.md b/demos/whisper/README_cn.md new file mode 100755 index 000000000..6f7c35f04 --- /dev/null +++ b/demos/whisper/README_cn.md @@ -0,0 +1,96 @@ +(简体中文|[English](./README.md)) + +# Whisper模型 +## 介绍 +Whisper是一种通用的语音识别模型。它是在多种音频的大数据集上训练的,也是一个多任务模型,可以执行多语言语音识别以及语音翻译和语言识别。 + +Whisper模型由OpenAI Whisper训练 https://github.com/openai/whisper + +## 使用方法 +### 1. 安装 + 请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + + 你可以从 easy,medium,hard 三中方式中选择一种方式安装。 + +### 2. 准备输入 + 这个 demo 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 + + 可以下载此 demo 的示例音频: + ```bash + wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + ``` + +### 3. 使用方法 + - 命令行 (推荐使用) + ```bash + + # 识别文本 + paddlespeech whisper --task transcribe --input ./zh.wav + + #选择只支持英文的模型,并且更换不同大小的模型 + paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav + + # 将语音翻译成英语 + paddlespeech whisper --task translate --input ./zh.wav + ``` + 使用方法: + ```bash + paddlespeech whisper --help + ``` + 参数: + - `input`(必须输入):用于识别的音频文件。 + - `model`:ASR 任务的模型,默认值:`whisper-large`。 + - `task`:输出类别,默认值:`transcribe`。 + - `lang`: 模型语言,默认值:``,使用`en`选择只支持英文的模型,目前可选择`en`的模型有[medium,base,small,tiny]。 + - `size`: 模型大小,默认值:`large`,目前支持[large,medium,base,small,tiny]。 + - `language`:设定解码语言,默认值:`None`,强制设定识别出的语言,默认为模型自行判定。 + - `sample_rate`:音频采样率,默认值:`16000`,目前Whisper暂不支持其他采样率。 + - `config`:ASR 任务的参数文件,若不设置则使用预训练模型中的默认配置,默认值:`None`。 + - `ckpt_path`:模型参数文件,若不设置则下载解码模型使用,默认值:`None`。 + - `yes`;不需要设置额外的参数,一旦设置了该参数,说明你默认同意程序的所有请求,其中包括自动转换输入音频的采样率。默认值:`False`。 + - `device`:执行预测的设备,默认值:当前系统下 paddlepaddle 的默认 device。 + - `verbose`: 如果使用,显示 logger 信息。 + + +- Python API + ```python + import paddle + from paddlespeech.cli.whisper import WhisperExecutor + + whisper_executor = WhisperExecutor() + + # 识别文本 + text = whisper_executor( + model='whisper', + task='transcribe', + sample_rate=16000, + config=None, # Set `config` and `ckpt_path` to None to use pretrained model. + ckpt_path=None, + audio_file='./zh.wav', + device=paddle.get_device()) + print('ASR Result: \n{}'.format(text)) + + # 将语音翻译成英语 + feature = whisper_executor( + model='whisper', + task='translate', + sample_rate=16000, + config=None, # Set `config` and `ckpt_path` to None to use pretrained model. + ckpt_path=None, + audio_file='./zh.wav', + device=paddle.get_device()) + print('Representation: \n{}'.format(feature)) + ``` + + + 输出: + ```bash + Transcribe Result: + Detected language: Chinese + [00:00.000 --> 00:05.000] 我认为跑步最重要的就是给我带来了身体健康 + {'text': '我认为跑步最重要的就是给我带来了身体健康', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': '我认为跑步最重要的就是给我带来了身体健康', 'tokens': [50364, 1654, 7422, 97, 13992, 32585, 31429, 8661, 24928, 1546, 5620, 49076, 4845, 99, 34912, 19847, 29485, 44201, 6346, 115, 50614], 'temperature': 0.0, 'avg_logprob': -0.23577967557040128, 'compression_ratio': 0.28169014084507044, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'} + + Translate Result: + Detected language: Chinese + [00:00.000 --> 00:05.000] I think the most important thing about running is that it brings me good health. + {'text': ' I think the most important thing about running is that it brings me good health.', 'segments': [{'id': 0, 'seek': 0, 'start': 0.0, 'end': 5.0, 'text': ' I think the most important thing about running is that it brings me good health.', 'tokens': [50364, 286, 519, 264, 881, 1021, 551, 466, 2614, 307, 300, 309, 5607, 385, 665, 1585, 13, 50614], 'temperature': 0.0, 'avg_logprob': -0.47945233395225123, 'compression_ratio': 1.095890410958904, 'no_speech_prob': 0.028302080929279327}], 'language': 'zh'} diff --git a/demos/whisper/run.sh b/demos/whisper/run.sh new file mode 100755 index 000000000..b9595735f --- /dev/null +++ b/demos/whisper/run.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# audio download +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav + +# to recognize text +paddlespeech whisper --task transcribe --input ./zh.wav + +# to recognize text and translate to English +paddlespeech whisper --task translate --input ./zh.wav + +# to change model English-Only model +paddlespeech whisper --lang en --size base --task transcribe --input ./en.wav \ No newline at end of file diff --git a/docker/ubuntu16-gpu/Dockerfile b/docker/ubuntu16-gpu/Dockerfile old mode 100644 new mode 100755 diff --git a/docker/ubuntu18-cpu/Dockerfile b/docker/ubuntu18-cpu/Dockerfile old mode 100644 new mode 100755 diff --git a/docs/Makefile b/docs/Makefile old mode 100644 new mode 100755 diff --git a/docs/images/PaddleSpeech_logo.png b/docs/images/PaddleSpeech_logo.png old mode 100644 new mode 100755 diff --git a/docs/images/arch/PaddleSpeech_Server_architecture_diagram.png b/docs/images/arch/PaddleSpeech_Server_architecture_diagram.png old mode 100644 new mode 100755 diff --git a/docs/images/arch/PaddleSpeech_Server_class_diagram.png b/docs/images/arch/PaddleSpeech_Server_class_diagram.png old mode 100644 new mode 100755 diff --git a/docs/images/arch/paddlespeech_high_layout.jpg b/docs/images/arch/paddlespeech_high_layout.jpg old mode 100644 new mode 100755 diff --git a/docs/images/audio_icon.png b/docs/images/audio_icon.png old mode 100644 new mode 100755 diff --git a/docs/images/ds2offlineModel.png b/docs/images/ds2offlineModel.png old mode 100644 new mode 100755 diff --git a/docs/images/ds2onlineModel.png b/docs/images/ds2onlineModel.png old mode 100644 new mode 100755 diff --git a/docs/images/fastpitch.png b/docs/images/fastpitch.png old mode 100644 new mode 100755 diff --git a/docs/images/fastspeech.png b/docs/images/fastspeech.png old mode 100644 new mode 100755 diff --git a/docs/images/fastspeech2.png b/docs/images/fastspeech2.png old mode 100644 new mode 100755 diff --git a/docs/images/frame_level_am.png b/docs/images/frame_level_am.png old mode 100644 new mode 100755 diff --git a/docs/images/logo-small.png b/docs/images/logo-small.png old mode 100644 new mode 100755 diff --git a/docs/images/logo.png b/docs/images/logo.png old mode 100644 new mode 100755 diff --git a/docs/images/news_icon.png b/docs/images/news_icon.png old mode 100644 new mode 100755 diff --git a/docs/images/paddle.png b/docs/images/paddle.png old mode 100644 new mode 100755 diff --git a/docs/images/pwg.png b/docs/images/pwg.png old mode 100644 new mode 100755 diff --git a/docs/images/seq2seq_am.png b/docs/images/seq2seq_am.png old mode 100644 new mode 100755 diff --git a/docs/images/speedyspeech.png b/docs/images/speedyspeech.png old mode 100644 new mode 100755 diff --git a/docs/images/tacotron.png b/docs/images/tacotron.png old mode 100644 new mode 100755 diff --git a/docs/images/tacotron2.png b/docs/images/tacotron2.png old mode 100644 new mode 100755 diff --git a/docs/images/transformer.png b/docs/images/transformer.png old mode 100644 new mode 100755 diff --git a/docs/images/transformer_tts.png b/docs/images/transformer_tts.png old mode 100644 new mode 100755 diff --git a/docs/images/tuning_error_surface.png b/docs/images/tuning_error_surface.png old mode 100644 new mode 100755 diff --git a/docs/images/wechat_group.png b/docs/images/wechat_group.png old mode 100644 new mode 100755 diff --git a/docs/paddlespeech.pdf b/docs/paddlespeech.pdf old mode 100644 new mode 100755 diff --git a/docs/requirements.txt b/docs/requirements.txt old mode 100644 new mode 100755 diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css old mode 100644 new mode 100755 diff --git a/docs/source/api/modules.rst b/docs/source/api/modules.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.backends.rst b/docs/source/api/paddlespeech.audio.backends.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.backends.soundfile_backend.rst b/docs/source/api/paddlespeech.audio.backends.soundfile_backend.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.backends.sox_backend.rst b/docs/source/api/paddlespeech.audio.backends.sox_backend.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.compliance.kaldi.rst b/docs/source/api/paddlespeech.audio.compliance.kaldi.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.compliance.librosa.rst b/docs/source/api/paddlespeech.audio.compliance.librosa.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.compliance.rst b/docs/source/api/paddlespeech.audio.compliance.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.datasets.dataset.rst b/docs/source/api/paddlespeech.audio.datasets.dataset.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.datasets.esc50.rst b/docs/source/api/paddlespeech.audio.datasets.esc50.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.datasets.gtzan.rst b/docs/source/api/paddlespeech.audio.datasets.gtzan.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.datasets.hey_snips.rst b/docs/source/api/paddlespeech.audio.datasets.hey_snips.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.datasets.rirs_noises.rst b/docs/source/api/paddlespeech.audio.datasets.rirs_noises.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.datasets.rst b/docs/source/api/paddlespeech.audio.datasets.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.datasets.tess.rst b/docs/source/api/paddlespeech.audio.datasets.tess.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.datasets.urban_sound.rst b/docs/source/api/paddlespeech.audio.datasets.urban_sound.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.datasets.voxceleb.rst b/docs/source/api/paddlespeech.audio.datasets.voxceleb.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.features.layers.rst b/docs/source/api/paddlespeech.audio.features.layers.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.features.rst b/docs/source/api/paddlespeech.audio.features.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.functional.functional.rst b/docs/source/api/paddlespeech.audio.functional.functional.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.functional.rst b/docs/source/api/paddlespeech.audio.functional.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.functional.window.rst b/docs/source/api/paddlespeech.audio.functional.window.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.io.rst b/docs/source/api/paddlespeech.audio.io.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.metric.eer.rst b/docs/source/api/paddlespeech.audio.metric.eer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.metric.rst b/docs/source/api/paddlespeech.audio.metric.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.rst b/docs/source/api/paddlespeech.audio.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.sox_effects.rst b/docs/source/api/paddlespeech.audio.sox_effects.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst b/docs/source/api/paddlespeech.audio.streamdata.autodecode.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.cache.rst b/docs/source/api/paddlespeech.audio.streamdata.cache.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.compat.rst b/docs/source/api/paddlespeech.audio.streamdata.compat.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst b/docs/source/api/paddlespeech.audio.streamdata.extradatasets.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.filters.rst b/docs/source/api/paddlespeech.audio.streamdata.filters.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.gopen.rst b/docs/source/api/paddlespeech.audio.streamdata.gopen.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.handlers.rst b/docs/source/api/paddlespeech.audio.streamdata.handlers.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.mix.rst b/docs/source/api/paddlespeech.audio.streamdata.mix.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst b/docs/source/api/paddlespeech.audio.streamdata.paddle_utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst b/docs/source/api/paddlespeech.audio.streamdata.pipeline.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.rst b/docs/source/api/paddlespeech.audio.streamdata.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst b/docs/source/api/paddlespeech.audio.streamdata.shardlists.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst b/docs/source/api/paddlespeech.audio.streamdata.tariterators.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.utils.rst b/docs/source/api/paddlespeech.audio.streamdata.utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.streamdata.writer.rst b/docs/source/api/paddlespeech.audio.streamdata.writer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.text.rst b/docs/source/api/paddlespeech.audio.text.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.text.text_featurizer.rst b/docs/source/api/paddlespeech.audio.text.text_featurizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.text.utility.rst b/docs/source/api/paddlespeech.audio.text.utility.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.transform.add_deltas.rst b/docs/source/api/paddlespeech.audio.transform.add_deltas.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.transform.channel_selector.rst b/docs/source/api/paddlespeech.audio.transform.channel_selector.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.transform.cmvn.rst b/docs/source/api/paddlespeech.audio.transform.cmvn.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.transform.functional.rst b/docs/source/api/paddlespeech.audio.transform.functional.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.transform.perturb.rst b/docs/source/api/paddlespeech.audio.transform.perturb.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.transform.rst b/docs/source/api/paddlespeech.audio.transform.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.transform.spec_augment.rst b/docs/source/api/paddlespeech.audio.transform.spec_augment.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.transform.spectrogram.rst b/docs/source/api/paddlespeech.audio.transform.spectrogram.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.transform.transform_interface.rst b/docs/source/api/paddlespeech.audio.transform.transform_interface.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.transform.transformation.rst b/docs/source/api/paddlespeech.audio.transform.transformation.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.transform.wpe.rst b/docs/source/api/paddlespeech.audio.transform.wpe.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst b/docs/source/api/paddlespeech.audio.utils.check_kwargs.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.utils.download.rst b/docs/source/api/paddlespeech.audio.utils.download.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst b/docs/source/api/paddlespeech.audio.utils.dynamic_import.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.utils.error.rst b/docs/source/api/paddlespeech.audio.utils.error.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.utils.log.rst b/docs/source/api/paddlespeech.audio.utils.log.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.utils.numeric.rst b/docs/source/api/paddlespeech.audio.utils.numeric.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.utils.rst b/docs/source/api/paddlespeech.audio.utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst b/docs/source/api/paddlespeech.audio.utils.tensor_utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.audio.utils.time.rst b/docs/source/api/paddlespeech.audio.utils.time.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.asr.infer.rst b/docs/source/api/paddlespeech.cli.asr.infer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.asr.rst b/docs/source/api/paddlespeech.cli.asr.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.base_commands.rst b/docs/source/api/paddlespeech.cli.base_commands.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.cls.infer.rst b/docs/source/api/paddlespeech.cli.cls.infer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.cls.rst b/docs/source/api/paddlespeech.cli.cls.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.download.rst b/docs/source/api/paddlespeech.cli.download.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.entry.rst b/docs/source/api/paddlespeech.cli.entry.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.executor.rst b/docs/source/api/paddlespeech.cli.executor.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.kws.infer.rst b/docs/source/api/paddlespeech.cli.kws.infer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.kws.rst b/docs/source/api/paddlespeech.cli.kws.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.log.rst b/docs/source/api/paddlespeech.cli.log.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.rst b/docs/source/api/paddlespeech.cli.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.st.infer.rst b/docs/source/api/paddlespeech.cli.st.infer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.st.rst b/docs/source/api/paddlespeech.cli.st.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.text.infer.rst b/docs/source/api/paddlespeech.cli.text.infer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.text.rst b/docs/source/api/paddlespeech.cli.text.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.tts.infer.rst b/docs/source/api/paddlespeech.cli.tts.infer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.tts.rst b/docs/source/api/paddlespeech.cli.tts.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.utils.rst b/docs/source/api/paddlespeech.cli.utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.vector.infer.rst b/docs/source/api/paddlespeech.cli.vector.infer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cli.vector.rst b/docs/source/api/paddlespeech.cli.vector.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst b/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cls.exps.panns.rst b/docs/source/api/paddlespeech.cls.exps.panns.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cls.exps.rst b/docs/source/api/paddlespeech.cls.exps.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cls.models.panns.classifier.rst b/docs/source/api/paddlespeech.cls.models.panns.classifier.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cls.models.panns.panns.rst b/docs/source/api/paddlespeech.cls.models.panns.panns.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cls.models.panns.rst b/docs/source/api/paddlespeech.cls.models.panns.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cls.models.rst b/docs/source/api/paddlespeech.cls.models.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.cls.rst b/docs/source/api/paddlespeech.cls.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.collate.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.compute_det.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.score.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.kws.exps.rst b/docs/source/api/paddlespeech.kws.exps.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.kws.models.loss.rst b/docs/source/api/paddlespeech.kws.models.loss.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.kws.models.mdtc.rst b/docs/source/api/paddlespeech.kws.models.mdtc.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.kws.models.rst b/docs/source/api/paddlespeech.kws.models.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.kws.rst b/docs/source/api/paddlespeech.kws.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.resource.model_alias.rst b/docs/source/api/paddlespeech.resource.model_alias.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.resource.pretrained_models.rst b/docs/source/api/paddlespeech.resource.pretrained_models.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.resource.resource.rst b/docs/source/api/paddlespeech.resource.resource.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.resource.rst b/docs/source/api/paddlespeech.resource.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.rst b/docs/source/api/paddlespeech.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.beam_search.batch_beam_search.rst b/docs/source/api/paddlespeech.s2t.decoders.beam_search.batch_beam_search.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.beam_search.beam_search.rst b/docs/source/api/paddlespeech.s2t.decoders.beam_search.beam_search.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.beam_search.rst b/docs/source/api/paddlespeech.s2t.decoders.beam_search.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.decoders_deprecated.rst b/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.decoders_deprecated.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst b/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper.rst b/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.recog.rst b/docs/source/api/paddlespeech.s2t.decoders.recog.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.rst b/docs/source/api/paddlespeech.s2t.decoders.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.scorers.ctc.rst b/docs/source/api/paddlespeech.s2t.decoders.scorers.ctc.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.scorers.ctc_prefix_score.rst b/docs/source/api/paddlespeech.s2t.decoders.scorers.ctc_prefix_score.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.scorers.length_bonus.rst b/docs/source/api/paddlespeech.s2t.decoders.scorers.length_bonus.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.scorers.rst b/docs/source/api/paddlespeech.s2t.decoders.scorers.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.scorers.scorer_interface.rst b/docs/source/api/paddlespeech.s2t.decoders.scorers.scorer_interface.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.decoders.utils.rst b/docs/source/api/paddlespeech.s2t.decoders.utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.runtime.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.runtime.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.server.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.server.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.export.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.export.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.test.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.test.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.test_export.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.test_export.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.test_wav.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.test_wav.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.train.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.model.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.model.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.rst b/docs/source/api/paddlespeech.s2t.exps.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.bin.alignment.rst b/docs/source/api/paddlespeech.s2t.exps.u2.bin.alignment.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.bin.export.rst b/docs/source/api/paddlespeech.s2t.exps.u2.bin.export.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.bin.rst b/docs/source/api/paddlespeech.s2t.exps.u2.bin.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.bin.test.rst b/docs/source/api/paddlespeech.s2t.exps.u2.bin.test.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.bin.test_wav.rst b/docs/source/api/paddlespeech.s2t.exps.u2.bin.test_wav.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.bin.train.rst b/docs/source/api/paddlespeech.s2t.exps.u2.bin.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.model.rst b/docs/source/api/paddlespeech.s2t.exps.u2.model.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.rst b/docs/source/api/paddlespeech.s2t.exps.u2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.test.rst b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.test.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.train.rst b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.model.rst b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.model.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.rst b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_st.bin.export.rst b/docs/source/api/paddlespeech.s2t.exps.u2_st.bin.export.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_st.bin.rst b/docs/source/api/paddlespeech.s2t.exps.u2_st.bin.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_st.bin.test.rst b/docs/source/api/paddlespeech.s2t.exps.u2_st.bin.test.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_st.bin.train.rst b/docs/source/api/paddlespeech.s2t.exps.u2_st.bin.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_st.model.rst b/docs/source/api/paddlespeech.s2t.exps.u2_st.model.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_st.rst b/docs/source/api/paddlespeech.s2t.exps.u2_st.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.audio.rst b/docs/source/api/paddlespeech.s2t.frontend.audio.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.augmentor.augmentation.rst b/docs/source/api/paddlespeech.s2t.frontend.augmentor.augmentation.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.augmentor.base.rst b/docs/source/api/paddlespeech.s2t.frontend.augmentor.base.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.augmentor.impulse_response.rst b/docs/source/api/paddlespeech.s2t.frontend.augmentor.impulse_response.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.augmentor.noise_perturb.rst b/docs/source/api/paddlespeech.s2t.frontend.augmentor.noise_perturb.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.augmentor.online_bayesian_normalization.rst b/docs/source/api/paddlespeech.s2t.frontend.augmentor.online_bayesian_normalization.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.augmentor.resample.rst b/docs/source/api/paddlespeech.s2t.frontend.augmentor.resample.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.augmentor.rst b/docs/source/api/paddlespeech.s2t.frontend.augmentor.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.augmentor.shift_perturb.rst b/docs/source/api/paddlespeech.s2t.frontend.augmentor.shift_perturb.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.augmentor.spec_augment.rst b/docs/source/api/paddlespeech.s2t.frontend.augmentor.spec_augment.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.augmentor.speed_perturb.rst b/docs/source/api/paddlespeech.s2t.frontend.augmentor.speed_perturb.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.augmentor.volume_perturb.rst b/docs/source/api/paddlespeech.s2t.frontend.augmentor.volume_perturb.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.featurizer.audio_featurizer.rst b/docs/source/api/paddlespeech.s2t.frontend.featurizer.audio_featurizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.featurizer.rst b/docs/source/api/paddlespeech.s2t.frontend.featurizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.featurizer.speech_featurizer.rst b/docs/source/api/paddlespeech.s2t.frontend.featurizer.speech_featurizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.featurizer.text_featurizer.rst b/docs/source/api/paddlespeech.s2t.frontend.featurizer.text_featurizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.normalizer.rst b/docs/source/api/paddlespeech.s2t.frontend.normalizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.rst b/docs/source/api/paddlespeech.s2t.frontend.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.speech.rst b/docs/source/api/paddlespeech.s2t.frontend.speech.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.frontend.utility.rst b/docs/source/api/paddlespeech.s2t.frontend.utility.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.io.batchfy.rst b/docs/source/api/paddlespeech.s2t.io.batchfy.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.io.collator.rst b/docs/source/api/paddlespeech.s2t.io.collator.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.io.converter.rst b/docs/source/api/paddlespeech.s2t.io.converter.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.io.dataloader.rst b/docs/source/api/paddlespeech.s2t.io.dataloader.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.io.dataset.rst b/docs/source/api/paddlespeech.s2t.io.dataset.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.io.reader.rst b/docs/source/api/paddlespeech.s2t.io.reader.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.io.rst b/docs/source/api/paddlespeech.s2t.io.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.io.sampler.rst b/docs/source/api/paddlespeech.s2t.io.sampler.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.io.utility.rst b/docs/source/api/paddlespeech.s2t.io.utility.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.asr_interface.rst b/docs/source/api/paddlespeech.s2t.models.asr_interface.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.ds2.conv.rst b/docs/source/api/paddlespeech.s2t.models.ds2.conv.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.ds2.deepspeech2.rst b/docs/source/api/paddlespeech.s2t.models.ds2.deepspeech2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.ds2.rst b/docs/source/api/paddlespeech.s2t.models.ds2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.lm.dataset.rst b/docs/source/api/paddlespeech.s2t.models.lm.dataset.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.lm.rst b/docs/source/api/paddlespeech.s2t.models.lm.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.lm.transformer.rst b/docs/source/api/paddlespeech.s2t.models.lm.transformer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.lm_interface.rst b/docs/source/api/paddlespeech.s2t.models.lm_interface.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.rst b/docs/source/api/paddlespeech.s2t.models.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.st_interface.rst b/docs/source/api/paddlespeech.s2t.models.st_interface.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.u2.rst b/docs/source/api/paddlespeech.s2t.models.u2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.u2.u2.rst b/docs/source/api/paddlespeech.s2t.models.u2.u2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.u2.updater.rst b/docs/source/api/paddlespeech.s2t.models.u2.updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.u2_st.rst b/docs/source/api/paddlespeech.s2t.models.u2_st.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.models.u2_st.u2_st.rst b/docs/source/api/paddlespeech.s2t.models.u2_st.u2_st.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.activation.rst b/docs/source/api/paddlespeech.s2t.modules.activation.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.align.rst b/docs/source/api/paddlespeech.s2t.modules.align.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.attention.rst b/docs/source/api/paddlespeech.s2t.modules.attention.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.cmvn.rst b/docs/source/api/paddlespeech.s2t.modules.cmvn.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.conformer_convolution.rst b/docs/source/api/paddlespeech.s2t.modules.conformer_convolution.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.crf.rst b/docs/source/api/paddlespeech.s2t.modules.crf.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.ctc.rst b/docs/source/api/paddlespeech.s2t.modules.ctc.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.decoder.rst b/docs/source/api/paddlespeech.s2t.modules.decoder.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.decoder_layer.rst b/docs/source/api/paddlespeech.s2t.modules.decoder_layer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.embedding.rst b/docs/source/api/paddlespeech.s2t.modules.embedding.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.encoder.rst b/docs/source/api/paddlespeech.s2t.modules.encoder.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.encoder_layer.rst b/docs/source/api/paddlespeech.s2t.modules.encoder_layer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.initializer.rst b/docs/source/api/paddlespeech.s2t.modules.initializer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.loss.rst b/docs/source/api/paddlespeech.s2t.modules.loss.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.mask.rst b/docs/source/api/paddlespeech.s2t.modules.mask.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.positionwise_feed_forward.rst b/docs/source/api/paddlespeech.s2t.modules.positionwise_feed_forward.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.rst b/docs/source/api/paddlespeech.s2t.modules.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.modules.subsampling.rst b/docs/source/api/paddlespeech.s2t.modules.subsampling.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.rst b/docs/source/api/paddlespeech.s2t.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.cli.rst b/docs/source/api/paddlespeech.s2t.training.cli.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.evaluator.rst b/docs/source/api/paddlespeech.s2t.training.extensions.evaluator.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.extension.rst b/docs/source/api/paddlespeech.s2t.training.extensions.extension.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.plot.rst b/docs/source/api/paddlespeech.s2t.training.extensions.plot.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.rst b/docs/source/api/paddlespeech.s2t.training.extensions.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.gradclip.rst b/docs/source/api/paddlespeech.s2t.training.gradclip.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.optimizer.rst b/docs/source/api/paddlespeech.s2t.training.optimizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.reporter.rst b/docs/source/api/paddlespeech.s2t.training.reporter.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.rst b/docs/source/api/paddlespeech.s2t.training.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.scheduler.rst b/docs/source/api/paddlespeech.s2t.training.scheduler.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.timer.rst b/docs/source/api/paddlespeech.s2t.training.timer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.trainer.rst b/docs/source/api/paddlespeech.s2t.training.trainer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.triggers.compare_value_trigger.rst b/docs/source/api/paddlespeech.s2t.training.triggers.compare_value_trigger.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.triggers.interval_trigger.rst b/docs/source/api/paddlespeech.s2t.training.triggers.interval_trigger.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.triggers.limit_trigger.rst b/docs/source/api/paddlespeech.s2t.training.triggers.limit_trigger.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.triggers.rst b/docs/source/api/paddlespeech.s2t.training.triggers.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.triggers.time_trigger.rst b/docs/source/api/paddlespeech.s2t.training.triggers.time_trigger.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.triggers.utils.rst b/docs/source/api/paddlespeech.s2t.training.triggers.utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.updaters.rst b/docs/source/api/paddlespeech.s2t.training.updaters.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.updaters.standard_updater.rst b/docs/source/api/paddlespeech.s2t.training.updaters.standard_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.training.updaters.updater.rst b/docs/source/api/paddlespeech.s2t.training.updaters.updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.asr_utils.rst b/docs/source/api/paddlespeech.s2t.utils.asr_utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.bleu_score.rst b/docs/source/api/paddlespeech.s2t.utils.bleu_score.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.check_kwargs.rst b/docs/source/api/paddlespeech.s2t.utils.check_kwargs.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.checkpoint.rst b/docs/source/api/paddlespeech.s2t.utils.checkpoint.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.cli_readers.rst b/docs/source/api/paddlespeech.s2t.utils.cli_readers.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.cli_utils.rst b/docs/source/api/paddlespeech.s2t.utils.cli_utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.cli_writers.rst b/docs/source/api/paddlespeech.s2t.utils.cli_writers.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.ctc_utils.rst b/docs/source/api/paddlespeech.s2t.utils.ctc_utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.dynamic_import.rst b/docs/source/api/paddlespeech.s2t.utils.dynamic_import.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.dynamic_pip_install.rst b/docs/source/api/paddlespeech.s2t.utils.dynamic_pip_install.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.error_rate.rst b/docs/source/api/paddlespeech.s2t.utils.error_rate.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.layer_tools.rst b/docs/source/api/paddlespeech.s2t.utils.layer_tools.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.log.rst b/docs/source/api/paddlespeech.s2t.utils.log.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.mp_tools.rst b/docs/source/api/paddlespeech.s2t.utils.mp_tools.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.profiler.rst b/docs/source/api/paddlespeech.s2t.utils.profiler.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.rst b/docs/source/api/paddlespeech.s2t.utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.socket_server.rst b/docs/source/api/paddlespeech.s2t.utils.socket_server.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.spec_augment.rst b/docs/source/api/paddlespeech.s2t.utils.spec_augment.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.tensor_utils.rst b/docs/source/api/paddlespeech.s2t.utils.tensor_utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.text_grid.rst b/docs/source/api/paddlespeech.s2t.utils.text_grid.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.s2t.utils.utility.rst b/docs/source/api/paddlespeech.s2t.utils.utility.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.base_commands.rst b/docs/source/api/paddlespeech.server.base_commands.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.bin.paddlespeech_client.rst b/docs/source/api/paddlespeech.server.bin.paddlespeech_client.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.bin.paddlespeech_server.rst b/docs/source/api/paddlespeech.server.bin.paddlespeech_server.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.bin.rst b/docs/source/api/paddlespeech.server.bin.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.acs.python.rst b/docs/source/api/paddlespeech.server.engine.acs.python.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.acs.rst b/docs/source/api/paddlespeech.server.engine.acs.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.online.ctc_endpoint.rst b/docs/source/api/paddlespeech.server.engine.asr.online.ctc_endpoint.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.online.ctc_search.rst b/docs/source/api/paddlespeech.server.engine.asr.online.ctc_search.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.online.onnx.asr_engine.rst b/docs/source/api/paddlespeech.server.engine.asr.online.onnx.asr_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.online.onnx.rst b/docs/source/api/paddlespeech.server.engine.asr.online.onnx.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.online.paddleinference.asr_engine.rst b/docs/source/api/paddlespeech.server.engine.asr.online.paddleinference.asr_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.online.paddleinference.rst b/docs/source/api/paddlespeech.server.engine.asr.online.paddleinference.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.online.python.asr_engine.rst b/docs/source/api/paddlespeech.server.engine.asr.online.python.asr_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.online.python.rst b/docs/source/api/paddlespeech.server.engine.asr.online.python.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.online.rst b/docs/source/api/paddlespeech.server.engine.asr.online.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.paddleinference.asr_engine.rst b/docs/source/api/paddlespeech.server.engine.asr.paddleinference.asr_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.paddleinference.rst b/docs/source/api/paddlespeech.server.engine.asr.paddleinference.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.python.asr_engine.rst b/docs/source/api/paddlespeech.server.engine.asr.python.asr_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.python.rst b/docs/source/api/paddlespeech.server.engine.asr.python.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.asr.rst b/docs/source/api/paddlespeech.server.engine.asr.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.base_engine.rst b/docs/source/api/paddlespeech.server.engine.base_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.cls.paddleinference.cls_engine.rst b/docs/source/api/paddlespeech.server.engine.cls.paddleinference.cls_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.cls.paddleinference.rst b/docs/source/api/paddlespeech.server.engine.cls.paddleinference.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.cls.python.cls_engine.rst b/docs/source/api/paddlespeech.server.engine.cls.python.cls_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.cls.python.rst b/docs/source/api/paddlespeech.server.engine.cls.python.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.cls.rst b/docs/source/api/paddlespeech.server.engine.cls.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.engine_factory.rst b/docs/source/api/paddlespeech.server.engine.engine_factory.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.engine_pool.rst b/docs/source/api/paddlespeech.server.engine.engine_pool.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.engine_warmup.rst b/docs/source/api/paddlespeech.server.engine.engine_warmup.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.rst b/docs/source/api/paddlespeech.server.engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.text.python.rst b/docs/source/api/paddlespeech.server.engine.text.python.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.text.python.text_engine.rst b/docs/source/api/paddlespeech.server.engine.text.python.text_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.text.rst b/docs/source/api/paddlespeech.server.engine.text.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.tts.online.onnx.rst b/docs/source/api/paddlespeech.server.engine.tts.online.onnx.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.tts.online.onnx.tts_engine.rst b/docs/source/api/paddlespeech.server.engine.tts.online.onnx.tts_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.tts.online.python.rst b/docs/source/api/paddlespeech.server.engine.tts.online.python.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.tts.online.python.tts_engine.rst b/docs/source/api/paddlespeech.server.engine.tts.online.python.tts_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.tts.online.rst b/docs/source/api/paddlespeech.server.engine.tts.online.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.tts.paddleinference.rst b/docs/source/api/paddlespeech.server.engine.tts.paddleinference.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.tts.paddleinference.tts_engine.rst b/docs/source/api/paddlespeech.server.engine.tts.paddleinference.tts_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.tts.python.rst b/docs/source/api/paddlespeech.server.engine.tts.python.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.tts.python.tts_engine.rst b/docs/source/api/paddlespeech.server.engine.tts.python.tts_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.tts.rst b/docs/source/api/paddlespeech.server.engine.tts.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.vector.python.rst b/docs/source/api/paddlespeech.server.engine.vector.python.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.vector.python.vector_engine.rst b/docs/source/api/paddlespeech.server.engine.vector.python.vector_engine.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.engine.vector.rst b/docs/source/api/paddlespeech.server.engine.vector.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.entry.rst b/docs/source/api/paddlespeech.server.entry.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.executor.rst b/docs/source/api/paddlespeech.server.executor.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.restful.acs_api.rst b/docs/source/api/paddlespeech.server.restful.acs_api.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.restful.api.rst b/docs/source/api/paddlespeech.server.restful.api.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.restful.asr_api.rst b/docs/source/api/paddlespeech.server.restful.asr_api.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.restful.cls_api.rst b/docs/source/api/paddlespeech.server.restful.cls_api.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.restful.request.rst b/docs/source/api/paddlespeech.server.restful.request.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.restful.response.rst b/docs/source/api/paddlespeech.server.restful.response.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.restful.rst b/docs/source/api/paddlespeech.server.restful.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.restful.text_api.rst b/docs/source/api/paddlespeech.server.restful.text_api.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.restful.tts_api.rst b/docs/source/api/paddlespeech.server.restful.tts_api.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.restful.vector_api.rst b/docs/source/api/paddlespeech.server.restful.vector_api.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.rst b/docs/source/api/paddlespeech.server.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.tests.asr.offline.http_client.rst b/docs/source/api/paddlespeech.server.tests.asr.offline.http_client.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.tests.asr.offline.rst b/docs/source/api/paddlespeech.server.tests.asr.offline.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.tests.asr.rst b/docs/source/api/paddlespeech.server.tests.asr.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.tests.rst b/docs/source/api/paddlespeech.server.tests.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.util.rst b/docs/source/api/paddlespeech.server.util.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.utils.audio_handler.rst b/docs/source/api/paddlespeech.server.utils.audio_handler.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.utils.audio_process.rst b/docs/source/api/paddlespeech.server.utils.audio_process.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.utils.buffer.rst b/docs/source/api/paddlespeech.server.utils.buffer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.utils.config.rst b/docs/source/api/paddlespeech.server.utils.config.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.utils.errors.rst b/docs/source/api/paddlespeech.server.utils.errors.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.utils.exception.rst b/docs/source/api/paddlespeech.server.utils.exception.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.utils.onnx_infer.rst b/docs/source/api/paddlespeech.server.utils.onnx_infer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.utils.paddle_predictor.rst b/docs/source/api/paddlespeech.server.utils.paddle_predictor.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.utils.rst b/docs/source/api/paddlespeech.server.utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.utils.util.rst b/docs/source/api/paddlespeech.server.utils.util.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.utils.vad.rst b/docs/source/api/paddlespeech.server.utils.vad.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.ws.api.rst b/docs/source/api/paddlespeech.server.ws.api.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.ws.asr_api.rst b/docs/source/api/paddlespeech.server.ws.asr_api.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.ws.rst b/docs/source/api/paddlespeech.server.ws.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.server.ws.tts_api.rst b/docs/source/api/paddlespeech.server.ws.tts_api.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.audio.audio.rst b/docs/source/api/paddlespeech.t2s.audio.audio.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.audio.codec.rst b/docs/source/api/paddlespeech.t2s.audio.codec.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.audio.rst b/docs/source/api/paddlespeech.t2s.audio.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.audio.spec_normalizer.rst b/docs/source/api/paddlespeech.t2s.audio.spec_normalizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.datasets.am_batch_fn.rst b/docs/source/api/paddlespeech.t2s.datasets.am_batch_fn.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.datasets.batch.rst b/docs/source/api/paddlespeech.t2s.datasets.batch.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.datasets.data_table.rst b/docs/source/api/paddlespeech.t2s.datasets.data_table.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.datasets.dataset.rst b/docs/source/api/paddlespeech.t2s.datasets.dataset.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.datasets.get_feats.rst b/docs/source/api/paddlespeech.t2s.datasets.get_feats.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.datasets.ljspeech.rst b/docs/source/api/paddlespeech.t2s.datasets.ljspeech.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.datasets.preprocess_utils.rst b/docs/source/api/paddlespeech.t2s.datasets.preprocess_utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.datasets.rst b/docs/source/api/paddlespeech.t2s.datasets.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.datasets.sampler.rst b/docs/source/api/paddlespeech.t2s.datasets.sampler.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.datasets.vocoder_batch_fn.rst b/docs/source/api/paddlespeech.t2s.datasets.vocoder_batch_fn.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.align.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.normalize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.preprocess.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.synthesize_e2e.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst b/docs/source/api/paddlespeech.t2s.exps.ernie_sat.utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.gen_gta_mel.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.gen_gta_mel.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.normalize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.preprocess.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.train.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst b/docs/source/api/paddlespeech.t2s.exps.fastspeech2.vc2_infer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.hifigan.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.hifigan.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.hifigan.train.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.hifigan.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.multi_band_melgan.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.multi_band_melgan.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.multi_band_melgan.train.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.multi_band_melgan.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.normalize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.parallelwave_gan.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.parallelwave_gan.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.parallelwave_gan.synthesize_from_wav.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.parallelwave_gan.synthesize_from_wav.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.parallelwave_gan.train.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.parallelwave_gan.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.preprocess.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.style_melgan.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.style_melgan.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.style_melgan.train.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.style_melgan.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.gan_vocoder.synthesize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.inference.rst b/docs/source/api/paddlespeech.t2s.exps.inference.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.inference_streaming.rst b/docs/source/api/paddlespeech.t2s.exps.inference_streaming.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.ort_predict.rst b/docs/source/api/paddlespeech.t2s.exps.ort_predict.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.ort_predict_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.ort_predict_e2e.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.ort_predict_streaming.rst b/docs/source/api/paddlespeech.t2s.exps.ort_predict_streaming.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.rst b/docs/source/api/paddlespeech.t2s.exps.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.speedyspeech.gen_gta_mel.rst b/docs/source/api/paddlespeech.t2s.exps.speedyspeech.gen_gta_mel.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.speedyspeech.inference.rst b/docs/source/api/paddlespeech.t2s.exps.speedyspeech.inference.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.speedyspeech.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.speedyspeech.normalize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.speedyspeech.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.speedyspeech.preprocess.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.speedyspeech.rst b/docs/source/api/paddlespeech.t2s.exps.speedyspeech.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.speedyspeech.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.speedyspeech.synthesize_e2e.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.speedyspeech.train.rst b/docs/source/api/paddlespeech.t2s.exps.speedyspeech.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.syn_utils.rst b/docs/source/api/paddlespeech.t2s.exps.syn_utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.synthesize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.synthesize_e2e.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.synthesize_streaming.rst b/docs/source/api/paddlespeech.t2s.exps.synthesize_streaming.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.tacotron2.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.tacotron2.normalize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.tacotron2.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.tacotron2.preprocess.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.tacotron2.rst b/docs/source/api/paddlespeech.t2s.exps.tacotron2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.tacotron2.train.rst b/docs/source/api/paddlespeech.t2s.exps.tacotron2.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.transformer_tts.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.transformer_tts.normalize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.transformer_tts.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.transformer_tts.preprocess.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.transformer_tts.rst b/docs/source/api/paddlespeech.t2s.exps.transformer_tts.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.transformer_tts.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.transformer_tts.synthesize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.transformer_tts.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.transformer_tts.synthesize_e2e.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.transformer_tts.train.rst b/docs/source/api/paddlespeech.t2s.exps.transformer_tts.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst b/docs/source/api/paddlespeech.t2s.exps.vits.normalize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.vits.preprocess.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.rst b/docs/source/api/paddlespeech.t2s.exps.vits.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst b/docs/source/api/paddlespeech.t2s.exps.vits.synthesize_e2e.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.train.rst b/docs/source/api/paddlespeech.t2s.exps.vits.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst b/docs/source/api/paddlespeech.t2s.exps.vits.voice_cloning.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.voice_cloning.rst b/docs/source/api/paddlespeech.t2s.exps.voice_cloning.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.waveflow.config.rst b/docs/source/api/paddlespeech.t2s.exps.waveflow.config.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.waveflow.ljspeech.rst b/docs/source/api/paddlespeech.t2s.exps.waveflow.ljspeech.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.waveflow.preprocess.rst b/docs/source/api/paddlespeech.t2s.exps.waveflow.preprocess.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.waveflow.rst b/docs/source/api/paddlespeech.t2s.exps.waveflow.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.waveflow.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.waveflow.synthesize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.waveflow.train.rst b/docs/source/api/paddlespeech.t2s.exps.waveflow.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.wavernn.rst b/docs/source/api/paddlespeech.t2s.exps.wavernn.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.wavernn.synthesize.rst b/docs/source/api/paddlespeech.t2s.exps.wavernn.synthesize.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.exps.wavernn.train.rst b/docs/source/api/paddlespeech.t2s.exps.wavernn.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.arpabet.rst b/docs/source/api/paddlespeech.t2s.frontend.arpabet.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.dataset.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.onnx_api.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst b/docs/source/api/paddlespeech.t2s.frontend.g2pw.utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.generate_lexicon.rst b/docs/source/api/paddlespeech.t2s.frontend.generate_lexicon.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.mix_frontend.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.normalizer.abbrrviation.rst b/docs/source/api/paddlespeech.t2s.frontend.normalizer.abbrrviation.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.normalizer.acronyms.rst b/docs/source/api/paddlespeech.t2s.frontend.normalizer.acronyms.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.normalizer.normalizer.rst b/docs/source/api/paddlespeech.t2s.frontend.normalizer.normalizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.normalizer.numbers.rst b/docs/source/api/paddlespeech.t2s.frontend.normalizer.numbers.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.normalizer.rst b/docs/source/api/paddlespeech.t2s.frontend.normalizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.normalizer.width.rst b/docs/source/api/paddlespeech.t2s.frontend.normalizer.width.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.phonectic.rst b/docs/source/api/paddlespeech.t2s.frontend.phonectic.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.punctuation.rst b/docs/source/api/paddlespeech.t2s.frontend.punctuation.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.tone_sandhi.rst b/docs/source/api/paddlespeech.t2s.frontend.tone_sandhi.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.vocab.rst b/docs/source/api/paddlespeech.t2s.frontend.vocab.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.zh_frontend.rst b/docs/source/api/paddlespeech.t2s.frontend.zh_frontend.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.char_convert.rst b/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.char_convert.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.chronology.rst b/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.chronology.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.constants.rst b/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.constants.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.num.rst b/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.num.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.phonecode.rst b/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.phonecode.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.quantifier.rst b/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.quantifier.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.rst b/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.text_normlization.rst b/docs/source/api/paddlespeech.t2s.frontend.zh_normalization.text_normlization.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.ernie_sat_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.fastspeech2.fastspeech2.rst b/docs/source/api/paddlespeech.t2s.models.fastspeech2.fastspeech2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.fastspeech2.fastspeech2_updater.rst b/docs/source/api/paddlespeech.t2s.models.fastspeech2.fastspeech2_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.fastspeech2.rst b/docs/source/api/paddlespeech.t2s.models.fastspeech2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.hifigan.hifigan.rst b/docs/source/api/paddlespeech.t2s.models.hifigan.hifigan.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.hifigan.hifigan_updater.rst b/docs/source/api/paddlespeech.t2s.models.hifigan.hifigan_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.hifigan.rst b/docs/source/api/paddlespeech.t2s.models.hifigan.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.melgan.melgan.rst b/docs/source/api/paddlespeech.t2s.models.melgan.melgan.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.melgan.multi_band_melgan_updater.rst b/docs/source/api/paddlespeech.t2s.models.melgan.multi_band_melgan_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.melgan.rst b/docs/source/api/paddlespeech.t2s.models.melgan.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.melgan.style_melgan.rst b/docs/source/api/paddlespeech.t2s.models.melgan.style_melgan.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.melgan.style_melgan_updater.rst b/docs/source/api/paddlespeech.t2s.models.melgan.style_melgan_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.parallel_wavegan.parallel_wavegan.rst b/docs/source/api/paddlespeech.t2s.models.parallel_wavegan.parallel_wavegan.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.parallel_wavegan.parallel_wavegan_updater.rst b/docs/source/api/paddlespeech.t2s.models.parallel_wavegan.parallel_wavegan_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.parallel_wavegan.rst b/docs/source/api/paddlespeech.t2s.models.parallel_wavegan.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.rst b/docs/source/api/paddlespeech.t2s.models.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.speedyspeech.rst b/docs/source/api/paddlespeech.t2s.models.speedyspeech.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.speedyspeech.speedyspeech.rst b/docs/source/api/paddlespeech.t2s.models.speedyspeech.speedyspeech.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.speedyspeech.speedyspeech_updater.rst b/docs/source/api/paddlespeech.t2s.models.speedyspeech.speedyspeech_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.tacotron2.rst b/docs/source/api/paddlespeech.t2s.models.tacotron2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.tacotron2.tacotron2.rst b/docs/source/api/paddlespeech.t2s.models.tacotron2.tacotron2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.tacotron2.tacotron2_updater.rst b/docs/source/api/paddlespeech.t2s.models.tacotron2.tacotron2_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.transformer_tts.rst b/docs/source/api/paddlespeech.t2s.models.transformer_tts.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.transformer_tts.transformer_tts.rst b/docs/source/api/paddlespeech.t2s.models.transformer_tts.transformer_tts.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.transformer_tts.transformer_tts_updater.rst b/docs/source/api/paddlespeech.t2s.models.transformer_tts.transformer_tts_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.duration_predictor.rst b/docs/source/api/paddlespeech.t2s.models.vits.duration_predictor.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.flow.rst b/docs/source/api/paddlespeech.t2s.models.vits.flow.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.generator.rst b/docs/source/api/paddlespeech.t2s.models.vits.generator.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.posterior_encoder.rst b/docs/source/api/paddlespeech.t2s.models.vits.posterior_encoder.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.residual_coupling.rst b/docs/source/api/paddlespeech.t2s.models.vits.residual_coupling.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.rst b/docs/source/api/paddlespeech.t2s.models.vits.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.text_encoder.rst b/docs/source/api/paddlespeech.t2s.models.vits.text_encoder.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.transform.rst b/docs/source/api/paddlespeech.t2s.models.vits.transform.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.vits.rst b/docs/source/api/paddlespeech.t2s.models.vits.vits.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.vits_updater.rst b/docs/source/api/paddlespeech.t2s.models.vits.vits_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.wavenet.residual_block.rst b/docs/source/api/paddlespeech.t2s.models.vits.wavenet.residual_block.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.wavenet.rst b/docs/source/api/paddlespeech.t2s.models.vits.wavenet.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.vits.wavenet.wavenet.rst b/docs/source/api/paddlespeech.t2s.models.vits.wavenet.wavenet.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.waveflow.rst b/docs/source/api/paddlespeech.t2s.models.waveflow.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.wavernn.rst b/docs/source/api/paddlespeech.t2s.models.wavernn.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.wavernn.wavernn.rst b/docs/source/api/paddlespeech.t2s.models.wavernn.wavernn.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.models.wavernn.wavernn_updater.rst b/docs/source/api/paddlespeech.t2s.models.wavernn.wavernn_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.activation.rst b/docs/source/api/paddlespeech.t2s.modules.activation.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.causal_conv.rst b/docs/source/api/paddlespeech.t2s.modules.causal_conv.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.conformer.convolution.rst b/docs/source/api/paddlespeech.t2s.modules.conformer.convolution.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.conformer.encoder_layer.rst b/docs/source/api/paddlespeech.t2s.modules.conformer.encoder_layer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.conformer.rst b/docs/source/api/paddlespeech.t2s.modules.conformer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.conv.rst b/docs/source/api/paddlespeech.t2s.modules.conv.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.geometry.rst b/docs/source/api/paddlespeech.t2s.modules.geometry.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.layer_norm.rst b/docs/source/api/paddlespeech.t2s.modules.layer_norm.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.losses.rst b/docs/source/api/paddlespeech.t2s.modules.losses.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.masked_fill.rst b/docs/source/api/paddlespeech.t2s.modules.masked_fill.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.nets_utils.rst b/docs/source/api/paddlespeech.t2s.modules.nets_utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.normalizer.rst b/docs/source/api/paddlespeech.t2s.modules.normalizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.positional_encoding.rst b/docs/source/api/paddlespeech.t2s.modules.positional_encoding.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.pqmf.rst b/docs/source/api/paddlespeech.t2s.modules.pqmf.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.predictor.duration_predictor.rst b/docs/source/api/paddlespeech.t2s.modules.predictor.duration_predictor.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.predictor.length_regulator.rst b/docs/source/api/paddlespeech.t2s.modules.predictor.length_regulator.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.predictor.rst b/docs/source/api/paddlespeech.t2s.modules.predictor.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.predictor.variance_predictor.rst b/docs/source/api/paddlespeech.t2s.modules.predictor.variance_predictor.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.residual_block.rst b/docs/source/api/paddlespeech.t2s.modules.residual_block.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.residual_stack.rst b/docs/source/api/paddlespeech.t2s.modules.residual_stack.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.rst b/docs/source/api/paddlespeech.t2s.modules.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.style_encoder.rst b/docs/source/api/paddlespeech.t2s.modules.style_encoder.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.tacotron2.attentions.rst b/docs/source/api/paddlespeech.t2s.modules.tacotron2.attentions.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.tacotron2.decoder.rst b/docs/source/api/paddlespeech.t2s.modules.tacotron2.decoder.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.tacotron2.encoder.rst b/docs/source/api/paddlespeech.t2s.modules.tacotron2.encoder.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.tacotron2.rst b/docs/source/api/paddlespeech.t2s.modules.tacotron2.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.tade_res_block.rst b/docs/source/api/paddlespeech.t2s.modules.tade_res_block.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.attention.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.attention.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.decoder.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.decoder.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.decoder_layer.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.decoder_layer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.embedding.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.embedding.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.encoder.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.encoder.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.encoder_layer.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.encoder_layer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.lightconv.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.lightconv.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.mask.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.mask.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.multi_layer_conv.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.multi_layer_conv.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.positionwise_feed_forward.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.positionwise_feed_forward.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.repeat.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.repeat.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.transformer.subsampling.rst b/docs/source/api/paddlespeech.t2s.modules.transformer.subsampling.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.modules.upsample.rst b/docs/source/api/paddlespeech.t2s.modules.upsample.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.rst b/docs/source/api/paddlespeech.t2s.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.cli.rst b/docs/source/api/paddlespeech.t2s.training.cli.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.default_config.rst b/docs/source/api/paddlespeech.t2s.training.default_config.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.experiment.rst b/docs/source/api/paddlespeech.t2s.training.experiment.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.extension.rst b/docs/source/api/paddlespeech.t2s.training.extension.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.extensions.evaluator.rst b/docs/source/api/paddlespeech.t2s.training.extensions.evaluator.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.extensions.rst b/docs/source/api/paddlespeech.t2s.training.extensions.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.extensions.snapshot.rst b/docs/source/api/paddlespeech.t2s.training.extensions.snapshot.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.extensions.visualizer.rst b/docs/source/api/paddlespeech.t2s.training.extensions.visualizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.optimizer.rst b/docs/source/api/paddlespeech.t2s.training.optimizer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.reporter.rst b/docs/source/api/paddlespeech.t2s.training.reporter.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.rst b/docs/source/api/paddlespeech.t2s.training.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.seeding.rst b/docs/source/api/paddlespeech.t2s.training.seeding.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.trainer.rst b/docs/source/api/paddlespeech.t2s.training.trainer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.trigger.rst b/docs/source/api/paddlespeech.t2s.training.trigger.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.triggers.interval_trigger.rst b/docs/source/api/paddlespeech.t2s.training.triggers.interval_trigger.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.triggers.limit_trigger.rst b/docs/source/api/paddlespeech.t2s.training.triggers.limit_trigger.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.triggers.rst b/docs/source/api/paddlespeech.t2s.training.triggers.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.triggers.time_trigger.rst b/docs/source/api/paddlespeech.t2s.training.triggers.time_trigger.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.updater.rst b/docs/source/api/paddlespeech.t2s.training.updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.updaters.rst b/docs/source/api/paddlespeech.t2s.training.updaters.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.training.updaters.standard_updater.rst b/docs/source/api/paddlespeech.t2s.training.updaters.standard_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.utils.checkpoint.rst b/docs/source/api/paddlespeech.t2s.utils.checkpoint.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.utils.display.rst b/docs/source/api/paddlespeech.t2s.utils.display.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.utils.error_rate.rst b/docs/source/api/paddlespeech.t2s.utils.error_rate.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.utils.h5_utils.rst b/docs/source/api/paddlespeech.t2s.utils.h5_utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.utils.internals.rst b/docs/source/api/paddlespeech.t2s.utils.internals.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.utils.layer_tools.rst b/docs/source/api/paddlespeech.t2s.utils.layer_tools.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.utils.mp_tools.rst b/docs/source/api/paddlespeech.t2s.utils.mp_tools.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.utils.profiler.rst b/docs/source/api/paddlespeech.t2s.utils.profiler.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.utils.rst b/docs/source/api/paddlespeech.t2s.utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.t2s.utils.scheduler.rst b/docs/source/api/paddlespeech.t2s.utils.scheduler.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.exps.ernie_linear.avg_model.rst b/docs/source/api/paddlespeech.text.exps.ernie_linear.avg_model.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.exps.ernie_linear.punc_restore.rst b/docs/source/api/paddlespeech.text.exps.ernie_linear.punc_restore.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.exps.ernie_linear.rst b/docs/source/api/paddlespeech.text.exps.ernie_linear.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.exps.ernie_linear.test.rst b/docs/source/api/paddlespeech.text.exps.ernie_linear.test.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.exps.ernie_linear.train.rst b/docs/source/api/paddlespeech.text.exps.ernie_linear.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.exps.rst b/docs/source/api/paddlespeech.text.exps.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.models.ernie_crf.model.rst b/docs/source/api/paddlespeech.text.models.ernie_crf.model.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.models.ernie_crf.rst b/docs/source/api/paddlespeech.text.models.ernie_crf.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.models.ernie_linear.dataset.rst b/docs/source/api/paddlespeech.text.models.ernie_linear.dataset.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.models.ernie_linear.ernie_linear.rst b/docs/source/api/paddlespeech.text.models.ernie_linear.ernie_linear.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.models.ernie_linear.ernie_linear_updater.rst b/docs/source/api/paddlespeech.text.models.ernie_linear.ernie_linear_updater.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.models.ernie_linear.rst b/docs/source/api/paddlespeech.text.models.ernie_linear.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.models.rst b/docs/source/api/paddlespeech.text.models.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.text.rst b/docs/source/api/paddlespeech.text.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.utils.dynamic_import.rst b/docs/source/api/paddlespeech.utils.dynamic_import.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.utils.env.rst b/docs/source/api/paddlespeech.utils.env.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.utils.rst b/docs/source/api/paddlespeech.utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.cluster.diarization.rst b/docs/source/api/paddlespeech.vector.cluster.diarization.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.cluster.plda.rst b/docs/source/api/paddlespeech.vector.cluster.plda.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.cluster.rst b/docs/source/api/paddlespeech.vector.cluster.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.exps.ge2e.audio_processor.rst b/docs/source/api/paddlespeech.vector.exps.ge2e.audio_processor.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.exps.ge2e.config.rst b/docs/source/api/paddlespeech.vector.exps.ge2e.config.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.exps.ge2e.dataset_processors.rst b/docs/source/api/paddlespeech.vector.exps.ge2e.dataset_processors.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.exps.ge2e.inference.rst b/docs/source/api/paddlespeech.vector.exps.ge2e.inference.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.exps.ge2e.preprocess.rst b/docs/source/api/paddlespeech.vector.exps.ge2e.preprocess.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.exps.ge2e.random_cycle.rst b/docs/source/api/paddlespeech.vector.exps.ge2e.random_cycle.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.exps.ge2e.rst b/docs/source/api/paddlespeech.vector.exps.ge2e.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.exps.ge2e.speaker_verification_dataset.rst b/docs/source/api/paddlespeech.vector.exps.ge2e.speaker_verification_dataset.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.exps.ge2e.train.rst b/docs/source/api/paddlespeech.vector.exps.ge2e.train.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.exps.rst b/docs/source/api/paddlespeech.vector.exps.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.io.augment.rst b/docs/source/api/paddlespeech.vector.io.augment.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.io.batch.rst b/docs/source/api/paddlespeech.vector.io.batch.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.io.dataset.rst b/docs/source/api/paddlespeech.vector.io.dataset.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.io.dataset_from_json.rst b/docs/source/api/paddlespeech.vector.io.dataset_from_json.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.io.embedding_norm.rst b/docs/source/api/paddlespeech.vector.io.embedding_norm.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.io.rst b/docs/source/api/paddlespeech.vector.io.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.io.signal_processing.rst b/docs/source/api/paddlespeech.vector.io.signal_processing.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.models.ecapa_tdnn.rst b/docs/source/api/paddlespeech.vector.models.ecapa_tdnn.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.models.lstm_speaker_encoder.rst b/docs/source/api/paddlespeech.vector.models.lstm_speaker_encoder.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.models.rst b/docs/source/api/paddlespeech.vector.models.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.modules.layer.rst b/docs/source/api/paddlespeech.vector.modules.layer.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.modules.loss.rst b/docs/source/api/paddlespeech.vector.modules.loss.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.modules.rst b/docs/source/api/paddlespeech.vector.modules.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.modules.sid_model.rst b/docs/source/api/paddlespeech.vector.modules.sid_model.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.rst b/docs/source/api/paddlespeech.vector.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.training.rst b/docs/source/api/paddlespeech.vector.training.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.training.scheduler.rst b/docs/source/api/paddlespeech.vector.training.scheduler.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.training.seeding.rst b/docs/source/api/paddlespeech.vector.training.seeding.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.utils.rst b/docs/source/api/paddlespeech.vector.utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.utils.time.rst b/docs/source/api/paddlespeech.vector.utils.time.rst old mode 100644 new mode 100755 diff --git a/docs/source/api/paddlespeech.vector.utils.vector_utils.rst b/docs/source/api/paddlespeech.vector.utils.vector_utils.rst old mode 100644 new mode 100755 diff --git a/docs/source/asr/PPASR.md b/docs/source/asr/PPASR.md old mode 100644 new mode 100755 diff --git a/docs/source/asr/PPASR_cn.md b/docs/source/asr/PPASR_cn.md old mode 100644 new mode 100755 diff --git a/docs/source/asr/data_preparation.md b/docs/source/asr/data_preparation.md old mode 100644 new mode 100755 diff --git a/docs/source/asr/feature_list.md b/docs/source/asr/feature_list.md old mode 100644 new mode 100755 diff --git a/docs/source/asr/models_introduction.md b/docs/source/asr/models_introduction.md old mode 100644 new mode 100755 diff --git a/docs/source/asr/ngram_lm.md b/docs/source/asr/ngram_lm.md old mode 100644 new mode 100755 diff --git a/docs/source/asr/quick_start.md b/docs/source/asr/quick_start.md old mode 100644 new mode 100755 diff --git a/docs/source/audio/_static/custom.css b/docs/source/audio/_static/custom.css old mode 100644 new mode 100755 diff --git a/docs/source/audio/_templates/module.rst_t b/docs/source/audio/_templates/module.rst_t old mode 100644 new mode 100755 diff --git a/docs/source/audio/_templates/package.rst_t b/docs/source/audio/_templates/package.rst_t old mode 100644 new mode 100755 diff --git a/docs/source/audio/_templates/toc.rst_t b/docs/source/audio/_templates/toc.rst_t old mode 100644 new mode 100755 diff --git a/docs/source/audio/conf.py b/docs/source/audio/conf.py old mode 100644 new mode 100755 diff --git a/docs/source/audio/index.rst b/docs/source/audio/index.rst old mode 100644 new mode 100755 diff --git a/docs/source/cls/custom_dataset.md b/docs/source/cls/custom_dataset.md old mode 100644 new mode 100755 index e39dcf12d..b7c06cd7a --- a/docs/source/cls/custom_dataset.md +++ b/docs/source/cls/custom_dataset.md @@ -108,7 +108,7 @@ for epoch in range(1, epochs + 1): optimizer.clear_grad() # Calculate loss - avg_loss = loss.numpy()[0] + avg_loss = float(loss) # Calculate metrics preds = paddle.argmax(logits, axis=1) diff --git a/docs/source/cls/quick_start.md b/docs/source/cls/quick_start.md old mode 100644 new mode 100755 diff --git a/docs/source/conf.py b/docs/source/conf.py old mode 100644 new mode 100755 diff --git a/docs/source/demo_video.rst b/docs/source/demo_video.rst old mode 100644 new mode 100755 diff --git a/docs/source/dependencies.md b/docs/source/dependencies.md old mode 100644 new mode 100755 diff --git a/docs/source/index.rst b/docs/source/index.rst old mode 100644 new mode 100755 diff --git a/docs/source/install.md b/docs/source/install.md old mode 100644 new mode 100755 index 1e6c1c48b..aa3d311c7 --- a/docs/source/install.md +++ b/docs/source/install.md @@ -12,8 +12,8 @@ There are 3 ways to use `PaddleSpeech`. According to the degree of difficulty, t - Python >= 3.7 - PaddlePaddle latest version (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html)) - C++ compilation environment -- Hip: For Linux and Mac, do not use command `sh` instead of command `bash` in installation document. -- Hip: We recommand you to install `paddlepaddle` from https://mirror.baidu.com/pypi/simple and install `paddlespeech` from https://pypi.tuna.tsinghua.edu.cn/simple. +- Tip: For Linux and Mac, do not use command `sh` instead of command `bash` in installation document. +- Tip: We recommand you to install `paddlepaddle` from https://mirror.baidu.com/pypi/simple and install `paddlespeech` from https://pypi.tuna.tsinghua.edu.cn/simple. ## Easy: Get the Basic Function (Support Linux, Mac, and Windows) - If you are newer to `PaddleSpeech` and want to experience it easily without your machine. We recommend you to use [AI Studio](https://aistudio.baidu.com/aistudio/index) to experience it. There is a step-by-step [tutorial](https://aistudio.baidu.com/aistudio/education/group/info/25130) for `PaddleSpeech`, and you can use the basic function of `PaddleSpeech` with a free machine. @@ -188,10 +188,6 @@ conda activate tools/venv conda install -y -c conda-forge sox libsndfile swig bzip2 libflac bc ``` ### Install PaddlePaddle -Some users may fail to install `kaldiio` due to the default download source, you can install `pytest-runner` at first; -```bash -pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple -``` Make sure you have GPU and the paddlepaddle version is right. For example, for CUDA 10.2, CuDNN7.6 install paddle 2.4rc: ```bash # Note, 2.4rc is just an example, please follow the minimum dependency of paddlepaddle for your selection @@ -202,6 +198,11 @@ You can also install the develop version of paddlepaddle. For example, for CUDA python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepaddle.org.cn/whl/linux/gpu/develop.html ``` ### Install PaddleSpeech in Developing Mode +Some users may fail to install `kaldiio` due to the default download source, you can install `pytest-runner` at first: +```bash +pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple +``` +Then install PaddleSpeech: ```bash pip install -e .[develop] -i https://pypi.tuna.tsinghua.edu.cn/simple ``` diff --git a/docs/source/install_cn.md b/docs/source/install_cn.md old mode 100644 new mode 100755 index ebc0cf7a2..dd06946f3 --- a/docs/source/install_cn.md +++ b/docs/source/install_cn.md @@ -182,6 +182,7 @@ conda install -y -c conda-forge sox libsndfile swig bzip2 libflac bc ### 安装 PaddlePaddle 请确认你系统是否有 GPU,并且使用了正确版本的 paddlepaddle。例如系统使用 CUDA 10.2, CuDNN7.6 ,你可以安装 paddlepaddle-gpu 2.4rc: ```bash +# 注意:2.4rc 只是一个示例,请按照对paddlepaddle的最小依赖进行选择。 python3 -m pip install paddlepaddle-gpu==2.4.0rc0 -i https://mirror.baidu.com/pypi/simple ``` 你也可以安装 develop 版本的PaddlePaddle. 例如系统使用 CUDA 10.2, CuDNN7.6 ,你可以安装 paddlepaddle-gpu develop: @@ -191,7 +192,6 @@ python3 -m pip install paddlepaddle-gpu==0.0.0.post102 -f https://www.paddlepadd ### 用开发者模式安装 PaddleSpeech 部分用户系统由于默认源的问题,安装中会出现 kaldiio 安转出错的问题,建议首先安装 pytest-runner: ```bash -# 注意:2.4rc 只是一个示例,请按照对paddlepaddle的最小依赖进行选择。 pip install pytest-runner -i https://pypi.tuna.tsinghua.edu.cn/simple ``` 然后安装 PaddleSpeech: diff --git a/docs/source/introduction.md b/docs/source/introduction.md old mode 100644 new mode 100755 diff --git a/docs/source/reference.md b/docs/source/reference.md old mode 100644 new mode 100755 diff --git a/docs/source/released_model.md b/docs/source/released_model.md old mode 100644 new mode 100755 index 4e76da033..9f0c2bea6 --- a/docs/source/released_model.md +++ b/docs/source/released_model.md @@ -9,7 +9,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) | onnx/inference/python | [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python | [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python | -[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python | +[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python | [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python | [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python | [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1) | python | @@ -22,7 +22,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Model | Pre-Train Method | Pre-Train Data | Finetune Data | Size | Descriptions | CER | WER | Example Link | :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----: | [Wav2vec2-large-960h-lv60-self Model](https://paddlespeech.bj.bcebos.com/wav2vec/wav2vec2-large-960h-lv60-self.pdparams) | wav2vec2 | Librispeech and LV-60k Dataset (5.3w h) | - | 1.18 GB |Pre-trained Wav2vec2.0 Model | - | - | - | -[Wav2vec2ASR-large-960h-librispeech Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/wav2vec2ASR-large-960h-librispeech_ckpt_1.3.0.model.tar.gz) | wav2vec2 | Librispeech and LV-60k Dataset (5.3w h) | Librispeech (960 h) | 1.18 GB |Encoder: Wav2vec2.0, Decoder: CTC, Decoding method: Greedy search | - | 0.0189 | [Wav2vecASR Librispeech ASR3](../../examples/librispeech/asr3) | +[Wav2vec2ASR-large-960h-librispeech Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/wav2vec2ASR-large-960h-librispeech_ckpt_1.3.1.model.tar.gz) | wav2vec2 | Librispeech and LV-60k Dataset (5.3w h) | Librispeech (960 h) | 718 MB |Encoder: Wav2vec2.0, Decoder: CTC, Decoding method: Greedy search | - | 0.0189 | [Wav2vecASR Librispeech ASR3](../../examples/librispeech/asr3) | ### Language Model based on NGram Language Model | Training Data | Token-based | Size | Descriptions @@ -40,36 +40,37 @@ Language Model | Training Data | Token-based | Size | Descriptions ## Text-to-Speech Models ### Acoustic Models -Model Type | Dataset| Example Link | Pretrained Models|Static/ONNX Models|Size (static) +Model Type | Dataset| Example Link | Pretrained Models|Static / ONNX / Paddle-Lite Models|Size (static) :-------------:| :------------:| :-----: | :-----:| :-----:| :-----: Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)||| Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB| TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)||| -SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2)|[speedyspeech_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip)|[speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)
[speedyspeech_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_onnx_0.2.0.zip)|13MB| -FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
[fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)|157MB| +SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2)|[speedyspeech_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip)|[speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)
[speedyspeech_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_onnx_0.2.0.zip)
[speedyspeech_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_pdlite_1.3.0.zip)|13MB| +FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
[fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)
[fastspeech2_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_pdlite_1.3.0.zip)|157MB| FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)||| -FastSpeech2-CNNDecoder| CSMSC| [fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)| [fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip) | [fastspeech2_cnndecoder_csmsc_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_static_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip) | 84MB| -FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_aishell3_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip)|[fastspeech2_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_1.1.0.zip)
[fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip)|147MB| -FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|[fastspeech2_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_static_1.1.0.zip)
[fastspeech2_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_onnx_1.1.0.zip)|145MB| -FastSpeech2| VCTK |[fastspeech2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip)|[fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip)
[fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip) | 145MB| +FastSpeech2-CNNDecoder| CSMSC| [fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)| [fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip) | [fastspeech2_cnndecoder_csmsc_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_static_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip)
[fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip)
[fastspeech2_cnndecoder_csmsc_streaming_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_pdlite_1.3.0.zip)| 84MB| +FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_aishell3_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip)|[fastspeech2_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_static_1.1.0.zip)
[fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip)
[fastspeech2_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_pdlite_1.3.0.zip) |147MB| +FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|[fastspeech2_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_static_1.1.0.zip)
[fastspeech2_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_onnx_1.1.0.zip)
[fastspeech2_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_pdlite_1.3.0.zip)|145MB| +FastSpeech2| VCTK |[fastspeech2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip)|[fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip)
[fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip)
[fastspeech2_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_pdlite_1.3.0.zip)| 145MB| FastSpeech2| ZH_EN |[fastspeech2-zh_en](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/zh_en_tts/tts3)|[fastspeech2_mix_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip)|[fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip)
[fastspeech2_mix_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_onnx_0.2.0.zip) | 145MB| - +FastSpeech2| Male ||[fastspeech2_male_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_ckpt_1.3.0.zip)| | | ### Vocoders -Model Type | Dataset| Example Link | Pretrained Models| Static/ONNX Models|Size (static) +Model Type | Dataset| Example Link | Pretrained Models| Static / ONNX / Paddle-Lite Models|Size (static) :-----:| :-----:| :-----: | :-----:| :-----:| :-----: WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)||| -Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)
[pwgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_onnx_0.2.0.zip)|4.8MB| -Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|[pwgan_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_static_1.1.0.zip)
[pwgan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_onnx_1.1.0.zip)|4.8MB| -Parallel WaveGAN| AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)| [pwgan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_1.1.0.zip)
[pwgan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_onnx_1.1.0.zip)|4.8MB| -Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|[pwgan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_static_1.1.0.zip)
[pwgan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_onnx_1.1.0.zip)|4.8MB| -|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip)
[mb_melgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip)|7.6MB| +Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)
[pwgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_onnx_0.2.0.zip)
[pwgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_pdlite_1.3.0.zip)|4.8MB| +Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|[pwgan_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_static_1.1.0.zip)
[pwgan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_onnx_1.1.0.zip)
[pwgan_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_pdlite_1.3.0.zip)|4.8MB| +Parallel WaveGAN| AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)| [pwgan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_static_1.1.0.zip)
[pwgan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_onnx_1.1.0.zip)
[pwgan_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_pdlite_1.3.0.zip)|4.8MB| +Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|[pwgan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_static_1.1.0.zip)
[pwgan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_onnx_1.1.0.zip)
[pwgan_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_pdlite_1.3.0.zip)|4.8MB| +|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip)
[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip)
[mb_melgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip)
[mb_melgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_pdlite_1.3.0.zip)|7.6MB| Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | | -HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)
[hifigan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip)|46MB| -HiFiGAN | LJSpeech |[HiFiGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc5)|[hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)|[hifigan_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_static_1.1.0.zip)
[hifigan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_onnx_1.1.0.zip) |49MB| -HiFiGAN | AISHELL-3 |[HiFiGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)|[hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip)
[hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip)|46MB| -HiFiGAN | VCTK |[HiFiGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5)|[hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip)|[hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip)
[hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip)|46MB| +HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)
[hifigan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip)
[hifigan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_pdlite_1.3.0.zip)|46MB| +HiFiGAN | LJSpeech |[HiFiGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc5)|[hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)|[hifigan_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_static_1.1.0.zip)
[hifigan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_onnx_1.1.0.zip)
[hifigan_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_pdlite_1.3.0.zip) |49MB| +HiFiGAN | AISHELL-3 |[HiFiGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)|[hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip)
[hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip)
[hifigan_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_pdlite_1.3.0.zip)|46MB| +HiFiGAN | VCTK |[HiFiGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5)|[hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip)|[hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip)
[hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip)
[hifigan_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_pdlite_1.3.0.zip)|46MB| WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB| +Parallel WaveGAN| Male ||[pwg_male_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.3.0.zip)||| ### Voice Cloning diff --git a/docs/source/streaming_asr_demo_video.rst b/docs/source/streaming_asr_demo_video.rst old mode 100644 new mode 100755 diff --git a/docs/source/streaming_tts_demo_video.rst b/docs/source/streaming_tts_demo_video.rst old mode 100644 new mode 100755 diff --git a/docs/source/tts/PPTTS.md b/docs/source/tts/PPTTS.md old mode 100644 new mode 100755 diff --git a/docs/source/tts/PPTTS_cn.md b/docs/source/tts/PPTTS_cn.md old mode 100644 new mode 100755 diff --git a/docs/source/tts/README.md b/docs/source/tts/README.md old mode 100644 new mode 100755 diff --git a/docs/source/tts/advanced_usage.md b/docs/source/tts/advanced_usage.md old mode 100644 new mode 100755 diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst old mode 100644 new mode 100755 diff --git a/docs/source/tts/demo_2.rst b/docs/source/tts/demo_2.rst old mode 100644 new mode 100755 diff --git a/docs/source/tts/gan_vocoder.md b/docs/source/tts/gan_vocoder.md old mode 100644 new mode 100755 diff --git a/docs/source/tts/models_introduction.md b/docs/source/tts/models_introduction.md old mode 100644 new mode 100755 diff --git a/docs/source/tts/quick_start.md b/docs/source/tts/quick_start.md old mode 100644 new mode 100755 diff --git a/docs/source/tts/quick_start_cn.md b/docs/source/tts/quick_start_cn.md old mode 100644 new mode 100755 diff --git a/docs/source/tts/test_sentence.txt b/docs/source/tts/test_sentence.txt old mode 100644 new mode 100755 diff --git a/docs/source/tts/tts_datasets.md b/docs/source/tts/tts_datasets.md old mode 100644 new mode 100755 diff --git a/docs/source/tts/tts_papers.md b/docs/source/tts/tts_papers.md old mode 100644 new mode 100755 diff --git a/docs/source/tts/zh_text_frontend.md b/docs/source/tts/zh_text_frontend.md old mode 100644 new mode 100755 diff --git a/docs/source/tts_demo_video.rst b/docs/source/tts_demo_video.rst old mode 100644 new mode 100755 diff --git a/docs/source/vpr/PPVPR.md b/docs/source/vpr/PPVPR.md old mode 100644 new mode 100755 diff --git a/docs/source/vpr/PPVPR_cn.md b/docs/source/vpr/PPVPR_cn.md old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/ctc_loss.ipynb b/docs/topic/ctc/ctc_loss.ipynb old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/ctc_loss_compare.ipynb b/docs/topic/ctc/ctc_loss_compare.ipynb old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/ctc_loss_speed_compare.ipynb b/docs/topic/ctc/ctc_loss_speed_compare.ipynb old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_alpha_definition.png b/docs/topic/ctc/img/ctc_loss_alpha_definition.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_alpha_recurse.png b/docs/topic/ctc/img/ctc_loss_alpha_recurse.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_alpha_recurse_2.png b/docs/topic/ctc/img/ctc_loss_alpha_recurse_2.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_backward_1.png b/docs/topic/ctc/img/ctc_loss_backward_1.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_backward_2.png b/docs/topic/ctc/img/ctc_loss_backward_2.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_backward_3.png b/docs/topic/ctc/img/ctc_loss_backward_3.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_backward_recurse.png b/docs/topic/ctc/img/ctc_loss_backward_recurse.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_cat_lattice.png b/docs/topic/ctc/img/ctc_loss_cat_lattice.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_forward_backward.png b/docs/topic/ctc/img/ctc_loss_forward_backward.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_forward_backward_to_loss.png b/docs/topic/ctc/img/ctc_loss_forward_backward_to_loss.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_forward_loss.png b/docs/topic/ctc/img/ctc_loss_forward_loss.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_gradient_of_y_hat.png b/docs/topic/ctc/img/ctc_loss_gradient_of_y_hat.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_gradient_with_y.png b/docs/topic/ctc/img/ctc_loss_gradient_with_y.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_prob_l_x.png b/docs/topic/ctc/img/ctc_loss_prob_l_x.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_prob_pi_x.png b/docs/topic/ctc/img/ctc_loss_prob_pi_x.png old mode 100644 new mode 100755 diff --git a/docs/topic/ctc/img/ctc_loss_rescale_loss.png b/docs/topic/ctc/img/ctc_loss_rescale_loss.png old mode 100644 new mode 100755 diff --git a/docs/topic/frontend/g2p.md b/docs/topic/frontend/g2p.md old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/gan_vocoder.ipynb b/docs/topic/gan_vocoder/gan_vocoder.ipynb old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/imgs/hifigan_dis.png b/docs/topic/gan_vocoder/imgs/hifigan_dis.png old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/imgs/hifigan_gen.png b/docs/topic/gan_vocoder/imgs/hifigan_gen.png old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/imgs/mb_melgan.png b/docs/topic/gan_vocoder/imgs/mb_melgan.png old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/imgs/mel_loss.png b/docs/topic/gan_vocoder/imgs/mel_loss.png old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/imgs/melgan.png b/docs/topic/gan_vocoder/imgs/melgan.png old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/imgs/pwg.png b/docs/topic/gan_vocoder/imgs/pwg.png old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/imgs/stft_loss_0.png b/docs/topic/gan_vocoder/imgs/stft_loss_0.png old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/imgs/stft_loss_1.png b/docs/topic/gan_vocoder/imgs/stft_loss_1.png old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/imgs/stft_loss_2.png b/docs/topic/gan_vocoder/imgs/stft_loss_2.png old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/imgs/style_melgan_TADE.png b/docs/topic/gan_vocoder/imgs/style_melgan_TADE.png old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/imgs/style_melgan_dis.png b/docs/topic/gan_vocoder/imgs/style_melgan_dis.png old mode 100644 new mode 100755 diff --git a/docs/topic/gan_vocoder/imgs/style_melgan_gen.png b/docs/topic/gan_vocoder/imgs/style_melgan_gen.png old mode 100644 new mode 100755 diff --git a/docs/topic/package_release/python_package_release.md b/docs/topic/package_release/python_package_release.md old mode 100644 new mode 100755 diff --git a/docs/tutorial/.gitkeep b/docs/tutorial/.gitkeep old mode 100644 new mode 100755 diff --git a/docs/tutorial/asr/tutorial_deepspeech2.ipynb b/docs/tutorial/asr/tutorial_deepspeech2.ipynb old mode 100644 new mode 100755 diff --git a/docs/tutorial/asr/tutorial_transformer.ipynb b/docs/tutorial/asr/tutorial_transformer.ipynb old mode 100644 new mode 100755 diff --git a/docs/tutorial/cls/cls_tutorial.ipynb b/docs/tutorial/cls/cls_tutorial.ipynb old mode 100644 new mode 100755 index 56b488adc..3cee64991 --- a/docs/tutorial/cls/cls_tutorial.ipynb +++ b/docs/tutorial/cls/cls_tutorial.ipynb @@ -509,7 +509,7 @@ " optimizer.clear_grad()\n", "\n", " # Calculate loss\n", - " avg_loss += loss.numpy()[0]\n", + " avg_loss += float(loss)\n", "\n", " # Calculate metrics\n", " preds = paddle.argmax(logits, axis=1)\n", diff --git a/docs/tutorial/st/st_tutorial.ipynb b/docs/tutorial/st/st_tutorial.ipynb old mode 100644 new mode 100755 diff --git a/docs/tutorial/tts/tts_tutorial.ipynb b/docs/tutorial/tts/tts_tutorial.ipynb old mode 100644 new mode 100755 diff --git a/examples/aishell/.gitignore b/examples/aishell/.gitignore old mode 100644 new mode 100755 diff --git a/examples/aishell/README.md b/examples/aishell/README.md old mode 100644 new mode 100755 diff --git a/examples/aishell/asr0/.gitignore b/examples/aishell/asr0/.gitignore old mode 100644 new mode 100755 diff --git a/examples/aishell/asr0/README.md b/examples/aishell/asr0/README.md old mode 100644 new mode 100755 diff --git a/examples/aishell/asr0/RESULTS.md b/examples/aishell/asr0/RESULTS.md old mode 100644 new mode 100755 diff --git a/examples/aishell/asr0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell/asr0/conf/preprocess.yaml b/examples/aishell/asr0/conf/preprocess.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell/asr0/conf/tuning/chunk_decode.yaml b/examples/aishell/asr0/conf/tuning/chunk_decode.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell/asr0/conf/tuning/decode.yaml b/examples/aishell/asr0/conf/tuning/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell/asr0/path.sh b/examples/aishell/asr0/path.sh old mode 100644 new mode 100755 diff --git a/examples/aishell/asr1/.gitignore b/examples/aishell/asr1/.gitignore old mode 100644 new mode 100755 diff --git a/examples/aishell/asr1/README.md b/examples/aishell/asr1/README.md old mode 100644 new mode 100755 diff --git a/examples/aishell/asr1/RESULTS.md b/examples/aishell/asr1/RESULTS.md old mode 100644 new mode 100755 diff --git a/examples/aishell/asr1/conf/augmentation.json b/examples/aishell/asr1/conf/augmentation.json old mode 100644 new mode 100755 diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell/asr1/conf/tuning/chunk_decode.yaml b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell/asr1/conf/tuning/decode.yaml b/examples/aishell/asr1/conf/tuning/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell/asr1/path.sh b/examples/aishell/asr1/path.sh old mode 100644 new mode 100755 diff --git a/examples/aishell/asr1/run.sh b/examples/aishell/asr1/run.sh old mode 100644 new mode 100755 diff --git a/examples/aishell3/README.md b/examples/aishell3/README.md old mode 100644 new mode 100755 diff --git a/examples/aishell3/ernie_sat/README.md b/examples/aishell3/ernie_sat/README.md old mode 100644 new mode 100755 index 9b7768985..bd5964c3a --- a/examples/aishell3/ernie_sat/README.md +++ b/examples/aishell3/ernie_sat/README.md @@ -1,5 +1,5 @@ # ERNIE-SAT with AISHELL-3 dataset -ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning. +[ERNIE-SAT](https://arxiv.org/abs/2211.03545) speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning. ## Model Framework In ERNIE-SAT, we propose two innovations: diff --git a/examples/aishell3/ernie_sat/conf/default.yaml b/examples/aishell3/ernie_sat/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md old mode 100644 new mode 100755 index 3e1dee2fb..49801c4c3 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -226,6 +226,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [fastspeech2_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [fastspeech2_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_pdlite_1.3.0.zip) + FastSpeech2 checkpoint contains files listed below. ```text diff --git a/examples/aishell3/tts3/conf/conformer.yaml b/examples/aishell3/tts3/conf/conformer.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell3/tts3/local/export2lite.sh b/examples/aishell3/tts3/local/export2lite.sh new file mode 120000 index 000000000..f7719914a --- /dev/null +++ b/examples/aishell3/tts3/local/export2lite.sh @@ -0,0 +1 @@ +../../../csmsc/tts3/local/export2lite.sh \ No newline at end of file diff --git a/examples/aishell3/tts3/local/lite_predict.sh b/examples/aishell3/tts3/local/lite_predict.sh new file mode 100755 index 000000000..e77e8b6c2 --- /dev/null +++ b/examples/aishell3/tts3/local/lite_predict.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_aishell3 \ + --voc=pwgan_aishell3 \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --spk_id=0 +fi + +# hifigan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_aishell3 \ + --voc=hifigan_aishell3 \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --spk_id=0 +fi diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh index f730f3761..b5da076b2 100755 --- a/examples/aishell3/tts3/run.sh +++ b/examples/aishell3/tts3/run.sh @@ -58,3 +58,13 @@ fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then ./local/ort_predict.sh ${train_output_path} fi + +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_aishell3 x86 + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_aishell3 x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_aishell3 x86 +fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +fi diff --git a/examples/aishell3/vc0/README.md b/examples/aishell3/vc0/README.md old mode 100644 new mode 100755 diff --git a/examples/aishell3/vc0/conf/default.yaml b/examples/aishell3/vc0/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md old mode 100644 new mode 100755 diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell3/vc2/README.md b/examples/aishell3/vc2/README.md old mode 100644 new mode 100755 diff --git a/examples/aishell3/vc2/conf/default.yaml b/examples/aishell3/vc2/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell3/vits-vc/README.md b/examples/aishell3/vits-vc/README.md old mode 100644 new mode 100755 diff --git a/examples/aishell3/vits-vc/conf/default.yaml b/examples/aishell3/vits-vc/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell3/vits/README.md b/examples/aishell3/vits/README.md old mode 100644 new mode 100755 diff --git a/examples/aishell3/vits/conf/default.yaml b/examples/aishell3/vits/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md old mode 100644 new mode 100755 index bc25f43cf..467653cbe --- a/examples/aishell3/voc1/README.md +++ b/examples/aishell3/voc1/README.md @@ -139,6 +139,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [pwgan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [pwgan_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_aishell3_pdlite_1.3.0.zip) + Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss:| eval/spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------: default| 1(gpu) x 400000|1.968762|0.759008|0.218524 diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md old mode 100644 new mode 100755 index 7f99a52e3..7f62ed0d0 --- a/examples/aishell3/voc5/README.md +++ b/examples/aishell3/voc5/README.md @@ -122,6 +122,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [hifigan_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_pdlite_1.3.0.zip) + Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss :-------------:| :------------:| :-----: | :-----: | :--------: default| 1(gpu) x 2500000|24.060|0.1068|7.499 diff --git a/examples/aishell3/voc5/conf/default.yaml b/examples/aishell3/voc5/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/aishell3_vctk/README.md b/examples/aishell3_vctk/README.md old mode 100644 new mode 100755 diff --git a/examples/aishell3_vctk/ernie_sat/README.md b/examples/aishell3_vctk/ernie_sat/README.md old mode 100644 new mode 100755 index 321957835..fbf9244d1 --- a/examples/aishell3_vctk/ernie_sat/README.md +++ b/examples/aishell3_vctk/ernie_sat/README.md @@ -1,5 +1,5 @@ # ERNIE-SAT with AISHELL-3 and VCTK dataset -ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning. +[ERNIE-SAT](https://arxiv.org/abs/2211.03545) speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning. ## Model Framework In ERNIE-SAT, we propose two innovations: diff --git a/examples/aishell3_vctk/ernie_sat/conf/default.yaml b/examples/aishell3_vctk/ernie_sat/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/ami/README.md b/examples/ami/README.md old mode 100644 new mode 100755 diff --git a/examples/ami/sd0/.gitignore b/examples/ami/sd0/.gitignore old mode 100644 new mode 100755 diff --git a/examples/ami/sd0/README.md b/examples/ami/sd0/README.md old mode 100644 new mode 100755 diff --git a/examples/ami/sd0/local/ami_prepare.py b/examples/ami/sd0/local/ami_prepare.py old mode 100644 new mode 100755 diff --git a/examples/ami/sd0/local/ami_splits.py b/examples/ami/sd0/local/ami_splits.py old mode 100644 new mode 100755 diff --git a/examples/ami/sd0/local/compute_embdding.py b/examples/ami/sd0/local/compute_embdding.py old mode 100644 new mode 100755 diff --git a/examples/ami/sd0/local/dataio.py b/examples/ami/sd0/local/dataio.py old mode 100644 new mode 100755 diff --git a/examples/ami/sd0/path.sh b/examples/ami/sd0/path.sh old mode 100644 new mode 100755 diff --git a/examples/ami/sd0/run.sh b/examples/ami/sd0/run.sh old mode 100644 new mode 100755 diff --git a/examples/callcenter/README.md b/examples/callcenter/README.md old mode 100644 new mode 100755 diff --git a/examples/callcenter/asr1/.gitignore b/examples/callcenter/asr1/.gitignore old mode 100644 new mode 100755 diff --git a/examples/callcenter/asr1/RESULTS.md b/examples/callcenter/asr1/RESULTS.md old mode 100644 new mode 100755 diff --git a/examples/callcenter/asr1/conf/augmentation.json b/examples/callcenter/asr1/conf/augmentation.json old mode 100644 new mode 100755 diff --git a/examples/callcenter/asr1/conf/chunk_conformer.yaml b/examples/callcenter/asr1/conf/chunk_conformer.yaml old mode 100644 new mode 100755 diff --git a/examples/callcenter/asr1/conf/conformer.yaml b/examples/callcenter/asr1/conf/conformer.yaml old mode 100644 new mode 100755 diff --git a/examples/callcenter/asr1/conf/preprocess.yaml b/examples/callcenter/asr1/conf/preprocess.yaml old mode 100644 new mode 100755 diff --git a/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml b/examples/callcenter/asr1/conf/tuning/chunk_decode.yaml old mode 100644 new mode 100755 diff --git a/examples/callcenter/asr1/conf/tuning/decode.yaml b/examples/callcenter/asr1/conf/tuning/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/callcenter/asr1/path.sh b/examples/callcenter/asr1/path.sh old mode 100644 new mode 100755 diff --git a/examples/callcenter/asr1/run.sh b/examples/callcenter/asr1/run.sh old mode 100644 new mode 100755 diff --git a/examples/csmsc/README.md b/examples/csmsc/README.md old mode 100644 new mode 100755 diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md old mode 100644 new mode 100755 diff --git a/examples/csmsc/tts0/conf/default.yaml b/examples/csmsc/tts0/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md old mode 100644 new mode 100755 index f45561719..ec88959d1 --- a/examples/csmsc/tts2/README.md +++ b/examples/csmsc/tts2/README.md @@ -230,6 +230,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [speedyspeech_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_onnx_0.2.0.zip) +The Paddle-Lite model can be downloaded here: +- [speedyspeech_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_pdlite_1.3.0.zip) + Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/ssim_loss :-------------:| :------------:| :-----: | :-----: | :--------:|:--------: diff --git a/examples/csmsc/tts2/conf/default.yaml b/examples/csmsc/tts2/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/csmsc/tts2/local/export2lite.sh b/examples/csmsc/tts2/local/export2lite.sh new file mode 120000 index 000000000..402fd8334 --- /dev/null +++ b/examples/csmsc/tts2/local/export2lite.sh @@ -0,0 +1 @@ +../../tts3/local/export2lite.sh \ No newline at end of file diff --git a/examples/csmsc/tts2/local/lite_predict.sh b/examples/csmsc/tts2/local/lite_predict.sh new file mode 100755 index 000000000..d0c6c0584 --- /dev/null +++ b/examples/csmsc/tts2/local/lite_predict.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=speedyspeech_csmsc \ + --voc=pwgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=speedyspeech_csmsc \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=speedyspeech_csmsc \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi diff --git a/examples/csmsc/tts2/run.sh b/examples/csmsc/tts2/run.sh index 557dd4ff3..1b608992f 100755 --- a/examples/csmsc/tts2/run.sh +++ b/examples/csmsc/tts2/run.sh @@ -60,3 +60,15 @@ fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then ./local/ort_predict.sh ${train_output_path} fi + +# must run after stage 3 (which stage generated static models) +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + ./local/export2lite.sh ${train_output_path} inference pdlite speedyspeech_csmsc x86 + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86 +fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +fi diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md old mode 100644 new mode 100755 index 371034e77..39926259d --- a/examples/csmsc/tts3/README.md +++ b/examples/csmsc/tts3/README.md @@ -238,6 +238,12 @@ The ONNX model can be downloaded here: - [fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip) - [fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip) +The Paddle-Lite model can be downloaded here: +> please compile develop version of Paddle-Lite to export and run TTS models, cause TTS models are supported by https://github.com/PaddlePaddle/Paddle-Lite/pull/9587 and https://github.com/PaddlePaddle/Paddle-Lite/pull/9706 +- [fastspeech2_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_pdlite_1.3.0.zip) +- [fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip) +- [fastspeech2_cnndecoder_csmsc_streaming_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_pdlite_1.3.0.zip) + Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287| diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md old mode 100644 new mode 100755 diff --git a/examples/csmsc/tts3/conf/cnndecoder.yaml b/examples/csmsc/tts3/conf/cnndecoder.yaml old mode 100644 new mode 100755 diff --git a/examples/csmsc/tts3/conf/conformer.yaml b/examples/csmsc/tts3/conf/conformer.yaml old mode 100644 new mode 100755 diff --git a/examples/csmsc/tts3/conf/default.yaml b/examples/csmsc/tts3/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/csmsc/tts3/local/export2lite.sh b/examples/csmsc/tts3/local/export2lite.sh new file mode 100755 index 000000000..c2687ec73 --- /dev/null +++ b/examples/csmsc/tts3/local/export2lite.sh @@ -0,0 +1,18 @@ +train_output_path=$1 +model_dir=$2 +output_dir=$3 +model=$4 +valid_targets=$5 + +model_name=${model%_*} +echo model_name: ${model_name} + +suffix=${valid_targets%,*} + +mkdir -p ${train_output_path}/${output_dir} + +paddle_lite_opt \ + --model_file ${train_output_path}/${model_dir}/${model}.pdmodel \ + --param_file ${train_output_path}/${model_dir}/${model}.pdiparams \ + --optimize_out ${train_output_path}/${output_dir}/${model}_${suffix} \ + --valid_targets ${valid_targets} diff --git a/examples/csmsc/tts3/local/lite_predict.sh b/examples/csmsc/tts3/local/lite_predict.sh new file mode 100755 index 000000000..1ed2f108d --- /dev/null +++ b/examples/csmsc/tts3/local/lite_predict.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_csmsc \ + --voc=pwgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_csmsc \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_csmsc \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/examples/csmsc/tts3/local/lite_predict_streaming.sh b/examples/csmsc/tts3/local/lite_predict_streaming.sh new file mode 100755 index 000000000..4570cb4eb --- /dev/null +++ b/examples/csmsc/tts3/local/lite_predict_streaming.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../lite_predict_streaming.py \ + --inference_dir=${train_output_path}/pdlite_streaming \ + --am=fastspeech2_csmsc \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out_streaming \ + --phones_dict=dump/phone_id_map.txt \ + --am_streaming=True +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../lite_predict_streaming.py \ + --inference_dir=${train_output_path}/pdlite_streaming \ + --am=fastspeech2_csmsc \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out_streaming \ + --phones_dict=dump/phone_id_map.txt \ + --am_streaming=True +fi + +# hifigan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + python3 ${BIN_DIR}/../lite_predict_streaming.py \ + --inference_dir=${train_output_path}/pdlite_streaming \ + --am=fastspeech2_csmsc \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --text=${BIN_DIR}/../sentences.txt \ + --output_dir=${train_output_path}/lite_infer_out_streaming \ + --phones_dict=dump/phone_id_map.txt \ + --am_streaming=True +fi + diff --git a/examples/csmsc/tts3/local/simple.lexicon b/examples/csmsc/tts3/local/simple.lexicon old mode 100644 new mode 100755 diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh index 80acf8200..14308af4e 100755 --- a/examples/csmsc/tts3/run.sh +++ b/examples/csmsc/tts3/run.sh @@ -61,3 +61,18 @@ fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then ./local/ort_predict.sh ${train_output_path} fi + +# must run after stage 3 (which stage generated static models) +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + # NOTE by yuantian 2022.11.21: please compile develop version of Paddle-Lite to export and run TTS models, + # cause TTS models are supported by https://github.com/PaddlePaddle/Paddle-Lite/pull/9587 + # and https://github.com/PaddlePaddle/Paddle-Lite/pull/9706 + ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_csmsc x86 + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86 +fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +fi diff --git a/examples/csmsc/tts3/run_cnndecoder.sh b/examples/csmsc/tts3/run_cnndecoder.sh index bae833157..8cc9c5da2 100755 --- a/examples/csmsc/tts3/run_cnndecoder.sh +++ b/examples/csmsc/tts3/run_cnndecoder.sh @@ -75,7 +75,6 @@ if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then fi # paddle2onnx streaming - if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then # install paddle2onnx version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}') @@ -97,3 +96,29 @@ if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then ./local/ort_predict_streaming.sh ${train_output_path} fi +# must run after stage 3 (which stage generated static models) +if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then + ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_csmsc x86 + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_csmsc x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite mb_melgan_csmsc x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_csmsc x86 +fi + +if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +fi + +# must run after stage 5 (which stage generated static models) +if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then + # streaming acoustic model + ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming fastspeech2_csmsc_am_encoder_infer x86 + ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming fastspeech2_csmsc_am_decoder x86 + ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming fastspeech2_csmsc_am_postnet x86 + ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming pwgan_csmsc x86 + # ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming mb_melgan_csmsc x86 + # ./local/export2lite.sh ${train_output_path} inference_streaming pdlite_streaming hifigan_csmsc x86 +fi + +if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict_streaming.sh ${train_output_path} || exit -1 +fi diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md old mode 100644 new mode 100755 diff --git a/examples/csmsc/vits/conf/default.yaml b/examples/csmsc/vits/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md old mode 100644 new mode 100755 index 4646a0345..252c2b920 --- a/examples/csmsc/voc1/README.md +++ b/examples/csmsc/voc1/README.md @@ -136,6 +136,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [pwgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_onnx_0.2.0.zip) +The Paddle-Lite model can be downloaded here: +- [pwgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_csmsc_pdlite_1.3.0.zip) + Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss| eval/spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------: default| 1(gpu) x 400000|1.948763|0.670098|0.248882 diff --git a/examples/csmsc/voc1/conf/default.yaml b/examples/csmsc/voc1/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md old mode 100644 new mode 100755 index 09fb8836c..f2a1eef7f --- a/examples/csmsc/voc3/README.md +++ b/examples/csmsc/voc3/README.md @@ -164,6 +164,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [mb_melgan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip) +The Paddle-Lite model can be downloaded here: +- [mb_melgan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_pdlite_1.3.0.zip) + Model | Step | eval/generator_loss | eval/log_stft_magnitude_loss|eval/spectral_convergence_loss |eval/sub_log_stft_magnitude_loss|eval/sub_spectral_convergence_loss :-------------:| :------------:| :-----: | :-----: | :--------:| :--------:| :--------: default| 1(gpu) x 1000000| 2.4851|0.71778 |0.2761 |0.66334 |0.2777| diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml old mode 100644 new mode 100755 diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md old mode 100644 new mode 100755 diff --git a/examples/csmsc/voc4/conf/default.yaml b/examples/csmsc/voc4/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md old mode 100644 new mode 100755 index ef552fd30..3347c6473 --- a/examples/csmsc/voc5/README.md +++ b/examples/csmsc/voc5/README.md @@ -121,6 +121,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [hifigan_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip) +The Paddle-Lite model can be downloaded here: +- [hifigan_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_pdlite_1.3.0.zip) + Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss :-------------:| :------------:| :-----: | :-----: | :--------: default| 1(gpu) x 2500000|24.927|0.1262|7.554 diff --git a/examples/csmsc/voc5/conf/default.yaml b/examples/csmsc/voc5/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/csmsc/voc5/conf/finetune.yaml b/examples/csmsc/voc5/conf/finetune.yaml old mode 100644 new mode 100755 diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md old mode 100644 new mode 100755 diff --git a/examples/csmsc/voc6/conf/default.yaml b/examples/csmsc/voc6/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/esc50/README.md b/examples/esc50/README.md old mode 100644 new mode 100755 diff --git a/examples/esc50/RESULTS.md b/examples/esc50/RESULTS.md old mode 100644 new mode 100755 diff --git a/examples/esc50/cls0/conf/panns.yaml b/examples/esc50/cls0/conf/panns.yaml old mode 100644 new mode 100755 diff --git a/examples/esc50/cls0/path.sh b/examples/esc50/cls0/path.sh old mode 100644 new mode 100755 diff --git a/examples/hey_snips/README.md b/examples/hey_snips/README.md old mode 100644 new mode 100755 index ba263906a..6311ad928 --- a/examples/hey_snips/README.md +++ b/examples/hey_snips/README.md @@ -2,7 +2,7 @@ ## Metrics We mesure FRRs with fixing false alarms in one hour: - +the release model: https://paddlespeech.bj.bcebos.com/kws/heysnips/kws0_mdtc_heysnips_ckpt.tar.gz |Model|False Alarm| False Reject Rate| |--|--|--| |MDTC| 1| 0.003559 | diff --git a/examples/hey_snips/kws0/README.md b/examples/hey_snips/kws0/README.md old mode 100644 new mode 100755 diff --git a/examples/hey_snips/kws0/conf/mdtc.yaml b/examples/hey_snips/kws0/conf/mdtc.yaml old mode 100644 new mode 100755 diff --git a/examples/iwslt2012/punc0/README.md b/examples/iwslt2012/punc0/README.md old mode 100644 new mode 100755 diff --git a/examples/iwslt2012/punc0/RESULTS.md b/examples/iwslt2012/punc0/RESULTS.md old mode 100644 new mode 100755 diff --git a/examples/iwslt2012/punc0/conf/default.yaml b/examples/iwslt2012/punc0/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-base.yaml b/examples/iwslt2012/punc0/conf/ernie-3.0-base.yaml old mode 100644 new mode 100755 diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-medium.yaml b/examples/iwslt2012/punc0/conf/ernie-3.0-medium.yaml old mode 100644 new mode 100755 diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-mini.yaml b/examples/iwslt2012/punc0/conf/ernie-3.0-mini.yaml old mode 100644 new mode 100755 diff --git a/examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh.yaml b/examples/iwslt2012/punc0/conf/ernie-3.0-nano-zh.yaml old mode 100644 new mode 100755 diff --git a/examples/iwslt2012/punc0/conf/ernie-tiny.yaml b/examples/iwslt2012/punc0/conf/ernie-tiny.yaml old mode 100644 new mode 100755 diff --git a/examples/iwslt2012/punc0/local/preprocess.py b/examples/iwslt2012/punc0/local/preprocess.py old mode 100644 new mode 100755 diff --git a/examples/librispeech/.gitignore b/examples/librispeech/.gitignore old mode 100644 new mode 100755 diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr0/README.md b/examples/librispeech/asr0/README.md old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr0/RESULTS.md b/examples/librispeech/asr0/RESULTS.md old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr0/conf/preprocess.yaml b/examples/librispeech/asr0/conf/preprocess.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr0/conf/tuning/chunk_decode.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr0/conf/tuning/decode.yaml b/examples/librispeech/asr0/conf/tuning/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr0/path.sh b/examples/librispeech/asr0/path.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/.gitignore b/examples/librispeech/asr1/.gitignore old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/README.md b/examples/librispeech/asr1/README.md old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/RESULTS.md b/examples/librispeech/asr1/RESULTS.md old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/cmd.sh b/examples/librispeech/asr1/cmd.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/conf/augmentation.json b/examples/librispeech/asr1/conf/augmentation.json old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/conf/chunk_conformer.yaml b/examples/librispeech/asr1/conf/chunk_conformer.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/conf/chunk_transformer.yaml b/examples/librispeech/asr1/conf/chunk_transformer.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/conf/conformer.yaml b/examples/librispeech/asr1/conf/conformer.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/conf/preprocess.yaml b/examples/librispeech/asr1/conf/preprocess.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml b/examples/librispeech/asr1/conf/tuning/chunk_decode.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/conf/tuning/decode.yaml b/examples/librispeech/asr1/conf/tuning/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr1/path.sh b/examples/librispeech/asr1/path.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/.gitignore b/examples/librispeech/asr2/.gitignore old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/README.md b/examples/librispeech/asr2/README.md old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/RESULTS.md b/examples/librispeech/asr2/RESULTS.md old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/cmd.sh b/examples/librispeech/asr2/cmd.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/conf/augmentation.json b/examples/librispeech/asr2/conf/augmentation.json old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/conf/decode/decode.yaml b/examples/librispeech/asr2/conf/decode/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/conf/decode/decode_att.yaml b/examples/librispeech/asr2/conf/decode/decode_att.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/conf/decode/decode_base.yaml b/examples/librispeech/asr2/conf/decode/decode_base.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/conf/decode/decode_ctc.yaml b/examples/librispeech/asr2/conf/decode/decode_ctc.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml b/examples/librispeech/asr2/conf/decode/decode_wo_lm.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/conf/fbank.conf b/examples/librispeech/asr2/conf/fbank.conf old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/conf/lm/transformer.yaml b/examples/librispeech/asr2/conf/lm/transformer.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/conf/pitch.conf b/examples/librispeech/asr2/conf/pitch.conf old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/conf/preprocess.yaml b/examples/librispeech/asr2/conf/preprocess.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr2/path.sh b/examples/librispeech/asr2/path.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr3/README.md b/examples/librispeech/asr3/README.md old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr3/RESULTS.md b/examples/librispeech/asr3/RESULTS.md old mode 100644 new mode 100755 index 1c5626d9e..27a87e137 --- a/examples/librispeech/asr3/RESULTS.md +++ b/examples/librispeech/asr3/RESULTS.md @@ -1,8 +1,8 @@ # LibriSpeech ## Wav2VecASR -train: Epoch 1, 1*V100-32G, batchsize:10 +train: Epoch 1, 1*V100-32G, batchsize: 6 | Model | Params | Config | Augmentation| Test set | Decode method | WER | | --- | --- | --- | --- | --- | --- | --- | -| wav2vec2ASR | 302.86 M | conf/wav2vec2ASR.yaml | spec_aug | test-clean | greedy search | 0.018887 | +| wav2vec2ASR | 302.86 M | conf/wav2vec2ASR.yaml | spec_aug | test-clean | greedy search | 0.018906 | diff --git a/examples/librispeech/asr3/cmd.sh b/examples/librispeech/asr3/cmd.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr3/conf/preprocess.yaml b/examples/librispeech/asr3/conf/preprocess.yaml old mode 100644 new mode 100755 index 4a908a83b..724782ed6 --- a/examples/librispeech/asr3/conf/preprocess.yaml +++ b/examples/librispeech/asr3/conf/preprocess.yaml @@ -1,4 +1,3 @@ process: # use raw audio - type: wav_process - dither: 0.0 diff --git a/examples/librispeech/asr3/conf/tuning/decode.yaml b/examples/librispeech/asr3/conf/tuning/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr3/conf/wav2vec2ASR.yaml b/examples/librispeech/asr3/conf/wav2vec2ASR.yaml old mode 100644 new mode 100755 index b19881b70..1ce2d94db --- a/examples/librispeech/asr3/conf/wav2vec2ASR.yaml +++ b/examples/librispeech/asr3/conf/wav2vec2ASR.yaml @@ -4,16 +4,21 @@ freeze_wav2vec2: True normalize_wav: True output_norm: True -dnn_blocks: 2 -dnn_neurons: 1024 -blank_id: 0 -ctc_dropout_rate: 0.0 +init_type: 'kaiming_uniform' # !Warning: need to convergence +enc: + input_shape: 1024 + dnn_blocks: 2 + dnn_neurons: 1024 + activation: True +ctc: + enc_n_units: 1024 + blank_id: 0 + dropout_rate: 0.0 wav2vec2_params_path: "exp/wav2vec2/wav2vec2-large-960h-lv60-self.pdparams" ############################################ # Wav2Vec2.0 # ############################################ -vocab_size: 32 hidden_size: 1024 num_hidden_layers: 24 num_attention_heads: 16 @@ -54,9 +59,6 @@ diversity_loss_weight: 0.1 ctc_loss_reduction: "sum" ctc_zero_infinity: False use_weighted_layer_sum: False -pad_token_id: 0 -bos_token_id: 1 -eos_token_id: 2 add_adapter: False adapter_kernel_size: 3 adapter_stride: 2 @@ -70,7 +72,6 @@ train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test-clean - ########################################### # Dataloader # ########################################### @@ -79,7 +80,7 @@ unit_type: 'char' mean_std_filepath: "" preprocess_config: conf/preprocess.yaml sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs 0: disabled other: enabled for 'other' epochs -batch_size: 10 # Different batch_size may cause large differences in results +batch_size: 6 # Different batch_size may cause large differences in results maxlen_in: 51200000000 # if input length > maxlen-in batchsize is automatically reduced maxlen_out: 1500000 # if output length > maxlen-out batchsize is automatically reduced minibatches: 0 # for debug @@ -95,26 +96,38 @@ dist_sampler: True shortest_first: True return_lens_rate: True +############################################ +# Data Augmentation # +############################################ +audio_augment: # for raw audio + sample_rate: 16000 + speeds: [95, 100, 105] ########################################### # Training # ########################################### n_epoch: 1 accum_grad: 1 -global_grad_clip: 3.0 +global_grad_clip: 5.0 model_optim: adadelta model_optim_conf: lr: 0.9 epsilon: 1.0e-6 rho: 0.95 -scheduler: constantlr -scheduler_conf: +model_scheduler: constantlr +model_scheduler_conf: + warmup_steps: 25000 + lr_decay: 1.0 +wav2vec2_optim: adadelta +wav2vec2_optim_conf: + lr: 0.9 + epsilon: 1.0e-6 + rho: 0.95 +wav2vec2_scheduler: constantlr +wav2vec2_scheduler_conf: warmup_steps: 25000 lr_decay: 1.0 log_interval: 1 checkpoint: kbest_n: 50 - latest_n: 5 -augment: True - - + latest_n: 5 \ No newline at end of file diff --git a/examples/librispeech/asr3/local/data.sh b/examples/librispeech/asr3/local/data.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr3/local/test.sh b/examples/librispeech/asr3/local/test.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr3/local/test_wav.sh b/examples/librispeech/asr3/local/test_wav.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr3/local/train.sh b/examples/librispeech/asr3/local/train.sh old mode 100644 new mode 100755 index 6913ed17e..24776fd17 --- a/examples/librispeech/asr3/local/train.sh +++ b/examples/librispeech/asr3/local/train.sh @@ -10,7 +10,8 @@ echo "using $ngpu gpus..." config_path=$1 ckpt_name=$2 -ips=$3 +resume=$3 +ips=$4 if [ ! $ips ];then ips_config= @@ -21,7 +22,7 @@ fi mkdir -p exp # seed may break model convergence -seed=1998 +seed=1988 if [ ${seed} != 0 ]; then export FLAGS_cudnn_deterministic=True fi @@ -34,13 +35,15 @@ python3 -u ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ ---seed ${seed} +--seed ${seed} \ +--resume ${resume} else python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \ --ngpu ${ngpu} \ --config ${config_path} \ --output exp/${ckpt_name} \ ---seed ${seed} +--seed ${seed} \ +--resume ${resume} fi if [ ${seed} != 0 ]; then diff --git a/examples/librispeech/asr3/path.sh b/examples/librispeech/asr3/path.sh old mode 100644 new mode 100755 diff --git a/examples/librispeech/asr3/run.sh b/examples/librispeech/asr3/run.sh old mode 100644 new mode 100755 index 3b1abb11b..05ad505c7 --- a/examples/librispeech/asr3/run.sh +++ b/examples/librispeech/asr3/run.sh @@ -11,7 +11,7 @@ conf_path=conf/wav2vec2ASR.yaml ips= #xx.xx.xx.xx,xx.xx.xx.xx decode_conf_path=conf/tuning/decode.yaml avg_num=1 -dict_path=data/lang_char/vocab.txt +resume= # xx e.g. 30 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -28,7 +28,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${resume} ${ips} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then @@ -38,10 +38,10 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # greedy search decoder - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # test a single .wav file - CUDA_VISIBLE_DEVICES=${gpus} ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi diff --git a/examples/ljspeech/README.md b/examples/ljspeech/README.md old mode 100644 new mode 100755 diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md old mode 100644 new mode 100755 diff --git a/examples/ljspeech/tts0/conf/default.yaml b/examples/ljspeech/tts0/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md old mode 100644 new mode 100755 diff --git a/examples/ljspeech/tts1/conf/default.yaml b/examples/ljspeech/tts1/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md old mode 100644 new mode 100755 index d786c1571..23b433d4e --- a/examples/ljspeech/tts3/README.md +++ b/examples/ljspeech/tts3/README.md @@ -221,6 +221,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [fastspeech2_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [fastspeech2_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_pdlite_1.3.0.zip) + Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: diff --git a/examples/ljspeech/tts3/conf/default.yaml b/examples/ljspeech/tts3/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/ljspeech/tts3/local/export2lite.sh b/examples/ljspeech/tts3/local/export2lite.sh new file mode 120000 index 000000000..f7719914a --- /dev/null +++ b/examples/ljspeech/tts3/local/export2lite.sh @@ -0,0 +1 @@ +../../../csmsc/tts3/local/export2lite.sh \ No newline at end of file diff --git a/examples/ljspeech/tts3/local/lite_predict.sh b/examples/ljspeech/tts3/local/lite_predict.sh new file mode 100755 index 000000000..75db6a0ea --- /dev/null +++ b/examples/ljspeech/tts3/local/lite_predict.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_ljspeech \ + --voc=pwgan_ljspeech \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --lang=en +fi + +# hifigan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_ljspeech \ + --voc=hifigan_ljspeech \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --lang=en +fi diff --git a/examples/ljspeech/tts3/run.sh b/examples/ljspeech/tts3/run.sh index 956185935..aacd4cc03 100755 --- a/examples/ljspeech/tts3/run.sh +++ b/examples/ljspeech/tts3/run.sh @@ -59,3 +59,14 @@ fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then ./local/ort_predict.sh ${train_output_path} fi + +# must run after stage 3 (which stage generated static models) +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_ljspeech x86 + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_ljspeech x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_ljspeech x86 +fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +fi \ No newline at end of file diff --git a/examples/ljspeech/voc0/README.md b/examples/ljspeech/voc0/README.md old mode 100644 new mode 100755 diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md old mode 100644 new mode 100755 index ad6cd2982..a7ac2af41 --- a/examples/ljspeech/voc1/README.md +++ b/examples/ljspeech/voc1/README.md @@ -136,6 +136,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [pwgan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [pwgan_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_ljspeech_pdlite_1.3.0.zip) + Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/ljspeech/voc1/conf/default.yaml b/examples/ljspeech/voc1/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md old mode 100644 new mode 100755 index eaa51e507..65fa53267 --- a/examples/ljspeech/voc5/README.md +++ b/examples/ljspeech/voc5/README.md @@ -121,6 +121,8 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [hifigan_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [hifigan_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_pdlite_1.3.0.zip) Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss :-------------:| :------------:| :-----: | :-----: | :--------: diff --git a/examples/ljspeech/voc5/conf/default.yaml b/examples/ljspeech/voc5/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/cmd.sh b/examples/mustc/st1/cmd.sh old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/conf/fbank.conf b/examples/mustc/st1/conf/fbank.conf old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/conf/pitch.conf b/examples/mustc/st1/conf/pitch.conf old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/conf/transformer_de.yaml b/examples/mustc/st1/conf/transformer_de.yaml old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/conf/transformer_es.yaml b/examples/mustc/st1/conf/transformer_es.yaml old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/conf/transformer_fr.yaml b/examples/mustc/st1/conf/transformer_fr.yaml old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/conf/transformer_it.yaml b/examples/mustc/st1/conf/transformer_it.yaml old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/conf/transformer_nl.yaml b/examples/mustc/st1/conf/transformer_nl.yaml old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/conf/transformer_pt.yaml b/examples/mustc/st1/conf/transformer_pt.yaml old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/conf/transformer_ro.yaml b/examples/mustc/st1/conf/transformer_ro.yaml old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/conf/transformer_ru.yaml b/examples/mustc/st1/conf/transformer_ru.yaml old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/local/augmentation.json b/examples/mustc/st1/local/augmentation.json old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/local/data_prep.sh b/examples/mustc/st1/local/data_prep.sh old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/local/divide_lang.sh b/examples/mustc/st1/local/divide_lang.sh old mode 100644 new mode 100755 diff --git a/examples/mustc/st1/path.sh b/examples/mustc/st1/path.sh old mode 100644 new mode 100755 diff --git a/examples/other/augmentation/augmentation.json b/examples/other/augmentation/augmentation.json old mode 100644 new mode 100755 diff --git a/examples/other/cc-cedict/.gitignore b/examples/other/cc-cedict/.gitignore old mode 100644 new mode 100755 diff --git a/examples/other/cc-cedict/README.md b/examples/other/cc-cedict/README.md old mode 100644 new mode 100755 diff --git a/examples/other/cc-cedict/local/parser.py b/examples/other/cc-cedict/local/parser.py old mode 100644 new mode 100755 diff --git a/examples/other/cc-cedict/path.sh b/examples/other/cc-cedict/path.sh old mode 100644 new mode 100755 diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md old mode 100644 new mode 100755 diff --git a/examples/other/g2p/compare_badcase.py b/examples/other/g2p/compare_badcase.py old mode 100644 new mode 100755 diff --git a/examples/other/g2p/get_g2p_data.py b/examples/other/g2p/get_g2p_data.py old mode 100644 new mode 100755 diff --git a/examples/other/g2p/test_g2p.py b/examples/other/g2p/test_g2p.py old mode 100644 new mode 100755 diff --git a/examples/other/ge2e/README.md b/examples/other/ge2e/README.md old mode 100644 new mode 100755 diff --git a/examples/other/mfa/README.md b/examples/other/mfa/README.md old mode 100644 new mode 100755 index c24524ab4..216d1275b --- a/examples/other/mfa/README.md +++ b/examples/other/mfa/README.md @@ -4,3 +4,6 @@ Run the following script to get started, for more detail, please see `run.sh`. ```bash ./run.sh ``` +# Rhythm tags for MFA +If you want to get rhythm tags with duration through MFA tool, you may add flag `--rhy-with-duration` in the first two commands in `run.sh` +Note that only CSMSC dataset is supported so far, and we replace `#` with `sp` in rhythm tags for MFA. diff --git a/examples/other/mfa/local/detect_oov.py b/examples/other/mfa/local/detect_oov.py old mode 100644 new mode 100755 diff --git a/examples/other/mfa/local/generate_lexicon.py b/examples/other/mfa/local/generate_lexicon.py old mode 100644 new mode 100755 index e9445665b..3deb24701 --- a/examples/other/mfa/local/generate_lexicon.py +++ b/examples/other/mfa/local/generate_lexicon.py @@ -182,12 +182,17 @@ if __name__ == "__main__": "--with-tone", action="store_true", help="whether to consider tone.") parser.add_argument( "--with-r", action="store_true", help="whether to consider erhua.") + parser.add_argument( + "--rhy-with-duration", + action="store_true", ) args = parser.parse_args() lexicon = generate_lexicon(args.with_tone, args.with_r) symbols = generate_symbols(lexicon) with open(args.output + ".lexicon", 'wt') as f: + if args.rhy_with_duration: + f.write("sp1 sp1\nsp2 sp2\nsp3 sp3\nsp4 sp4\n") for k, v in lexicon.items(): f.write(f"{k} {v}\n") diff --git a/examples/other/mfa/local/reorganize_aishell3.py b/examples/other/mfa/local/reorganize_aishell3.py old mode 100644 new mode 100755 diff --git a/examples/other/mfa/local/reorganize_baker.py b/examples/other/mfa/local/reorganize_baker.py old mode 100644 new mode 100755 index 153e01d13..0e0035bda --- a/examples/other/mfa/local/reorganize_baker.py +++ b/examples/other/mfa/local/reorganize_baker.py @@ -23,6 +23,7 @@ for more details. """ import argparse import os +import re import shutil from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -32,6 +33,22 @@ import librosa import soundfile as sf from tqdm import tqdm +repalce_dict = { + ";": "", + "。": "", + ":": "", + "—": "", + ")": "", + ",": "", + "“": "", + "(": "", + "、": "", + "…": "", + "!": "", + "?": "", + "”": "" +} + def get_transcripts(path: Union[str, Path]): transcripts = {} @@ -55,9 +72,13 @@ def resample_and_save(source, target, sr=16000): def reorganize_baker(root_dir: Union[str, Path], output_dir: Union[str, Path]=None, - resample_audio=False): + resample_audio=False, + rhy_dur=False): root_dir = Path(root_dir).expanduser() - transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt" + if rhy_dur: + transcript_path = root_dir / "ProsodyLabeling" / "000001-010000_rhy.txt" + else: + transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt" transcriptions = get_transcripts(transcript_path) wave_dir = root_dir / "Wave" @@ -92,6 +113,46 @@ def reorganize_baker(root_dir: Union[str, Path], print("Done!") +def insert_rhy(sentence_first, sentence_second): + sub = '#' + return_words = [] + sentence_first = sentence_first.translate(str.maketrans(repalce_dict)) + rhy_idx = [substr.start() for substr in re.finditer(sub, sentence_first)] + re_rhy_idx = [] + sentence_first_ = sentence_first.replace("#1", "").replace( + "#2", "").replace("#3", "").replace("#4", "") + sentence_seconds = sentence_second.split(" ") + for i, w in enumerate(rhy_idx): + re_rhy_idx.append(w - i * 2) + i = 0 + # print("re_rhy_idx: ", re_rhy_idx) + for sentence_s in (sentence_seconds): + return_words.append(sentence_s) + if i < len(re_rhy_idx) and len(return_words) - i == re_rhy_idx[i]: + return_words.append("sp" + sentence_first[rhy_idx[i] + 1:rhy_idx[i] + + 2]) + i = i + 1 + return return_words + + +def normalize_rhy(root_dir: Union[str, Path]): + root_dir = Path(root_dir).expanduser() + transcript_path = root_dir / "ProsodyLabeling" / "000001-010000.txt" + target_transcript_path = root_dir / "ProsodyLabeling" / "000001-010000_rhy.txt" + + with open(transcript_path) as f: + lines = f.readlines() + + with open(target_transcript_path, 'wt') as f: + for i in range(0, len(lines), 2): + sentence_first = lines[i] #第一行直接保存 + f.write(sentence_first) + transcription = lines[i + 1].strip() + f.write("\t" + " ".join( + insert_rhy(sentence_first.split('\t')[1], transcription)) + + "\n") + + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Reorganize Baker dataset for MFA") @@ -104,6 +165,12 @@ if __name__ == "__main__": "--resample-audio", action="store_true", help="To resample audio files or just copy them") + parser.add_argument( + "--rhy-with-duration", + action="store_true", ) args = parser.parse_args() - reorganize_baker(args.root_dir, args.output_dir, args.resample_audio) + if args.rhy_with_duration: + normalize_rhy(args.root_dir) + reorganize_baker(args.root_dir, args.output_dir, args.resample_audio, + args.rhy_with_duration) diff --git a/examples/other/mfa/local/reorganize_ljspeech.py b/examples/other/mfa/local/reorganize_ljspeech.py old mode 100644 new mode 100755 diff --git a/examples/other/mfa/local/reorganize_vctk.py b/examples/other/mfa/local/reorganize_vctk.py old mode 100644 new mode 100755 diff --git a/examples/other/ngram_lm/.gitignore b/examples/other/ngram_lm/.gitignore old mode 100644 new mode 100755 diff --git a/examples/other/ngram_lm/README.md b/examples/other/ngram_lm/README.md old mode 100644 new mode 100755 diff --git a/examples/other/ngram_lm/s0/.gitignore b/examples/other/ngram_lm/s0/.gitignore old mode 100644 new mode 100755 diff --git a/examples/other/ngram_lm/s0/README.md b/examples/other/ngram_lm/s0/README.md old mode 100644 new mode 100755 diff --git a/examples/other/ngram_lm/s0/data/README.md b/examples/other/ngram_lm/s0/data/README.md old mode 100644 new mode 100755 diff --git a/examples/other/ngram_lm/s0/data/custom_confusion.txt b/examples/other/ngram_lm/s0/data/custom_confusion.txt old mode 100644 new mode 100755 diff --git a/examples/other/ngram_lm/s0/data/text_correct.txt b/examples/other/ngram_lm/s0/data/text_correct.txt old mode 100644 new mode 100755 diff --git a/examples/other/ngram_lm/s0/local/build_zh_lm.sh b/examples/other/ngram_lm/s0/local/build_zh_lm.sh old mode 100644 new mode 100755 diff --git a/examples/other/ngram_lm/s0/local/kenlm_score_test.py b/examples/other/ngram_lm/s0/local/kenlm_score_test.py old mode 100644 new mode 100755 diff --git a/examples/other/ngram_lm/s0/path.sh b/examples/other/ngram_lm/s0/path.sh old mode 100644 new mode 100755 diff --git a/examples/other/ngram_lm/s0/requirements.txt b/examples/other/ngram_lm/s0/requirements.txt old mode 100644 new mode 100755 diff --git a/examples/other/punctuation_restoration/README.md b/examples/other/punctuation_restoration/README.md old mode 100644 new mode 100755 diff --git a/examples/other/rhy/README.md b/examples/other/rhy/README.md new file mode 100755 index 000000000..11336ad9f --- /dev/null +++ b/examples/other/rhy/README.md @@ -0,0 +1,41 @@ +# Prosody Prediction with CSMSC and AISHELL-3 + +## Get Started +### Data Preprocessing +```bash +./run.sh --stage 0 --stop-stage 0 +``` +### Model Training +```bash +./run.sh --stage 1 --stop-stage 1 +``` +### Testing +```bash +./run.sh --stage 2 --stop-stage 2 +``` +### Prosody Prediction +```bash +./run.sh --stage 3 --stop-stage 3 +``` +## Pretrained Model +The pretrained model can be downloaded here: + +[ernie-1.0_aishellcsmsc_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/rhy_predict/ernie-1.0_aishellcsmsc_ckpt_1.3.0.zip) + +And you should put it into `exp/${YOUREXP}/checkpoints` folder. + +## Rhythm mapping +Four punctuation marks are used to denote the rhythm marks respectively: +|ryh_token|csmsc|aishll3| +|:---: |:---: |:---: | +|%|#1|%| +|`|#2|| +|~|#3|| +|$|#4|$| + +## Prediction Results +| | #1 | #2 | #3 | #4 | +|:-----:|:-----:|:-----:|:-----:|:-----:| +|Precision |0.90 |0.66 |0.91 |0.90| +|Recall |0.92 |0.62 |0.83 |0.85| +|F1 |0.91 |0.64 |0.87 |0.87| diff --git a/examples/other/rhy/conf/default.yaml b/examples/other/rhy/conf/default.yaml new file mode 100755 index 000000000..1eb90f11f --- /dev/null +++ b/examples/other/rhy/conf/default.yaml @@ -0,0 +1,44 @@ +########################################################### +# DATA SETTING # +########################################################### +dataset_type: Ernie +train_path: data/train.txt +dev_path: data/dev.txt +test_path: data/test.txt +batch_size: 64 +num_workers: 2 +data_params: + pretrained_token: ernie-1.0 + punc_path: data/rhy_token + seq_len: 100 + + +########################################################### +# MODEL SETTING # +########################################################### +model_type: ErnieLinear +model: + pretrained_token: ernie-1.0 + num_classes: 5 + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer_params: + weight_decay: 1.0e-6 # weight decay coefficient. + +scheduler_params: + learning_rate: 1.0e-5 # learning rate. + gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better. + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 20 +num_snapshots: 5 + +########################################################### +# OTHER SETTING # +########################################################### +num_snapshots: 10 # max number of snapshots to keep while training +seed: 42 # random seed for paddle, random, and np.random diff --git a/examples/other/rhy/data/rhy_token b/examples/other/rhy/data/rhy_token new file mode 100755 index 000000000..bf1fe253f --- /dev/null +++ b/examples/other/rhy/data/rhy_token @@ -0,0 +1,4 @@ +% +` +~ +$ \ No newline at end of file diff --git a/examples/other/rhy/local/data.sh b/examples/other/rhy/local/data.sh new file mode 100755 index 000000000..93b134873 --- /dev/null +++ b/examples/other/rhy/local/data.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +if [ ! -f 000001-010000.txt ]; then + wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/rhy_predict/000001-010000.txt +fi + +if [ ! -f label_train-set.txt ]; then + wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/rhy_predict/label_train-set.txt +fi + + +aishell_data=$1 +csmsc_data=$2 +processed_path=$3 + +python3 ./local/pre_for_sp_csmsc.py \ + --data=${csmsc_data} \ + --processed_path=${processed_path} + +python3 ./local/pre_for_sp_aishell.py \ + --data=${aishell_data} \ + --processed_path=${processed_path} + + +echo "Finish data preparation." +exit 0 diff --git a/examples/other/rhy/local/pre_for_sp_aishell.py b/examples/other/rhy/local/pre_for_sp_aishell.py new file mode 100755 index 000000000..a2a716683 --- /dev/null +++ b/examples/other/rhy/local/pre_for_sp_aishell.py @@ -0,0 +1,51 @@ +import argparse +import os +import re + +# This is the replacement for rhythm labels to predict. +# 韵律标签的代替 +replace_ = {"#1": "%", "#2": "`", "#3": "~", "#4": "$"} + + +def replace_rhy_with_punc(line): + # r'[:、,;。?!,.:;"?!”’《》【】<=>{}()()#&@“”^_|…\\]%*$', '', line) #参考checkcheck_oov.py, + line = re.sub(r'[:、,;。?!,.:;"?!’《》【】<=>{}()()#&@“”^_|…\\]%*$', '', line) + for r in replace_.keys(): + if r in line: + line = line.replace(r, replace_[r]) + return line + + +def pre_and_write(data, file): + with open(file, 'a') as rf: + for d in data: + d = d.split('|')[2].strip() + # d = replace_rhy_with_punc(d) + d = ' '.join(d) + ' \n' + rf.write(d) + + +def main(): + parser = argparse.ArgumentParser( + description="Train a Rhy prediction model.") + parser.add_argument("--data", type=str, default="label_train-set.txt") + parser.add_argument( + "--processed_path", type=str, default="../data/rhy_predict") + args = parser.parse_args() + os.makedirs(args.processed_path, exist_ok=True) + + with open(args.data) as rf: + text = rf.readlines()[5:] + len_ = len(text) + lens = [int(len_ * 0.9), int(len_ * 0.05), int(len_ * 0.05)] + files = ['train.txt', 'test.txt', 'dev.txt'] + + i = 0 + for l_, file in zip(lens, files): + file = os.path.join(args.processed_path, file) + pre_and_write(text[i:i + l_], file) + i = i + l_ + + +if __name__ == "__main__": + main() diff --git a/examples/other/rhy/local/pre_for_sp_csmsc.py b/examples/other/rhy/local/pre_for_sp_csmsc.py new file mode 100755 index 000000000..0a96092c1 --- /dev/null +++ b/examples/other/rhy/local/pre_for_sp_csmsc.py @@ -0,0 +1,51 @@ +import argparse +import os +import re + +replace_ = {"#1": "%", "#2": "`", "#3": "~", "#4": "$"} + + +def replace_rhy_with_punc(line): + # r'[:、,;。?!,.:;"?!”’《》【】<=>{}()()#&@“”^_|…\\]%*$', '', line) #参考checkcheck_oov.py, + line = re.sub(r'^$\*%', '', line) + for r in replace_.keys(): + if r in line: + line = line.replace(r, replace_[r]) + return line + + +def pre_and_write(data, file): + with open(file, 'w') as rf: + for d in data: + d = d.split('\t')[1].strip() + d = replace_rhy_with_punc(d) + d = ' '.join(d) + ' \n' + rf.write(d) + + +def main(): + parser = argparse.ArgumentParser( + description="Train a Rhy prediction model.") + parser.add_argument("--data", type=str, default="label_train-set.txt") + parser.add_argument( + "--processed_path", type=str, default="../data/rhy_predict") + args = parser.parse_args() + print(args.data, args.processed_path) + os.makedirs(args.processed_path, exist_ok=True) + + with open(args.data) as rf: + rf = rf.readlines() + text = rf[0::2] + len_ = len(text) + lens = [int(len_ * 0.9), int(len_ * 0.05), int(len_ * 0.05)] + files = ['train.txt', 'test.txt', 'dev.txt'] + + i = 0 + for l_, file in zip(lens, files): + file = os.path.join(args.processed_path, file) + pre_and_write(text[i:i + l_], file) + i = i + l_ + + +if __name__ == "__main__": + main() diff --git a/examples/other/rhy/local/rhy_predict.sh b/examples/other/rhy/local/rhy_predict.sh new file mode 100755 index 000000000..30a4f12f8 --- /dev/null +++ b/examples/other/rhy/local/rhy_predict.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +text=$4 +ckpt_prefix=${ckpt_name%.*} + +python3 ${BIN_DIR}/punc_restore.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --text=${text} diff --git a/examples/other/rhy/local/test.sh b/examples/other/rhy/local/test.sh new file mode 100755 index 000000000..bd490b5b9 --- /dev/null +++ b/examples/other/rhy/local/test.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 +ckpt_name=$3 +print_eval=$4 + +ckpt_prefix=${ckpt_name%.*} + +python3 ${BIN_DIR}/test.py \ + --config=${config_path} \ + --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \ + --print_eval=${print_eval} \ No newline at end of file diff --git a/examples/other/rhy/local/train.sh b/examples/other/rhy/local/train.sh new file mode 100755 index 000000000..85227eacb --- /dev/null +++ b/examples/other/rhy/local/train.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +config_path=$1 +train_output_path=$2 + +python3 ${BIN_DIR}/train.py \ + --config=${config_path} \ + --output-dir=${train_output_path} \ + --ngpu=1 diff --git a/examples/other/rhy/path.sh b/examples/other/rhy/path.sh new file mode 100755 index 000000000..da790261f --- /dev/null +++ b/examples/other/rhy/path.sh @@ -0,0 +1,14 @@ +#!/bin/bash +export MAIN_ROOT=${PWD}/../../../ + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +MODEL=ernie_linear +export BIN_DIR=${MAIN_ROOT}/paddlespeech/text/exps/${MODEL} diff --git a/examples/other/rhy/run.sh b/examples/other/rhy/run.sh new file mode 100755 index 000000000..aed58152e --- /dev/null +++ b/examples/other/rhy/run.sh @@ -0,0 +1,40 @@ +#!/bin/bash +set -e +source path.sh + +gpus=0 +stage=0 +stop_stage=100 + +aishell_data=label_train-set.txt +csmsc_data=000001-010000.txt +processed_path=data + +conf_path=conf/default.yaml +train_output_path=exp/default +ckpt_name=snapshot_iter_2600.pdz +text=我们城市的复苏有赖于他强有力的政策。 +print_eval=false + +# with the following command, you can choose the stage range you want to run +# such as `./run.sh --stage 0 --stop-stage 0` +# this can not be mixed use with `$1`, `$2` ... +source ${MAIN_ROOT}/utils/parse_options.sh || exit 1 + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # prepare data + ./local/data.sh ${aishell_data} ${csmsc_data} ${processed_path} +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # train model, all `ckpt` under `train_output_path/checkpoints/` dir + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1 +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${train_output_path} ${ckpt_name} ${print_eval} || exit -1 +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/rhy_predict.sh ${conf_path} ${train_output_path} ${ckpt_name} ${text}|| exit -1 +fi \ No newline at end of file diff --git a/examples/other/spm/.gitignore b/examples/other/spm/.gitignore old mode 100644 new mode 100755 diff --git a/examples/other/spm/README.md b/examples/other/spm/README.md old mode 100644 new mode 100755 diff --git a/examples/other/spm/path.sh b/examples/other/spm/path.sh old mode 100644 new mode 100755 diff --git a/examples/other/spm/text b/examples/other/spm/text old mode 100644 new mode 100755 diff --git a/examples/other/tn/README.md b/examples/other/tn/README.md old mode 100644 new mode 100755 diff --git a/examples/other/tn/data/textnorm_test_cases.txt b/examples/other/tn/data/textnorm_test_cases.txt old mode 100644 new mode 100755 index e9a479b47..17e90d0b6 --- a/examples/other/tn/data/textnorm_test_cases.txt +++ b/examples/other/tn/data/textnorm_test_cases.txt @@ -122,4 +122,6 @@ iPad Pro的秒控键盘这次也推出白色版本。|iPad Pro的秒控键盘这 近期也一反常态地发表看空言论|近期也一反常态地发表看空言论 985|九八五 12~23|十二到二十三 -12-23|十二到二十三 \ No newline at end of file +12-23|十二到二十三 +25cm²|二十五平方厘米 +25m|米 \ No newline at end of file diff --git a/examples/other/tn/get_textnorm_data.py b/examples/other/tn/get_textnorm_data.py old mode 100644 new mode 100755 diff --git a/examples/other/tn/test_textnorm.py b/examples/other/tn/test_textnorm.py old mode 100644 new mode 100755 diff --git a/examples/other/tts_finetune/tts3/README.md b/examples/other/tts_finetune/tts3/README.md old mode 100644 new mode 100755 index fa691764c..8564af5f6 --- a/examples/other/tts_finetune/tts3/README.md +++ b/examples/other/tts_finetune/tts3/README.md @@ -55,7 +55,7 @@ If you want to finetune Chinese pretrained model, you need to prepare Chinese da 000001|ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1 ``` -Here is an example of the first 200 data of csmsc. +Here is a Chinese data example of the first 200 data of csmsc. ```bash mkdir -p input && cd input @@ -69,7 +69,7 @@ If you want to finetune English pretrained model, you need to prepare English da LJ001-0001|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition ``` -Here is an example of the first 200 data of ljspeech. +Here is an English data example of the first 200 data of ljspeech. ```bash mkdir -p input && cd input @@ -78,7 +78,7 @@ unzip ljspeech_mini.zip cd ../ ``` -If you want to finetune Chinese-English Mixed pretrained model, you need to prepare Chinese data or English data. Here is an example of the first 12 data of SSB0005 (the speaker of aishell3). +If you want to finetune Chinese-English Mixed pretrained model, you need to prepare Chinese data or English data. Here is a Chinese data example of the first 12 data of SSB0005 (the speaker of aishell3). ```bash mkdir -p input && cd input diff --git a/examples/other/tts_finetune/tts3/conf/fastspeech2_layers.txt b/examples/other/tts_finetune/tts3/conf/fastspeech2_layers.txt old mode 100644 new mode 100755 diff --git a/examples/other/tts_finetune/tts3/conf/finetune.yaml b/examples/other/tts_finetune/tts3/conf/finetune.yaml old mode 100644 new mode 100755 diff --git a/examples/other/tts_finetune/tts3/local/check_oov.py b/examples/other/tts_finetune/tts3/local/check_oov.py old mode 100644 new mode 100755 diff --git a/examples/other/tts_finetune/tts3/local/extract_feature.py b/examples/other/tts_finetune/tts3/local/extract_feature.py old mode 100644 new mode 100755 diff --git a/examples/other/tts_finetune/tts3/local/finetune.py b/examples/other/tts_finetune/tts3/local/finetune.py old mode 100644 new mode 100755 diff --git a/examples/other/tts_finetune/tts3/local/generate_duration.py b/examples/other/tts_finetune/tts3/local/generate_duration.py old mode 100644 new mode 100755 diff --git a/examples/other/tts_finetune/tts3/local/get_mfa_result.py b/examples/other/tts_finetune/tts3/local/get_mfa_result.py old mode 100644 new mode 100755 diff --git a/examples/other/tts_finetune/tts3/local/prepare_env.py b/examples/other/tts_finetune/tts3/local/prepare_env.py old mode 100644 new mode 100755 diff --git a/examples/other/tts_finetune/tts3/run_mix.sh b/examples/other/tts_finetune/tts3/run_mix.sh old mode 100644 new mode 100755 index 71008ef5b..960278a53 --- a/examples/other/tts_finetune/tts3/run_mix.sh +++ b/examples/other/tts_finetune/tts3/run_mix.sh @@ -108,3 +108,4 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then --spk_id=$replace_spkid fi + diff --git a/examples/ted_en_zh/README.md b/examples/ted_en_zh/README.md old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st0/.gitignore b/examples/ted_en_zh/st0/.gitignore old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st0/README.md b/examples/ted_en_zh/st0/README.md old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st0/RESULTS.md b/examples/ted_en_zh/st0/RESULTS.md old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st0/conf/preprocess.yaml b/examples/ted_en_zh/st0/conf/preprocess.yaml old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st0/conf/transformer.yaml b/examples/ted_en_zh/st0/conf/transformer.yaml old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st0/conf/tuning/decode.yaml b/examples/ted_en_zh/st0/conf/tuning/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st0/path.sh b/examples/ted_en_zh/st0/path.sh old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/.gitignore b/examples/ted_en_zh/st1/.gitignore old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/README.md b/examples/ted_en_zh/st1/README.md old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/RESULTS.md b/examples/ted_en_zh/st1/RESULTS.md old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/cmd.sh b/examples/ted_en_zh/st1/cmd.sh old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/conf/fbank.conf b/examples/ted_en_zh/st1/conf/fbank.conf old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/conf/pitch.conf b/examples/ted_en_zh/st1/conf/pitch.conf old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/conf/preprocess.yaml b/examples/ted_en_zh/st1/conf/preprocess.yaml old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/conf/transformer.yaml b/examples/ted_en_zh/st1/conf/transformer.yaml old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/conf/tuning/decode.yaml b/examples/ted_en_zh/st1/conf/tuning/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/local/convert_torch_to_paddle.py b/examples/ted_en_zh/st1/local/convert_torch_to_paddle.py old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/local/ted_en_zh.py b/examples/ted_en_zh/st1/local/ted_en_zh.py old mode 100644 new mode 100755 diff --git a/examples/ted_en_zh/st1/path.sh b/examples/ted_en_zh/st1/path.sh old mode 100644 new mode 100755 diff --git a/examples/thchs30/README.md b/examples/thchs30/README.md old mode 100644 new mode 100755 diff --git a/examples/thchs30/align0/README.md b/examples/thchs30/align0/README.md old mode 100644 new mode 100755 diff --git a/examples/thchs30/align0/data/dict/syllable.lexicon b/examples/thchs30/align0/data/dict/syllable.lexicon old mode 100644 new mode 100755 diff --git a/examples/thchs30/align0/local/data.sh b/examples/thchs30/align0/local/data.sh old mode 100644 new mode 100755 diff --git a/examples/thchs30/align0/local/gen_word2phone.py b/examples/thchs30/align0/local/gen_word2phone.py old mode 100644 new mode 100755 diff --git a/examples/thchs30/align0/local/reorganize_thchs30.py b/examples/thchs30/align0/local/reorganize_thchs30.py old mode 100644 new mode 100755 diff --git a/examples/thchs30/align0/path.sh b/examples/thchs30/align0/path.sh old mode 100644 new mode 100755 diff --git a/examples/timit/README.md b/examples/timit/README.md old mode 100644 new mode 100755 diff --git a/examples/timit/asr1/.gitignore b/examples/timit/asr1/.gitignore old mode 100644 new mode 100755 diff --git a/examples/timit/asr1/RESULTS.md b/examples/timit/asr1/RESULTS.md old mode 100644 new mode 100755 diff --git a/examples/timit/asr1/conf/augmentation.json b/examples/timit/asr1/conf/augmentation.json old mode 100644 new mode 100755 diff --git a/examples/timit/asr1/conf/dev_spk.list b/examples/timit/asr1/conf/dev_spk.list old mode 100644 new mode 100755 diff --git a/examples/timit/asr1/conf/phones.60-48-39.map b/examples/timit/asr1/conf/phones.60-48-39.map old mode 100644 new mode 100755 diff --git a/examples/timit/asr1/conf/preprocess.yaml b/examples/timit/asr1/conf/preprocess.yaml old mode 100644 new mode 100755 diff --git a/examples/timit/asr1/conf/test_spk.list b/examples/timit/asr1/conf/test_spk.list old mode 100644 new mode 100755 diff --git a/examples/timit/asr1/conf/transformer.yaml b/examples/timit/asr1/conf/transformer.yaml old mode 100644 new mode 100755 diff --git a/examples/timit/asr1/conf/tuning/decode.yaml b/examples/timit/asr1/conf/tuning/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/timit/asr1/path.sh b/examples/timit/asr1/path.sh old mode 100644 new mode 100755 diff --git a/examples/tiny/.gitignore b/examples/tiny/.gitignore old mode 100644 new mode 100755 diff --git a/examples/tiny/README.md b/examples/tiny/README.md old mode 100644 new mode 100755 diff --git a/examples/tiny/asr0/.gitignore b/examples/tiny/asr0/.gitignore old mode 100644 new mode 100755 diff --git a/examples/tiny/asr0/README.md b/examples/tiny/asr0/README.md old mode 100644 new mode 100755 diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml old mode 100644 new mode 100755 diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml old mode 100644 new mode 100755 diff --git a/examples/tiny/asr0/conf/preprocess.yaml b/examples/tiny/asr0/conf/preprocess.yaml old mode 100644 new mode 100755 diff --git a/examples/tiny/asr0/conf/tuning/chunk_decode.yaml b/examples/tiny/asr0/conf/tuning/chunk_decode.yaml old mode 100644 new mode 100755 diff --git a/examples/tiny/asr0/conf/tuning/decode.yaml b/examples/tiny/asr0/conf/tuning/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/tiny/asr0/path.sh b/examples/tiny/asr0/path.sh old mode 100644 new mode 100755 diff --git a/examples/tiny/asr1/.gitignore b/examples/tiny/asr1/.gitignore old mode 100644 new mode 100755 diff --git a/examples/tiny/asr1/README.md b/examples/tiny/asr1/README.md old mode 100644 new mode 100755 diff --git a/examples/tiny/asr1/conf/augmentation.json b/examples/tiny/asr1/conf/augmentation.json old mode 100644 new mode 100755 diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml old mode 100644 new mode 100755 diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml old mode 100644 new mode 100755 diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml old mode 100644 new mode 100755 diff --git a/examples/tiny/asr1/conf/preprocess.yaml b/examples/tiny/asr1/conf/preprocess.yaml old mode 100644 new mode 100755 diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml old mode 100644 new mode 100755 diff --git a/examples/tiny/asr1/conf/tuning/chunk_decode.yaml b/examples/tiny/asr1/conf/tuning/chunk_decode.yaml old mode 100644 new mode 100755 diff --git a/examples/tiny/asr1/conf/tuning/decode.yaml b/examples/tiny/asr1/conf/tuning/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/tiny/asr1/path.sh b/examples/tiny/asr1/path.sh old mode 100644 new mode 100755 diff --git a/examples/vctk/README.md b/examples/vctk/README.md old mode 100644 new mode 100755 diff --git a/examples/vctk/ernie_sat/README.md b/examples/vctk/ernie_sat/README.md old mode 100644 new mode 100755 index 94c7ae25d..1808e2074 --- a/examples/vctk/ernie_sat/README.md +++ b/examples/vctk/ernie_sat/README.md @@ -1,5 +1,5 @@ # ERNIE-SAT with VCTK dataset -ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning. +[ERNIE-SAT](https://arxiv.org/abs/2211.03545) speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning. ## Model Framework In ERNIE-SAT, we propose two innovations: diff --git a/examples/vctk/ernie_sat/conf/default.yaml b/examples/vctk/ernie_sat/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md old mode 100644 new mode 100755 index 2a2f27fd4..0bf2037f5 --- a/examples/vctk/tts3/README.md +++ b/examples/vctk/tts3/README.md @@ -224,6 +224,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [fastspeech2_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_pdlite_1.3.0.zip) + FastSpeech2 checkpoint contains files listed below. ```text fastspeech2_vctk_ckpt_1.2.0 diff --git a/examples/vctk/tts3/conf/default.yaml b/examples/vctk/tts3/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/vctk/tts3/local/export2lite.sh b/examples/vctk/tts3/local/export2lite.sh new file mode 120000 index 000000000..f7719914a --- /dev/null +++ b/examples/vctk/tts3/local/export2lite.sh @@ -0,0 +1 @@ +../../../csmsc/tts3/local/export2lite.sh \ No newline at end of file diff --git a/examples/vctk/tts3/local/lite_predict.sh b/examples/vctk/tts3/local/lite_predict.sh new file mode 100755 index 000000000..eb608535b --- /dev/null +++ b/examples/vctk/tts3/local/lite_predict.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +train_output_path=$1 + +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_vctk \ + --voc=pwgan_vctk \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --spk_id=0 \ + --lang=en +fi + +# hifigan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + python3 ${BIN_DIR}/../lite_predict.py \ + --inference_dir=${train_output_path}/pdlite \ + --am=fastspeech2_vctk \ + --voc=hifigan_vctk \ + --text=${BIN_DIR}/../sentences_en.txt \ + --output_dir=${train_output_path}/lite_infer_out \ + --phones_dict=dump/phone_id_map.txt \ + --speaker_dict=dump/speaker_id_map.txt \ + --spk_id=0 \ + --lang=en +fi diff --git a/examples/vctk/tts3/run.sh b/examples/vctk/tts3/run.sh index b5184aed8..a112b94b7 100755 --- a/examples/vctk/tts3/run.sh +++ b/examples/vctk/tts3/run.sh @@ -58,3 +58,14 @@ fi if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then ./local/ort_predict.sh ${train_output_path} fi + +# must run after stage 3 (which stage generated static models) +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + ./local/export2lite.sh ${train_output_path} inference pdlite fastspeech2_vctk x86 + ./local/export2lite.sh ${train_output_path} inference pdlite pwgan_vctk x86 + # ./local/export2lite.sh ${train_output_path} inference pdlite hifigan_vctk x86 +fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1 +fi \ No newline at end of file diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md old mode 100644 new mode 100755 index 2d80e7563..761f9bddb --- a/examples/vctk/voc1/README.md +++ b/examples/vctk/voc1/README.md @@ -141,6 +141,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [pwgan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [pwgan_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_vctk_pdlite_1.3.0.zip) + Parallel WaveGAN checkpoint contains files listed below. diff --git a/examples/vctk/voc1/conf/default.yaml b/examples/vctk/voc1/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md old mode 100644 new mode 100755 index e937679b5..5a104f56f --- a/examples/vctk/voc5/README.md +++ b/examples/vctk/voc5/README.md @@ -127,6 +127,9 @@ The static model can be downloaded here: The ONNX model can be downloaded here: - [hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip) +The Paddle-Lite model can be downloaded here: +- [hifigan_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_pdlite_1.3.0.zip) + Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss :-------------:| :------------:| :-----: | :-----: | :--------: diff --git a/examples/vctk/voc5/conf/default.yaml b/examples/vctk/voc5/conf/default.yaml old mode 100644 new mode 100755 diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md old mode 100644 new mode 100755 diff --git a/examples/voxceleb/sv0/README.md b/examples/voxceleb/sv0/README.md old mode 100644 new mode 100755 diff --git a/examples/voxceleb/sv0/RESULT.md b/examples/voxceleb/sv0/RESULT.md old mode 100644 new mode 100755 diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml old mode 100644 new mode 100755 diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn_small.yaml old mode 100644 new mode 100755 diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py old mode 100644 new mode 100755 diff --git a/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py b/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py old mode 100644 new mode 100755 diff --git a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py old mode 100644 new mode 100755 diff --git a/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py b/examples/voxceleb/sv0/local/make_voxceleb_kaldi_trial.py old mode 100644 new mode 100755 diff --git a/examples/voxceleb/sv0/local/test.sh b/examples/voxceleb/sv0/local/test.sh old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/README.md b/examples/wenetspeech/README.md old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr0/RESULTS.md b/examples/wenetspeech/asr0/RESULTS.md old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/.gitignore b/examples/wenetspeech/asr1/.gitignore old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/README.md b/examples/wenetspeech/asr1/README.md old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/RESULTS.md b/examples/wenetspeech/asr1/RESULTS.md old mode 100644 new mode 100755 index f22c652e6..cd480163e --- a/examples/wenetspeech/asr1/RESULTS.md +++ b/examples/wenetspeech/asr1/RESULTS.md @@ -53,3 +53,22 @@ Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1 | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | -1 | 0.061884 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | -1 | 0.062056 | | conformer | 32.52 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | -1 | 0.052110 | + + +## U2PP Steaming Pretrained Model + +Pretrain model from https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | 16 | 0.057031 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | 16 | 0.068826 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | 16 | 0.069111 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | 16 | 0.059213 | + +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size | CER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention | -1 | 0.049256 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | -1 | 0.052086 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | -1 | 0.052267 | +| conformer | 122.88 M | conf/chunk_conformer.yaml | spec_aug | aishell1 | attention_rescoring | -1 | 0.047198 | diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer.yaml old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml b/examples/wenetspeech/asr1/conf/chunk_conformer_u2pp.yaml old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/conf/conformer.yaml b/examples/wenetspeech/asr1/conf/conformer.yaml old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/conf/preprocess.yaml b/examples/wenetspeech/asr1/conf/preprocess.yaml old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml b/examples/wenetspeech/asr1/conf/tuning/chunk_decode.yaml old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/conf/tuning/decode.yaml b/examples/wenetspeech/asr1/conf/tuning/decode.yaml old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/local/process_opus.py b/examples/wenetspeech/asr1/local/process_opus.py old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/local/quant.sh b/examples/wenetspeech/asr1/local/quant.sh index 9dfea9045..ac854aaad 100755 --- a/examples/wenetspeech/asr1/local/quant.sh +++ b/examples/wenetspeech/asr1/local/quant.sh @@ -1,7 +1,8 @@ #!/bin/bash +# ./local/quant.sh conf/chunk_conformer_u2pp.yaml conf/tuning/chunk_decode.yaml exp/chunk_conformer_u2pp/checkpoints/avg_10 data/wav.aishell.test.scp if [ $# != 4 ];then - echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file" + echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_scp" exit -1 fi @@ -11,16 +12,15 @@ echo "using $ngpu gpus..." config_path=$1 decode_config_path=$2 ckpt_prefix=$3 -audio_file=$4 +audio_scp=$4 mkdir -p data -wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/ if [ $? -ne 0 ]; then exit 1 fi -if [ ! -f ${audio_file} ]; then - echo "Plase input the right audio_file path" +if [ ! -f ${audio_scp} ]; then + echo "Plase input the right audio_scp path" exit 1 fi @@ -49,7 +49,8 @@ for type in attention_rescoring; do --checkpoint_path ${ckpt_prefix} \ --opts decode.decoding_method ${type} \ --opts decode.decode_batch_size ${batch_size} \ - --audio_file ${audio_file} + --num_utts 200 \ + --audio_scp ${audio_scp} if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/wenetspeech/asr1/local/test_wav.sh b/examples/wenetspeech/asr1/local/test_wav.sh index 474642624..c3a17f491 100755 --- a/examples/wenetspeech/asr1/local/test_wav.sh +++ b/examples/wenetspeech/asr1/local/test_wav.sh @@ -42,6 +42,7 @@ for type in attention_rescoring; do output_dir=${ckpt_prefix} mkdir -p ${output_dir} python3 -u ${BIN_DIR}/test_wav.py \ + --debug True \ --ngpu ${ngpu} \ --config ${config_path} \ --decode_cfg ${decode_config_path} \ diff --git a/examples/wenetspeech/asr1/path.sh b/examples/wenetspeech/asr1/path.sh old mode 100644 new mode 100755 diff --git a/examples/wenetspeech/asr1/run.sh b/examples/wenetspeech/asr1/run.sh old mode 100644 new mode 100755 index ddce0a9c8..2ae7b31c6 --- a/examples/wenetspeech/asr1/run.sh +++ b/examples/wenetspeech/asr1/run.sh @@ -54,3 +54,7 @@ if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then # test a single .wav file CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1 fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + # export quant model, plesae see local/quant.sh +fi diff --git a/examples/zh_en_tts/tts3/README.md b/examples/zh_en_tts/tts3/README.md old mode 100644 new mode 100755 index b4b683089..012028007 --- a/examples/zh_en_tts/tts3/README.md +++ b/examples/zh_en_tts/tts3/README.md @@ -116,6 +116,8 @@ optional arguments: 5. `--phones-dict` is the path of the phone vocabulary file. 6. `--speaker-dict` is the path of the speaker id map file when training a multi-speaker FastSpeech2. +We have **added module speaker classifier** with reference to [Learning to Speak Fluently in a Foreign Language: Multilingual Speech Synthesis and Cross-Language Voice Cloning](https://arxiv.org/pdf/1907.04448.pdf). The main parameter configuration: `config["model"]["enable_speaker_classifier"]`, `config["model"]["hidden_sc_dim"]` and `config["updater"]["spk_loss_scale"]` in `conf/default.yaml`. The current experimental results show that this module can decouple text information and speaker information, and more experiments are still being sorted out. This module is currently not enabled by default, if you are interested, you can try it yourself. + ### Synthesizing We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the default neural vocoder. diff --git a/examples/zh_en_tts/tts3/conf/default.yaml b/examples/zh_en_tts/tts3/conf/default.yaml old mode 100644 new mode 100755 index e65b5d0ec..efa8b3ea2 --- a/examples/zh_en_tts/tts3/conf/default.yaml +++ b/examples/zh_en_tts/tts3/conf/default.yaml @@ -74,6 +74,9 @@ model: stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder spk_embed_dim: 256 # speaker embedding dimension spk_embed_integration_type: concat # speaker embedding integration type + enable_speaker_classifier: False # Whether to use speaker classifier module + hidden_sc_dim: 256 # The hidden layer dim of speaker classifier + @@ -82,6 +85,7 @@ model: ########################################################### updater: use_masking: True # whether to apply masking for padded part in loss calculation + spk_loss_scale: 0.02 # The scales of speaker classifier loss ########################################################### diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/__init__.py b/paddlespeech/audio/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/backends/__init__.py b/paddlespeech/audio/backends/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/backends/soundfile_backend.py b/paddlespeech/audio/backends/soundfile_backend.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/backends/sox_backend.py b/paddlespeech/audio/backends/sox_backend.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/compliance/__init__.py b/paddlespeech/audio/compliance/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/compliance/kaldi.py b/paddlespeech/audio/compliance/kaldi.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/compliance/librosa.py b/paddlespeech/audio/compliance/librosa.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/datasets/__init__.py b/paddlespeech/audio/datasets/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/datasets/dataset.py b/paddlespeech/audio/datasets/dataset.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/datasets/esc50.py b/paddlespeech/audio/datasets/esc50.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/datasets/gtzan.py b/paddlespeech/audio/datasets/gtzan.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/datasets/hey_snips.py b/paddlespeech/audio/datasets/hey_snips.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/datasets/rirs_noises.py b/paddlespeech/audio/datasets/rirs_noises.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/datasets/tess.py b/paddlespeech/audio/datasets/tess.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/datasets/urban_sound.py b/paddlespeech/audio/datasets/urban_sound.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/datasets/voxceleb.py b/paddlespeech/audio/datasets/voxceleb.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/features/__init__.py b/paddlespeech/audio/features/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/features/layers.py b/paddlespeech/audio/features/layers.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/functional/__init__.py b/paddlespeech/audio/functional/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/functional/functional.py b/paddlespeech/audio/functional/functional.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/functional/window.py b/paddlespeech/audio/functional/window.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/io/__init__.py b/paddlespeech/audio/io/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/metric/__init__.py b/paddlespeech/audio/metric/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/metric/eer.py b/paddlespeech/audio/metric/eer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/sox_effects/__init__.py b/paddlespeech/audio/sox_effects/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/__init__.py b/paddlespeech/audio/streamdata/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/autodecode.py b/paddlespeech/audio/streamdata/autodecode.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/cache.py b/paddlespeech/audio/streamdata/cache.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/compat.py b/paddlespeech/audio/streamdata/compat.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/extradatasets.py b/paddlespeech/audio/streamdata/extradatasets.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/filters.py b/paddlespeech/audio/streamdata/filters.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/gopen.py b/paddlespeech/audio/streamdata/gopen.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/handlers.py b/paddlespeech/audio/streamdata/handlers.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/mix.py b/paddlespeech/audio/streamdata/mix.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/paddle_utils.py b/paddlespeech/audio/streamdata/paddle_utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/pipeline.py b/paddlespeech/audio/streamdata/pipeline.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/shardlists.py b/paddlespeech/audio/streamdata/shardlists.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/tariterators.py b/paddlespeech/audio/streamdata/tariterators.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/utils.py b/paddlespeech/audio/streamdata/utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/streamdata/writer.py b/paddlespeech/audio/streamdata/writer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/text/__init__.py b/paddlespeech/audio/text/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/text/text_featurizer.py b/paddlespeech/audio/text/text_featurizer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/text/utility.py b/paddlespeech/audio/text/utility.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/transform/__init__.py b/paddlespeech/audio/transform/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/transform/add_deltas.py b/paddlespeech/audio/transform/add_deltas.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/transform/channel_selector.py b/paddlespeech/audio/transform/channel_selector.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/transform/cmvn.py b/paddlespeech/audio/transform/cmvn.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/transform/functional.py b/paddlespeech/audio/transform/functional.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/transform/perturb.py b/paddlespeech/audio/transform/perturb.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/transform/spec_augment.py b/paddlespeech/audio/transform/spec_augment.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/transform/spectrogram.py b/paddlespeech/audio/transform/spectrogram.py old mode 100644 new mode 100755 index cba60cfdb..84812a2cf --- a/paddlespeech/audio/transform/spectrogram.py +++ b/paddlespeech/audio/transform/spectrogram.py @@ -383,7 +383,7 @@ class LogMelSpectrogramKaldi(): class WavProcess(): - def __init__(self, dither=0.0): + def __init__(self): """ Args: dither (float): Dithering constant @@ -391,9 +391,7 @@ class WavProcess(): Returns: """ - self.dither = dither - - def __call__(self, x, train): + def __call__(self, x): """ Args: x (np.ndarray): shape (Ti,) @@ -405,10 +403,10 @@ class WavProcess(): Returns: np.ndarray: (T, D) """ - dither = self.dither if train else 0.0 if x.ndim != 1: raise ValueError("Not support x: [Time, Channel]") - waveform = np.expand_dims(x, -1) + waveform = x.astype("float32") / 32768.0 + waveform = np.expand_dims(waveform, -1) return waveform diff --git a/paddlespeech/audio/transform/transform_interface.py b/paddlespeech/audio/transform/transform_interface.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/transform/transformation.py b/paddlespeech/audio/transform/transformation.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/transform/wpe.py b/paddlespeech/audio/transform/wpe.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/utils/__init__.py b/paddlespeech/audio/utils/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/utils/check_kwargs.py b/paddlespeech/audio/utils/check_kwargs.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/utils/download.py b/paddlespeech/audio/utils/download.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/utils/dynamic_import.py b/paddlespeech/audio/utils/dynamic_import.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/utils/error.py b/paddlespeech/audio/utils/error.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/utils/log.py b/paddlespeech/audio/utils/log.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/utils/numeric.py b/paddlespeech/audio/utils/numeric.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/utils/tensor_utils.py b/paddlespeech/audio/utils/tensor_utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/audio/utils/time.py b/paddlespeech/audio/utils/time.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/README_cn.md b/paddlespeech/cli/README_cn.md old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/asr/__init__.py b/paddlespeech/cli/asr/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py old mode 100644 new mode 100755 index 437f64631..004143361 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -52,7 +52,7 @@ class ASRExecutor(BaseExecutor): self.parser.add_argument( '--model', type=str, - default='conformer_u2pp_wenetspeech', + default='conformer_u2pp_online_wenetspeech', choices=[ tag[:tag.index('-')] for tag in self.task_resource.pretrained_models.keys() @@ -470,7 +470,7 @@ class ASRExecutor(BaseExecutor): @stats_wrapper def __call__(self, audio_file: os.PathLike, - model: str='conformer_u2pp_wenetspeech', + model: str='conformer_u2pp_online_wenetspeech', lang: str='zh', sample_rate: int=16000, config: os.PathLike=None, diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py old mode 100644 new mode 100755 index 7210091a9..767d0df78 --- a/paddlespeech/cli/base_commands.py +++ b/paddlespeech/cli/base_commands.py @@ -83,7 +83,9 @@ model_name_format = { 'st': 'Model-Source language-Target language', 'text': 'Model-Task-Language', 'tts': 'Model-Language', - 'vector': 'Model-Sample Rate' + 'vector': 'Model-Sample Rate', + 'ssl': 'Model-Language-Sample Rate', + 'whisper': 'Model-Language-Sample Rate' } @@ -94,7 +96,9 @@ class StatsCommand: def __init__(self): self.parser = argparse.ArgumentParser( prog='paddlespeech.stats', add_help=True) - self.task_choices = ['asr', 'cls', 'st', 'text', 'tts', 'vector', 'kws'] + self.task_choices = [ + 'asr', 'cls', 'st', 'text', 'tts', 'vector', 'kws', 'ssl', 'whisper' + ] self.parser.add_argument( '--task', type=str, @@ -141,6 +145,12 @@ _commands = { 'tts': ['Text to Speech infer command.', 'TTSExecutor'], 'vector': ['Speech to vector embedding infer command.', 'VectorExecutor'], 'kws': ['Keyword Spotting infer command.', 'KWSExecutor'], + 'ssl': + ['Self-Supervised Learning Pretrained model infer command.', 'SSLExecutor'], + 'whisper': [ + 'Whisper model for speech to text or translate speech to English.', + 'WhisperExecutor' + ] } for com, info in _commands.items(): diff --git a/paddlespeech/cli/cls/__init__.py b/paddlespeech/cli/cls/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/download.py b/paddlespeech/cli/download.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/entry.py b/paddlespeech/cli/entry.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/kws/__init__.py b/paddlespeech/cli/kws/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/kws/infer.py b/paddlespeech/cli/kws/infer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/log.py b/paddlespeech/cli/log.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/ssl/infer.py b/paddlespeech/cli/ssl/infer.py index 4bc8a5074..95fcee5ce 100755 --- a/paddlespeech/cli/ssl/infer.py +++ b/paddlespeech/cli/ssl/infer.py @@ -27,8 +27,11 @@ import paddle import soundfile from yacs.config import CfgNode +<<<<<<< HEAD from ...utils.env import MODEL_HOME from ..download import get_path_from_url +======= +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 from ..executor import BaseExecutor from ..log import logger from ..utils import CLI_TIMER @@ -86,7 +89,12 @@ class SSLExecutor(BaseExecutor): type=str, default='ctc_greedy_search', choices=[ +<<<<<<< HEAD 'ctc_greedy_search', 'ctc_prefix_beam_search', +======= + 'ctc_greedy_search', + 'ctc_prefix_beam_search', +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 ], help='only support asr task') self.parser.add_argument( @@ -218,9 +226,13 @@ class SSLExecutor(BaseExecutor): # pcm16 -> pcm 32 audio = self._pcm16to32(audio) audio = librosa.resample( +<<<<<<< HEAD audio, orig_sr=audio_sample_rate, target_sr=self.sample_rate) +======= + audio, orig_sr=audio_sample_rate, target_sr=self.sample_rate) +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 audio_sample_rate = self.sample_rate # pcm32 -> pcm 16 audio = self._pcm32to16(audio) @@ -262,7 +274,11 @@ class SSLExecutor(BaseExecutor): logger.exception(e) else: logger.debug( +<<<<<<< HEAD f"we will use the wav2vec2 like model to extract audio feature") +======= + "we will use the wav2vec2 like model to extract audio feature") +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 try: out_feature = self.model(audio[:, :, 0]) self._outputs["result"] = out_feature[0] @@ -399,7 +415,11 @@ class SSLExecutor(BaseExecutor): rtf=rtf, device=device) task_results[id_] = res +<<<<<<< HEAD +======= + +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 except Exception as e: has_exceptions = True task_results[id_] = f'{e.__class__.__name__}: {e}' @@ -432,7 +452,12 @@ class SSLExecutor(BaseExecutor): audio_file = os.path.abspath(audio_file) paddle.set_device(device) +<<<<<<< HEAD self._init_from_path(model, task, lang, sample_rate, config, decode_method, ckpt_path) +======= + self._init_from_path(model, task, lang, sample_rate, config, + decode_method, ckpt_path) +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 if not self._check(audio_file, sample_rate, force_yes): sys.exit(-1) if rtf: diff --git a/paddlespeech/cli/st/__init__.py b/paddlespeech/cli/st/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/text/__init__.py b/paddlespeech/cli/text/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/tts/__init__.py b/paddlespeech/cli/tts/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py old mode 100644 new mode 100755 index 3eb597156..707518c05 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -67,6 +67,7 @@ class TTSExecutor(BaseExecutor): 'fastspeech2_mix', 'tacotron2_csmsc', 'tacotron2_ljspeech', + 'fastspeech2_male', ], help='Choose acoustic model type of tts task.') self.parser.add_argument( @@ -122,6 +123,7 @@ class TTSExecutor(BaseExecutor): 'hifigan_aishell3', 'hifigan_vctk', 'wavernn_csmsc', + 'pwgan_male', ], help='Choose vocoder type of tts task.') diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/vector/__init__.py b/paddlespeech/cli/vector/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cli/whisper/__init__.py b/paddlespeech/cli/whisper/__init__.py new file mode 100755 index 000000000..3bafc10d2 --- /dev/null +++ b/paddlespeech/cli/whisper/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .infer import WhisperExecutor diff --git a/paddlespeech/cli/whisper/infer.py b/paddlespeech/cli/whisper/infer.py new file mode 100755 index 000000000..c016b453a --- /dev/null +++ b/paddlespeech/cli/whisper/infer.py @@ -0,0 +1,493 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import io +import os +import sys +import time +from collections import OrderedDict +from typing import List +from typing import Optional +from typing import Union + +import librosa +import numpy as np +import paddle +import soundfile +from yacs.config import CfgNode + +from ...utils.env import DATA_HOME +from ..download import get_path_from_url +from ..executor import BaseExecutor +from ..log import logger +from ..utils import CLI_TIMER +from ..utils import stats_wrapper +from ..utils import timer_register +from paddlespeech.s2t.models.whisper import log_mel_spectrogram +from paddlespeech.s2t.models.whisper import ModelDimensions +from paddlespeech.s2t.models.whisper import Whisper +from paddlespeech.s2t.models.whisper.tokenizer import LANGUAGES +from paddlespeech.s2t.models.whisper.tokenizer import TO_LANGUAGE_CODE +from paddlespeech.s2t.utils.utility import UpdateConfig + +__all__ = ['WhisperExecutor'] + + +@timer_register +class WhisperExecutor(BaseExecutor): + def __init__(self): + super().__init__('whisper') + self.parser = argparse.ArgumentParser( + prog='paddlespeech.whisper', add_help=True) + self.parser.add_argument( + '--input', type=str, default=None, help='Audio file to recognize.') + self.parser.add_argument( + '--model', + type=str, + default='whisper', + choices=['whisper'], + help='Choose model type of asr task.') + self.parser.add_argument( + '--lang', + type=str, + default='', + choices=['', 'en'], + help='Choose model language. Default is "", English-only model set [en].' + ) + self.parser.add_argument( + '--task', + type=str, + default='transcribe', + choices=["transcribe", "translate"], + help='Choose task tpye for transcribe or translate.') + self.parser.add_argument( + '--size', + type=str, + default='large', + choices=['large', 'medium', 'base', 'small', 'tiny'], + help='Choose model size. now only support large, large:[whisper-large-16k]' + ) + self.parser.add_argument( + '--language', + type=str, + default='None', + choices=sorted(LANGUAGES.keys()) + sorted( + [k.title() for k in TO_LANGUAGE_CODE.keys()]), + help='Choose model decode language. Default is None, recognized by model.' + ) + self.parser.add_argument( + "--sample_rate", + type=int, + default=16000, + choices=[16000], + help='Choose the audio sample rate of the model. only support 16000') + self.parser.add_argument( + '--config', + type=str, + default=None, + help='Config of asr task. Use deault config when it is None.') + self.parser.add_argument( + '--decode_method', + type=str, + default='ctc_prefix_beam_search', + choices=['ctc_greedy_search', 'ctc_prefix_beam_search'], + help='only support transformer and conformer model') + self.parser.add_argument( + '--ckpt_path', + type=str, + default=None, + help='Checkpoint file of model.') + self.parser.add_argument( + '--yes', + '-y', + action="store_true", + default=False, + help='No additional parameters required. \ + Once set this parameter, it means accepting the request of the program by default, \ + which includes transforming the audio sample rate') + self.parser.add_argument( + '--rtf', + action="store_true", + default=False, + help='Show Real-time Factor(RTF).') + self.parser.add_argument( + '--device', + type=str, + default=paddle.get_device(), + help='Choose device to execute model inference.') + self.parser.add_argument( + '-d', + '--job_dump_result', + action='store_true', + help='Save job result into file.') + self.parser.add_argument( + '-v', + '--verbose', + action='store_true', + help='Increase logger verbosity of current task.') + + def _init_from_path(self, + model_type: str='whisper', + lang: str='', + task: str='transcribe', + size: str='large', + language: str='None', + sample_rate: int=16000, + cfg_path: Optional[os.PathLike]=None, + decode_method: str='ctc_prefix_beam_search', + num_decoding_left_chunks: int=-1, + ckpt_path: Optional[os.PathLike]=None): + """ + Init model and other resources from a specific path. + """ + logger.debug("start to init the model") + # default max_len: unit:second + self.max_len = 50 + if hasattr(self, 'model'): + logger.debug('Model had been initialized.') + return + + if cfg_path is None or ckpt_path is None: + sample_rate_str = '16k' if sample_rate == 16000 else '8k' + if lang == "": + tag = model_type + '-' + size + '-' + sample_rate_str + else: + tag = model_type + '-' + size + '-' + lang + '-' + sample_rate_str + self.task_resource.set_task_model(tag, version=None) + self.res_path = self.task_resource.res_dir + + self.cfg_path = os.path.join( + self.res_path, self.task_resource.res_dict['cfg_path']) + self.ckpt_path = os.path.join( + self.res_path, + self.task_resource.res_dict['ckpt_path'] + ".pdparams") + logger.debug(self.res_path) + + else: + self.cfg_path = os.path.abspath(cfg_path) + self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams") + self.res_path = os.path.dirname( + os.path.dirname(os.path.abspath(self.cfg_path))) + logger.debug(self.cfg_path) + logger.debug(self.ckpt_path) + + #Init body. + self.config = CfgNode(new_allowed=True) + self.config.merge_from_file(self.cfg_path) + + with UpdateConfig(self.config): + if "whisper" in model_type: + resource_url = self.task_resource.res_dict['resource_data'] + resource_md5 = self.task_resource.res_dict['resource_data_md5'] + + self.resource_path = os.path.join( + DATA_HOME, self.task_resource.version, 'whisper') + self.download_resource(resource_url, self.resource_path, + resource_md5) + else: + raise Exception("wrong type") + + # load model + model_dict = paddle.load(self.ckpt_path) + dims = ModelDimensions(**model_dict["dims"]) + self.model = Whisper(dims) + self.model.load_dict(model_dict) + self.model.eval() + + #set task + if task is not None: + self.task = task + + #set language + if language is not None: + if lang == 'en' and language != 'en': + logger.info( + "{tag} is an English-only model, set language=English .") + self.language = 'en' + else: + self.language = language + + def preprocess(self, model_type: str, input: Union[str, os.PathLike]): + """ + Input preprocess and return paddle.Tensor stored in self.input. + Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet). + """ + + audio_file = input + if isinstance(audio_file, (str, os.PathLike)): + logger.debug("Preprocess audio_file:" + audio_file) + elif isinstance(audio_file, io.BytesIO): + audio_file.seek(0) + + # Get the object for feature extraction + # whisper hard-coded audio hyperparameters, params in paddlespeech/s2t/models/whisper/whisper.py + logger.debug("read the audio file") + audio, audio_sample_rate = soundfile.read( + audio_file, dtype="float32", always_2d=True) + if self.change_format: + if audio.shape[1] >= 2: + audio = audio.mean(axis=1, dtype=np.int16) + else: + audio = audio[:, 0] + # pcm16 -> pcm 32 + audio = self._pcm16to32(audio) + audio = librosa.resample( + audio, orig_sr=audio_sample_rate, target_sr=self.sample_rate) + audio_sample_rate = self.sample_rate + # pcm32 -> pcm 16 + audio = self._pcm32to16(audio) + else: + audio = audio[:, 0] + + logger.debug(f"audio shape: {audio.shape}") + # fbank + audio = log_mel_spectrogram(audio, resource_path=self.resource_path) + + audio_len = paddle.to_tensor(audio.shape[0]) + + self._inputs["audio"] = audio + self._inputs["audio_len"] = audio_len + logger.debug(f"audio feat shape: {audio.shape}") + + logger.debug("audio feat process success") + + @paddle.no_grad() + def infer(self, model_type: str): + """ + Model inference and result stored in self.output. + """ + logger.debug("start to infer the model to get the output") + cfg = self.config + audio = self._inputs["audio"] + if cfg.temperature_increment_on_fallback is not None: + temperature = tuple( + np.arange(cfg.temperature, 1.0 + 1e-6, + cfg.temperature_increment_on_fallback)) + else: + temperature = [cfg.temperature] + + self._outputs["result"] = self.model.transcribe( + audio, + verbose=cfg.verbose, + task=self.task, + language=self.language, + resource_path=self.resource_path, + temperature=temperature, + compression_ratio_threshold=cfg.compression_ratio_threshold, + logprob_threshold=cfg.logprob_threshold, + best_of=cfg.best_of, + beam_size=cfg.beam_size, + patience=cfg.patience, + length_penalty=cfg.length_penalty, + initial_prompt=cfg.initial_prompt, + condition_on_previous_text=cfg.condition_on_previous_text, + no_speech_threshold=cfg.no_speech_threshold) + + def postprocess(self) -> Union[str, os.PathLike]: + """ + Output postprocess and return human-readable results such as texts and audio files. + """ + return self._outputs["result"] + + def download_resource(self, url, lm_dir, md5sum): + download_path = get_path_from_url( + url=url, + root_dir=lm_dir, + md5sum=md5sum, + decompress=True, ) + + def _pcm16to32(self, audio): + assert (audio.dtype == np.int16) + audio = audio.astype("float32") + bits = np.iinfo(np.int16).bits + audio = audio / (2**(bits - 1)) + return audio + + def _pcm32to16(self, audio): + assert (audio.dtype == np.float32) + bits = np.iinfo(np.int16).bits + audio = audio * (2**(bits - 1)) + audio = np.round(audio).astype("int16") + return audio + + def _check(self, audio_file: str, sample_rate: int, force_yes: bool=False): + self.sample_rate = sample_rate + if self.sample_rate != 16000 and self.sample_rate != 8000: + logger.error( + "invalid sample rate, please input --sr 8000 or --sr 16000") + return False + + if isinstance(audio_file, (str, os.PathLike)): + if not os.path.isfile(audio_file): + logger.error("Please input the right audio file path") + return False + elif isinstance(audio_file, io.BytesIO): + audio_file.seek(0) + + logger.debug("checking the audio file format......") + try: + audio, audio_sample_rate = soundfile.read( + audio_file, dtype="int16", always_2d=True) + audio_duration = audio.shape[0] / audio_sample_rate + if audio_duration > self.max_len: + logger.error( + f"Please input audio file less then {self.max_len} seconds.\n" + ) + return False + except Exception as e: + logger.exception(e) + logger.error( + f"can not open the audio file, please check the audio file({audio_file}) format is 'wav'. \n \ + you can try to use sox to change the file format.\n \ + For example: \n \ + sample rate: 16k \n \ + sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav \n \ + sample rate: 8k \n \ + sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \ + ") + return False + logger.debug("The sample rate is %d" % audio_sample_rate) + if audio_sample_rate != self.sample_rate: + logger.warning("The sample rate of the input file is not {}.\n \ + The program will resample the wav file to {}.\n \ + If the result does not meet your expectations,\n \ + Please input the 16k 16 bit 1 channel wav file. \ + ".format(self.sample_rate, self.sample_rate)) + if force_yes is False: + while (True): + logger.debug( + "Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream." + ) + content = input("Input(Y/N):") + if content.strip() == "Y" or content.strip( + ) == "y" or content.strip() == "yes" or content.strip( + ) == "Yes": + logger.debug( + "change the sampele rate, channel to 16k and 1 channel" + ) + break + elif content.strip() == "N" or content.strip( + ) == "n" or content.strip() == "no" or content.strip( + ) == "No": + logger.debug("Exit the program") + return False + else: + logger.warning("Not regular input, please input again") + + self.change_format = True + else: + logger.debug("The audio file format is right") + self.change_format = False + + return True + + def execute(self, argv: List[str]) -> bool: + """ + Command line entry. + """ + parser_args = self.parser.parse_args(argv) + + model = parser_args.model + lang = parser_args.lang + task = parser_args.task + size = parser_args.size + language = parser_args.language + sample_rate = parser_args.sample_rate + config = parser_args.config + ckpt_path = parser_args.ckpt_path + decode_method = parser_args.decode_method + force_yes = parser_args.yes + rtf = parser_args.rtf + device = parser_args.device + + if not parser_args.verbose: + self.disable_task_loggers() + + task_source = self.get_input_source(parser_args.input) + task_results = OrderedDict() + has_exceptions = False + + for id_, input_ in task_source.items(): + try: + res = self( + audio_file=input_, + model=model, + lang=lang, + task=task, + size=size, + language=language, + sample_rate=sample_rate, + config=config, + ckpt_path=ckpt_path, + decode_method=decode_method, + force_yes=force_yes, + rtf=rtf, + device=device) + task_results[id_] = res + except Exception as e: + has_exceptions = True + task_results[id_] = f'{e.__class__.__name__}: {e}' + + if rtf: + self.show_rtf(CLI_TIMER[self.__class__.__name__]) + + self.process_task_results(parser_args.input, task_results, + parser_args.job_dump_result) + + if has_exceptions: + return False + else: + return True + + @stats_wrapper + def __call__(self, + audio_file: os.PathLike, + model: str='whisper', + lang: str='', + task: str='transcribe', + size: str='large', + language: str='None', + sample_rate: int=16000, + config: os.PathLike=None, + ckpt_path: os.PathLike=None, + decode_method: str='attention_rescoring', + num_decoding_left_chunks: int=-1, + force_yes: bool=False, + rtf: bool=False, + device=paddle.get_device()): + """ + Python API to call an executor. + """ + audio_file = os.path.abspath(audio_file) + paddle.set_device(device) + self._init_from_path(model, lang, task, size, language, sample_rate, + config, decode_method, num_decoding_left_chunks, + ckpt_path) + if not self._check(audio_file, sample_rate, force_yes): + sys.exit(-1) + if rtf: + k = self.__class__.__name__ + CLI_TIMER[k]['start'].append(time.time()) + + self.preprocess(model, audio_file) + self.infer(model) + res = self.postprocess() # Retrieve result of asr. + + if rtf: + CLI_TIMER[k]['end'].append(time.time()) + audio, audio_sample_rate = soundfile.read( + audio_file, dtype="int16", always_2d=True) + CLI_TIMER[k]['extra'].append(audio.shape[0] / audio_sample_rate) + + return res diff --git a/paddlespeech/cls/__init__.py b/paddlespeech/cls/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cls/exps/__init__.py b/paddlespeech/cls/exps/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cls/exps/panns/__init__.py b/paddlespeech/cls/exps/panns/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cls/exps/panns/deploy/__init__.py b/paddlespeech/cls/exps/panns/deploy/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cls/exps/panns/deploy/predict.py b/paddlespeech/cls/exps/panns/deploy/predict.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cls/exps/panns/export_model.py b/paddlespeech/cls/exps/panns/export_model.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cls/exps/panns/predict.py b/paddlespeech/cls/exps/panns/predict.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cls/exps/panns/train.py b/paddlespeech/cls/exps/panns/train.py old mode 100644 new mode 100755 index fba38a01c..133893081 --- a/paddlespeech/cls/exps/panns/train.py +++ b/paddlespeech/cls/exps/panns/train.py @@ -101,7 +101,7 @@ if __name__ == "__main__": optimizer.clear_grad() # Calculate loss - avg_loss += loss.numpy()[0] + avg_loss += float(loss) # Calculate metrics preds = paddle.argmax(logits, axis=1) diff --git a/paddlespeech/cls/models/__init__.py b/paddlespeech/cls/models/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cls/models/panns/__init__.py b/paddlespeech/cls/models/panns/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cls/models/panns/classifier.py b/paddlespeech/cls/models/panns/classifier.py old mode 100644 new mode 100755 diff --git a/paddlespeech/cls/models/panns/panns.py b/paddlespeech/cls/models/panns/panns.py old mode 100644 new mode 100755 diff --git a/paddlespeech/kws/__init__.py b/paddlespeech/kws/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/kws/exps/__init__.py b/paddlespeech/kws/exps/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/kws/exps/mdtc/__init__.py b/paddlespeech/kws/exps/mdtc/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/kws/exps/mdtc/collate.py b/paddlespeech/kws/exps/mdtc/collate.py old mode 100644 new mode 100755 diff --git a/paddlespeech/kws/exps/mdtc/compute_det.py b/paddlespeech/kws/exps/mdtc/compute_det.py old mode 100644 new mode 100755 diff --git a/paddlespeech/kws/exps/mdtc/plot_det_curve.py b/paddlespeech/kws/exps/mdtc/plot_det_curve.py old mode 100644 new mode 100755 diff --git a/paddlespeech/kws/exps/mdtc/score.py b/paddlespeech/kws/exps/mdtc/score.py old mode 100644 new mode 100755 diff --git a/paddlespeech/kws/exps/mdtc/train.py b/paddlespeech/kws/exps/mdtc/train.py old mode 100644 new mode 100755 index 94e45d590..d5bb5e020 --- a/paddlespeech/kws/exps/mdtc/train.py +++ b/paddlespeech/kws/exps/mdtc/train.py @@ -110,7 +110,7 @@ if __name__ == '__main__': optimizer.clear_grad() # Calculate loss - avg_loss += loss.numpy()[0] + avg_loss += float(loss) # Calculate metrics num_corrects += corrects diff --git a/paddlespeech/kws/models/__init__.py b/paddlespeech/kws/models/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/kws/models/loss.py b/paddlespeech/kws/models/loss.py old mode 100644 new mode 100755 diff --git a/paddlespeech/kws/models/mdtc.py b/paddlespeech/kws/models/mdtc.py old mode 100644 new mode 100755 diff --git a/paddlespeech/resource/__init__.py b/paddlespeech/resource/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/resource/model_alias.py b/paddlespeech/resource/model_alias.py old mode 100644 new mode 100755 index f5ec655b7..ab0b1828c --- a/paddlespeech/resource/model_alias.py +++ b/paddlespeech/resource/model_alias.py @@ -18,6 +18,12 @@ __all__ = [ # Records of model name to import class model_alias = { + # --------------------------------- + # -------------- SSL -------------- + # --------------------------------- + "wav2vec2ASR": ["paddlespeech.s2t.models.wav2vec2:Wav2vec2ASR"], + "wav2vec2": ["paddlespeech.s2t.models.wav2vec2:Wav2vec2Base"], + # --------------------------------- # -------------- ASR -------------- # --------------------------------- @@ -25,11 +31,15 @@ model_alias = { "deepspeech2online": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"], "conformer": ["paddlespeech.s2t.models.u2:U2Model"], "conformer_online": ["paddlespeech.s2t.models.u2:U2Model"], - "conformer_u2pp": ["paddlespeech.s2t.models.u2:U2Model"], "conformer_u2pp_online": ["paddlespeech.s2t.models.u2:U2Model"], "transformer": ["paddlespeech.s2t.models.u2:U2Model"], "wenetspeech": ["paddlespeech.s2t.models.u2:U2Model"], + # --------------------------------- + # ------------ Whisper ------------ + # --------------------------------- + "whisper": ["paddlespeech.s2t.models.whisper:Whisper"], + # --------------------------------- # -------------- CLS -------------- # --------------------------------- diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py old mode 100644 new mode 100755 index efd6bb3f2..067246749 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -25,6 +25,8 @@ __all__ = [ 'tts_static_pretrained_models', 'tts_onnx_pretrained_models', 'vector_dynamic_pretrained_models', + 'ssl_dynamic_pretrained_models', + 'whisper_dynamic_pretrained_models', ] # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]". @@ -32,6 +34,44 @@ __all__ = [ # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" +# --------------------------------- +# -------------- SSL -------------- +# --------------------------------- +ssl_dynamic_pretrained_models = { + "wav2vec2-en-16k": { + '1.3': { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/wav2vec2-large-960h-lv60-self_ckpt_1.3.0.model.tar.gz', + 'md5': + 'acc46900680e341e500437aa59193518', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'wav2vec2-large-960h-lv60-self', + 'model': + 'wav2vec2-large-960h-lv60-self.pdparams', + 'params': + 'wav2vec2-large-960h-lv60-self.pdparams', + }, + }, + "wav2vec2ASR_librispeech-en-16k": { + '1.3': { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/wav2vec2ASR-large-960h-librispeech_ckpt_1.3.1.model.tar.gz', + 'md5': + 'cbe28d6c78f3dd2e189968402381f454', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/wav2vec2ASR/checkpoints/avg_1', + 'model': + 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', + 'params': + 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams', + }, + }, +} + # --------------------------------- # -------------- ASR -------------- # --------------------------------- @@ -68,32 +108,12 @@ asr_dynamic_pretrained_models = { '', }, }, - "conformer_u2pp_wenetspeech-zh-16k": { - '1.1': { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.3.model.tar.gz', - 'md5': - '662b347e1d2131b7a4dc5398365e2134', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/chunk_conformer_u2pp/checkpoints/avg_10', - 'model': - 'exp/chunk_conformer_u2pp/checkpoints/avg_10.pdparams', - 'params': - 'exp/chunk_conformer_u2pp/checkpoints/avg_10.pdparams', - 'lm_url': - '', - 'lm_md5': - '', - }, - }, "conformer_u2pp_online_wenetspeech-zh-16k": { - '1.1': { + '1.3': { 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.1.4.model.tar.gz', + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz', 'md5': - '3100fc1eac5779486cab859366992d0b', + '62d230c1bf27731192aa9d3b8deca300', 'cfg_path': 'model.yaml', 'ckpt_path': @@ -444,6 +464,189 @@ asr_onnx_pretrained_models = { }, } +whisper_dynamic_pretrained_models = { + "whisper-large-16k": { + '1.3': { + 'url': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-large-model.tar.gz', + 'md5': + 'cf1557af9d8ffa493fefad9cb08ae189', + 'cfg_path': + 'whisper.yaml', + 'ckpt_path': + 'whisper-large-model', + 'model': + 'whisper-large-model.pdparams', + 'params': + 'whisper-large-model.pdparams', + 'resource_data': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221108/assets.tar', + 'resource_data_md5': + '37a0a8abdb3641a51194f79567a93b61', + }, + }, + "whisper-base-en-16k": { + '1.3': { + 'url': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-base-en-model.tar.gz', + 'md5': + 'b156529aefde6beb7726d2ea98fd067a', + 'cfg_path': + 'whisper.yaml', + 'ckpt_path': + 'whisper-base-en-model', + 'model': + 'whisper-base-en-model.pdparams', + 'params': + 'whisper-base-en-model.pdparams', + 'resource_data': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221108/assets.tar', + 'resource_data_md5': + '37a0a8abdb3641a51194f79567a93b61', + }, + }, + "whisper-base-16k": { + '1.3': { + 'url': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-base-model.tar.gz', + 'md5': + '6b012a5abd583db14398c3492e47120b', + 'cfg_path': + 'whisper.yaml', + 'ckpt_path': + 'whisper-base-model', + 'model': + 'whisper-base-model.pdparams', + 'params': + 'whisper-base-model.pdparams', + 'resource_data': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221108/assets.tar', + 'resource_data_md5': + '37a0a8abdb3641a51194f79567a93b61', + }, + }, + "whisper-medium-en-16k": { + '1.3': { + 'url': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-medium-en-model.tar.gz', + 'md5': + 'c7f57d270bd20c7b170ba9dcf6c16f74', + 'cfg_path': + 'whisper.yaml', + 'ckpt_path': + 'whisper-medium-en-model', + 'model': + 'whisper-medium-en-model.pdparams', + 'params': + 'whisper-medium-en-model.pdparams', + 'resource_data': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221108/assets.tar', + 'resource_data_md5': + '37a0a8abdb3641a51194f79567a93b61', + }, + }, + "whisper-medium-16k": { + '1.3': { + 'url': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-medium-model.tar.gz', + 'md5': + '4c7dcd0df25f408199db4a4548336786', + 'cfg_path': + 'whisper.yaml', + 'ckpt_path': + 'whisper-medium-model', + 'model': + 'whisper-medium-model.pdparams', + 'params': + 'whisper-medium-model.pdparams', + 'resource_data': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221108/assets.tar', + 'resource_data_md5': + '37a0a8abdb3641a51194f79567a93b61', + }, + }, + "whisper-small-en-16k": { + '1.3': { + 'url': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-small-en-model.tar.gz', + 'md5': + '2b24efcb2e93f3275af7c0c7f598ff1c', + 'cfg_path': + 'whisper.yaml', + 'ckpt_path': + 'whisper-small-en-model', + 'model': + 'whisper-small-en-model.pdparams', + 'params': + 'whisper-small-en-model.pdparams', + 'resource_data': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221108/assets.tar', + 'resource_data_md5': + '37a0a8abdb3641a51194f79567a93b61', + }, + }, + "whisper-small-16k": { + '1.3': { + 'url': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-small-model.tar.gz', + 'md5': + '5a57911dd41651dd6ed78c5763912825', + 'cfg_path': + 'whisper.yaml', + 'ckpt_path': + 'whisper-small-model', + 'model': + 'whisper-small-model.pdparams', + 'params': + 'whisper-small-model.pdparams', + 'resource_data': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221108/assets.tar', + 'resource_data_md5': + '37a0a8abdb3641a51194f79567a93b61', + }, + }, + "whisper-tiny-en-16k": { + '1.3': { + 'url': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-tiny-en-model.tar.gz', + 'md5': + '14969164a3f713fd58e56978c34188f6', + 'cfg_path': + 'whisper.yaml', + 'ckpt_path': + 'whisper-tiny-en-model', + 'model': + 'whisper-tiny-en-model.pdparams', + 'params': + 'whisper-tiny-en-model.pdparams', + 'resource_data': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221108/assets.tar', + 'resource_data_md5': + '37a0a8abdb3641a51194f79567a93b61', + }, + }, + "whisper-tiny-16k": { + '1.3': { + 'url': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-tiny-model.tar.gz', + 'md5': + 'a5b82a1f2067a2ca400f17fabd62b81b', + 'cfg_path': + 'whisper.yaml', + 'ckpt_path': + 'whisper-tiny-model', + 'model': + 'whisper-tiny-model.pdparams', + 'params': + 'whisper-tiny-model.pdparams', + 'resource_data': + 'https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221108/assets.tar', + 'resource_data_md5': + '37a0a8abdb3641a51194f79567a93b61', + }, + }, +} + # --------------------------------- # -------------- CLS -------------- # --------------------------------- @@ -743,6 +946,22 @@ tts_dynamic_pretrained_models = { 'speaker_id_map.txt', }, }, + "fastspeech2_male-zh": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_ckpt_1.3.0.zip', + 'md5': + 'a4b1a2f667b878ec8f67375357b04282', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_76000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + }, # tacotron2 "tacotron2_csmsc-zh": { '1.0': { @@ -833,6 +1052,20 @@ tts_dynamic_pretrained_models = { 'feats_stats.npy', }, }, + "pwgan_male-zh": { + '1.0': { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.3.0.zip', + 'md5': + 'c98cdb889c809973f8cc764437311132', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_200000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + }, # mb_melgan "mb_melgan_csmsc-zh": { '1.0': { diff --git a/paddlespeech/resource/resource.py b/paddlespeech/resource/resource.py old mode 100644 new mode 100755 index 8e9914b2e..4eb0e32d7 --- a/paddlespeech/resource/resource.py +++ b/paddlespeech/resource/resource.py @@ -22,7 +22,9 @@ from ..utils.dynamic_import import dynamic_import from ..utils.env import MODEL_HOME from .model_alias import model_alias -task_supported = ['asr', 'cls', 'st', 'text', 'tts', 'vector', 'kws'] +task_supported = [ + 'asr', 'cls', 'st', 'text', 'tts', 'vector', 'kws', 'ssl', 'whisper' +] model_format_supported = ['dynamic', 'static', 'onnx'] inference_mode_supported = ['online', 'offline'] @@ -108,7 +110,6 @@ class CommonTaskResource: """ assert model_name in model_alias, 'No model classes found for "{}"'.format( model_name) - ret = [] for import_path in model_alias[model_name]: ret.append(dynamic_import(import_path)) diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/README.md b/paddlespeech/s2t/decoders/README.md old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/__init__.py b/paddlespeech/s2t/decoders/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/beam_search/__init__.py b/paddlespeech/s2t/decoders/beam_search/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/beam_search/batch_beam_search.py b/paddlespeech/s2t/decoders/beam_search/batch_beam_search.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/beam_search/beam_search.py b/paddlespeech/s2t/decoders/beam_search/beam_search.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/ctcdecoder/__init__.py b/paddlespeech/s2t/decoders/ctcdecoder/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/ctcdecoder/decoders_deprecated.py b/paddlespeech/s2t/decoders/ctcdecoder/decoders_deprecated.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py b/paddlespeech/s2t/decoders/ctcdecoder/scorer_deprecated.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py b/paddlespeech/s2t/decoders/ctcdecoder/swig_wrapper.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/ctcdecoder/tests/test_decoders.py b/paddlespeech/s2t/decoders/ctcdecoder/tests/test_decoders.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/recog.py b/paddlespeech/s2t/decoders/recog.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/recog_bin.py b/paddlespeech/s2t/decoders/recog_bin.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/scorers/__init__.py b/paddlespeech/s2t/decoders/scorers/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/scorers/ctc.py b/paddlespeech/s2t/decoders/scorers/ctc.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/scorers/length_bonus.py b/paddlespeech/s2t/decoders/scorers/length_bonus.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/scorers/ngram.py b/paddlespeech/s2t/decoders/scorers/ngram.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/scorers/scorer_interface.py b/paddlespeech/s2t/decoders/scorers/scorer_interface.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/decoders/utils.py b/paddlespeech/s2t/decoders/utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/__init__.py b/paddlespeech/s2t/exps/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/__init__.py b/paddlespeech/s2t/exps/deepspeech2/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/__init__.py b/paddlespeech/s2t/exps/deepspeech2/bin/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/__init__.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/client.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/client.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/record.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/record.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/send.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/send.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/paddlespeech/s2t/exps/deepspeech2/bin/export.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/lm/transformer/__init__.py b/paddlespeech/s2t/exps/lm/transformer/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/lm/transformer/bin/__init__.py b/paddlespeech/s2t/exps/lm/transformer/bin/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/lm/transformer/bin/cacu_perplexity.py b/paddlespeech/s2t/exps/lm/transformer/bin/cacu_perplexity.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py b/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2/__init__.py b/paddlespeech/s2t/exps/u2/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2/bin/__init__.py b/paddlespeech/s2t/exps/u2/bin/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2/bin/export.py b/paddlespeech/s2t/exps/u2/bin/export.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2/bin/quant.py b/paddlespeech/s2t/exps/u2/bin/quant.py old mode 100644 new mode 100755 index c38134c57..6d361c5fd --- a/paddlespeech/s2t/exps/u2/bin/quant.py +++ b/paddlespeech/s2t/exps/u2/bin/quant.py @@ -11,13 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Evaluation for U2 model.""" -import os -import sys -from pathlib import Path - +"""Quantzation U2 model.""" import paddle -import soundfile +from kaldiio import ReadHelper from paddleslim import PTQ from yacs.config import CfgNode @@ -34,7 +30,7 @@ class U2Infer(): def __init__(self, config, args): self.args = args self.config = config - self.audio_file = args.audio_file + self.audio_scp = args.audio_scp self.preprocess_conf = config.preprocess_config self.preprocess_args = {"train": False} @@ -63,133 +59,112 @@ class U2Infer(): self.model.set_state_dict(model_dict) def run(self): - check(args.audio_file) - - with paddle.no_grad(): - # read - audio, sample_rate = soundfile.read( - self.audio_file, dtype="int16", always_2d=True) - audio = audio[:, 0] - logger.info(f"audio shape: {audio.shape}") - - # fbank - feat = self.preprocessing(audio, **self.preprocess_args) - logger.info(f"feat shape: {feat.shape}") - - ilen = paddle.to_tensor(feat.shape[0]) - xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) - decode_config = self.config.decode - logger.info(f"decode cfg: {decode_config}") - reverse_weight = getattr(decode_config, 'reverse_weight', 0.0) - result_transcripts = self.model.decode( - xs, - ilen, - text_feature=self.text_feature, - decoding_method=decode_config.decoding_method, - beam_size=decode_config.beam_size, - ctc_weight=decode_config.ctc_weight, - decoding_chunk_size=decode_config.decoding_chunk_size, - num_decoding_left_chunks=decode_config.num_decoding_left_chunks, - simulate_streaming=decode_config.simulate_streaming, - reverse_weight=reverse_weight) - rsl = result_transcripts[0][0] - utt = Path(self.audio_file).name - logger.info(f"hyp: {utt} {rsl}") - # print(self.model) - # print(self.model.forward_encoder_chunk) - - logger.info("-------------start quant ----------------------") - batch_size = 1 - feat_dim = 80 - model_size = 512 - num_left_chunks = -1 - reverse_weight = 0.3 - logger.info( - f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}, reverse_weight {reverse_weight}" - ) - - # ######################## self.model.forward_encoder_chunk ############ - # input_spec = [ - # # (T,), int16 - # paddle.static.InputSpec(shape=[None], dtype='int16'), - # ] - # self.model.forward_feature = paddle.jit.to_static( - # self.model.forward_feature, input_spec=input_spec) - - ######################### self.model.forward_encoder_chunk ############ - input_spec = [ - # xs, (B, T, D) - paddle.static.InputSpec( - shape=[batch_size, None, feat_dim], dtype='float32'), - # offset, int, but need be tensor - paddle.static.InputSpec(shape=[1], dtype='int32'), - # required_cache_size, int - num_left_chunks, - # att_cache - paddle.static.InputSpec( - shape=[None, None, None, None], dtype='float32'), - # cnn_cache - paddle.static.InputSpec( - shape=[None, None, None, None], dtype='float32') - ] - self.model.forward_encoder_chunk = paddle.jit.to_static( - self.model.forward_encoder_chunk, input_spec=input_spec) - - ######################### self.model.ctc_activation ######################## - input_spec = [ - # encoder_out, (B,T,D) - paddle.static.InputSpec( - shape=[batch_size, None, model_size], dtype='float32') - ] - self.model.ctc_activation = paddle.jit.to_static( - self.model.ctc_activation, input_spec=input_spec) - - ######################### self.model.forward_attention_decoder ######################## - input_spec = [ - # hyps, (B, U) - paddle.static.InputSpec(shape=[None, None], dtype='int64'), - # hyps_lens, (B,) - paddle.static.InputSpec(shape=[None], dtype='int64'), - # encoder_out, (B,T,D) - paddle.static.InputSpec( - shape=[batch_size, None, model_size], dtype='float32'), - reverse_weight - ] - self.model.forward_attention_decoder = paddle.jit.to_static( - self.model.forward_attention_decoder, input_spec=input_spec) - ################################################################################ - - # jit save - logger.info(f"export save: {self.args.export_path}") - config = { - 'is_static': True, - 'combine_params': True, - 'skip_forward': True - } - self.ptq.save_quantized_model(self.model, self.args.export_path) - # paddle.jit.save( - # self.model, - # self.args.export_path, - # combine_params=True, - # skip_forward=True) - - -def check(audio_file): - if not os.path.isfile(audio_file): - print("Please input the right audio file path") - sys.exit(-1) - - logger.info("checking the audio file format......") - try: - sig, sample_rate = soundfile.read(audio_file) - except Exception as e: - logger.error(str(e)) - logger.error( - "can not open the wav file, please check the audio file format") - sys.exit(-1) - logger.info("The sample rate is %d" % sample_rate) - assert (sample_rate == 16000) - logger.info("The audio file format is right") + cnt = 0 + with ReadHelper(f"scp:{self.audio_scp}") as reader: + for key, (rate, audio) in reader: + assert rate == 16000 + cnt += 1 + if cnt > args.num_utts: + break + + with paddle.no_grad(): + logger.info(f"audio shape: {audio.shape}") + + # fbank + feat = self.preprocessing(audio, **self.preprocess_args) + logger.info(f"feat shape: {feat.shape}") + + ilen = paddle.to_tensor(feat.shape[0]) + xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) + decode_config = self.config.decode + logger.info(f"decode cfg: {decode_config}") + result_transcripts = self.model.decode( + xs, + ilen, + text_feature=self.text_feature, + decoding_method=decode_config.decoding_method, + beam_size=decode_config.beam_size, + ctc_weight=decode_config.ctc_weight, + decoding_chunk_size=decode_config.decoding_chunk_size, + num_decoding_left_chunks=decode_config. + num_decoding_left_chunks, + simulate_streaming=decode_config.simulate_streaming, + reverse_weight=decode_config.reverse_weight) + rsl = result_transcripts[0][0] + utt = key + logger.info(f"hyp: {utt} {rsl}") + # print(self.model) + # print(self.model.forward_encoder_chunk) + + logger.info("-------------start quant ----------------------") + batch_size = 1 + feat_dim = 80 + model_size = 512 + num_left_chunks = -1 + reverse_weight = 0.3 + logger.info( + f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}, reverse_weight {reverse_weight}" + ) + + # ######################## self.model.forward_encoder_chunk ############ + # input_spec = [ + # # (T,), int16 + # paddle.static.InputSpec(shape=[None], dtype='int16'), + # ] + # self.model.forward_feature = paddle.jit.to_static( + # self.model.forward_feature, input_spec=input_spec) + + ######################### self.model.forward_encoder_chunk ############ + input_spec = [ + # xs, (B, T, D) + paddle.static.InputSpec( + shape=[batch_size, None, feat_dim], dtype='float32'), + # offset, int, but need be tensor + paddle.static.InputSpec(shape=[1], dtype='int32'), + # required_cache_size, int + num_left_chunks, + # att_cache + paddle.static.InputSpec( + shape=[None, None, None, None], dtype='float32'), + # cnn_cache + paddle.static.InputSpec( + shape=[None, None, None, None], dtype='float32') + ] + self.model.forward_encoder_chunk = paddle.jit.to_static( + self.model.forward_encoder_chunk, input_spec=input_spec) + + ######################### self.model.ctc_activation ######################## + input_spec = [ + # encoder_out, (B,T,D) + paddle.static.InputSpec( + shape=[batch_size, None, model_size], dtype='float32') + ] + self.model.ctc_activation = paddle.jit.to_static( + self.model.ctc_activation, input_spec=input_spec) + + ######################### self.model.forward_attention_decoder ######################## + input_spec = [ + # hyps, (B, U) + paddle.static.InputSpec(shape=[None, None], dtype='int64'), + # hyps_lens, (B,) + paddle.static.InputSpec(shape=[None], dtype='int64'), + # encoder_out, (B,T,D) + paddle.static.InputSpec( + shape=[batch_size, None, model_size], dtype='float32'), + reverse_weight + ] + self.model.forward_attention_decoder = paddle.jit.to_static( + self.model.forward_attention_decoder, input_spec=input_spec) + ################################################################################ + + # jit save + logger.info(f"export save: {self.args.export_path}") + self.ptq.ptq._convert(self.model) + paddle.jit.save( + self.model, + self.args.export_path, + combine_params=True, + skip_forward=True) def main(config, args): @@ -202,11 +177,16 @@ if __name__ == "__main__": parser.add_argument( "--result_file", type=str, help="path of save the asr result") parser.add_argument( - "--audio_file", type=str, help="path of the input audio file") + "--audio_scp", type=str, help="path of the input audio file") + parser.add_argument( + "--num_utts", + type=int, + default=200, + help="num utts for quant calibrition.") parser.add_argument( "--export_path", type=str, - default='export', + default='export.jit.quant', help="path of the input audio file") args = parser.parse_args() diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py old mode 100644 new mode 100755 index d12ea3646..0df443193 --- a/paddlespeech/s2t/exps/u2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py @@ -16,6 +16,8 @@ import os import sys from pathlib import Path +import distutils +import numpy as np import paddle import soundfile from yacs.config import CfgNode @@ -74,6 +76,8 @@ class U2Infer(): # fbank feat = self.preprocessing(audio, **self.preprocess_args) logger.info(f"feat shape: {feat.shape}") + if self.args.debug: + np.savetxt("feat.transform.txt", feat) ilen = paddle.to_tensor(feat.shape[0]) xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0) @@ -126,6 +130,11 @@ if __name__ == "__main__": "--result_file", type=str, help="path of save the asr result") parser.add_argument( "--audio_file", type=str, help="path of the input audio file") + parser.add_argument( + "--debug", + type=distutils.util.strtobool, + default=False, + help="for debug.") args = parser.parse_args() config = CfgNode(new_allowed=True) diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2/trainer.py b/paddlespeech/s2t/exps/u2/trainer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2_kaldi/__init__.py b/paddlespeech/s2t/exps/u2_kaldi/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/__init__.py b/paddlespeech/s2t/exps/u2_kaldi/bin/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/recog.py b/paddlespeech/s2t/exps/u2_kaldi/bin/recog.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2_st/__init__.py b/paddlespeech/s2t/exps/u2_st/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2_st/bin/__init__.py b/paddlespeech/s2t/exps/u2_st/bin/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2_st/bin/export.py b/paddlespeech/s2t/exps/u2_st/bin/export.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/exps/wav2vec2/__init__.py b/paddlespeech/s2t/exps/wav2vec2/__init__.py new file mode 100755 index 000000000..97043fd7b --- /dev/null +++ b/paddlespeech/s2t/exps/wav2vec2/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py b/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py old mode 100644 new mode 100755 index 185a92b8d..97043fd7b --- a/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py +++ b/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test.py b/paddlespeech/s2t/exps/wav2vec2/bin/test.py old mode 100644 new mode 100755 index d1a6fd405..a376651df --- a/paddlespeech/s2t/exps/wav2vec2/bin/test.py +++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py old mode 100644 new mode 100755 index 3a537bce5..0d66ac410 --- a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/train.py b/paddlespeech/s2t/exps/wav2vec2/bin/train.py old mode 100644 new mode 100755 index b2edecca1..29e7ef552 --- a/paddlespeech/s2t/exps/wav2vec2/bin/train.py +++ b/paddlespeech/s2t/exps/wav2vec2/bin/train.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,9 +34,10 @@ def main(config, args): if __name__ == "__main__": parser = default_argument_parser() + parser.add_argument( + '--resume', type=str, default="", nargs="?", help='resume ckpt path.') args = parser.parse_args() print_arguments(args, globals()) - # https://yaml.org/type/float.html config = CfgNode(new_allowed=True) if args.config: diff --git a/paddlespeech/s2t/exps/wav2vec2/model.py b/paddlespeech/s2t/exps/wav2vec2/model.py index 9d72e067d..1144128fb 100755 --- a/paddlespeech/s2t/exps/wav2vec2/model.py +++ b/paddlespeech/s2t/exps/wav2vec2/model.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ import json import math import os +import re import time from collections import defaultdict from collections import OrderedDict @@ -84,13 +85,14 @@ class Wav2Vec2ASRTrainer(Trainer): # forward utt, wav, wavs_lens, target, target_lens = batch wavs_lens_rate = wavs_lens / wav.shape[1] - target_lens_rate = target_lens / target.shape[1] + wav = wav[:, :, 0] - wav = self.speech_augmentation(wav, wavs_lens_rate) - loss = self.model(wav, wavs_lens_rate, target, target_lens_rate) + if hasattr(train_conf, 'audio_augment'): + wav = self.speech_augmentation(wav, wavs_lens_rate) + + loss = self.model(wav, wavs_lens_rate, target, target_lens) # loss div by `batch_size * accum_grad` loss /= train_conf.accum_grad - # update self.avg_train_loss self.update_average(batch_index, float(loss)) @@ -117,13 +119,12 @@ class Wav2Vec2ASRTrainer(Trainer): if not train_conf.freeze_wav2vec2: self.wav2vec2_optimizer.step() self.wav2vec2_optimizer.clear_grad() - if self.config.model_scheduler is not 'newbobscheduler': + if self.config.model_scheduler != 'newbobscheduler': self.model_lr_scheduler.step() - if self.config.wav2vec2_scheduler is not 'newbobscheduler': + self.model_lr_scheduler.clear_grad() if not train_conf.freeze_wav2vec2: self.wav2vec2_lr_scheduler.step() self.iteration += 1 - losses_np = {'loss': self.avg_train_loss * train_conf.accum_grad} iteration_time = time.time() - start for k, v in losses_np.items(): @@ -135,7 +136,10 @@ class Wav2Vec2ASRTrainer(Trainer): if (batch_index + 1) % train_conf.accum_grad == 0: if dist.get_rank() == 0 and self.visualizer: losses_np_v = losses_np.copy() - losses_np_v.update({"model_lr": self.model_lr_scheduler(), "wav2vec2_lr": self.wav2vec2_lr_scheduler()}) + losses_np_v.update({ + "model_lr": self.model_lr_scheduler(), + "wav2vec2_lr": self.wav2vec2_lr_scheduler() + }) for key, val in losses_np_v.items(): self.visualizer.add_scalar( tag='train/' + key, value=val, step=self.iteration - 1) @@ -152,9 +156,8 @@ class Wav2Vec2ASRTrainer(Trainer): for i, batch in enumerate(self.valid_loader): utt, wav, wavs_lens, target, target_lens = batch wavs_lens_rate = wavs_lens / wav.shape[1] - target_lens_rate = target_lens / target.shape[1] wav = wav[:, :, 0] - loss = self.model(wav, wavs_lens_rate, target, target_lens_rate) + loss = self.model(wav, wavs_lens_rate, target, target_lens) if math.isfinite(float(loss)): num_utts = batch[1].shape[0] @@ -280,6 +283,106 @@ class Wav2Vec2ASRTrainer(Trainer): logger.info("Init from scratch!") return scratch + @mp_tools.rank_zero_only + def save(self, tag=None, infos: dict=None): + """Save checkpoint (model parameters and optimizer states). + + Args: + tag (int or str, optional): None for step, else using tag, e.g epoch. Defaults to None. + infos (dict, optional): meta data to save. Defaults to None. + """ + + infos = infos if infos else dict() + infos.update({ + "epoch": self.epoch, + "model_lr": self.model_optimizer.get_lr(), + "wav2vec2_lr": self.wav2vec2_optimizer.get_lr() + }) + + checkpoint_path = os.path.join( + self.checkpoint_dir, + "{}".format(self.iteration if tag is None else tag)) + + model_dict = self.model.state_dict() + params_path = checkpoint_path + ".pdparams" + paddle.save(model_dict, params_path) + logger.info("Saved model to {}".format(params_path)) + + model_opt_dict = self.model_optimizer.state_dict() + wav2vec2_opt_dict = self.wav2vec2_optimizer.state_dict() + + opt_dict = {'model': model_opt_dict, 'wav2vec2': wav2vec2_opt_dict} + + optimizer_path = checkpoint_path + ".pdopt" + paddle.save(opt_dict, optimizer_path) + logger.info("Saved optimzier state to {}".format(optimizer_path)) + + scheduler_dict = {} + + if self.config.model_scheduler == 'newbobscheduler': + scheduler_dict['model'] = self.model_lr_scheduler.save() + if self.config.wav2vec2_scheduler == 'newbobscheduler': + scheduler_dict['wav2vec2'] = self.wav2vec2_lr_scheduler.save() + if scheduler_dict: + scheduler_path = checkpoint_path + ".pdlrs" + paddle.save(scheduler_dict, scheduler_path) + logger.info("Saved scheduler state to {}".format(scheduler_path)) + info_path = re.sub('.pdparams$', '.json', params_path) + infos = {} if infos is None else infos + with open(info_path, 'w') as fout: + data = json.dumps(infos) + fout.write(data) + + def resume_or_scratch(self): + """Resume from latest checkpoint at checkpoints in the output + directory or load a specified checkpoint. + + If ``args.checkpoint_path`` is not None, load the checkpoint, else + resume training. + """ + scratch = None + if self.args.resume: + # just restore ckpt + # lr will resotre from optimizer ckpt + resume_json_path = os.path.join(self.checkpoint_dir, + self.args.resume + '.json') + with open(resume_json_path, 'r') as f: + resume_json = json.load(f) + self.iteration = 0 + self.epoch = resume_json["epoch"] + + # resotre model from *.pdparams + params_path = os.path.join(self.checkpoint_dir, + "{}".format(self.epoch)) + '.pdparams' + model_dict = paddle.load(params_path) + self.model.set_state_dict(model_dict) + + # resotre optimizer from *.pdopt + optimizer_path = os.path.join(self.checkpoint_dir, + "{}".format(self.epoch)) + '.pdopt' + optimizer_dict = paddle.load(optimizer_path) + self.model_optimizer.set_state_dict(optimizer_dict['model']) + self.wav2vec2_optimizer.set_state_dict(optimizer_dict['wav2vec2']) + + # resotre lr_scheduler from *.pdlrs + scheduler_path = os.path.join(self.checkpoint_dir, + "{}".format(self.epoch)) + '.pdlrs' + if os.path.isfile(os.path.join(scheduler_path)): + scheduler_dict = paddle.load(scheduler_path) + if self.config.model_scheduler == 'newbobscheduler': + self.model_lr_scheduler.load(scheduler_dict['model']) + if self.config.wav2vec2_scheduler == 'newbobscheduler': + self.wav2vec2_lr_scheduler.load(scheduler_dict['wav2vec2']) + logger.info( + f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!") + scratch = False + else: + self.iteration = 0 + self.epoch = 0 + scratch = True + logger.info("Init from scratch!") + return scratch + def do_train(self): """The training process control by step.""" # !!!IMPORTANT!!! @@ -290,7 +393,6 @@ class Wav2Vec2ASRTrainer(Trainer): # paddle.jit.save(script_model, script_model_path) self.before_train() - if not self.use_streamdata: logger.info( f"Train Total Examples: {len(self.train_loader.dataset)}") @@ -307,8 +409,9 @@ class Wav2Vec2ASRTrainer(Trainer): report("Rank", dist.get_rank()) report("epoch", self.epoch) report('step', self.iteration) - report("model_lr", self.model_lr_scheduler()) - report("wav2vec2_lr", self.wav2vec2_lr_scheduler()) + report("model_lr", self.model_optimizer.get_lr()) + report("wav2vec2_lr", + self.wav2vec2_optimizer.get_lr()) self.train_batch(batch_index, batch, msg) self.after_train_batch() report('iter', batch_index + 1) @@ -352,15 +455,18 @@ class Wav2Vec2ASRTrainer(Trainer): self.visualizer.add_scalar( tag='eval/cv_loss', value=cv_loss, step=self.epoch) self.visualizer.add_scalar( - tag='eval/model_lr', value=self.model_lr_scheduler(), step=self.epoch) + tag='eval/model_lr', + value=self.model_lr_scheduler(), + step=self.epoch) self.visualizer.add_scalar( - tag='eval/wav2vec2_lr', value=self.wav2vec2_lr_scheduler(), step=self.epoch) - - if self.config.model_scheduler is 'newbobscheduler': + tag='eval/wav2vec2_lr', + value=self.wav2vec2_lr_scheduler(), + step=self.epoch) + if self.config.model_scheduler == 'newbobscheduler': self.model_lr_scheduler.step(cv_loss) - if self.config.wav2vec2_scheduler is 'newbobscheduler': + if self.config.wav2vec2_scheduler == 'newbobscheduler': if not self.config.freeze_wav2vec2: - self.wav2vec2_scheduler.step(cv_loss) + self.wav2vec2_lr_scheduler.step(cv_loss) self.save(tag=self.epoch, infos={'val_loss': cv_loss}) self.new_epoch() @@ -395,17 +501,20 @@ class Wav2Vec2ASRTrainer(Trainer): model_conf.output_dim = self.test_loader.vocab_size model = Wav2vec2ASR.from_config(model_conf) + model_dict = paddle.load(config.wav2vec2_params_path) + model.wav2vec2.set_state_dict(model_dict) if self.parallel: model = paddle.DataParallel(model, find_unused_parameters=True) - logger.info(f"{model}") layer_tools.print_params(model, logger.info) self.model = model logger.info("Setup model!") # setup speech augmentation for wav2vec2 - self.speech_augmentation = TimeDomainSpecAugment() + if hasattr(config, 'audio_augment') and self.train: + self.speech_augmentation = TimeDomainSpecAugment( + **config.audio_augment) if not self.train: return @@ -421,19 +530,18 @@ class Wav2Vec2ASRTrainer(Trainer): wav2vec2_scheduler_type = train_config.wav2vec2_scheduler wav2vec2_scheduler_conf = train_config.wav2vec2_scheduler_conf + model_scheduler_args = dict( + **{"learning_rate": model_optim_conf.lr, + "verbose": False}, **(dict(model_scheduler_conf))) - model_scheduler_args = dict(**{ - "learning_rate": model_optim_conf.lr, - "verbose": False}, **(dict(model_scheduler_conf))) - - wav2vec2_scheduler_args = dict(**{ - "learning_rate": wav2vec2_optim_conf.lr, - "verbose": False}, **(dict(wav2vec2_scheduler_conf))) + wav2vec2_scheduler_args = dict( + **{"learning_rate": wav2vec2_optim_conf.lr, + "verbose": False}, **(dict(wav2vec2_scheduler_conf))) model_lr_scheduler = LRSchedulerFactory.from_args(model_scheduler_type, - model_scheduler_args) - wav2vec2_lr_scheduler = LRSchedulerFactory.from_args(wav2vec2_scheduler_type, - wav2vec2_scheduler_args) + model_scheduler_args) + wav2vec2_lr_scheduler = LRSchedulerFactory.from_args( + wav2vec2_scheduler_type, wav2vec2_scheduler_args) def optimizer_args( config, @@ -444,18 +552,37 @@ class Wav2Vec2ASRTrainer(Trainer): train_config = config optim_arg = dict(optim_conf) optim_arg.update({ - "grad_clip": train_config.global_grad_clip, - "learning_rate": lr_scheduler - if lr_scheduler else optim_conf.lr, - "parameters": parameters}) + "grad_clip": + train_config.global_grad_clip, + "learning_rate": + lr_scheduler if lr_scheduler else optim_conf.lr, + "parameters": + parameters + }) return optim_arg - model_optimizer_args = optimizer_args(config, model_optim_type, model_optim_conf, - [*model.enc.parameters(), *model.ctc.parameters()], model_lr_scheduler) - wav2vec2_optimizer_args = optimizer_args(config, wav2vec2_optim_type, wav2vec2_optim_conf, - model.wav2vec2.parameters(), wav2vec2_lr_scheduler) - model_optimizer = OptimizerFactory.from_args(model_optim_type, model_optimizer_args) - wav2vec2_optimizer = OptimizerFactory.from_args(wav2vec2_optim_type, wav2vec2_optimizer_args) + model_optimizer_args = optimizer_args(config, model_optim_type, + model_optim_conf, [{ + 'params': + model._layers.enc.parameters() + }, { + 'params': + model._layers.ctc.parameters() + }] if self.parallel else [{ + 'params': + model.enc.parameters() + }, { + 'params': + model.ctc.parameters() + }], model_lr_scheduler) + wav2vec2_optimizer_args = optimizer_args( + config, wav2vec2_optim_type, wav2vec2_optim_conf, + model._layers.wav2vec2.parameters() if self.parallel else + model.wav2vec2.parameters(), wav2vec2_lr_scheduler) + model_optimizer = OptimizerFactory.from_args(model_optim_type, + model_optimizer_args) + wav2vec2_optimizer = OptimizerFactory.from_args(wav2vec2_optim_type, + wav2vec2_optimizer_args) self.model_optimizer = model_optimizer self.wav2vec2_optimizer = wav2vec2_optimizer diff --git a/paddlespeech/s2t/exps/whisper/test_wav.py b/paddlespeech/s2t/exps/whisper/test_wav.py new file mode 100755 index 000000000..e04eec4f2 --- /dev/null +++ b/paddlespeech/s2t/exps/whisper/test_wav.py @@ -0,0 +1,123 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.∏ +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from Whisper (https://github.com/openai/whisper/whisper/) +import os.path +import sys + +import distutils +import numpy as np +import paddle +import soundfile +from yacs.config import CfgNode + +from paddlespeech.s2t.models.whisper import log_mel_spectrogram +from paddlespeech.s2t.models.whisper import ModelDimensions +from paddlespeech.s2t.models.whisper import transcribe +from paddlespeech.s2t.models.whisper import Whisper +from paddlespeech.s2t.training.cli import default_argument_parser +from paddlespeech.s2t.utils.log import Log + +logger = Log(__name__).getlog() + + +class WhisperInfer(): + def __init__(self, config, args): + self.args = args + self.config = config + self.audio_file = args.audio_file + + paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') + config.pop("ngpu") + + #load_model + model_dict = paddle.load(self.config.model_file) + config.pop("model_file") + dims = ModelDimensions(**model_dict["dims"]) + self.model = Whisper(dims) + self.model.load_dict(model_dict) + + def run(self): + check(args.audio_file) + + with paddle.no_grad(): + temperature = config.pop("temperature") + temperature_increment_on_fallback = config.pop( + "temperature_increment_on_fallback") + if temperature_increment_on_fallback is not None: + temperature = tuple( + np.arange(temperature, 1.0 + 1e-6, + temperature_increment_on_fallback)) + else: + temperature = [temperature] + + #load audio + mel = log_mel_spectrogram( + args.audio_file, resource_path=config.resource_path) + + result = transcribe( + self.model, mel, temperature=temperature, **config) + if args.result_file is not None: + with open(args.result_file, 'w') as f: + f.write(str(result)) + return result + + +def check(audio_file: str): + if not os.path.isfile(audio_file): + print("Please input the right audio file path") + sys.exit(-1) + + logger.info("checking the audio file format......") + try: + _, sample_rate = soundfile.read(audio_file) + except Exception as e: + logger.error(str(e)) + logger.error( + "can not open the wav file, please check the audio file format") + sys.exit(-1) + logger.info("The sample rate is %d" % sample_rate) + assert (sample_rate == 16000) + logger.info("The audio file format is right") + + +def main(config, args): + WhisperInfer(config, args).run() + + +if __name__ == "__main__": + parser = default_argument_parser() + # save asr result to + parser.add_argument( + "--result_file", type=str, help="path of save the asr result") + parser.add_argument( + "--audio_file", type=str, help="path of the input audio file") + parser.add_argument( + "--debug", + type=distutils.util.strtobool, + default=False, + help="for debug.") + args = parser.parse_args() + + config = CfgNode(new_allowed=True) + + if args.config: + config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + main(config, args) diff --git a/paddlespeech/s2t/frontend/__init__.py b/paddlespeech/s2t/frontend/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/augmentor/__init__.py b/paddlespeech/s2t/frontend/augmentor/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/augmentor/augmentation.py b/paddlespeech/s2t/frontend/augmentor/augmentation.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/augmentor/base.py b/paddlespeech/s2t/frontend/augmentor/base.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/augmentor/impulse_response.py b/paddlespeech/s2t/frontend/augmentor/impulse_response.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/augmentor/online_bayesian_normalization.py b/paddlespeech/s2t/frontend/augmentor/online_bayesian_normalization.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/augmentor/resample.py b/paddlespeech/s2t/frontend/augmentor/resample.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/augmentor/shift_perturb.py b/paddlespeech/s2t/frontend/augmentor/shift_perturb.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/augmentor/spec_augment.py b/paddlespeech/s2t/frontend/augmentor/spec_augment.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/augmentor/speed_perturb.py b/paddlespeech/s2t/frontend/augmentor/speed_perturb.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/augmentor/volume_perturb.py b/paddlespeech/s2t/frontend/augmentor/volume_perturb.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/featurizer/__init__.py b/paddlespeech/s2t/frontend/featurizer/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/featurizer/speech_featurizer.py b/paddlespeech/s2t/frontend/featurizer/speech_featurizer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/speech.py b/paddlespeech/s2t/frontend/speech.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/io/__init__.py b/paddlespeech/s2t/io/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/io/batchfy.py b/paddlespeech/s2t/io/batchfy.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/io/converter.py b/paddlespeech/s2t/io/converter.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/io/utility.py b/paddlespeech/s2t/io/utility.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/__init__.py b/paddlespeech/s2t/models/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/asr_interface.py b/paddlespeech/s2t/models/asr_interface.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/ds2/__init__.py b/paddlespeech/s2t/models/ds2/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/ds2/conv.py b/paddlespeech/s2t/models/ds2/conv.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/lm/__init__.py b/paddlespeech/s2t/models/lm/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/lm/dataset.py b/paddlespeech/s2t/models/lm/dataset.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/lm/transformer.py b/paddlespeech/s2t/models/lm/transformer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/lm_interface.py b/paddlespeech/s2t/models/lm_interface.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/st_interface.py b/paddlespeech/s2t/models/st_interface.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/u2/__init__.py b/paddlespeech/s2t/models/u2/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/u2/updater.py b/paddlespeech/s2t/models/u2/updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/u2_st/__init__.py b/paddlespeech/s2t/models/u2_st/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/models/wav2vec2/__init__.py b/paddlespeech/s2t/models/wav2vec2/__init__.py old mode 100644 new mode 100755 index e69de29bb..3a12a9cf3 --- a/paddlespeech/s2t/models/wav2vec2/__init__.py +++ b/paddlespeech/s2t/models/wav2vec2/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .wav2vec2_ASR import Wav2vec2ASR +from .wav2vec2_ASR import Wav2vec2Base + +__all__ = ["Wav2vec2ASR", "Wav2vec2Base"] diff --git a/paddlespeech/s2t/models/wav2vec2/modules/VanillaNN.py b/paddlespeech/s2t/models/wav2vec2/modules/VanillaNN.py old mode 100644 new mode 100755 index ae141d1b3..9c88796bb --- a/paddlespeech/s2t/models/wav2vec2/modules/VanillaNN.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/VanillaNN.py @@ -1,11 +1,24 @@ -"""Vanilla Neural Network for simple tests. -Authors -* Elena Rastorgueva 2020 -""" +# Authors +# * Elena Rastorgueva 2020 +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/lobes/models/VanillaNN.py). import paddle from paddlespeech.s2t.models.wav2vec2.modules import containers from paddlespeech.s2t.models.wav2vec2.modules import linear +from paddlespeech.s2t.models.wav2vec2.modules.normalization import BatchNorm1d class VanillaNN(containers.Sequential): @@ -27,18 +40,33 @@ class VanillaNN(containers.Sequential): paddle.shape([10, 120, 512]) """ - def __init__( - self, - input_shape, - activation=paddle.nn.LeakyReLU, - dnn_blocks=2, - dnn_neurons=512, ): - super().__init__(input_shape=input_shape) + def __init__(self, + input_shape, + dnn_blocks=2, + dnn_neurons=512, + activation=True, + normalization=False, + dropout_rate=0.5): + super().__init__(input_shape=[None, None, input_shape]) + + if not isinstance(dropout_rate, list): + dropout_rate = [dropout_rate] * dnn_blocks + else: + assert len( + dropout_rate + ) == dnn_blocks, "len(dropout_rate) must equal to dnn_blocks" for block_index in range(dnn_blocks): self.append( linear.Linear, n_neurons=dnn_neurons, - bias=True, + bias_attr=None, layer_name="linear", ) - self.append(activation(), layer_name="act") + if normalization: + self.append( + BatchNorm1d, input_size=dnn_neurons, layer_name='bn') + if activation: + self.append(paddle.nn.LeakyReLU(), layer_name="act") + self.append( + paddle.nn.Dropout(p=dropout_rate[block_index]), + layer_name='dropout') diff --git a/paddlespeech/s2t/models/wav2vec2/modules/activations.py b/paddlespeech/s2t/models/wav2vec2/modules/activations.py old mode 100644 new mode 100755 index 722d8a0d6..af42b8a47 --- a/paddlespeech/s2t/models/wav2vec2/modules/activations.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/activations.py @@ -1,3 +1,4 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2020 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/paddlespeech/s2t/models/wav2vec2/modules/containers.py b/paddlespeech/s2t/models/wav2vec2/modules/containers.py old mode 100644 new mode 100755 index b39733570..6a6b94e95 --- a/paddlespeech/s2t/models/wav2vec2/modules/containers.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/containers.py @@ -1,3 +1,19 @@ +# Authors +# * Peter Plantinga 2020 +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/nnet/containers.py). import inspect import paddle @@ -125,5 +141,4 @@ class Sequential(paddle.nn.LayerDict): x = layer(x) if isinstance(x, tuple): x = x[0] - return x diff --git a/paddlespeech/s2t/models/wav2vec2/modules/linear.py b/paddlespeech/s2t/models/wav2vec2/modules/linear.py old mode 100644 new mode 100755 index 488949d14..3ea3716c4 --- a/paddlespeech/s2t/models/wav2vec2/modules/linear.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/linear.py @@ -1,8 +1,20 @@ -"""Library implementing linear transformation. -Authors - * Mirco Ravanelli 2020 - * Davide Borra 2021 -""" +# Authors +# * Mirco Ravanelli 2020 +# * Davide Borra 2021 +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/nnet/linear.py). import logging import paddle @@ -41,7 +53,7 @@ class Linear(paddle.nn.Layer): n_neurons, input_shape=None, input_size=None, - bias=True, + bias_attr=None, combine_dims=False, ): super().__init__() self.combine_dims = combine_dims @@ -55,7 +67,7 @@ class Linear(paddle.nn.Layer): input_size = input_shape[2] * input_shape[3] # Weights are initialized following paddle approach - self.w = align.Linear(input_size, n_neurons, bias_attr=bias) + self.w = align.Linear(input_size, n_neurons, bias_attr=bias_attr) def forward(self, x): """Returns the linear transformation of input tensor. diff --git a/paddlespeech/s2t/models/wav2vec2/modules/modeling_outputs.py b/paddlespeech/s2t/models/wav2vec2/modules/modeling_outputs.py old mode 100644 new mode 100755 index fb2a87122..ab623a996 --- a/paddlespeech/s2t/models/wav2vec2/modules/modeling_outputs.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/modeling_outputs.py @@ -1,3 +1,4 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2020 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py index ea06d1789..688bf5f84 100755 --- a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py @@ -1,4 +1,4 @@ -# coding=utf-8 +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -1120,9 +1120,6 @@ class Wav2Vec2ConfigPure(): self.output_hidden_states = False self.use_return_dict = True - self.pad_token_id = config.pad_token_id - self.bos_token_id = config.bos_token_id - self.eos_token_id = config.eos_token_id self.hidden_size = config.hidden_size self.feat_extract_norm = config.feat_extract_norm self.feat_extract_activation = config.feat_extract_activation @@ -1145,7 +1142,6 @@ class Wav2Vec2ConfigPure(): self.layerdrop = config.layerdrop self.layer_norm_eps = config.layer_norm_eps self.initializer_range = config.initializer_range - self.vocab_size = config.vocab_size self.do_stable_layer_norm = config.do_stable_layer_norm self.use_weighted_layer_sum = config.use_weighted_layer_sum diff --git a/paddlespeech/s2t/models/wav2vec2/modules/normalization.py b/paddlespeech/s2t/models/wav2vec2/modules/normalization.py index 7716c755a..4415a50eb 100755 --- a/paddlespeech/s2t/models/wav2vec2/modules/normalization.py +++ b/paddlespeech/s2t/models/wav2vec2/modules/normalization.py @@ -16,12 +16,20 @@ # See the License for the specific language governing permissions and # limitations under the License. # Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/nnet/normalization.py) +<<<<<<< HEAD import paddle import paddle.nn as nn from paddlespeech.s2t.modules.align import BatchNorm1D +======= +import paddle.nn as nn + +from paddlespeech.s2t.modules.align import BatchNorm1D + + +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 class BatchNorm1d(nn.Layer): """Applies 1d batch normalization to the input tensor. Arguments @@ -52,6 +60,7 @@ class BatchNorm1d(nn.Layer): """ def __init__( +<<<<<<< HEAD self, input_shape=None, input_size=None, @@ -60,6 +69,15 @@ class BatchNorm1d(nn.Layer): combine_batch_time=False, skip_transpose=False, ): +======= + self, + input_shape=None, + input_size=None, + eps=1e-05, + momentum=0.9, + combine_batch_time=False, + skip_transpose=False, ): +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 super().__init__() self.combine_batch_time = combine_batch_time self.skip_transpose = skip_transpose @@ -69,11 +87,15 @@ class BatchNorm1d(nn.Layer): elif input_size is None: input_size = input_shape[-1] +<<<<<<< HEAD self.norm = nn.BatchNorm1D( input_size, momentum=momentum, epsilon=eps ) +======= + self.norm = BatchNorm1D(input_size, momentum=momentum, epsilon=eps) +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 def forward(self, x): """Returns the normalized input tensor. @@ -88,9 +110,14 @@ class BatchNorm1d(nn.Layer): if x.ndim == 3: x = x.reshape(shape_or[0] * shape_or[1], shape_or[2]) else: +<<<<<<< HEAD x = x.reshape( shape_or[0] * shape_or[1], shape_or[3], shape_or[2] ) +======= + x = x.reshape(shape_or[0] * shape_or[1], shape_or[3], + shape_or[2]) +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 elif not self.skip_transpose: x = x.transpose([0, 2, 1]) @@ -101,4 +128,8 @@ class BatchNorm1d(nn.Layer): elif not self.skip_transpose: x_n = x_n.transpose([0, 2, 1]) - return x_n \ No newline at end of file +<<<<<<< HEAD + return x_n +======= + return x_n +>>>>>>> 45426846942f68cf43a23677d8d55f6d4ab93ab1 diff --git a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py old mode 100644 new mode 100755 index 9998a8e5e..0c4ade7b7 --- a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py +++ b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py @@ -1,12 +1,23 @@ -""" -Low level signal processing utilities -Authors - * Peter Plantinga 2020 - * Francois Grondin 2020 - * William Aris 2020 - * Samuele Cornell 2020 - * Sarthak Yadav 2022 -""" +# Authors +# * Peter Plantinga 2020 +# * Francois Grondin 2020 +# * William Aris 2020 +# * Samuele Cornell 2020 +# * Sarthak Yadav 2022 +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/processing/signal_processing.py) import numpy as np import paddle diff --git a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py old mode 100644 new mode 100755 index 471ab7657..9224549a4 --- a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py +++ b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py @@ -1,3 +1,19 @@ +# Authors +# * Peter Plantinga 2020 +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/processing/speech_augmentation.py) import math import paddle @@ -623,16 +639,177 @@ class DropChunk(nn.Layer): return dropped_waveform +class SpecAugment(paddle.nn.Layer): + """An implementation of the SpecAugment algorithm. + Reference: + https://arxiv.org/abs/1904.08779 + Arguments + --------- + time_warp : bool + Whether applying time warping. + time_warp_window : int + Time warp window. + time_warp_mode : str + Interpolation mode for time warping (default "bicubic"). + freq_mask : bool + Whether applying freq mask. + freq_mask_width : int or tuple + Freq mask width range. + n_freq_mask : int + Number of freq mask. + time_mask : bool + Whether applying time mask. + time_mask_width : int or tuple + Time mask width range. + n_time_mask : int + Number of time mask. + replace_with_zero : bool + If True, replace masked value with 0, else replace masked value with mean of the input tensor. + Example + ------- + >>> aug = SpecAugment() + >>> a = paddle.rand([8, 120, 80]) + >>> a = aug(a) + >>> print(a.shape) + paddle.Size([8, 120, 80]) + """ + + def __init__( + self, + time_warp=True, + time_warp_window=5, + time_warp_mode="bicubic", + freq_mask=True, + freq_mask_width=(0, 20), + n_freq_mask=2, + time_mask=True, + time_mask_width=(0, 100), + n_time_mask=2, + replace_with_zero=True, ): + super().__init__() + assert ( + time_warp or freq_mask or time_mask + ), "at least one of time_warp, time_mask, or freq_mask should be applied" + + self.apply_time_warp = time_warp + self.time_warp_window = time_warp_window + self.time_warp_mode = time_warp_mode + + self.freq_mask = freq_mask + if isinstance(freq_mask_width, int): + freq_mask_width = (0, freq_mask_width) + self.freq_mask_width = freq_mask_width + self.n_freq_mask = n_freq_mask + + self.time_mask = time_mask + if isinstance(time_mask_width, int): + time_mask_width = (0, time_mask_width) + self.time_mask_width = time_mask_width + self.n_time_mask = n_time_mask + + self.replace_with_zero = replace_with_zero + + def forward(self, x): + """Takes in input a tensors and returns an augmented one.""" + if self.apply_time_warp: + x = self.time_warp(x) + if self.freq_mask: + x = self.mask_along_axis(x, dim=2) + if self.time_mask: + x = self.mask_along_axis(x, dim=1) + return x + + def time_warp(self, x): + """Time warping with paddle.nn.functional.interpolate""" + original_size = x.shape + window = self.time_warp_window + + # 2d interpolation requires 4D or higher dimension tensors + # x: (Batch, Time, Freq) -> (Batch, 1, Time, Freq) + if x.dim() == 3: + x = x.unsqueeze(1) + + time = x.shape[2] + if time - window <= window: + return x.view(*original_size) + + # compute center and corresponding window + c = paddle.randint(window, time - window, (1, ))[0] + w = paddle.randint(c - window, c + window, (1, ))[0] + 1 + # c = 5 + # w = 10 + left = paddle.nn.functional.interpolate( + x[:, :, :c], + (w, x.shape[3]), + mode=self.time_warp_mode, + align_corners=True, ) + right = paddle.nn.functional.interpolate( + x[:, :, c:], + (time - w, x.shape[3]), + mode=self.time_warp_mode, + align_corners=True, ) + + x[:, :, :w] = left + x[:, :, w:] = right + return x.view(*original_size) + + def mask_along_axis(self, x, dim): + """Mask along time or frequency axis. + Arguments + --------- + x : tensor + Input tensor. + dim : int + Corresponding dimension to mask. + """ + original_size = x.shape + if x.dim() == 4: + x = x.view(-1, x.shape[2], x.shape[3]) + + batch, time, fea = x.shape + + if dim == 1: + D = time + n_mask = self.n_time_mask + width_range = self.time_mask_width + else: + D = fea + n_mask = self.n_freq_mask + width_range = self.freq_mask_width + + mask_len = paddle.randint(width_range[0], width_range[1], + (batch, n_mask)).unsqueeze(2) + + mask_pos = paddle.randint(0, max(1, D - mask_len.max()), + (batch, n_mask)).unsqueeze(2) + + # compute masks + arange = paddle.arange(end=D).view(1, 1, -1) + mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len)) + mask = mask.any(axis=1) + + if dim == 1: + mask = mask.unsqueeze(2) + else: + mask = mask.unsqueeze(1) + + if self.replace_with_zero: + val = 0.0 + else: + val = x.mean() + # same to x.masked_fill_(mask, val) + y = paddle.full(x.shape, val, x.dtype) + x = paddle.where(mask, y, x) + return x.view(*original_size) + + class TimeDomainSpecAugment(nn.Layer): """A time-domain approximation of the SpecAugment algorithm. - This augmentation module implements three augmentations in the time-domain. - 1. Drop chunks of the audio (zero amplitude or white noise) 2. Drop frequency bands (with band-drop filters) 3. Speed peturbation (via resampling to slightly different rate) - Arguments --------- perturb_prob : float from 0 to 1 @@ -661,7 +838,6 @@ class TimeDomainSpecAugment(nn.Layer): drop_chunk_noise_factor : float The noise factor used to scale the white noise inserted, relative to the average amplitude of the utterance. Default 0 (no noise inserted). - Example ------- >>> inputs = paddle.randn([10, 16000]) @@ -702,7 +878,6 @@ class TimeDomainSpecAugment(nn.Layer): def forward(self, waveforms, lengths): """Returns the distorted waveforms. - Arguments --------- waveforms : tensor diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py old mode 100644 new mode 100755 index 0d99e8708..eda188da5 --- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py +++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py @@ -1,3 +1,16 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. from collections import defaultdict from typing import Dict from typing import List @@ -10,7 +23,9 @@ import paddle.nn.functional as F from paddlespeech.s2t.models.wav2vec2.modules.modeling_wav2vec2 import Wav2Vec2ConfigPure from paddlespeech.s2t.models.wav2vec2.modules.modeling_wav2vec2 import Wav2Vec2Model from paddlespeech.s2t.models.wav2vec2.modules.VanillaNN import VanillaNN +from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import SpecAugment from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC +from paddlespeech.s2t.modules.initializer import DefaultInitializerContext from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank from paddlespeech.s2t.utils.utility import log_add @@ -18,44 +33,41 @@ from paddlespeech.s2t.utils.utility import log_add class Wav2vec2ASR(nn.Layer): def __init__(self, config: dict): super().__init__() + init_type = config.get("init_type", None) + with DefaultInitializerContext(init_type): + self.config = config + wav2vec2_config = Wav2Vec2ConfigPure(config) + wav2vec2 = Wav2Vec2Model(wav2vec2_config) + self.normalize_wav = config.normalize_wav + self.output_norm = config.output_norm + if hasattr(config, 'spec_augment'): + self.spec_augment = SpecAugment(**config.spec_augment) - wav2vec2_config = Wav2Vec2ConfigPure(config) - wav2vec2 = Wav2Vec2Model(wav2vec2_config) - model_dict = paddle.load(config.wav2vec2_params_path) - wav2vec2.set_state_dict(model_dict) - self.normalize_wav = config.normalize_wav - self.output_norm = config.output_norm - if config.freeze_wav2vec2: - wav2vec2.eval() - for parm in wav2vec2.parameters(): - parm.trainable = False - self.wav2vec2 = wav2vec2 - self.enc = VanillaNN( - input_shape=[None, None, wav2vec2_config.hidden_size], - activation=nn.LeakyReLU, - dnn_blocks=config.dnn_blocks, - dnn_neurons=config.dnn_neurons) - self.ctc = CTC(odim=config.output_dim, - enc_n_units=config.dnn_neurons, - blank_id=config.blank_id, - dropout_rate=config.ctc_dropout_rate, - reduction='mean') - - def forward(self, wav, wavs_lens_rate, target, target_lens_rate): + if config.freeze_wav2vec2: + wav2vec2.eval() + for parm in wav2vec2.parameters(): + parm.trainable = False + self.wav2vec2 = wav2vec2 + self.enc = VanillaNN(**config.enc) + self.ctc = CTC(**config.ctc, + odim=config.output_dim, + batch_average=False, + reduction='mean') + + def forward(self, wav, wavs_lens_rate, target, target_lens): if self.normalize_wav: - wav = F.layer_norm(wav, wav.shape[1:]) + wav = F.layer_norm(wav, wav.shape) # Extract wav2vec output out = self.wav2vec2(wav)[0] # We normalize the output if required if self.output_norm: - out = F.layer_norm(out, out.shape[1:]) - feats = out - + out = F.layer_norm(out, out.shape) + if self.train and hasattr(self.config, 'spec_augment'): + feats = self.spec_augment(out) + else: + feats = out x = self.enc(feats) x_lens = (wavs_lens_rate * x.shape[1]).round().astype(paddle.int64) - target_lens = (target_lens_rate * - target.shape[1]).round().astype(paddle.int64) - ctc_loss = self.ctc(x, x_lens, target, target_lens) return ctc_loss @@ -226,3 +238,33 @@ class Wav2vec2ASR(nn.Layer): """ hyps = self._ctc_prefix_beam_search(wav, beam_size) return hyps[0][0] + + +class Wav2vec2Base(nn.Layer): + """Wav2vec2 model""" + + def __init__(self, config: dict): + super().__init__() + wav2vec2_config = Wav2Vec2ConfigPure(config) + wav2vec2 = Wav2Vec2Model(wav2vec2_config) + self.wav2vec2 = wav2vec2 + + @classmethod + def from_config(cls, configs: dict): + """init model. + + Args: + configs (dict): config dict. + + Raises: + ValueError: raise when using not support encoder type. + + Returns: + nn.Layer: Wav2Vec2Base + """ + model = cls(configs) + return model + + def forward(self, wav): + out = self.wav2vec2(wav) + return out diff --git a/paddlespeech/s2t/models/whisper/__init__.py b/paddlespeech/s2t/models/whisper/__init__.py new file mode 100755 index 000000000..98ab23610 --- /dev/null +++ b/paddlespeech/s2t/models/whisper/__init__.py @@ -0,0 +1,12 @@ +# MIT License, Copyright (c) 2022 OpenAI. +# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# +# Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py) +from paddlespeech.s2t.models.whisper.whipser import decode +from paddlespeech.s2t.models.whisper.whipser import DecodingOptions +from paddlespeech.s2t.models.whisper.whipser import DecodingResult +from paddlespeech.s2t.models.whisper.whipser import detect_language +from paddlespeech.s2t.models.whisper.whipser import log_mel_spectrogram +from paddlespeech.s2t.models.whisper.whipser import ModelDimensions +from paddlespeech.s2t.models.whisper.whipser import transcribe +from paddlespeech.s2t.models.whisper.whipser import Whisper diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py new file mode 100755 index 000000000..8bd85c914 --- /dev/null +++ b/paddlespeech/s2t/models/whisper/tokenizer.py @@ -0,0 +1,362 @@ +# MIT License, Copyright (c) 2022 OpenAI. +# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# +# Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/tokenizer.py) +import os +from dataclasses import dataclass +from functools import lru_cache +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union + +import numpy as np +import paddle +from paddlenlp.transformers import GPTTokenizer + +LANGUAGES = { + "en": "english", + "zh": "chinese", + "de": "german", + "es": "spanish", + "ru": "russian", + "ko": "korean", + "fr": "french", + "ja": "japanese", + "pt": "portuguese", + "tr": "turkish", + "pl": "polish", + "ca": "catalan", + "nl": "dutch", + "ar": "arabic", + "sv": "swedish", + "it": "italian", + "id": "indonesian", + "hi": "hindi", + "fi": "finnish", + "vi": "vietnamese", + "iw": "hebrew", + "uk": "ukrainian", + "el": "greek", + "ms": "malay", + "cs": "czech", + "ro": "romanian", + "da": "danish", + "hu": "hungarian", + "ta": "tamil", + "no": "norwegian", + "th": "thai", + "ur": "urdu", + "hr": "croatian", + "bg": "bulgarian", + "lt": "lithuanian", + "la": "latin", + "mi": "maori", + "ml": "malayalam", + "cy": "welsh", + "sk": "slovak", + "te": "telugu", + "fa": "persian", + "lv": "latvian", + "bn": "bengali", + "sr": "serbian", + "az": "azerbaijani", + "sl": "slovenian", + "kn": "kannada", + "et": "estonian", + "mk": "macedonian", + "br": "breton", + "eu": "basque", + "is": "icelandic", + "hy": "armenian", + "ne": "nepali", + "mn": "mongolian", + "bs": "bosnian", + "kk": "kazakh", + "sq": "albanian", + "sw": "swahili", + "gl": "galician", + "mr": "marathi", + "pa": "punjabi", + "si": "sinhala", + "km": "khmer", + "sn": "shona", + "yo": "yoruba", + "so": "somali", + "af": "afrikaans", + "oc": "occitan", + "ka": "georgian", + "be": "belarusian", + "tg": "tajik", + "sd": "sindhi", + "gu": "gujarati", + "am": "amharic", + "yi": "yiddish", + "lo": "lao", + "uz": "uzbek", + "fo": "faroese", + "ht": "haitian creole", + "ps": "pashto", + "tk": "turkmen", + "nn": "nynorsk", + "mt": "maltese", + "sa": "sanskrit", + "lb": "luxembourgish", + "my": "myanmar", + "bo": "tibetan", + "tl": "tagalog", + "mg": "malagasy", + "as": "assamese", + "tt": "tatar", + "haw": "hawaiian", + "ln": "lingala", + "ha": "hausa", + "ba": "bashkir", + "jw": "javanese", + "su": "sundanese", +} + +# language code lookup by name, with a few language aliases +TO_LANGUAGE_CODE = { + **{language: code for code, language in LANGUAGES.items()}, + "burmese": "my", + "valencian": "ca", + "flemish": "nl", + "haitian": "ht", + "letzeburgesch": "lb", + "pushto": "ps", + "panjabi": "pa", + "moldavian": "ro", + "moldovan": "ro", + "sinhalese": "si", + "castilian": "es", +} + + +@dataclass(frozen=True) +class Tokenizer: + """A thin wrapper around `GPTTokenizer` providing quick access to special tokens""" + + tokenizer: "GPTTokenizer" + language: Optional[str] + sot_sequence: Tuple[int] + + def encode(self, text, **kwargs): + return self.tokenizer.encode(text, **kwargs) + + def decode(self, + token_ids: Union[int, List[int], np.ndarray, paddle.Tensor], + **kwargs): + if len(token_ids) > 1: + ids_list = [] + for ids in token_ids: + if paddle.is_tensor(ids): + ids = ids.item() + if ids < len(self.tokenizer): + ids_list.append(ids) + token_ids = ids_list + + return self.tokenizer.decode(token_ids, **kwargs) + + def decode_with_timestamps(self, tokens) -> str: + """ + Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. + This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>". + """ + outputs = [[]] + for token in tokens: + if token >= self.timestamp_begin: + timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>" + outputs.append(timestamp) + outputs.append([]) + else: + outputs[-1].append(token) + outputs = [ + s if isinstance(s, str) else self.tokenizer.decode(s) + for s in outputs + ] + return "".join(outputs) + + @property + @lru_cache() + def eot(self) -> int: + return self.tokenizer.eos_token_id + + @property + @lru_cache() + def sot(self) -> int: + return self._get_single_token_id("<|startoftranscript|>") + + @property + @lru_cache() + def sot_lm(self) -> int: + return self._get_single_token_id("<|startoflm|>") + + @property + @lru_cache() + def sot_prev(self) -> int: + return self._get_single_token_id("<|startofprev|>") + + @property + @lru_cache() + def no_speech(self) -> int: + return self._get_single_token_id("<|nospeech|>") + + @property + @lru_cache() + def no_timestamps(self) -> int: + return self._get_single_token_id("<|notimestamps|>") + + @property + @lru_cache() + def timestamp_begin(self) -> int: + return self.tokenizer.all_special_ids[-1] + 1 + + @property + @lru_cache() + def language_token(self) -> int: + """Returns the token id corresponding to the value of the `language` field""" + if self.language is None: + raise ValueError( + "This tokenizer does not have language token configured") + + additional_tokens = dict( + zip( + self.tokenizer.additional_special_tokens, + self.tokenizer.additional_special_tokens_ids, )) + candidate = f"<|{self.language}|>" + if candidate in additional_tokens: + return additional_tokens[candidate] + + raise KeyError(f"Language {self.language} not found in tokenizer.") + + @property + @lru_cache() + def all_language_tokens(self) -> Tuple[int]: + result = [] + for token, token_id in zip( + self.tokenizer.additional_special_tokens, + self.tokenizer.additional_special_tokens_ids, ): + if token.strip("<|>") in LANGUAGES: + result.append(token_id) + return tuple(result) + + @property + @lru_cache() + def all_language_codes(self) -> Tuple[str]: + return tuple( + self.decode([l]).strip("<|>") for l in self.all_language_tokens) + + @property + @lru_cache() + def sot_sequence_including_notimestamps(self) -> Tuple[int]: + return tuple(list(self.sot_sequence) + [self.no_timestamps]) + + @property + @lru_cache() + def non_speech_tokens(self) -> Tuple[int]: + """ + Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech + annotations, to prevent sampling texts that are not actually spoken in the audio, e.g. + + - ♪♪♪ + - ( SPEAKING FOREIGN LANGUAGE ) + - [DAVID] Hey there, + + keeping basic punctuations like commas, periods, question marks, exclamation points, etc. + """ + symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』") + symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split( + ) + + # symbols that may be a single token or multiple tokens depending on the tokenizer. + # In case they're multiple tokens, suppress the first token, which is safe because: + # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress + # in generations, and in the 3-byte UTF-8 representation they share the first two bytes. + miscellaneous = set("♩♪♫♬♭♮♯") + assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous) + + # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word + result = { + self.tokenizer.encode(" -").input_ids[0], + self.tokenizer.encode(" '").input_ids[0] + } + for symbol in symbols + list(miscellaneous): + for tokens in [ + self.tokenizer.encode(symbol).input_ids, + self.tokenizer.encode(" " + symbol).input_ids + ]: + if len(tokens) == 1 or symbol in miscellaneous: + result.add(tokens[0]) + + return tuple(sorted(result)) + + def _get_single_token_id(self, text) -> int: + tokens = self.tokenizer.encode(text).input_ids + assert len(tokens) == 1, f"{text} is not encoded as a single token" + return tokens[0] + + +@lru_cache(maxsize=None) +def build_tokenizer(resource_path: str, name: str="gpt2"): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + path = os.path.join(resource_path, "assets", name) + tokenizer = GPTTokenizer.from_pretrained(path) + + specials = [ + "<|startoftranscript|>", + * [f"<|{lang}|>" for lang in LANGUAGES.keys()], + "<|translate|>", + "<|transcribe|>", + "<|startoflm|>", + "<|startofprev|>", + "<|nospeech|>", + "<|notimestamps|>", + ] + + tokenizer.add_special_tokens(dict(additional_special_tokens=specials)) + return tokenizer + + +@lru_cache(maxsize=None) +def get_tokenizer( + multilingual: bool, + resource_path: str, + *, + task: Optional[str]=None, # Literal["transcribe", "translate", None] + language: Optional[str]=None, ) -> Tokenizer: + if language is not None: + language = language.lower() + if language not in LANGUAGES: + if language in TO_LANGUAGE_CODE: + language = TO_LANGUAGE_CODE[language] + else: + raise ValueError(f"Unsupported language: {language}") + + if multilingual: + tokenizer_name = "multilingual" + task = task or "transcribe" + language = language or "en" + else: + tokenizer_name = "gpt2" + task = None + language = None + + tokenizer = build_tokenizer( + resource_path=resource_path, name=tokenizer_name) + all_special_ids: List[int] = tokenizer.all_special_ids + sot: int = all_special_ids[1] + translate: int = all_special_ids[-6] + transcribe: int = all_special_ids[-5] + + langs = tuple(LANGUAGES.keys()) + sot_sequence = [sot] + if language is not None: + sot_sequence.append(sot + 1 + langs.index(language)) + if task is not None: + sot_sequence.append(transcribe if task == "transcribe" else translate) + + return Tokenizer( + tokenizer=tokenizer, + language=language, + sot_sequence=tuple(sot_sequence)) diff --git a/paddlespeech/s2t/models/whisper/utils.py b/paddlespeech/s2t/models/whisper/utils.py new file mode 100755 index 000000000..d067af7d2 --- /dev/null +++ b/paddlespeech/s2t/models/whisper/utils.py @@ -0,0 +1,92 @@ +# MIT License, Copyright (c) 2022 OpenAI. +# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# +# Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/utils.py) +import zlib +from typing import Iterator +from typing import TextIO + + +def exact_div(x, y): + assert x % y == 0 + return x // y + + +def str2bool(string): + str2val = {"True": True, "False": False} + if string in str2val: + return str2val[string] + else: + raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}") + + +def optional_int(string): + return None if string == "None" else int(string) + + +def optional_float(string): + return None if string == "None" else float(string) + + +def compression_ratio(text) -> float: + return len(text) / len(zlib.compress(text.encode("utf-8"))) + + +def format_timestamp(seconds: float, + always_include_hours: bool=False, + decimal_marker: str='.'): + assert seconds >= 0, "non-negative timestamp expected" + milliseconds = round(seconds * 1000.0) + + hours = milliseconds // 3_600_000 + milliseconds -= hours * 3_600_000 + + minutes = milliseconds // 60_000 + milliseconds -= minutes * 60_000 + + seconds = milliseconds // 1_000 + milliseconds -= seconds * 1_000 + + hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" + return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}" + + +def write_txt(transcript: Iterator[dict], file: TextIO): + for segment in transcript: + print(segment['text'].strip(), file=file, flush=True) + + +def write_vtt(transcript: Iterator[dict], file: TextIO): + print("WEBVTT\n", file=file) + for segment in transcript: + print( + f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n" + f"{segment['text'].strip().replace('-->', '->')}\n", + file=file, + flush=True, ) + + +def write_srt(transcript: Iterator[dict], file: TextIO): + """ + Write a transcript to a file in SRT format. + + Example usage: + from pathlib import Path + from whisper.utils import write_srt + + result = transcribe(model, audio_path, temperature=temperature, **args) + + # save SRT + audio_basename = Path(audio_path).stem + with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt: + write_srt(result["segments"], file=srt) + """ + for i, segment in enumerate(transcript, start=1): + # write srt lines + print( + f"{i}\n" + f"{format_timestamp(segment['start'], always_include_hours=True, decimal_marker=',')} --> " + f"{format_timestamp(segment['end'], always_include_hours=True, decimal_marker=',')}\n" + f"{segment['text'].strip().replace('-->', '->')}\n", + file=file, + flush=True, ) diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py new file mode 100755 index 000000000..ba9983338 --- /dev/null +++ b/paddlespeech/s2t/models/whisper/whipser.py @@ -0,0 +1,1478 @@ +# MIT License, Copyright (c) 2022 OpenAI. +# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved. +# +# Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper) +import os +from dataclasses import dataclass +from dataclasses import field +from functools import lru_cache +from typing import Dict +from typing import Iterable +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.nn.functional as F +import soundfile +import tqdm +from paddle import nn +from paddle.distribution import Categorical + +import paddlespeech.s2t.modules.align as paddlespeech_nn +from paddlespeech.s2t.models.whisper import utils +from paddlespeech.s2t.models.whisper.tokenizer import get_tokenizer +from paddlespeech.s2t.models.whisper.tokenizer import LANGUAGES +from paddlespeech.s2t.models.whisper.tokenizer import Tokenizer +from paddlespeech.s2t.utils.log import Log +logger = Log(__name__).getlog() + +_MODELS = ["large"] +SAMPLE_RATE = 16000 +N_FFT = 400 +N_MELS = 80 +HOP_LENGTH = 160 +CHUNK_LENGTH = 30 +N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000: number of samples in a chunk +N_FRAMES = utils.exact_div( + N_SAMPLES, HOP_LENGTH) # 3000: number of frames in a mel spectrogram input + + +@dataclass +class ModelDimensions: + n_mels: int + n_audio_ctx: int + n_audio_state: int + n_audio_head: int + n_audio_layer: int + n_vocab: int + n_text_ctx: int + n_text_state: int + n_text_head: int + n_text_layer: int + + +class LayerNorm(paddlespeech_nn.LayerNorm): + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + return super().forward(x) + + +class Linear(paddlespeech_nn.Linear): + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + return F.linear(x, self.weight, None + if self.bias is None else self.bias) + + +class Conv1d(paddlespeech_nn.Conv1D): + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + return super().forward(x) + + +class MultiHeadAttention(nn.Layer): + def __init__(self, n_state: int, n_head: int): + super().__init__() + self.n_head = n_head + self.query = Linear(n_state, n_state, bias_attr=True) + self.key = Linear(n_state, n_state, bias_attr=False) + self.value = Linear(n_state, n_state, bias_attr=True) + self.out = Linear(n_state, n_state, bias_attr=True) + + def forward( + self, + x: paddle.Tensor, + xa: Optional[paddle.Tensor]=None, + mask: Optional[paddle.Tensor]=None, + kv_cache: Optional[dict]=None, ): + q = self.query(x) + + if kv_cache is None or xa is None or self.key not in kv_cache: + # hooks, if installed (i.e. kv_cache is not None), will prepend the cached kv tensors; + # otherwise, perform key/value projections for self- or cross-attention as usual. + k = self.key(x if xa is None else xa) + v = self.value(x if xa is None else xa) + else: + # for cross-attention, calculate keys and values once and reuse in subsequent calls. + k = kv_cache[self.key] + v = kv_cache[self.value] + + wv = self.qkv_attention(q, k, v, mask) + return self.out(wv) + + def qkv_attention(self, + q: paddle.Tensor, + k: paddle.Tensor, + v: paddle.Tensor, + mask: Optional[paddle.Tensor]=None): + n_batch, n_ctx, n_state = q.shape + scale = (n_state // self.n_head)**-0.25 + q = paddle.transpose( + q.view(*q.shape[:2], self.n_head, -1), (0, 2, 1, 3)) * scale + k = paddle.transpose( + k.view(*k.shape[:2], self.n_head, -1), (0, 2, 3, 1)) * scale + v = paddle.transpose( + v.view(*v.shape[:2], self.n_head, -1), (0, 2, 1, 3)) + + qk = q @ k + if mask is not None: + qk = qk + mask[:n_ctx, :n_ctx] + + w = F.softmax(qk.float(), axis=-1).to(q.dtype) + return paddle.transpose((w @ v), (0, 2, 1, 3)).flatten(start_axis=2) + + +class ResidualAttentionBlock(nn.Layer): + def __init__(self, n_state: int, n_head: int, cross_attention: bool=False): + super().__init__() + + self.attn = MultiHeadAttention(n_state, n_head) + self.attn_ln = LayerNorm(n_state) + + self.cross_attn = MultiHeadAttention( + n_state, n_head) if cross_attention else None + self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None + + n_mlp = n_state * 4 + self.mlp = nn.Sequential( + Linear(n_state, n_mlp, bias_attr=True), + nn.GELU(), Linear(n_mlp, n_state, bias_attr=True)) + self.mlp_ln = LayerNorm(n_state) + + def forward( + self, + x: paddle.Tensor, + xa: Optional[paddle.Tensor]=None, + mask: Optional[paddle.Tensor]=None, + kv_cache: Optional[dict]=None, ): + x = x + self.attn(self.attn_ln(x), mask=mask, kv_cache=kv_cache) + if self.cross_attn: + x = x + self.cross_attn( + self.cross_attn_ln(x), xa, kv_cache=kv_cache) + x = x + self.mlp(self.mlp_ln(x)) + return x + + +def sinusoids(length, channels, max_timescale=10000): + """Returns sinusoids for positional embedding""" + assert channels % 2 == 0 + log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1) + inv_timescales = paddle.exp(-log_timescale_increment * paddle.arange( + channels // 2, dtype=paddle.float32)) + scaled_time = paddle.arange( + length, + dtype=paddle.float32)[:, np.newaxis] * inv_timescales[np.newaxis, :] + return paddle.to_tensor( + paddle.concat( + [paddle.sin(scaled_time), paddle.cos(scaled_time)], axis=1)) + + +class AudioEncoder(nn.Layer): + def __init__(self, + n_mels: int, + n_ctx: int, + n_state: int, + n_head: int, + n_layer: int): + super().__init__() + self.conv1 = Conv1d( + n_mels, n_state, kernel_size=3, stride=1, padding=1, bias_attr=True) + self.conv2 = Conv1d( + n_state, + n_state, + kernel_size=3, + stride=2, + padding=1, + bias_attr=True) + self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state)) + + self.blocks: Iterable[ResidualAttentionBlock] = nn.LayerList( + [ResidualAttentionBlock(n_state, n_head) for _ in range(n_layer)]) + self.ln_post = LayerNorm(n_state) + + def forward(self, x: paddle.Tensor): + """ + x : paddle.Tensor, shape = (batch_size, n_mels, n_ctx) + the mel spectrogram of the audio + """ + x = F.gelu(self.conv1(x)) + x = F.gelu(self.conv2(x)) + x = paddle.transpose(x, (0, 2, 1)) + + assert x.shape[ + 1:] == self.positional_embedding.shape, "incorrect audio shape" + x = (x + self.positional_embedding) + + for block in self.blocks: + x = block(x) + + x = self.ln_post(x) + return x + + +class TextDecoder(nn.Layer): + def __init__(self, + n_vocab: int, + n_ctx: int, + n_state: int, + n_head: int, + n_layer: int): + super().__init__() + + self.token_embedding = nn.Embedding(n_vocab, n_state) + self.positional_embedding = paddle.create_parameter( + shape=[n_ctx, n_state], dtype='float32') + + self.blocks: Iterable[ResidualAttentionBlock] = nn.LayerList([ + ResidualAttentionBlock(n_state, n_head, cross_attention=True) + for _ in range(n_layer) + ]) + self.ln = LayerNorm(n_state) + + mask = fluid.layers.fill_constant( + shape=[n_ctx, n_state], value=-np.inf, dtype='float32') + mask = paddle.triu(mask, diagonal=1) + self.register_buffer("mask", mask, persistable=False) + + def forward(self, + x: paddle.Tensor, + xa: paddle.Tensor, + kv_cache: Optional[dict]=None): + """ + x : paddle.LongTensor, shape = (batch_size, <= n_ctx) + the text tokens + xa : paddle.Tensor, shape = (batch_size, n_mels, n_audio_ctx) + the encoded audio features to be attended on + """ + offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 + x = self.token_embedding(x) + self.positional_embedding[offset:offset + + x.shape[-1]] + x = x.to(xa.dtype) + + for block in self.blocks: + x = block(x, xa, mask=self.mask, kv_cache=kv_cache) + + x = self.ln(x) + logits = (x @ paddle.transpose(self.token_embedding.weight, (1, 0))) + + return logits + + +@dataclass(frozen=True) +class DecodingOptions: + task: str = "transcribe" # whether to perform X->X "transcribe" or X->English "translate" + language: Optional[ + str] = None # language that the audio is in; uses detected language if None + # sampling-related options + temperature: float = 0.0 + sample_len: Optional[int] = None # maximum number of tokens to sample + best_of: Optional[ + int] = None # number of independent samples to collect, when t > 0 + beam_size: Optional[ + int] = None # number of beams in beam search, when t == 0 + patience: Optional[ + float] = None # patience in beam search (https://arxiv.org/abs/2204.05424) + + # options for ranking generations (either beams or best-of-N samples) + length_penalty: Optional[ + float] = None # "alpha" in Google NMT, None defaults to length norm + + # prompt, prefix, and token suppression + prompt: Optional[Union[str, List[ + int]]] = None # text or tokens for the previous context + prefix: Optional[Union[str, List[ + int]]] = None # text or tokens to prefix the current context + suppress_blank: bool = True # this will suppress blank outputs + + # list of tokens ids (or comma-separated token ids) to suppress + # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()` + suppress_tokens: Optional[Union[str, Iterable[int]]] = "-1" + + # timestamp sampling options + without_timestamps: bool = False # use <|notimestamps|> to sample text tokens only + max_initial_timestamp: Optional[ + float] = 1.0 # the initial timestamp cannot be later than this + + # implementation details + fp16: bool = False # use fp16 for most of the calculation + + +@dataclass(frozen=True) +class DecodingResult: + audio_features: paddle.Tensor + language: str + language_probs: Optional[Dict[str, float]] = None + tokens: List[int] = field(default_factory=list) + text: str = "" + avg_logprob: float = np.nan + no_speech_prob: float = np.nan + temperature: float = np.nan + compression_ratio: float = np.nan + + +class Inference: + def logits(self, tokens: paddle.Tensor, + audio_features: paddle.Tensor) -> paddle.Tensor: + """Perform a forward pass on the decoder and return per-token logits""" + raise NotImplementedError + + def rearrange_kv_cache(self, source_indices) -> None: + """Update the key-value cache according to the updated beams""" + raise NotImplementedError + + def cleanup_caching(self) -> None: + """Clean up any resources or hooks after decoding is finished""" + pass + + +class WhisperInference(Inference): + def __init__(self, model: "Whisper", initial_token_length: int): + self.model: "Whisper" = model + self.initial_token_length = initial_token_length + self.kv_cache = {} + self.hooks = [] + + def logits(self, tokens: paddle.Tensor, + audio_features: paddle.Tensor) -> paddle.Tensor: + if not self.kv_cache: + self.kv_cache, self.hooks = self.model.install_kv_cache_hooks() + + if tokens.shape[-1] > self.initial_token_length: + # only need to use the last token except in the first forward pass + tokens = tokens[:, -1:] + + return self.model.decoder( + tokens, audio_features, kv_cache=self.kv_cache) + + def cleanup_caching(self): + for hook in self.hooks: + hook.remove() + + self.kv_cache = {} + self.hooks = [] + + def rearrange_kv_cache(self, source_indices): + for module, tensor in self.kv_cache.items(): + # update the key/value cache to contain the selected sequences + self.kv_cache[module] = tensor[source_indices].detach() + + +@paddle.no_grad() +def detect_language( + model: "Whisper", + mel: paddle.Tensor, + resource_path: str, + tokenizer: Tokenizer=None) -> Tuple[paddle.Tensor, List[dict]]: + """ + Detect the spoken language in the audio, and return them as list of strings, along with the ids + of the most probable language tokens and the probability distribution over all language tokens. + This is performed outside the main decode loop in order to not interfere with kv-caching. + + Returns + ------- + language_tokens : Tensor, shape = (batch_size,) + ids of the most probable language tokens, which appears after the startoftranscript token. + language_probs : List[Dict[str, float]], length = batch_size + list of dictionaries containing the probability distribution over all languages. + """ + if tokenizer is None: + tokenizer = get_tokenizer( + model.is_multilingual, resource_path=resource_path) + if tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence: + raise ValueError( + "This model doesn't have language tokens so it can't perform lang id" + ) + + single = mel.ndim == 2 + if single: + mel = mel.unsqueeze(0) + + # skip encoder forward pass if already-encoded audio features were given + if mel.shape[-2:] != (model.dims.n_audio_ctx, model.dims.n_audio_state): + mel = model.encoder(mel) + + # forward pass using a single token, startoftranscript + batch_size = mel.shape[0] + x = paddle.to_tensor([[tokenizer.sot]] * batch_size) # [batch_size, 1] + logits = model.logits(x, mel)[:, 0] + + # collect detected languages; suppress all non-language tokens + mask = paddle.ones(paddle.to_tensor(logits.shape[-1]), dtype=bool) + mask[list(tokenizer.all_language_tokens)] = False + logits[:, mask] = -np.inf + language_tokens = paddle.argmax(logits, axis=-1) + language_token_probs = F.softmax(logits, axis=-1) + language_probs = [{ + c: language_token_probs[i, j].tolist() + for j, c in zip(tokenizer.all_language_tokens, + tokenizer.all_language_codes) + } for i in range(batch_size)] + + if single: + language_tokens = language_tokens[0] + language_probs = language_probs[0] + + return language_tokens, language_probs + + +def transcribe( + model: "Whisper", + mel: paddle.Tensor, + resource_path: str, + *, + verbose: Optional[bool]=None, + temperature: Union[float, Tuple[float, ...]]=(0.0, 0.2, 0.4, 0.6, 0.8, + 1.0), + compression_ratio_threshold: Optional[float]=2.4, + logprob_threshold: Optional[float]=-1.0, + no_speech_threshold: Optional[float]=0.6, + condition_on_previous_text: bool=True, + **decode_options, ): + """ + Transcribe an audio file using Whisper + + Parameters + ---------- + model: Whisper + The Whisper model instance + + mel: paddle.Tensor + The audio feature + + verbose: bool + Whether to display the text being decoded to the console. If True, displays all the details, + If False, displays minimal details. If None, does not display anything + + temperature: Union[float, Tuple[float, ...]] + Temperature for sampling. It can be a tuple of temperatures, which will be successfully used + upon failures according to either `compression_ratio_threshold` or `logprob_threshold`. + + compression_ratio_threshold: float + If the gzip compression ratio is above this value, treat as failed + + logprob_threshold: float + If the average log probability over sampled tokens is below this value, treat as failed + + no_speech_threshold: float + If the no_speech probability is higher than this value AND the average log probability + over sampled tokens is below `logprob_threshold`, consider the segment as silent + + condition_on_previous_text: bool + if True, the previous output of the model is provided as a prompt for the next window; + disabling may make the text inconsistent across windows, but the model becomes less prone to + getting stuck in a failure loop, such as repetition looping or timestamps going out of sync. + + decode_options: dict + Keyword arguments to construct `DecodingOptions` instances + + Returns + ------- + A dictionary containing the resulting text ("text") and segment-level details ("segments"), and + the spoken language ("language"), which is detected when `decode_options["language"]` is None. + """ + dtype = np.float32 #paddle only support float32 + + if dtype == np.float32: + decode_options["fp16"] = False + + if decode_options.get( + "language", 'None') or decode_options.get("language", None) is None: + if not model.is_multilingual: + decode_options["language"] = "en" + else: + if verbose: + print( + "Detecting language using up to the first 30 seconds. Use `--language` to specify the language" + ) + segment = pad_or_trim(mel, N_FRAMES) + _, probs = model.detect_language(segment, resource_path) + decode_options["language"] = max(probs, key=probs.get) + if verbose is not None: + print( + f"Detected language: {LANGUAGES[decode_options['language']].title()}" + ) + + language = decode_options["language"] + task = decode_options.get("task", "transcribe") + tokenizer = get_tokenizer( + model.is_multilingual, + resource_path=resource_path, + language=language, + task=task) + + def decode_with_fallback(segment: paddle.Tensor) -> DecodingResult: + temperatures = [temperature] if isinstance(temperature, ( + int, float)) else temperature + decode_result = None + + for t in temperatures: + kwargs = {**decode_options} + if t > 0: + # disable beam_size and patience when t > 0 + kwargs.pop("beam_size", None) + kwargs.pop("patience", None) + else: + # disable best_of when t == 0 + kwargs.pop("best_of", None) + + options = DecodingOptions(**kwargs, temperature=t) + decode_result = model.decode(segment, options, resource_path) + + needs_fallback = False + if compression_ratio_threshold is not None and decode_result.compression_ratio > compression_ratio_threshold: + needs_fallback = True # too repetitive + if logprob_threshold is not None and decode_result.avg_logprob < logprob_threshold: + needs_fallback = True # average log probability is too low + + if not needs_fallback: + break + + return decode_result + + seek = 0 + input_stride = utils.exact_div( + N_FRAMES, model.dims.n_audio_ctx) # mel frames per output token: 2 + time_precision = (input_stride * HOP_LENGTH / + SAMPLE_RATE) # time per output token: 0.02 (seconds) + all_tokens = [] + all_segments = [] + prompt_reset_since = 0 + + initial_prompt = decode_options.pop("initial_prompt", None) or [] + if initial_prompt: + initial_prompt = tokenizer.encode(" " + + initial_prompt.strip()).input_ids + all_tokens.extend(initial_prompt) + + def add_segment(*, + start: float, + end: float, + text_tokens: paddle.Tensor, + result: DecodingResult): + text = tokenizer.decode( + [token for token in text_tokens if token < tokenizer.eot]) + if len(text.strip()) == 0: # skip empty text output + return + + all_segments.append({ + "id": len(all_segments), + "seek": seek, + "start": start, + "end": end, + "text": text, + "tokens": result.tokens, + "temperature": result.temperature, + "avg_logprob": result.avg_logprob, + "compression_ratio": result.compression_ratio, + "no_speech_prob": result.no_speech_prob, + }) + if verbose: + print( + f"[{utils.format_timestamp(start)} --> {utils.format_timestamp(end)}] {text}" + ) + + # show the progress bar when verbose is False (otherwise the transcribed text will be printed) + num_frames = mel.shape[-1] + previous_seek_value = seek + + with tqdm.tqdm( + total=num_frames, unit='frames', + disable=verbose is not False) as pbar: + while seek < num_frames: + timestamp_offset = float(seek * HOP_LENGTH / SAMPLE_RATE) + segment = pad_or_trim(mel[:, seek:], N_FRAMES) + segment_duration = segment.shape[-1] * HOP_LENGTH / SAMPLE_RATE + + decode_options["prompt"] = all_tokens[prompt_reset_since:] + result: DecodingResult = decode_with_fallback(segment) + tokens = paddle.to_tensor(result.tokens) + + if no_speech_threshold is not None: + # no voice activity check + should_skip = result.no_speech_prob > no_speech_threshold + if logprob_threshold is not None and result.avg_logprob > logprob_threshold: + # don't skip if the logprob is high enough, despite the no_speech_prob + should_skip = False + + if should_skip: + seek += segment.shape[ + -1] # fast-forward to the next segment boundary + continue + + timestamp_tokens: paddle.Tensor = tokens.greater_equal( + paddle.to_tensor(tokenizer.timestamp_begin)) + + consecutive = paddle.where(timestamp_tokens[:-1] & timestamp_tokens[ + 1:])[0] + if len( + consecutive + ) > 0: # if the output contains two consecutive timestamp tokens + consecutive = paddle.add(consecutive, paddle.to_tensor(1)) + last_slice = 0 + for current_slice in consecutive: + sliced_tokens = tokens[last_slice:current_slice] + start_timestamp_position = ( + sliced_tokens[0].item() - tokenizer.timestamp_begin) + end_timestamp_position = ( + sliced_tokens[-1].item() - tokenizer.timestamp_begin) + add_segment( + start=timestamp_offset + start_timestamp_position * + time_precision, + end=timestamp_offset + end_timestamp_position * + time_precision, + text_tokens=sliced_tokens[1:-1], + result=result, ) + last_slice = current_slice + last_timestamp_position = ( + tokens[last_slice - 1].item() - tokenizer.timestamp_begin) + seek += last_timestamp_position * input_stride + all_tokens.extend(tokens[:last_slice + 1].tolist()) + else: + duration = segment_duration + timestamps = tokens[timestamp_tokens.nonzero().flatten()] + if len(timestamps) > 0 and timestamps[ + -1].item() != tokenizer.timestamp_begin: + # no consecutive timestamps but it has a timestamp; use the last one. + # single timestamp at the end means no speech after the last timestamp. + last_timestamp_position = timestamps[ + -1].item() - tokenizer.timestamp_begin + duration = last_timestamp_position * time_precision + + add_segment( + start=timestamp_offset, + end=timestamp_offset + duration, + text_tokens=tokens, + result=result, ) + + seek += segment.shape[-1] + all_tokens.extend(tokens.tolist()) + + if not condition_on_previous_text or result.temperature > 0.5: + # do not feed the prompt tokens if a high temperature was used + prompt_reset_since = len(all_tokens) + + # update progress bar + pbar.update(min(num_frames, seek) - previous_seek_value) + previous_seek_value = seek + + return dict( + text=tokenizer.decode(all_tokens[len(initial_prompt):]), + segments=all_segments, + language=language) + + +class SequenceRanker: + def rank(self, + tokens: List[List[paddle.Tensor]], + sum_logprobs: List[List[float]]) -> List[int]: + """ + Given a list of groups of samples and their cumulative log probabilities, + return the indices of the samples in each group to select as the final result + """ + raise NotImplementedError + + +class MaximumLikelihoodRanker(SequenceRanker): + """ + Select the sample with the highest log probabilities, penalized using either + a simple length normalization or Google NMT paper's length penalty + """ + + def __init__(self, length_penalty: Optional[float]): + self.length_penalty = length_penalty + + def rank(self, + tokens: List[List[paddle.Tensor]], + sum_logprobs: List[List[float]]): + def scores(logprobs, lengths): + result = [] + for logprob, length in zip(logprobs, lengths): + if self.length_penalty is None: + penalty = length + else: + # from the Google NMT paper + penalty = ((5 + length) / 6)**self.length_penalty + result.append(logprob / penalty) + return result + + # get the sequence with the highest score + lengths = [[len(t) for t in s] for s in tokens] + return [np.argmax(scores(p, l)) for p, l in zip(sum_logprobs, lengths)] + + +class TokenDecoder: + def reset(self): + """Initialize any stateful variables for decoding a new sequence""" + + def update(self, + tokens: paddle.Tensor, + logits: paddle.Tensor, + sum_logprobs: paddle.Tensor) -> Tuple[paddle.Tensor, bool]: + """Specify how to select the next token, based on the current trace and logits + + Parameters + ---------- + tokens : Tensor, shape = (n_batch, current_sequence_length) + all tokens in the context so far, including the prefix and sot_sequence tokens + + logits : Tensor, shape = (n_batch, vocab_size) + per-token logits of the probability distribution at the current step + + sum_logprobs : Tensor, shape = (n_batch) + cumulative log probabilities for each sequence + + Returns + ------- + tokens : Tensor, shape = (n_batch, current_sequence_length + 1) + the tokens, appended with the selected next token + + completed : bool + True if all sequences has reached the end of text + + """ + raise NotImplementedError + + def finalize( + self, tokens: paddle.Tensor, sum_logprobs: paddle.Tensor + ) -> Tuple[Sequence[Sequence[paddle.Tensor]], List[List[float]]]: + """Finalize search and return the final candidate sequences + + Parameters + ---------- + tokens : Tensor, shape = (batch_size, beam_size, current_sequence_length) + all tokens in the context so far, including the prefix and sot_sequence + + sum_logprobs : Tensor, shape = (batch_size, beam_size) + cumulative log probabilities for each sequence + + Returns + ------- + tokens : Sequence[Sequence[Tensor]], length = batch_size + sequence of Tensors containing candidate token sequences, for each audio input + + sum_logprobs : List[List[float]], length = batch_size + sequence of cumulative log probabilities corresponding to the above + + """ + raise NotImplementedError + + +class GreedyDecoder(TokenDecoder): + def __init__(self, temperature: float, eot: int): + self.temperature = temperature + self.eot = eot + + def update(self, + tokens: paddle.Tensor, + logits: paddle.Tensor, + sum_logprobs: paddle.Tensor) -> Tuple[paddle.Tensor, bool]: + temperature = self.temperature + if temperature == 0: + next_tokens = paddle.argmax(logits, axis=-1) + else: + next_tokens = Categorical(logits=logits / temperature).sample( + shape=logits.shape) + + logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32) + current_logprobs = logprobs[paddle.arange(logprobs.shape[0]), + next_tokens] + sum_logprobs += current_logprobs * paddle.to_tensor( + (tokens[:, -1] != self.eot), dtype=paddle.float32) + + next_tokens[tokens[:, -1] == self.eot] = self.eot + tokens = paddle.concat([tokens, next_tokens[:, None]], axis=-1) + + completed = paddle.all((tokens[:, -1] == self.eot)) + return tokens, completed + + def finalize(self, tokens: paddle.Tensor, sum_logprobs: paddle.Tensor): + # make sure each sequence has at least one EOT token at the end + tokens = F.pad(tokens, (0, 1), value=self.eot, data_format="NCL") + return tokens, sum_logprobs.tolist() + + +class BeamSearchDecoder(TokenDecoder): + def __init__(self, + beam_size: int, + eot: int, + inference: Inference, + patience: Optional[float]=None): + self.beam_size = beam_size + self.eot = eot + self.inference = inference + self.patience = patience or 1.0 + self.max_candidates: int = round(beam_size * self.patience) + self.finished_sequences = None + + assert self.max_candidates > 0, f"Invalid beam size ({beam_size}) or patience ({patience})" + + def reset(self): + self.finished_sequences = None + + def update(self, + tokens: paddle.Tensor, + logits: paddle.Tensor, + sum_logprobs: paddle.Tensor) -> Tuple[paddle.Tensor, bool]: + if tokens.shape[0] % self.beam_size != 0: + raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0") + + batch_size = tokens.shape[0] // self.beam_size + if self.finished_sequences is None: # for the first update + self.finished_sequences = [{} for _ in range(batch_size)] + + logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32) + next_tokens, source_indices, finished_sequences = [], [], [] + for i in range(batch_size): + scores, sources, finished = {}, {}, {} + + # STEP 1: calculate the cumulative log probabilities for possible candidates + for j in range(self.beam_size): + idx = i * self.beam_size + j + prefix = tokens[idx].tolist() + logprob, token = paddle.topk( + logprobs[idx], k=self.beam_size + 1) + for logprob, token in zip(logprob, token): + new_logprob = (sum_logprobs[idx] + logprob).tolist()[0] + sequence = tuple(prefix + [token.tolist()[0]]) + scores[sequence] = new_logprob + sources[sequence] = idx + + # STEP 2: rank the candidates and keep the top beam_size sequences for each audio + saved = 0 + for sequence in sorted(scores, key=scores.get, reverse=True): + if sequence[-1] == self.eot: + finished[sequence] = scores[sequence] + else: + sum_logprobs[len(next_tokens)] = scores[sequence] + next_tokens.append(sequence) + source_indices.append(sources[sequence]) + + saved += 1 + if saved == self.beam_size: + break + + finished_sequences.append(finished) + + tokens = paddle.to_tensor(next_tokens) + self.inference.rearrange_kv_cache(source_indices) + + # add newly finished sequences to self.finished_sequences + assert len(self.finished_sequences) == len(finished_sequences) + for previously_finished, newly_finished in zip(self.finished_sequences, + finished_sequences): + for seq in sorted( + newly_finished, key=newly_finished.get, reverse=True): + if len(previously_finished) >= self.max_candidates: + break # the candidate list is full + previously_finished[seq] = newly_finished[seq] + + # mark as completed if all audio has enough number of samples + completed = all( + len(sequences) >= self.max_candidates + for sequences in self.finished_sequences) + return tokens, completed + + def finalize(self, + preceding_tokens: paddle.Tensor, + sum_logprobs: paddle.Tensor): + # collect all finished sequences, including patience, and add unfinished ones if not enough + sum_logprobs = sum_logprobs.cpu() + for i, sequences in enumerate(self.finished_sequences): + if len(sequences + ) < self.beam_size: # when not enough sequences are finished + for j in list(np.argsort(sum_logprobs[i]))[::-1]: + sequence = preceding_tokens[i, j].tolist() + [self.eot] + sequences[tuple(sequence)] = sum_logprobs[i][j].item() + if len(sequences) >= self.beam_size: + break + + tokens: List[List[paddle.Tensor]] = [ + [paddle.to_tensor(seq) for seq in sequences.keys()] + for sequences in self.finished_sequences + ] + sum_logprobs: List[List[float]] = [ + list(sequences.values()) for sequences in self.finished_sequences + ] + return tokens, sum_logprobs + + +class LogitFilter: + def apply(self, logits: paddle.Tensor, tokens: paddle.Tensor) -> None: + """Apply any filtering or masking to logits in-place + + Parameters + ---------- + logits : Tensor, shape = (n_batch, vocab_size) + per-token logits of the probability distribution at the current step + + tokens : Tensor, shape = (n_batch, current_sequence_length) + all tokens in the context so far, including the prefix and sot_sequence tokens + + """ + raise NotImplementedError + + +class SuppressBlank(LogitFilter): + def __init__(self, tokenizer: Tokenizer, sample_begin: int): + self.tokenizer = tokenizer + self.sample_begin = sample_begin + + def apply(self, logits: paddle.Tensor, tokens: paddle.Tensor): + if tokens.shape[1] == self.sample_begin: + logits[:, self.tokenizer.encode(" ").input_ids + + [self.tokenizer.eot]] = -np.inf + + +class SuppressTokens(LogitFilter): + def __init__(self, suppress_tokens: Sequence[int]): + self.suppress_tokens = list(suppress_tokens) + + def apply(self, logits: paddle.Tensor, tokens: paddle.Tensor): + logits[:, self.suppress_tokens] = -np.inf + + +class ApplyTimestampRules(LogitFilter): + def __init__(self, + tokenizer: Tokenizer, + sample_begin: int, + max_initial_timestamp_index: Optional[int]): + self.tokenizer = tokenizer + self.sample_begin = sample_begin + self.max_initial_timestamp_index = max_initial_timestamp_index + + def apply(self, logits: paddle.Tensor, tokens: paddle.Tensor): + # suppress <|notimestamps|> which is handled by without_timestamps + if self.tokenizer.no_timestamps is not None: + logits[:, self.tokenizer.no_timestamps] = -np.inf + + # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly + for k in range(tokens.shape[0]): + seq = [t for t in tokens[k, self.sample_begin:].tolist()] + last_was_timestamp = len(seq) >= 1 and seq[ + -1] >= self.tokenizer.timestamp_begin + penultimate_was_timestamp = len(seq) < 2 or seq[ + -2] >= self.tokenizer.timestamp_begin + + if last_was_timestamp: + if penultimate_was_timestamp: # has to be non-timestamp + logits[k, self.tokenizer.timestamp_begin:] = -np.inf + else: # cannot be normal text tokens + logits[k, :self.tokenizer.eot] = -np.inf + + # apply the `max_initial_timestamp` option + if tokens.shape[ + 1] == self.sample_begin and self.max_initial_timestamp_index is not None: + last_allowed = self.tokenizer.timestamp_begin + self.max_initial_timestamp_index + logits[:, last_allowed + 1:] = -np.inf + + # if sum of probability over timestamps is above any other token, sample timestamp + logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32) + for k in range(tokens.shape[0]): + timestamp_logprob = paddle.logsumexp( + logprobs[k, self.tokenizer.timestamp_begin:], axis=-1) + max_text_token_logprob = paddle.max( + logprobs[k, :self.tokenizer.timestamp_begin]) + if timestamp_logprob > max_text_token_logprob: + logits[k, :self.tokenizer.timestamp_begin] = -np.inf + + +class DecodingTask: + inference: Inference + sequence_ranker: SequenceRanker + decoder: TokenDecoder + logit_filters: List[LogitFilter] + + def __init__(self, + model: "Whisper", + options: DecodingOptions, + resource_path: str): + self.model = model + + language = options.language or "en" + tokenizer = get_tokenizer( + model.is_multilingual, + resource_path=resource_path, + language=language, + task=options.task) + self.tokenizer: Tokenizer = tokenizer + self.options: DecodingOptions = self._verify_options(options) + self.resource_path: str = resource_path + + self.beam_size: int = options.beam_size or options.best_of or 1 + self.n_ctx: int = model.dims.n_text_ctx + self.sample_len: int = options.sample_len or model.dims.n_text_ctx // 2 + + self.sot_sequence: Tuple[int] = tokenizer.sot_sequence + if self.options.without_timestamps: + self.sot_sequence = tokenizer.sot_sequence_including_notimestamps + + self.initial_tokens: Tuple[int] = self._get_initial_tokens() + self.sample_begin: int = len(self.initial_tokens) + self.sot_index: int = self.initial_tokens.index(tokenizer.sot) + + # inference: implements the forward pass through the decoder, including kv caching + self.inference = WhisperInference(model, len(self.initial_tokens)) + + # sequence ranker: implements how to rank a group of sampled sequences + self.sequence_ranker = MaximumLikelihoodRanker(options.length_penalty) + + # decoder: implements how to select the next tokens, given the autoregressive distribution + if options.beam_size is not None: + self.decoder = BeamSearchDecoder(options.beam_size, tokenizer.eot, + self.inference, options.patience) + else: + self.decoder = GreedyDecoder(options.temperature, tokenizer.eot) + + # logit filters: applies various rules to suppress or penalize certain tokens + self.logit_filters = [] + if self.options.suppress_blank: + self.logit_filters.append( + SuppressBlank(self.tokenizer, self.sample_begin)) + if self.options.suppress_tokens: + self.logit_filters.append( + SuppressTokens(self._get_suppress_tokens())) + if not options.without_timestamps: + precision = CHUNK_LENGTH / model.dims.n_audio_ctx # usually 0.02 seconds + max_initial_timestamp_index = None + if options.max_initial_timestamp: + max_initial_timestamp_index = round( + self.options.max_initial_timestamp / precision) + self.logit_filters.append( + ApplyTimestampRules(tokenizer, self.sample_begin, + max_initial_timestamp_index)) + + def _verify_options(self, options: DecodingOptions) -> DecodingOptions: + if options.beam_size is not None and options.best_of is not None: + raise ValueError("beam_size and best_of can't be given together") + if options.temperature == 0: + if options.best_of is not None: + raise ValueError( + "best_of with greedy sampling (T=0) is not compatible") + if options.patience is not None and options.beam_size is None: + raise ValueError("patience requires beam_size to be given") + if options.length_penalty is not None and not ( + 0 <= options.length_penalty <= 1): + raise ValueError( + "length_penalty (alpha) should be a value between 0 and 1") + + return options + + def _get_initial_tokens(self) -> Tuple[int]: + tokens = list(self.sot_sequence) + prefix = self.options.prefix + prompt = self.options.prompt + + if prefix: + prefix_tokens = ( + self.tokenizer.encode(" " + prefix.strip().input_ids) + if isinstance(prefix, str) else prefix) + if self.sample_len is not None: + max_prefix_len = self.n_ctx // 2 - self.sample_len + prefix_tokens = prefix_tokens[-max_prefix_len:] + tokens = tokens + prefix_tokens + + if prompt: + prompt_tokens = ( + self.tokenizer.encode(" " + prompt.strip().input_ids) + if isinstance(prompt, str) else prompt) + tokens = [self.tokenizer.sot_prev] + prompt_tokens[-(self.n_ctx // 2 + - 1):] + tokens + + return tuple(tokens) + + def _get_suppress_tokens(self) -> Tuple[int]: + suppress_tokens = self.options.suppress_tokens + + if isinstance(suppress_tokens, str): + suppress_tokens = [int(t) for t in suppress_tokens.split(",")] + + if -1 in suppress_tokens: + suppress_tokens = [t for t in suppress_tokens if t >= 0] + suppress_tokens.extend(self.tokenizer.non_speech_tokens) + elif suppress_tokens is None or len(suppress_tokens) == 0: + suppress_tokens = [] # interpret empty string as an empty list + else: + assert isinstance(suppress_tokens, + list), "suppress_tokens must be a list" + + suppress_tokens.extend([ + self.tokenizer.sot, self.tokenizer.sot_prev, self.tokenizer.sot_lm + ]) + if self.tokenizer.no_speech is not None: + # no-speech probability is collected separately + suppress_tokens.append(self.tokenizer.no_speech) + + return tuple(sorted(set(suppress_tokens))) + + def _get_audio_features(self, mel: paddle.Tensor): + #if self.options.fp16: + # mel = mel.half() + + if mel.shape[-2:] == (self.model.dims.n_audio_ctx, + self.model.dims.n_audio_state): + # encoded audio features are given; skip audio encoding + audio_features = mel + else: + audio_features = self.model.encoder(mel) + + #if audio_features.dtype != (np.float16 if self.options.fp16 else np.float32): + # return TypeError(f"audio_features has an incorrect dtype: {audio_features.dtype}") + + return audio_features + + def _detect_language(self, + audio_features: paddle.Tensor, + tokens: paddle.Tensor, + resource_path: str): + languages = [self.options.language] * audio_features.shape[0] + lang_probs = None + + if self.options.language is None or self.options.task == "lang_id": + lang_tokens, lang_probs = self.model.detect_language( + audio_features, self.tokenizer, self.resource_path) + languages = [max(probs, key=probs.get) for probs in lang_probs] + if self.options.language is None: + tokens[:, self.sot_index + + 1] = lang_tokens # write language tokens + + return languages, lang_probs + + def _main_loop(self, audio_features: paddle.Tensor, tokens: paddle.Tensor): + assert audio_features.shape[0] == tokens.shape[0] + n_batch = tokens.shape[0] + sum_logprobs: paddle.Tensor = paddle.zeros( + paddle.to_tensor(n_batch), dtype=paddle.float32) + no_speech_probs = [np.nan] * n_batch + + try: + for i in range(self.sample_len): + logits = self.inference.logits(tokens, audio_features) + + if i == 0 and self.tokenizer.no_speech is not None: # save no_speech_probs + probs_at_sot = F.softmax( + logits[:, self.sot_index], + axis=-1, + dtype=paddle.float32) + no_speech_probs = probs_at_sot[:, self.tokenizer. + no_speech].tolist() + + # now we need to consider the logits at the last token only + logits = logits[:, -1] + + # apply the logit filters, e.g. for suppressing or applying penalty to + for logit_filter in self.logit_filters: + logit_filter.apply(logits, tokens) + + # expand the tokens tensor with the selected next tokens + tokens, completed = self.decoder.update(tokens, logits, + sum_logprobs) + if completed or tokens.shape[-1] > self.n_ctx: + break + finally: + self.inference.cleanup_caching() + + return tokens, sum_logprobs, no_speech_probs + + @paddle.no_grad() + def run(self, mel: paddle.Tensor) -> List[DecodingResult]: + self.decoder.reset() + tokenizer: Tokenizer = self.tokenizer + batch_size: int = mel.shape[0] + + audio_features: paddle.Tensor = self._get_audio_features( + mel) # encoder forward pass + + tokens: paddle.Tensor + if batch_size > 1: + for i in range(batch_size): + tokens = paddle.concat( + x=[ + paddle.to_tensor([self.initial_tokens]), + paddle.to_tensor([self.initial_tokens]) + ], + axis=0) + elif batch_size == 1: + tokens = paddle.to_tensor([self.initial_tokens]) + + # detect language if requested, overwriting the language token + languages, language_probs = self._detect_language( + paddle.to_tensor(audio_features), + paddle.to_tensor(tokens), self.resource_path) + + if self.options.task == "lang_id": + return [ + DecodingResult( + audio_features=features, + language=language, + language_probs=probs) + for features, language, probs in zip(audio_features, languages, + language_probs) + ] + + # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling + + audio_features = paddle.repeat_interleave( + audio_features, self.beam_size, axis=0) + tokens = paddle.repeat_interleave(tokens, self.beam_size, axis=0) + + # call the main sampling loop + tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, + tokens) + + # reshape the tensors to have (batch_size, beam_size) as the first two dimensions + audio_features = audio_features[::self.beam_size] + no_speech_probs = no_speech_probs[::self.beam_size] + assert audio_features.shape[0] == len(no_speech_probs) == batch_size + + tokens = tokens.reshape([batch_size, self.beam_size, -1]) + sum_logprobs = sum_logprobs.reshape([batch_size, self.beam_size]) + + # get the final candidates for each group, and slice between the first sampled token and EOT + tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs) + tokens: List[List[paddle.Tensor]] = [[ + t[self.sample_begin:(t == tokenizer.eot).nonzero()[0, 0]] for t in s + ] for s in tokens] + + # select the top-ranked sample in each group + selected = self.sequence_ranker.rank(tokens, sum_logprobs) + tokens: List[List[ + int]] = [t[i].tolist() for i, t in zip(selected, tokens)] + texts: List[str] = [tokenizer.decode(t).strip() for t in tokens] + + sum_logprobs: List[ + float] = [lp[i] for i, lp in zip(selected, sum_logprobs)] + avg_logprobs: List[ + float] = [lp / (len(t) + 1) for t, lp in zip(tokens, sum_logprobs)] + + fields = (texts, languages, tokens, audio_features, avg_logprobs, + no_speech_probs) + if len(set(map(len, fields))) != 1: + raise RuntimeError( + f"inconsistent result lengths: {list(map(len, fields))}") + + return [ + DecodingResult( + audio_features=features, + language=language, + tokens=tokens, + text=text, + avg_logprob=avg_logprob, + no_speech_prob=no_speech_prob, + temperature=self.options.temperature, + compression_ratio=utils.compression_ratio(text), ) + for text, language, tokens, features, avg_logprob, no_speech_prob in + zip(*fields) + ] + + +@paddle.no_grad() +def decode( + model: "Whisper", + mel: paddle.Tensor, + options: DecodingOptions=DecodingOptions(), + resource_path=str, ) -> Union[DecodingResult, List[DecodingResult]]: + """ + Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s). + + Parameters + ---------- + model: Whisper + the Whisper model instance + + mel: paddle.Tensor, shape = (80, 3000) or (*, 80, 3000) + A tensor containing the Mel spectrogram(s) + + options: DecodingOptions + A dataclass that contains all necessary options for decoding 30-second segments + + Returns + ------- + result: Union[DecodingResult, List[DecodingResult]] + The result(s) of decoding contained in `DecodingResult` dataclass instance(s) + """ + single = mel.ndim == 2 + if single: + mel = mel.unsqueeze(0) + + result = DecodingTask(model, options, resource_path).run(mel) + + if single: + result = result[0] + + return result + + +class Whisper(nn.Layer): + def __init__(self, dims: ModelDimensions): + super().__init__() + self.dims = dims + self.encoder = AudioEncoder( + self.dims.n_mels, + self.dims.n_audio_ctx, + self.dims.n_audio_state, + self.dims.n_audio_head, + self.dims.n_audio_layer, ) + self.decoder = TextDecoder( + self.dims.n_vocab, + self.dims.n_text_ctx, + self.dims.n_text_state, + self.dims.n_text_head, + self.dims.n_text_layer, ) + + def embed_audio(self, mel: paddle.Tensor): + return self.encoder.forward(mel) + + def logits(self, tokens: paddle.Tensor, audio_features: paddle.Tensor): + return self.decoder.forward(tokens, audio_features) + + def forward(self, mel: paddle.Tensor, + tokens: paddle.Tensor) -> Dict[str, paddle.Tensor]: + return self.decoder(tokens, self.encoder(mel)) + + @property + def device(self): + return paddle.device.get_device() + + @property + def is_multilingual(self): + return self.dims.n_vocab == 51865 + + def install_kv_cache_hooks(self, cache: Optional[dict]=None): + """ + The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value + tensors calculated for the previous positions. This method returns a dictionary that stores + all caches, and the necessary hooks for the key and value projection modules that save the + intermediate tensors to be reused during later calculations. + + Returns + ------- + cache : Dict[nn.Layer, paddle.Tensor] + A dictionary object mapping the key/value projection modules to its cache + hooks : List[RemovableHandle] + List of PyTorch RemovableHandle objects to stop the hooks to be called + """ + cache = {**cache} if cache is not None else {} + hooks = [] + + def save_to_cache(module, _, output): + if module not in cache or output.shape[ + 1] > self.decoder.positional_embedding.shape[0]: + cache[ + module] = output # save as-is, for the first token or cross attention + else: + cache[module] = paddle.concat( + [cache[module], output], axis=1).detach() + return cache[module] + + def install_hooks(layer: nn.Layer): + if isinstance(layer, MultiHeadAttention): + hooks.append( + layer.key.register_forward_post_hook(save_to_cache)) + hooks.append( + layer.value.register_forward_post_hook(save_to_cache)) + + self.decoder.apply(install_hooks) + return cache, hooks + + detect_language = detect_language + transcribe = transcribe + decode = decode + + +def pad_or_trim(array, length: int=N_SAMPLES, *, axis: int=-1): + """ + Pad or trim the audio array to N_SAMPLES, as expected by the encoder. + """ + if paddle.is_tensor(array): + if array.shape[axis] > length: + array = array.index_select(axis=axis, index=paddle.arange(length)) + + if array.shape[axis] < length: + pad_widths = [(0, 0)] * array.ndim + pad_widths[axis] = (0, length - array.shape[axis]) + array = paddle.transpose(array, (1, 0)) + array = F.pad( + array, [pad for sizes in pad_widths[::-1] for pad in sizes], + data_format='NLC') + array = paddle.transpose(array, (1, 0)) + else: + if array.shape[axis] > length: + array = array.take(indices=range(length), axis=axis) + + if array.shape[axis] < length: + pad_widths = [(0, 0)] * array.ndim + pad_widths[axis] = (0, length - array.shape[axis]) + array = paddle.transpose(array, (1, 0)) + array = np.pad(array, pad_widths) + array = paddle.transpose(array, (1, 0)) + + return array + + +def hann_window(n_fft: int=N_FFT): + """ + hanning window + n_fft: The number of frequency components of the discrete Fourier transform. + """ + return paddle.to_tensor( + [0.5 - 0.5 * np.cos(2 * np.pi * n / n_fft) for n in range(n_fft)], + dtype=paddle.float32) + + +@lru_cache(maxsize=None) +def mel_filters(resource_path: str, n_mels: int=N_MELS) -> paddle.Tensor: + """ + load the mel filterbank matrix for projecting STFT into a Mel spectrogram. + Allows decoupling librosa dependency; saved using: + + np.savez_compressed( + "mel_filters.npz", + mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80), + ) + """ + assert n_mels == 80, f"Unsupported n_mels: {n_mels}" + with np.load(os.path.join(resource_path, "assets", "mel_filters.npz")) as f: + return paddle.to_tensor(f[f"mel_{n_mels}"]) + + +def log_mel_spectrogram(audio: Union[str, np.ndarray, paddle.Tensor], + n_mels: int=N_MELS, + resource_path: str=None): + """ + Compute the log-Mel spectrogram of + + Parameters + ---------- + audio: Union[str, np.ndarray, paddle.Tensor], shape = (*) + The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz + + n_mels: int + The number of Mel-frequency filters, only 80 is supported + + Returns + ------- + paddle.Tensor, shape = (80, n_frames) + A Tensor that contains the Mel spectrogram + """ + if not paddle.is_tensor(audio): + if isinstance(audio, str): + audio, _ = soundfile.read(audio, dtype="float32", always_2d=True) + audio = audio[:, 0] + logger.info(f"audio shape: {audio.shape}") + audio = paddle.to_tensor(audio) + + window = hann_window(N_FFT) + stft = paddle.signal.stft(audio, N_FFT, HOP_LENGTH, window=window) + + magnitudes = stft[:, :-1].abs()**2 + + filters = mel_filters(resource_path, n_mels) + mel_spec = filters @ magnitudes + mel_spec = paddle.to_tensor(mel_spec.numpy().tolist()) + + log_spec = paddle.clip(mel_spec, min=1e-10).log10() + log_spec = paddle.maximum(log_spec, log_spec.max() - 8.0) + log_spec = (log_spec + 4.0) / 4.0 + return log_spec diff --git a/paddlespeech/s2t/models/whisper/whisper_LICENSE b/paddlespeech/s2t/models/whisper/whisper_LICENSE new file mode 100755 index 000000000..49e465e19 --- /dev/null +++ b/paddlespeech/s2t/models/whisper/whisper_LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 OpenAI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/paddlespeech/s2t/modules/__init__.py b/paddlespeech/s2t/modules/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/activation.py b/paddlespeech/s2t/modules/activation.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/align.py b/paddlespeech/s2t/modules/align.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py old mode 100644 new mode 100755 index 128f87c07..d9568dcc9 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -19,7 +19,6 @@ from typing import Tuple import paddle from paddle import nn -from paddle.nn import functional as F from paddle.nn import initializer as I from paddlespeech.s2t.modules.align import Linear @@ -56,16 +55,6 @@ class MultiHeadedAttention(nn.Layer): self.linear_out = Linear(n_feat, n_feat) self.dropout = nn.Dropout(p=dropout_rate) - def _build_once(self, *args, **kwargs): - super()._build_once(*args, **kwargs) - # if self.self_att: - # self.linear_kv = Linear(self.n_feat, self.n_feat*2) - if not self.training: - self.weight = paddle.concat( - [self.linear_k.weight, self.linear_v.weight], axis=-1) - self.bias = paddle.concat([self.linear_k.bias, self.linear_v.bias]) - self._built = True - def forward_qkv(self, query: paddle.Tensor, key: paddle.Tensor, @@ -87,13 +76,8 @@ class MultiHeadedAttention(nn.Layer): n_batch = query.shape[0] q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) - if self.training: - k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) - v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) - else: - k, v = F.linear(key, self.weight, self.bias).view( - n_batch, -1, 2 * self.h, self.d_k).split( - 2, axis=2) + k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) + v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) q = q.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) k = k.transpose([0, 2, 1, 3]) # (batch, head, time2, d_k) diff --git a/paddlespeech/s2t/modules/cmvn.py b/paddlespeech/s2t/modules/cmvn.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/crf.py b/paddlespeech/s2t/modules/crf.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/fbank.py b/paddlespeech/s2t/modules/fbank.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/initializer.py b/paddlespeech/s2t/modules/initializer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/loss.py b/paddlespeech/s2t/modules/loss.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/mask.py b/paddlespeech/s2t/modules/mask.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/__init__.py b/paddlespeech/s2t/training/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/extensions/__init__.py b/paddlespeech/s2t/training/extensions/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/extensions/evaluator.py b/paddlespeech/s2t/training/extensions/evaluator.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/extensions/extension.py b/paddlespeech/s2t/training/extensions/extension.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/extensions/plot.py b/paddlespeech/s2t/training/extensions/plot.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/extensions/snapshot.py b/paddlespeech/s2t/training/extensions/snapshot.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/extensions/visualizer.py b/paddlespeech/s2t/training/extensions/visualizer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/optimizer.py b/paddlespeech/s2t/training/optimizer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/reporter.py b/paddlespeech/s2t/training/reporter.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/scheduler.py b/paddlespeech/s2t/training/scheduler.py index cd8a5498b..f04e130db 100755 --- a/paddlespeech/s2t/training/scheduler.py +++ b/paddlespeech/s2t/training/scheduler.py @@ -229,6 +229,125 @@ class NewBobScheduler(LRScheduler): +@register_scheduler +class NewBobScheduler(LRScheduler): + """Scheduler with new-bob technique, used for LR annealing. + + The learning rate is annealed based on the validation performance. + In particular: if (past_loss-current_loss)/past_loss< impr_threshold: + lr=lr * annealing_factor. + + Arguments + --------- + initial_value : float + The initial hyperparameter value. + annealing_factor : float + It is annealing factor used in new_bob strategy. + improvement_threshold : float + It is the improvement rate between losses used to perform learning + annealing in new_bob strategy. + patient : int + When the annealing condition is violated patient times, + the learning rate is finally reduced. + + Example + ------- + >>> scheduler = NewBobScheduler(initial_value=1.0) + >>> scheduler(metric_value=10.0) + (1.0, 1.0) + >>> scheduler(metric_value=2.0) + (1.0, 1.0) + >>> scheduler(metric_value=2.5) + (1.0, 0.5) + """ + + def __init__( + self, + learning_rate, + last_epoch=-1, + verbose=False, + annealing_factor=0.5, + improvement_threshold=0.0025, + patient=0, ): + self.hyperparam_value = learning_rate + self.annealing_factor = annealing_factor + self.improvement_threshold = improvement_threshold + self.patient = patient + self.metric_values = [] + self.current_patient = self.patient + super().__init__(learning_rate, last_epoch, verbose) + + def step(self, metric_value=None): + """ + + ``step`` should be called after ``optimizer.step`` . It will update the learning rate in optimizer according to current ``epoch`` . + The new learning rate will take effect on next ``optimizer.step`` . + + Args: + epoch (int, None): specify current epoch. Default: None. Auto-increment from last_epoch=-1. + + Returns: + None + """ + if metric_value is None: + self.last_epoch += 1 + self.last_lr = self.hyperparam_value + else: + self.last_epoch += 1 + self.last_lr = self.get_lr(metric_value) + + if self.verbose: + print('Epoch {}: {} set learning rate to {}.'.format( + self.last_epoch, self.__class__.__name__, self.last_lr)) + + def get_lr(self, metric_value): + """Returns the current and new value for the hyperparameter. + + Arguments + --------- + metric_value : int + A number for determining whether to change the hyperparameter value. + """ + new_value = self.hyperparam_value + if len(self.metric_values) > 0: + prev_metric = self.metric_values[-1] + # Update value if improvement too small and patience is 0 + if prev_metric == 0: # Prevent division by zero + improvement = 0 + else: + improvement = (prev_metric - metric_value) / prev_metric + if improvement < self.improvement_threshold: + if self.current_patient == 0: + new_value *= self.annealing_factor + self.current_patient = self.patient + else: + self.current_patient -= 1 + + # Store relevant info + self.metric_values.append(metric_value) + self.hyperparam_value = new_value + + return new_value + + def save(self): + """Saves the current metrics on the specified path.""" + data = { + "current_epoch_index": self.last_epoch, + "hyperparam_value": self.hyperparam_value, + "metric_values": self.metric_values, + "current_patient": self.current_patient + } + return data + + def load(self, data): + """Loads the needed information.""" + data = paddle.load(data) + self.last_epoch = data["current_epoch_index"] + self.hyperparam_value = data["hyperparam_value"] + self.metric_values = data["metric_values"] + self.current_patient = data["current_patient"] + + def dynamic_import_scheduler(module): """Import Scheduler class dynamically. diff --git a/paddlespeech/s2t/training/timer.py b/paddlespeech/s2t/training/timer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/triggers/__init__.py b/paddlespeech/s2t/training/triggers/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/triggers/compare_value_trigger.py b/paddlespeech/s2t/training/triggers/compare_value_trigger.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/triggers/interval_trigger.py b/paddlespeech/s2t/training/triggers/interval_trigger.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/triggers/limit_trigger.py b/paddlespeech/s2t/training/triggers/limit_trigger.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/triggers/time_trigger.py b/paddlespeech/s2t/training/triggers/time_trigger.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/triggers/utils.py b/paddlespeech/s2t/training/triggers/utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/updaters/__init__.py b/paddlespeech/s2t/training/updaters/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/updaters/standard_updater.py b/paddlespeech/s2t/training/updaters/standard_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/updaters/trainer.py b/paddlespeech/s2t/training/updaters/trainer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/training/updaters/updater.py b/paddlespeech/s2t/training/updaters/updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/__init__.py b/paddlespeech/s2t/utils/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/asr_utils.py b/paddlespeech/s2t/utils/asr_utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/bleu_score.py b/paddlespeech/s2t/utils/bleu_score.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/check_kwargs.py b/paddlespeech/s2t/utils/check_kwargs.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/checkpoint.py b/paddlespeech/s2t/utils/checkpoint.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/cli_readers.py b/paddlespeech/s2t/utils/cli_readers.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/cli_utils.py b/paddlespeech/s2t/utils/cli_utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/cli_writers.py b/paddlespeech/s2t/utils/cli_writers.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/ctc_utils.py b/paddlespeech/s2t/utils/ctc_utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/dynamic_import.py b/paddlespeech/s2t/utils/dynamic_import.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/dynamic_pip_install.py b/paddlespeech/s2t/utils/dynamic_pip_install.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/error_rate.py b/paddlespeech/s2t/utils/error_rate.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/layer_tools.py b/paddlespeech/s2t/utils/layer_tools.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/log.py b/paddlespeech/s2t/utils/log.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/mp_tools.py b/paddlespeech/s2t/utils/mp_tools.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/profiler.py b/paddlespeech/s2t/utils/profiler.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/socket_server.py b/paddlespeech/s2t/utils/socket_server.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/spec_augment.py b/paddlespeech/s2t/utils/spec_augment.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/text_grid.py b/paddlespeech/s2t/utils/text_grid.py old mode 100644 new mode 100755 diff --git a/paddlespeech/s2t/utils/utility.py b/paddlespeech/s2t/utils/utility.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/README.md b/paddlespeech/server/README.md old mode 100644 new mode 100755 diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md old mode 100644 new mode 100755 diff --git a/paddlespeech/server/__init__.py b/paddlespeech/server/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/base_commands.py b/paddlespeech/server/base_commands.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/bin/__init__.py b/paddlespeech/server/bin/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py old mode 100644 new mode 100755 index 10a91d9be..1b1792bd1 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -113,7 +113,7 @@ class ServerExecutor(BaseExecutor): """ config = get_config(config_file) if self.init(config): - uvicorn.run(app, host=config.host, port=config.port, debug=True) + uvicorn.run(app, host=config.host, port=config.port) @cli_server_register( diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml old mode 100644 new mode 100755 diff --git a/paddlespeech/server/conf/tts_online_application.yaml b/paddlespeech/server/conf/tts_online_application.yaml old mode 100644 new mode 100755 diff --git a/paddlespeech/server/conf/vector_application.yaml b/paddlespeech/server/conf/vector_application.yaml old mode 100644 new mode 100755 diff --git a/paddlespeech/server/conf/ws_conformer_application.yaml b/paddlespeech/server/conf/ws_conformer_application.yaml old mode 100644 new mode 100755 diff --git a/paddlespeech/server/conf/ws_conformer_wenetspeech_application_faster.yaml b/paddlespeech/server/conf/ws_conformer_wenetspeech_application_faster.yaml old mode 100644 new mode 100755 diff --git a/paddlespeech/server/conf/ws_ds2_application.yaml b/paddlespeech/server/conf/ws_ds2_application.yaml old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/__init__.py b/paddlespeech/server/engine/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/acs/__init__.py b/paddlespeech/server/engine/acs/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/acs/python/__init__.py b/paddlespeech/server/engine/acs/python/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/acs/python/acs_engine.py b/paddlespeech/server/engine/acs/python/acs_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/__init__.py b/paddlespeech/server/engine/asr/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/online/__init__.py b/paddlespeech/server/engine/asr/online/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/online/ctc_endpoint.py b/paddlespeech/server/engine/asr/online/ctc_endpoint.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/online/ctc_search.py b/paddlespeech/server/engine/asr/online/ctc_search.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/online/onnx/__init__.py b/paddlespeech/server/engine/asr/online/onnx/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/online/paddleinference/__init__.py b/paddlespeech/server/engine/asr/online/paddleinference/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/online/python/__init__.py b/paddlespeech/server/engine/asr/online/python/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/paddleinference/__init__.py b/paddlespeech/server/engine/asr/paddleinference/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/python/__init__.py b/paddlespeech/server/engine/asr/python/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/base_engine.py b/paddlespeech/server/engine/base_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/cls/__init__.py b/paddlespeech/server/engine/cls/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/cls/paddleinference/__init__.py b/paddlespeech/server/engine/cls/paddleinference/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/cls/python/__init__.py b/paddlespeech/server/engine/cls/python/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/cls/python/cls_engine.py b/paddlespeech/server/engine/cls/python/cls_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/engine_factory.py b/paddlespeech/server/engine/engine_factory.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/engine_pool.py b/paddlespeech/server/engine/engine_pool.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/engine_warmup.py b/paddlespeech/server/engine/engine_warmup.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/text/__init__.py b/paddlespeech/server/engine/text/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/text/python/__init__.py b/paddlespeech/server/engine/text/python/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/text/python/text_engine.py b/paddlespeech/server/engine/text/python/text_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/tts/__init__.py b/paddlespeech/server/engine/tts/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/tts/online/__init__.py b/paddlespeech/server/engine/tts/online/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/tts/online/onnx/__init__.py b/paddlespeech/server/engine/tts/online/onnx/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/tts/online/python/__init__.py b/paddlespeech/server/engine/tts/online/python/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/tts/paddleinference/__init__.py b/paddlespeech/server/engine/tts/paddleinference/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/tts/python/__init__.py b/paddlespeech/server/engine/tts/python/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/vector/__init__.py b/paddlespeech/server/engine/vector/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/vector/python/__init__.py b/paddlespeech/server/engine/vector/python/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/entry.py b/paddlespeech/server/entry.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/executor.py b/paddlespeech/server/executor.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/restful/__init__.py b/paddlespeech/server/restful/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/restful/acs_api.py b/paddlespeech/server/restful/acs_api.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/restful/api.py b/paddlespeech/server/restful/api.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/restful/asr_api.py b/paddlespeech/server/restful/asr_api.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/restful/cls_api.py b/paddlespeech/server/restful/cls_api.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/restful/request.py b/paddlespeech/server/restful/request.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/restful/response.py b/paddlespeech/server/restful/response.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/restful/text_api.py b/paddlespeech/server/restful/text_api.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/restful/tts_api.py b/paddlespeech/server/restful/tts_api.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/restful/vector_api.py b/paddlespeech/server/restful/vector_api.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/tests/__init__.py b/paddlespeech/server/tests/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/tests/asr/__init__.py b/paddlespeech/server/tests/asr/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/tests/asr/offline/__init__.py b/paddlespeech/server/tests/asr/offline/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/tests/asr/offline/http_client.py b/paddlespeech/server/tests/asr/offline/http_client.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/tests/asr/online/README.md b/paddlespeech/server/tests/asr/online/README.md old mode 100644 new mode 100755 diff --git a/paddlespeech/server/tests/asr/online/README_cn.md b/paddlespeech/server/tests/asr/online/README_cn.md old mode 100644 new mode 100755 diff --git a/paddlespeech/server/tests/asr/online/microphone_client.py b/paddlespeech/server/tests/asr/online/microphone_client.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/tests/text/http_client.py b/paddlespeech/server/tests/text/http_client.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/tests/tts/offline/http_client.py b/paddlespeech/server/tests/tts/offline/http_client.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/tests/tts/online/http_client.py b/paddlespeech/server/tests/tts/online/http_client.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/tests/tts/online/ws_client.py b/paddlespeech/server/tests/tts/online/ws_client.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/util.py b/paddlespeech/server/util.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/utils/__init__.py b/paddlespeech/server/utils/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/utils/audio_process.py b/paddlespeech/server/utils/audio_process.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/utils/buffer.py b/paddlespeech/server/utils/buffer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/utils/config.py b/paddlespeech/server/utils/config.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/utils/errors.py b/paddlespeech/server/utils/errors.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/utils/exception.py b/paddlespeech/server/utils/exception.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/utils/onnx_infer.py b/paddlespeech/server/utils/onnx_infer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/utils/paddle_predictor.py b/paddlespeech/server/utils/paddle_predictor.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/utils/vad.py b/paddlespeech/server/utils/vad.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/ws/__init__.py b/paddlespeech/server/ws/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/ws/api.py b/paddlespeech/server/ws/api.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/ws/asr_api.py b/paddlespeech/server/ws/asr_api.py old mode 100644 new mode 100755 diff --git a/paddlespeech/server/ws/tts_api.py b/paddlespeech/server/ws/tts_api.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py old mode 100644 new mode 100755 index 7d93c026e..57fe82a9c --- a/paddlespeech/t2s/__init__.py +++ b/paddlespeech/t2s/__init__.py @@ -18,5 +18,6 @@ from . import exps from . import frontend from . import models from . import modules +from . import ssml from . import training from . import utils diff --git a/paddlespeech/t2s/audio/__init__.py b/paddlespeech/t2s/audio/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/audio/audio.py b/paddlespeech/t2s/audio/audio.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/audio/codec.py b/paddlespeech/t2s/audio/codec.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/audio/spec_normalizer.py b/paddlespeech/t2s/audio/spec_normalizer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/datasets/__init__.py b/paddlespeech/t2s/datasets/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/datasets/batch.py b/paddlespeech/t2s/datasets/batch.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/datasets/data_table.py b/paddlespeech/t2s/datasets/data_table.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/datasets/dataset.py b/paddlespeech/t2s/datasets/dataset.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/datasets/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/datasets/ljspeech.py b/paddlespeech/t2s/datasets/ljspeech.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/datasets/preprocess_utils.py b/paddlespeech/t2s/datasets/preprocess_utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/datasets/sampler.py b/paddlespeech/t2s/datasets/sampler.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/__init__.py b/paddlespeech/t2s/exps/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/csmsc_test.txt b/paddlespeech/t2s/exps/csmsc_test.txt old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/ernie_sat/__init__.py b/paddlespeech/t2s/exps/ernie_sat/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/ernie_sat/normalize.py b/paddlespeech/t2s/exps/ernie_sat/normalize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/ernie_sat/preprocess.py b/paddlespeech/t2s/exps/ernie_sat/preprocess.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize.py b/paddlespeech/t2s/exps/ernie_sat/synthesize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/ernie_sat/utils.py b/paddlespeech/t2s/exps/ernie_sat/utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/fastspeech2/__init__.py b/paddlespeech/t2s/exps/fastspeech2/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/fastspeech2/normalize.py b/paddlespeech/t2s/exps/fastspeech2/normalize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py old mode 100644 new mode 100755 index 10e023d0c..d31e62a82 --- a/paddlespeech/t2s/exps/fastspeech2/train.py +++ b/paddlespeech/t2s/exps/fastspeech2/train.py @@ -145,17 +145,27 @@ def train_sp(args, config): # copy conf to output_dir shutil.copyfile(args.config, output_dir / config_name) + if "enable_speaker_classifier" in config.model: + enable_spk_cls = config.model.enable_speaker_classifier + else: + enable_spk_cls = False + updater = FastSpeech2Updater( model=model, optimizer=optimizer, dataloader=train_dataloader, output_dir=output_dir, - **config["updater"]) + enable_spk_cls=enable_spk_cls, + **config["updater"], ) trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir) evaluator = FastSpeech2Evaluator( - model, dev_dataloader, output_dir=output_dir, **config["updater"]) + model, + dev_dataloader, + output_dir=output_dir, + enable_spk_cls=enable_spk_cls, + **config["updater"], ) if dist.get_rank() == 0: trainer.extend(evaluator, trigger=(1, "epoch")) diff --git a/paddlespeech/t2s/exps/fastspeech2/vc2_infer.py b/paddlespeech/t2s/exps/fastspeech2/vc2_infer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/README.md b/paddlespeech/t2s/exps/gan_vocoder/README.md old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/__init__.py b/paddlespeech/t2s/exps/gan_vocoder/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/__init__.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/__init__.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/normalize.py b/paddlespeech/t2s/exps/gan_vocoder/normalize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/__init__.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/__init__.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/gan_vocoder/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py old mode 100644 new mode 100755 index 5840c0699..e0ae20bb1 --- a/paddlespeech/t2s/exps/inference.py +++ b/paddlespeech/t2s/exps/inference.py @@ -145,7 +145,7 @@ def main(): # warmup for utt_id, sentence in sentences[:3]: with timer() as t: - am_output_data = get_am_output( + mel = get_am_output( input=sentence, am_predictor=am_predictor, am=args.am, @@ -154,12 +154,11 @@ def main(): merge_sentences=merge_sentences, speaker_dict=args.speaker_dict, spk_id=args.spk_id, ) - wav = get_voc_output( - voc_predictor=voc_predictor, input=am_output_data) + wav = get_voc_output(voc_predictor=voc_predictor, input=mel) speed = wav.size / t.elapse rtf = fs / speed print( - f"{utt_id}, mel: {am_output_data.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." ) print("warm up done!") @@ -168,7 +167,7 @@ def main(): T = 0 for utt_id, sentence in sentences: with timer() as t: - am_output_data = get_am_output( + mel = get_am_output( input=sentence, am_predictor=am_predictor, am=args.am, @@ -177,8 +176,7 @@ def main(): merge_sentences=merge_sentences, speaker_dict=args.speaker_dict, spk_id=args.spk_id, ) - wav = get_voc_output( - voc_predictor=voc_predictor, input=am_output_data) + wav = get_voc_output(voc_predictor=voc_predictor, input=mel) N += wav.size T += t.elapse @@ -187,7 +185,7 @@ def main(): sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=fs) print( - f"{utt_id}, mel: {am_output_data.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." ) print(f"{utt_id} done!") diff --git a/paddlespeech/t2s/exps/inference_streaming.py b/paddlespeech/t2s/exps/inference_streaming.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/lite_predict.py b/paddlespeech/t2s/exps/lite_predict.py new file mode 100755 index 000000000..bd0c732b1 --- /dev/null +++ b/paddlespeech/t2s/exps/lite_predict.py @@ -0,0 +1,168 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from pathlib import Path + +import soundfile as sf +from timer import timer + +from paddlespeech.t2s.exps.syn_utils import get_frontend +from paddlespeech.t2s.exps.syn_utils import get_lite_am_output +from paddlespeech.t2s.exps.syn_utils import get_lite_predictor +from paddlespeech.t2s.exps.syn_utils import get_lite_voc_output +from paddlespeech.t2s.exps.syn_utils import get_sentences + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Paddle Infernce with acoustic model & vocoder.") + # acoustic model + parser.add_argument( + '--am', + type=str, + default='fastspeech2_csmsc', + choices=[ + 'speedyspeech_csmsc', + 'fastspeech2_csmsc', + 'fastspeech2_aishell3', + 'fastspeech2_ljspeech', + 'fastspeech2_vctk', + 'fastspeech2_mix', + ], + help='Choose acoustic model type of tts task.') + parser.add_argument( + "--phones_dict", type=str, default=None, help="phone vocabulary file.") + parser.add_argument( + "--tones_dict", type=str, default=None, help="tone vocabulary file.") + parser.add_argument( + "--speaker_dict", type=str, default=None, help="speaker id map file.") + parser.add_argument( + '--spk_id', + type=int, + default=0, + help='spk id for multi speaker acoustic model') + # voc + parser.add_argument( + '--voc', + type=str, + default='pwgan_csmsc', + choices=[ + 'pwgan_csmsc', + 'pwgan_aishell3', + 'pwgan_ljspeech', + 'pwgan_vctk', + 'mb_melgan_csmsc', + 'hifigan_csmsc', + 'hifigan_aishell3', + 'hifigan_ljspeech', + 'hifigan_vctk', + ], + help='Choose vocoder type of tts task.') + # other + parser.add_argument( + '--lang', + type=str, + default='zh', + help='Choose model language. zh or en or mix') + parser.add_argument( + "--text", + type=str, + help="text to synthesize, a 'utt_id sentence' pair per line") + parser.add_argument( + "--inference_dir", type=str, help="dir to save inference models") + parser.add_argument("--output_dir", type=str, help="output dir") + + args, _ = parser.parse_known_args() + return args + + +# only inference for models trained with csmsc now +def main(): + args = parse_args() + + # frontend + frontend = get_frontend( + lang=args.lang, + phones_dict=args.phones_dict, + tones_dict=args.tones_dict) + + # am_predictor + am_predictor = get_lite_predictor( + model_dir=args.inference_dir, model_file=args.am + "_x86.nb") + # model: {model_name}_{dataset} + am_dataset = args.am[args.am.rindex('_') + 1:] + + # voc_predictor + voc_predictor = get_lite_predictor( + model_dir=args.inference_dir, model_file=args.voc + "_x86.nb") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + sentences = get_sentences(text_file=args.text, lang=args.lang) + + merge_sentences = True + fs = 24000 if am_dataset != 'ljspeech' else 22050 + # warmup + for utt_id, sentence in sentences[:3]: + with timer() as t: + mel = get_lite_am_output( + input=sentence, + am_predictor=am_predictor, + am=args.am, + frontend=frontend, + lang=args.lang, + merge_sentences=merge_sentences, + speaker_dict=args.speaker_dict, + spk_id=args.spk_id, ) + wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel) + speed = wav.size / t.elapse + rtf = fs / speed + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + + print("warm up done!") + + N = 0 + T = 0 + for utt_id, sentence in sentences: + with timer() as t: + mel = get_lite_am_output( + input=sentence, + am_predictor=am_predictor, + am=args.am, + frontend=frontend, + lang=args.lang, + merge_sentences=merge_sentences, + speaker_dict=args.speaker_dict, + spk_id=args.spk_id, ) + wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel) + + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + rtf = fs / speed + + sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=fs) + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + + print(f"{utt_id} done!") + print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }") + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/lite_predict_streaming.py b/paddlespeech/t2s/exps/lite_predict_streaming.py new file mode 100755 index 000000000..37b600512 --- /dev/null +++ b/paddlespeech/t2s/exps/lite_predict_streaming.py @@ -0,0 +1,230 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from pathlib import Path + +import numpy as np +import soundfile as sf +from timer import timer + +from paddlespeech.t2s.exps.syn_utils import denorm +from paddlespeech.t2s.exps.syn_utils import get_chunks +from paddlespeech.t2s.exps.syn_utils import get_frontend +from paddlespeech.t2s.exps.syn_utils import get_lite_am_sublayer_output +from paddlespeech.t2s.exps.syn_utils import get_lite_predictor +from paddlespeech.t2s.exps.syn_utils import get_lite_streaming_am_output +from paddlespeech.t2s.exps.syn_utils import get_lite_voc_output +from paddlespeech.t2s.exps.syn_utils import get_sentences +from paddlespeech.t2s.exps.syn_utils import run_frontend +from paddlespeech.t2s.utils import str2bool + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Paddle Infernce with acoustic model & vocoder.") + # acoustic model + parser.add_argument( + '--am', + type=str, + default='fastspeech2_csmsc', + choices=['fastspeech2_csmsc'], + help='Choose acoustic model type of tts task.') + parser.add_argument( + "--am_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training acoustic model." + ) + parser.add_argument( + "--phones_dict", type=str, default=None, help="phone vocabulary file.") + parser.add_argument( + "--tones_dict", type=str, default=None, help="tone vocabulary file.") + parser.add_argument( + "--speaker_dict", type=str, default=None, help="speaker id map file.") + parser.add_argument( + '--spk_id', + type=int, + default=0, + help='spk id for multi speaker acoustic model') + # voc + parser.add_argument( + '--voc', + type=str, + default='pwgan_csmsc', + choices=['pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc'], + help='Choose vocoder type of tts task.') + # other + parser.add_argument( + '--lang', + type=str, + default='zh', + help='Choose model language. zh or en') + parser.add_argument( + "--text", + type=str, + help="text to synthesize, a 'utt_id sentence' pair per line") + parser.add_argument( + "--inference_dir", type=str, help="dir to save inference models") + parser.add_argument("--output_dir", type=str, help="output dir") + # inference + + # streaming related + parser.add_argument( + "--am_streaming", + type=str2bool, + default=False, + help="whether use streaming acoustic model") + parser.add_argument( + "--block_size", type=int, default=42, help="block size of am streaming") + parser.add_argument( + "--pad_size", type=int, default=12, help="pad size of am streaming") + + args, _ = parser.parse_known_args() + return args + + +# only inference for models trained with csmsc now +def main(): + args = parse_args() + + # frontend + frontend = get_frontend( + lang=args.lang, + phones_dict=args.phones_dict, + tones_dict=args.tones_dict) + + # am_predictor + am_encoder_infer_predictor = get_lite_predictor( + model_dir=args.inference_dir, + model_file=args.am + "_am_encoder_infer" + "_x86.nb") + am_decoder_predictor = get_lite_predictor( + model_dir=args.inference_dir, + model_file=args.am + "_am_decoder" + "_x86.nb") + am_postnet_predictor = get_lite_predictor( + model_dir=args.inference_dir, + model_file=args.am + "_am_postnet" + "_x86.nb") + am_mu, am_std = np.load(args.am_stat) + # model: {model_name}_{dataset} + am_dataset = args.am[args.am.rindex('_') + 1:] + + # voc_predictor + voc_predictor = get_lite_predictor( + model_dir=args.inference_dir, model_file=args.voc + "_x86.nb") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + sentences = get_sentences(text_file=args.text, lang=args.lang) + + merge_sentences = True + + fs = 24000 if am_dataset != 'ljspeech' else 22050 + # warmup + for utt_id, sentence in sentences[:3]: + with timer() as t: + normalized_mel = get_lite_streaming_am_output( + input=sentence, + am_encoder_infer_predictor=am_encoder_infer_predictor, + am_decoder_predictor=am_decoder_predictor, + am_postnet_predictor=am_postnet_predictor, + frontend=frontend, + lang=args.lang, + merge_sentences=merge_sentences, ) + mel = denorm(normalized_mel, am_mu, am_std) + wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel) + speed = wav.size / t.elapse + rtf = fs / speed + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + + print("warm up done!") + + N = 0 + T = 0 + block_size = args.block_size + pad_size = args.pad_size + get_tone_ids = False + for utt_id, sentence in sentences: + with timer() as t: + # frontend + frontend_dict = run_frontend( + frontend=frontend, + text=sentence, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + lang=args.lang) + phone_ids = frontend_dict['phone_ids'] + phones = phone_ids[0].numpy() + # acoustic model + orig_hs = get_lite_am_sublayer_output( + am_encoder_infer_predictor, input=phones) + + if args.am_streaming: + hss = get_chunks(orig_hs, block_size, pad_size) + chunk_num = len(hss) + mel_list = [] + for i, hs in enumerate(hss): + am_decoder_output = get_lite_am_sublayer_output( + am_decoder_predictor, input=hs) + am_postnet_output = get_lite_am_sublayer_output( + am_postnet_predictor, + input=np.transpose(am_decoder_output, (0, 2, 1))) + am_output_data = am_decoder_output + np.transpose( + am_postnet_output, (0, 2, 1)) + normalized_mel = am_output_data[0] + + sub_mel = denorm(normalized_mel, am_mu, am_std) + # clip output part of pad + if i == 0: + sub_mel = sub_mel[:-pad_size] + elif i == chunk_num - 1: + # 最后一块的右侧一定没有 pad 够 + sub_mel = sub_mel[pad_size:] + else: + # 倒数几块的右侧也可能没有 pad 够 + sub_mel = sub_mel[pad_size:(block_size + pad_size) - + sub_mel.shape[0]] + mel_list.append(sub_mel) + mel = np.concatenate(mel_list, axis=0) + + else: + am_decoder_output = get_lite_am_sublayer_output( + am_decoder_predictor, input=orig_hs) + am_postnet_output = get_lite_am_sublayer_output( + am_postnet_predictor, + input=np.transpose(am_decoder_output, (0, 2, 1))) + am_output_data = am_decoder_output + np.transpose( + am_postnet_output, (0, 2, 1)) + normalized_mel = am_output_data[0] + mel = denorm(normalized_mel, am_mu, am_std) + # vocoder + wav = get_lite_voc_output(voc_predictor=voc_predictor, input=mel) + + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + rtf = fs / speed + + sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000) + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) + + print(f"{utt_id} done!") + print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }") + + +if __name__ == "__main__": + main() diff --git a/paddlespeech/t2s/exps/ort_predict.py b/paddlespeech/t2s/exps/ort_predict.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/ort_predict_streaming.py b/paddlespeech/t2s/exps/ort_predict_streaming.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/sentences.txt b/paddlespeech/t2s/exps/sentences.txt old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/sentences_en.txt b/paddlespeech/t2s/exps/sentences_en.txt old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/sentences_mix.txt b/paddlespeech/t2s/exps/sentences_mix.txt old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/sentences_ssml.txt b/paddlespeech/t2s/exps/sentences_ssml.txt new file mode 100755 index 000000000..e3614f224 --- /dev/null +++ b/paddlespeech/t2s/exps/sentences_ssml.txt @@ -0,0 +1,10 @@ +0001 考古人员西布达拉宫里发现一个被隐的装有宝箱子。 +0002 有人询问中国银北京分行行长是否叫任我。 +0003 市委书记亲自领审计员对这家公司进行财务审计,发现企业的利润数据虚假。 +0004 学生们对代理解不深刻,特别是小点,在数数时容易弄错。 +0005 军从小学习武术,擅散打,大后参军,担任连。 +0006 我说她了工资,她就红着脸,摇头否认。 +0007 请把这封信交团长,告诉他,前线的供一定要有保障。 +0008 矿下的道,与北京四合院的小有点相似。 +0009 他常叹自己命,几亩田,种点。 +0010 小明对天相很有研究,在宿舍说了一宿有关星宿的常识。 \ No newline at end of file diff --git a/paddlespeech/t2s/exps/speedyspeech/__init__.py b/paddlespeech/t2s/exps/speedyspeech/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/speedyspeech/inference.py b/paddlespeech/t2s/exps/speedyspeech/inference.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/speedyspeech/normalize.py b/paddlespeech/t2s/exps/speedyspeech/normalize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/stream_play_tts.py b/paddlespeech/t2s/exps/stream_play_tts.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py old mode 100644 new mode 100755 index 15d8dfb78..cea125291 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -13,6 +13,7 @@ # limitations under the License. import math import os +import re from pathlib import Path from typing import Any from typing import Dict @@ -25,6 +26,8 @@ import paddle from paddle import inference from paddle import jit from paddle.static import InputSpec +from paddlelite.lite import create_paddle_predictor +from paddlelite.lite import MobileConfig from yacs.config import CfgNode from paddlespeech.t2s.datasets.data_table import DataTable @@ -33,6 +36,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.modules.normalizer import ZScore from paddlespeech.utils.dynamic_import import dynamic_import + # remove [W:onnxruntime: xxx] from ort ort.set_default_logger_severity(3) @@ -103,14 +107,15 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'): sentences = [] with open(text_file, 'rt') as f: for line in f: - items = line.strip().split() - utt_id = items[0] - if lang == 'zh': - sentence = "".join(items[1:]) - elif lang == 'en': - sentence = " ".join(items[1:]) - elif lang == 'mix': - sentence = " ".join(items[1:]) + if line.strip() != "": + items = re.split(r"\s+", line.strip(), 1) + utt_id = items[0] + if lang == 'zh': + sentence = "".join(items[1:]) + elif lang == 'en': + sentence = " ".join(items[1:]) + elif lang == 'mix': + sentence = " ".join(items[1:]) sentences.append((utt_id, sentence)) return sentences @@ -180,11 +185,20 @@ def run_frontend(frontend: object, to_tensor: bool=True): outs = dict() if lang == 'zh': - input_ids = frontend.get_input_ids( - text, - merge_sentences=merge_sentences, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) + input_ids = {} + if text.strip() != "" and re.match(r".*?.*?.*", text, + re.DOTALL): + input_ids = frontend.get_input_ids_ssml( + text, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) + else: + input_ids = frontend.get_input_ids( + text, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) phone_ids = input_ids["phone_ids"] if get_tone_ids: tone_ids = input_ids["tone_ids"] @@ -498,3 +512,105 @@ def get_sess(model_path: Optional[os.PathLike], sess = ort.InferenceSession( model_path, providers=providers, sess_options=sess_options) return sess + + +# Paddle-Lite +def get_lite_predictor(model_dir: Optional[os.PathLike]=None, + model_file: Optional[os.PathLike]=None, + cpu_threads: int=1): + config = MobileConfig() + config.set_model_from_file(str(Path(model_dir) / model_file)) + predictor = create_paddle_predictor(config) + return predictor + + +def get_lite_am_output( + input: str, + am_predictor, + am: str, + frontend: object, + lang: str='zh', + merge_sentences: bool=True, + speaker_dict: Optional[os.PathLike]=None, + spk_id: int=0, ): + am_name = am[:am.rindex('_')] + am_dataset = am[am.rindex('_') + 1:] + get_spk_id = False + get_tone_ids = False + if am_name == 'speedyspeech': + get_tone_ids = True + if am_dataset in {"aishell3", "vctk", "mix"} and speaker_dict: + get_spk_id = True + spk_id = np.array([spk_id]) + + frontend_dict = run_frontend( + frontend=frontend, + text=input, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + lang=lang) + + if get_tone_ids: + tone_ids = frontend_dict['tone_ids'] + tones = tone_ids[0].numpy() + tones_handle = am_predictor.get_input(1) + tones_handle.from_numpy(tones) + + if get_spk_id: + spk_id_handle = am_predictor.get_input(1) + spk_id_handle.from_numpy(spk_id) + phone_ids = frontend_dict['phone_ids'] + phones = phone_ids[0].numpy() + phones_handle = am_predictor.get_input(0) + phones_handle.from_numpy(phones) + am_predictor.run() + am_output_handle = am_predictor.get_output(0) + am_output_data = am_output_handle.numpy() + return am_output_data + + +def get_lite_voc_output(voc_predictor, input): + mel_handle = voc_predictor.get_input(0) + mel_handle.from_numpy(input) + voc_predictor.run() + voc_output_handle = voc_predictor.get_output(0) + wav = voc_output_handle.numpy() + return wav + + +def get_lite_am_sublayer_output(am_sublayer_predictor, input): + input_handle = am_sublayer_predictor.get_input(0) + input_handle.from_numpy(input) + + am_sublayer_predictor.run() + am_sublayer_handle = am_sublayer_predictor.get_output(0) + am_sublayer_output = am_sublayer_handle.numpy() + return am_sublayer_output + + +def get_lite_streaming_am_output(input: str, + am_encoder_infer_predictor, + am_decoder_predictor, + am_postnet_predictor, + frontend, + lang: str='zh', + merge_sentences: bool=True): + get_tone_ids = False + frontend_dict = run_frontend( + frontend=frontend, + text=input, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids, + lang=lang) + phone_ids = frontend_dict['phone_ids'] + phones = phone_ids[0].numpy() + am_encoder_infer_output = get_lite_am_sublayer_output( + am_encoder_infer_predictor, input=phones) + am_decoder_output = get_lite_am_sublayer_output( + am_decoder_predictor, input=am_encoder_infer_output) + am_postnet_output = get_lite_am_sublayer_output( + am_postnet_predictor, input=np.transpose(am_decoder_output, (0, 2, 1))) + am_output_data = am_decoder_output + np.transpose(am_postnet_output, + (0, 2, 1)) + normalized_mel = am_output_data[0] + return normalized_mel diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/synthesize_streaming.py b/paddlespeech/t2s/exps/synthesize_streaming.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/tacotron2/__init__.py b/paddlespeech/t2s/exps/tacotron2/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/transformer_tts/__init__.py b/paddlespeech/t2s/exps/transformer_tts/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/transformer_tts/normalize.py b/paddlespeech/t2s/exps/transformer_tts/normalize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize.py b/paddlespeech/t2s/exps/transformer_tts/synthesize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/vits/__init__.py b/paddlespeech/t2s/exps/vits/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/vits/synthesize.py b/paddlespeech/t2s/exps/vits/synthesize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/vits/synthesize_e2e.py b/paddlespeech/t2s/exps/vits/synthesize_e2e.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/vits/voice_cloning.py b/paddlespeech/t2s/exps/vits/voice_cloning.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/waveflow/__init__.py b/paddlespeech/t2s/exps/waveflow/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/waveflow/config.py b/paddlespeech/t2s/exps/waveflow/config.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/waveflow/ljspeech.py b/paddlespeech/t2s/exps/waveflow/ljspeech.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/waveflow/preprocess.py b/paddlespeech/t2s/exps/waveflow/preprocess.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/waveflow/synthesize.py b/paddlespeech/t2s/exps/waveflow/synthesize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/waveflow/train.py b/paddlespeech/t2s/exps/waveflow/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/wavernn/__init__.py b/paddlespeech/t2s/exps/wavernn/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/wavernn/synthesize.py b/paddlespeech/t2s/exps/wavernn/synthesize.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/__init__.py b/paddlespeech/t2s/frontend/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/arpabet.py b/paddlespeech/t2s/frontend/arpabet.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/g2pw/__init__.py b/paddlespeech/t2s/frontend/g2pw/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/g2pw/dataset.py b/paddlespeech/t2s/frontend/g2pw/dataset.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py old mode 100644 new mode 100755 index ad32c4050..47c26a610 --- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py +++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py @@ -100,7 +100,7 @@ class G2PWOnnxConverter: ] self.non_polyphonic = { '一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗', - '肖', '瘙', '誒', '泊' + '肖', '瘙', '誒', '泊', '听' } self.non_monophonic = {'似', '攢'} self.monophonic_chars = [ @@ -210,7 +210,8 @@ class G2PWOnnxConverter: for sent_id, sent in enumerate(sentences): # pypinyin works well for Simplified Chinese than Traditional Chinese sent_s = tranditional_to_simplified(sent) - pypinyin_result = pinyin(sent_s, style=Style.TONE3) + pypinyin_result = pinyin( + sent_s, neutral_tone_with_five=True, style=Style.TONE3) partial_result = [None] * len(sent) for i, char in enumerate(sent): if char in self.polyphonic_chars_new: diff --git a/paddlespeech/t2s/frontend/g2pw/utils.py b/paddlespeech/t2s/frontend/g2pw/utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/generate_lexicon.py b/paddlespeech/t2s/frontend/generate_lexicon.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/normalizer/__init__.py b/paddlespeech/t2s/frontend/normalizer/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/normalizer/abbrrviation.py b/paddlespeech/t2s/frontend/normalizer/abbrrviation.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/normalizer/acronyms.py b/paddlespeech/t2s/frontend/normalizer/acronyms.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/normalizer/normalizer.py b/paddlespeech/t2s/frontend/normalizer/normalizer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/normalizer/numbers.py b/paddlespeech/t2s/frontend/normalizer/numbers.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/normalizer/width.py b/paddlespeech/t2s/frontend/normalizer/width.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/polyphonic.yaml b/paddlespeech/t2s/frontend/polyphonic.yaml old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/punctuation.py b/paddlespeech/t2s/frontend/punctuation.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/vocab.py b/paddlespeech/t2s/frontend/vocab.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py old mode 100644 new mode 100755 index 722eed601..e30286986 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -13,6 +13,7 @@ # limitations under the License. import os import re +from operator import itemgetter from typing import Dict from typing import List @@ -31,6 +32,7 @@ from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer +from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor INITIALS = [ 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh', @@ -81,6 +83,7 @@ class Frontend(): g2p_model="g2pW", phone_vocab_path=None, tone_vocab_path=None): + self.mix_ssml_processor = MixTextProcessor() self.tone_modifier = ToneSandhi() self.text_normalizer = TextNormalizer() self.punc = ":,;。?!“”‘’':,;.?!" @@ -281,6 +284,65 @@ class Frontend(): phones_list.append(merge_list) return phones_list + def _split_word_to_char(self, words): + res = [] + for x in words: + res.append(x) + return res + + # if using ssml, have pingyin specified, assign pinyin to words + def _g2p_assign(self, + words: List[str], + pinyin_spec: List[str], + merge_sentences: bool=True) -> List[List[str]]: + phones_list = [] + initials = [] + finals = [] + + words = self._split_word_to_char(words[0]) + for pinyin, char in zip(pinyin_spec, words): + sub_initials = [] + sub_finals = [] + pinyin = pinyin.replace("u:", "v") + #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu + if pinyin in self.pinyin2phone: + initial_final_list = self.pinyin2phone[pinyin].split(" ") + if len(initial_final_list) == 2: + sub_initials.append(initial_final_list[0]) + sub_finals.append(initial_final_list[1]) + elif len(initial_final_list) == 1: + sub_initials.append('') + sub_finals.append(initial_final_list[1]) + else: + # If it's not pinyin (possibly punctuation) or no conversion is required + sub_initials.append(pinyin) + sub_finals.append(pinyin) + initials.append(sub_initials) + finals.append(sub_finals) + + initials = sum(initials, []) + finals = sum(finals, []) + phones = [] + for c, v in zip(initials, finals): + # NOTE: post process for pypinyin outputs + # we discriminate i, ii and iii + if c and c not in self.punc: + phones.append(c) + if c and c in self.punc: + phones.append('sp') + if v and v not in self.punc: + phones.append(v) + phones_list.append(phones) + if merge_sentences: + merge_list = sum(phones_list, []) + # rm the last 'sp' to avoid the noise at the end + # cause in the training data, no 'sp' in the end + if merge_list[-1] == 'sp': + merge_list = merge_list[:-1] + phones_list = [] + phones_list.append(merge_list) + return phones_list + def _merge_erhua(self, initials: List[str], finals: List[str], @@ -396,6 +458,52 @@ class Frontend(): print("----------------------------") return phonemes + #@an added for ssml pinyin + def get_phonemes_ssml(self, + ssml_inputs: list, + merge_sentences: bool=True, + with_erhua: bool=True, + robot: bool=False, + print_info: bool=False) -> List[List[str]]: + all_phonemes = [] + for word_pinyin_item in ssml_inputs: + phonemes = [] + sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item) + sentences = self.text_normalizer.normalize(sentence) + if len(pinyin_spec) == 0: + phonemes = self._g2p( + sentences, + merge_sentences=merge_sentences, + with_erhua=with_erhua) + else: + # phonemes should be pinyin_spec + phonemes = self._g2p_assign( + sentences, pinyin_spec, merge_sentences=merge_sentences) + + all_phonemes = all_phonemes + phonemes + + if robot: + new_phonemes = [] + for sentence in all_phonemes: + new_sentence = [] + for item in sentence: + # `er` only have tone `2` + if item[-1] in "12345" and item != "er2": + item = item[:-1] + "1" + new_sentence.append(item) + new_phonemes.append(new_sentence) + all_phonemes = new_phonemes + + if print_info: + print("----------------------------") + print("text norm results:") + print(sentences) + print("----------------------------") + print("g2p results:") + print(all_phonemes[0]) + print("----------------------------") + return [sum(all_phonemes, [])] + def get_input_ids(self, sentence: str, merge_sentences: bool=True, @@ -405,6 +513,7 @@ class Frontend(): add_blank: bool=False, blank_token: str="", to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: + phonemes = self.get_phonemes( sentence, merge_sentences=merge_sentences, @@ -437,3 +546,49 @@ class Frontend(): if temp_phone_ids: result["phone_ids"] = temp_phone_ids return result + + # @an added for ssml + def get_input_ids_ssml( + self, + sentence: str, + merge_sentences: bool=True, + get_tone_ids: bool=False, + robot: bool=False, + print_info: bool=False, + add_blank: bool=False, + blank_token: str="", + to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: + + l_inputs = MixTextProcessor.get_pinyin_split(sentence) + phonemes = self.get_phonemes_ssml( + l_inputs, + merge_sentences=merge_sentences, + print_info=print_info, + robot=robot) + result = {} + phones = [] + tones = [] + temp_phone_ids = [] + temp_tone_ids = [] + + for part_phonemes in phonemes: + phones, tones = self._get_phone_tone( + part_phonemes, get_tone_ids=get_tone_ids) + if add_blank: + phones = insert_after_character(phones, blank_token) + if tones: + tone_ids = self._t2id(tones) + if to_tensor: + tone_ids = paddle.to_tensor(tone_ids) + temp_tone_ids.append(tone_ids) + if phones: + phone_ids = self._p2id(phones) + # if use paddle.to_tensor() in onnxruntime, the first time will be too low + if to_tensor: + phone_ids = paddle.to_tensor(phone_ids) + temp_phone_ids.append(phone_ids) + if temp_tone_ids: + result["tone_ids"] = temp_tone_ids + if temp_phone_ids: + result["phone_ids"] = temp_phone_ids + return result diff --git a/paddlespeech/t2s/frontend/zh_normalization/README.md b/paddlespeech/t2s/frontend/zh_normalization/README.md old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/zh_normalization/__init__.py b/paddlespeech/t2s/frontend/zh_normalization/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/zh_normalization/char_convert.py b/paddlespeech/t2s/frontend/zh_normalization/char_convert.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/zh_normalization/constants.py b/paddlespeech/t2s/frontend/zh_normalization/constants.py old mode 100644 new mode 100755 index 5d2b0b34e..6423ad74a --- a/paddlespeech/t2s/frontend/zh_normalization/constants.py +++ b/paddlespeech/t2s/frontend/zh_normalization/constants.py @@ -19,7 +19,7 @@ from pypinyin.constants import SUPPORT_UCS4 # 全角半角转换 # 英文字符全角 -> 半角映射表 (num: 52) F2H_ASCII_LETTERS = { - chr(ord(char) + 65248): char + ord(char) + 65248: ord(char) for char in string.ascii_letters } @@ -27,12 +27,12 @@ F2H_ASCII_LETTERS = { H2F_ASCII_LETTERS = {value: key for key, value in F2H_ASCII_LETTERS.items()} # 数字字符全角 -> 半角映射表 (num: 10) -F2H_DIGITS = {chr(ord(char) + 65248): char for char in string.digits} +F2H_DIGITS = {ord(char) + 65248: ord(char) for char in string.digits} # 数字字符半角 -> 全角映射表 H2F_DIGITS = {value: key for key, value in F2H_DIGITS.items()} # 标点符号全角 -> 半角映射表 (num: 32) -F2H_PUNCTUATIONS = {chr(ord(char) + 65248): char for char in string.punctuation} +F2H_PUNCTUATIONS = {ord(char) + 65248: ord(char) for char in string.punctuation} # 标点符号半角 -> 全角映射表 H2F_PUNCTUATIONS = {value: key for key, value in F2H_PUNCTUATIONS.items()} diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py old mode 100644 new mode 100755 index 268d7229b..598030e43 --- a/paddlespeech/t2s/frontend/zh_normalization/quantifier.py +++ b/paddlespeech/t2s/frontend/zh_normalization/quantifier.py @@ -18,6 +18,25 @@ from .num import num2str # 温度表达式,温度会影响负号的读法 # -3°C 零下三度 RE_TEMPERATURE = re.compile(r'(-?)(\d+(\.\d+)?)(°C|℃|度|摄氏度)') +measure_dict = { + "cm2": "平方厘米", + "cm²": "平方厘米", + "cm3": "立方厘米", + "cm³": "立方厘米", + "cm": "厘米", + "db": "分贝", + "ds": "毫秒", + "kg": "千克", + "km": "千米", + "m2": "平方米", + "m²": "平方米", + "m³": "立方米", + "m3": "立方米", + "ml": "毫升", + "m": "米", + "mm": "毫米", + "s": "秒" +} def replace_temperature(match) -> str: @@ -35,3 +54,10 @@ def replace_temperature(match) -> str: unit: str = "摄氏度" if unit == "摄氏度" else "度" result = f"{sign}{temperature}{unit}" return result + + +def replace_measure(sentence) -> str: + for q_notation in measure_dict: + if q_notation in sentence: + sentence = sentence.replace(q_notation, measure_dict[q_notation]) + return sentence diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py old mode 100644 new mode 100755 index bc663c70d..1942e6661 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -46,6 +46,7 @@ from .phonecode import RE_TELEPHONE from .phonecode import replace_mobile from .phonecode import replace_phone from .quantifier import RE_TEMPERATURE +from .quantifier import replace_measure from .quantifier import replace_temperature @@ -73,6 +74,17 @@ class TextNormalizer(): def _post_replace(self, sentence: str) -> str: sentence = sentence.replace('/', '每') sentence = sentence.replace('~', '至') + sentence = sentence.replace('~', '至') + sentence = sentence.replace('①', '一') + sentence = sentence.replace('②', '二') + sentence = sentence.replace('③', '三') + sentence = sentence.replace('④', '四') + sentence = sentence.replace('⑤', '五') + sentence = sentence.replace('⑥', '六') + sentence = sentence.replace('⑦', '七') + sentence = sentence.replace('⑧', '八') + sentence = sentence.replace('⑨', '九') + sentence = sentence.replace('⑩', '十') return sentence @@ -91,6 +103,7 @@ class TextNormalizer(): sentence = RE_TIME.sub(replace_time, sentence) sentence = RE_TEMPERATURE.sub(replace_temperature, sentence) + sentence = replace_measure(sentence) sentence = RE_FRAC.sub(replace_frac, sentence) sentence = RE_PERCENTAGE.sub(replace_percentage, sentence) sentence = RE_MOBILE_PHONE.sub(replace_mobile, sentence) diff --git a/paddlespeech/t2s/models/__init__.py b/paddlespeech/t2s/models/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/ernie_sat/__init__.py b/paddlespeech/t2s/models/ernie_sat/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/ernie_sat/ernie_sat_updater.py b/paddlespeech/t2s/models/ernie_sat/ernie_sat_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/fastspeech2/__init__.py b/paddlespeech/t2s/models/fastspeech2/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py old mode 100644 new mode 100755 index 9905765db..0eb44beb6 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py @@ -25,6 +25,8 @@ import paddle.nn.functional as F from paddle import nn from typeguard import check_argument_types +from paddlespeech.t2s.modules.adversarial_loss.gradient_reversal import GradientReversalLayer +from paddlespeech.t2s.modules.adversarial_loss.speaker_classifier import SpeakerClassifier from paddlespeech.t2s.modules.nets_utils import initialize from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask from paddlespeech.t2s.modules.nets_utils import make_pad_mask @@ -138,7 +140,10 @@ class FastSpeech2(nn.Layer): # training related init_type: str="xavier_uniform", init_enc_alpha: float=1.0, - init_dec_alpha: float=1.0, ): + init_dec_alpha: float=1.0, + # speaker classifier + enable_speaker_classifier: bool=False, + hidden_sc_dim: int=256, ): """Initialize FastSpeech2 module. Args: idim (int): @@ -268,6 +273,10 @@ class FastSpeech2(nn.Layer): Initial value of alpha in scaled pos encoding of the encoder. init_dec_alpha (float): Initial value of alpha in scaled pos encoding of the decoder. + enable_speaker_classifier (bool): + Whether to use speaker classifier module + hidden_sc_dim (int): + The hidden layer dim of speaker classifier """ assert check_argument_types() @@ -281,6 +290,9 @@ class FastSpeech2(nn.Layer): self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor self.use_scaled_pos_enc = use_scaled_pos_enc + self.hidden_sc_dim = hidden_sc_dim + self.spk_num = spk_num + self.enable_speaker_classifier = enable_speaker_classifier self.spk_embed_dim = spk_embed_dim if self.spk_embed_dim is not None: @@ -373,6 +385,12 @@ class FastSpeech2(nn.Layer): self.tone_projection = nn.Linear(adim + self.tone_embed_dim, adim) + if self.spk_num and self.enable_speaker_classifier: + # set lambda = 1 + self.grad_reverse = GradientReversalLayer(1) + self.speaker_classifier = SpeakerClassifier( + idim=adim, hidden_sc_dim=self.hidden_sc_dim, spk_num=spk_num) + # define duration predictor self.duration_predictor = DurationPredictor( idim=adim, @@ -547,7 +565,7 @@ class FastSpeech2(nn.Layer): if tone_id is not None: tone_id = paddle.cast(tone_id, 'int64') # forward propagation - before_outs, after_outs, d_outs, p_outs, e_outs = self._forward( + before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits = self._forward( xs, ilens, olens, @@ -564,7 +582,7 @@ class FastSpeech2(nn.Layer): max_olen = max(olens) ys = ys[:, :max_olen] - return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens + return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits def _forward(self, xs: paddle.Tensor, @@ -584,6 +602,12 @@ class FastSpeech2(nn.Layer): # (B, Tmax, adim) hs, _ = self.encoder(xs, x_masks) + if self.spk_num and self.enable_speaker_classifier and not is_inference: + hs_for_spk_cls = self.grad_reverse(hs) + spk_logits = self.speaker_classifier(hs_for_spk_cls, ilens) + else: + spk_logits = None + # integrate speaker embedding if self.spk_embed_dim is not None: # spk_emb has a higher priority than spk_id @@ -676,7 +700,7 @@ class FastSpeech2(nn.Layer): after_outs = before_outs + self.postnet( before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) - return before_outs, after_outs, d_outs, p_outs, e_outs + return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits def encoder_infer( self, @@ -771,7 +795,7 @@ class FastSpeech2(nn.Layer): es = e.unsqueeze(0) if e is not None else None # (1, L, odim) - _, outs, d_outs, p_outs, e_outs = self._forward( + _, outs, d_outs, p_outs, e_outs, _ = self._forward( xs, ilens, ds=ds, @@ -783,7 +807,7 @@ class FastSpeech2(nn.Layer): is_inference=True) else: # (1, L, odim) - _, outs, d_outs, p_outs, e_outs = self._forward( + _, outs, d_outs, p_outs, e_outs, _ = self._forward( xs, ilens, is_inference=True, @@ -791,6 +815,7 @@ class FastSpeech2(nn.Layer): spk_emb=spk_emb, spk_id=spk_id, tone_id=tone_id) + return outs[0], d_outs[0], p_outs[0], e_outs[0] def _integrate_with_spk_embed(self, hs, spk_emb): @@ -1058,6 +1083,7 @@ class FastSpeech2Loss(nn.Layer): self.l1_criterion = nn.L1Loss(reduction=reduction) self.mse_criterion = nn.MSELoss(reduction=reduction) self.duration_criterion = DurationPredictorLoss(reduction=reduction) + self.ce_criterion = nn.CrossEntropyLoss() def forward( self, @@ -1072,7 +1098,10 @@ class FastSpeech2Loss(nn.Layer): es: paddle.Tensor, ilens: paddle.Tensor, olens: paddle.Tensor, - ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: + spk_logits: paddle.Tensor=None, + spk_ids: paddle.Tensor=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor, + paddle.Tensor, ]: """Calculate forward propagation. Args: @@ -1098,11 +1127,18 @@ class FastSpeech2Loss(nn.Layer): Batch of the lengths of each input (B,). olens(Tensor): Batch of the lengths of each target (B,). + spk_logits(Option[Tensor]): + Batch of outputs after speaker classifier (B, Lmax, num_spk) + spk_ids(Option[Tensor]): + Batch of target spk_id (B,) + Returns: """ + speaker_loss = 0.0 + # apply mask to remove padded part if self.use_masking: out_masks = make_non_pad_mask(olens).unsqueeze(-1) @@ -1124,6 +1160,16 @@ class FastSpeech2Loss(nn.Layer): ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape)) es = es.masked_select(pitch_masks.broadcast_to(es.shape)) + if spk_logits is not None and spk_ids is not None: + batch_size = spk_ids.shape[0] + spk_ids = paddle.repeat_interleave(spk_ids, spk_logits.shape[1], + None) + spk_logits = paddle.reshape(spk_logits, + [-1, spk_logits.shape[-1]]) + mask_index = spk_logits.abs().sum(axis=1) != 0 + spk_ids = spk_ids[mask_index] + spk_logits = spk_logits[mask_index] + # calculate loss l1_loss = self.l1_criterion(before_outs, ys) if after_outs is not None: @@ -1132,6 +1178,9 @@ class FastSpeech2Loss(nn.Layer): pitch_loss = self.mse_criterion(p_outs, ps) energy_loss = self.mse_criterion(e_outs, es) + if spk_logits is not None and spk_ids is not None: + speaker_loss = self.ce_criterion(spk_logits, spk_ids) / batch_size + # make weighted mask and apply it if self.use_weighted_masking: out_masks = make_non_pad_mask(olens).unsqueeze(-1) @@ -1161,4 +1210,4 @@ class FastSpeech2Loss(nn.Layer): energy_loss = energy_loss.masked_select( pitch_masks.broadcast_to(energy_loss.shape)).sum() - return l1_loss, duration_loss, pitch_loss, energy_loss + return l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py old mode 100644 new mode 100755 index 92aa9dfc7..b398267e6 --- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py +++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py @@ -14,6 +14,7 @@ import logging from pathlib import Path +from paddle import DataParallel from paddle import distributed as dist from paddle.io import DataLoader from paddle.nn import Layer @@ -23,6 +24,7 @@ from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator from paddlespeech.t2s.training.reporter import report from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater + logging.basicConfig( format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s', datefmt='[%Y-%m-%d %H:%M:%S]') @@ -31,24 +33,30 @@ logger.setLevel(logging.INFO) class FastSpeech2Updater(StandardUpdater): - def __init__(self, - model: Layer, - optimizer: Optimizer, - dataloader: DataLoader, - init_state=None, - use_masking: bool=False, - use_weighted_masking: bool=False, - output_dir: Path=None): + def __init__( + self, + model: Layer, + optimizer: Optimizer, + dataloader: DataLoader, + init_state=None, + use_masking: bool=False, + spk_loss_scale: float=0.02, + use_weighted_masking: bool=False, + output_dir: Path=None, + enable_spk_cls: bool=False, ): super().__init__(model, optimizer, dataloader, init_state=None) self.criterion = FastSpeech2Loss( - use_masking=use_masking, use_weighted_masking=use_weighted_masking) + use_masking=use_masking, + use_weighted_masking=use_weighted_masking, ) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) self.filehandler = logging.FileHandler(str(log_file)) logger.addHandler(self.filehandler) self.logger = logger self.msg = "" + self.spk_loss_scale = spk_loss_scale + self.enable_spk_cls = enable_spk_cls def update_core(self, batch): self.msg = "Rank: {}, ".format(dist.get_rank()) @@ -60,18 +68,33 @@ class FastSpeech2Updater(StandardUpdater): if spk_emb is not None: spk_id = None - before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( - text=batch["text"], - text_lengths=batch["text_lengths"], - speech=batch["speech"], - speech_lengths=batch["speech_lengths"], - durations=batch["durations"], - pitch=batch["pitch"], - energy=batch["energy"], - spk_id=spk_id, - spk_emb=spk_emb) - - l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( + if type( + self.model + ) == DataParallel and self.model._layers.spk_num and self.model._layers.enable_speaker_classifier: + with self.model.no_sync(): + before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( + text=batch["text"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + durations=batch["durations"], + pitch=batch["pitch"], + energy=batch["energy"], + spk_id=spk_id, + spk_emb=spk_emb) + else: + before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( + text=batch["text"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + durations=batch["durations"], + pitch=batch["pitch"], + energy=batch["energy"], + spk_id=spk_id, + spk_emb=spk_emb) + + l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion( after_outs=after_outs, before_outs=before_outs, d_outs=d_outs, @@ -82,9 +105,12 @@ class FastSpeech2Updater(StandardUpdater): ps=batch["pitch"], es=batch["energy"], ilens=batch["text_lengths"], - olens=olens) + olens=olens, + spk_logits=spk_logits, + spk_ids=spk_id, ) - loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss = self.spk_loss_scale * speaker_loss + loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss optimizer = self.optimizer optimizer.clear_grad() @@ -96,11 +122,18 @@ class FastSpeech2Updater(StandardUpdater): report("train/duration_loss", float(duration_loss)) report("train/pitch_loss", float(pitch_loss)) report("train/energy_loss", float(energy_loss)) + if self.enable_spk_cls: + report("train/speaker_loss", float(speaker_loss)) + report("train/scaled_speaker_loss", float(scaled_speaker_loss)) losses_dict["l1_loss"] = float(l1_loss) losses_dict["duration_loss"] = float(duration_loss) losses_dict["pitch_loss"] = float(pitch_loss) losses_dict["energy_loss"] = float(energy_loss) + losses_dict["energy_loss"] = float(energy_loss) + if self.enable_spk_cls: + losses_dict["speaker_loss"] = float(speaker_loss) + losses_dict["scaled_speaker_loss"] = float(scaled_speaker_loss) losses_dict["loss"] = float(loss) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) @@ -112,7 +145,9 @@ class FastSpeech2Evaluator(StandardEvaluator): dataloader: DataLoader, use_masking: bool=False, use_weighted_masking: bool=False, - output_dir: Path=None): + spk_loss_scale: float=0.02, + output_dir: Path=None, + enable_spk_cls: bool=False): super().__init__(model, dataloader) log_file = output_dir / 'worker_{}.log'.format(dist.get_rank()) @@ -120,6 +155,8 @@ class FastSpeech2Evaluator(StandardEvaluator): logger.addHandler(self.filehandler) self.logger = logger self.msg = "" + self.spk_loss_scale = spk_loss_scale + self.enable_spk_cls = enable_spk_cls self.criterion = FastSpeech2Loss( use_masking=use_masking, use_weighted_masking=use_weighted_masking) @@ -133,18 +170,33 @@ class FastSpeech2Evaluator(StandardEvaluator): if spk_emb is not None: spk_id = None - before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens = self.model( - text=batch["text"], - text_lengths=batch["text_lengths"], - speech=batch["speech"], - speech_lengths=batch["speech_lengths"], - durations=batch["durations"], - pitch=batch["pitch"], - energy=batch["energy"], - spk_id=spk_id, - spk_emb=spk_emb) - - l1_loss, duration_loss, pitch_loss, energy_loss = self.criterion( + if type( + self.model + ) == DataParallel and self.model._layers.spk_num and self.model._layers.enable_speaker_classifier: + with self.model.no_sync(): + before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( + text=batch["text"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + durations=batch["durations"], + pitch=batch["pitch"], + energy=batch["energy"], + spk_id=spk_id, + spk_emb=spk_emb) + else: + before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model( + text=batch["text"], + text_lengths=batch["text_lengths"], + speech=batch["speech"], + speech_lengths=batch["speech_lengths"], + durations=batch["durations"], + pitch=batch["pitch"], + energy=batch["energy"], + spk_id=spk_id, + spk_emb=spk_emb) + + l1_loss, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion( after_outs=after_outs, before_outs=before_outs, d_outs=d_outs, @@ -155,19 +207,29 @@ class FastSpeech2Evaluator(StandardEvaluator): ps=batch["pitch"], es=batch["energy"], ilens=batch["text_lengths"], - olens=olens, ) - loss = l1_loss + duration_loss + pitch_loss + energy_loss + olens=olens, + spk_logits=spk_logits, + spk_ids=spk_id, ) + + scaled_speaker_loss = self.spk_loss_scale * speaker_loss + loss = l1_loss + duration_loss + pitch_loss + energy_loss + scaled_speaker_loss report("eval/loss", float(loss)) report("eval/l1_loss", float(l1_loss)) report("eval/duration_loss", float(duration_loss)) report("eval/pitch_loss", float(pitch_loss)) report("eval/energy_loss", float(energy_loss)) + if self.enable_spk_cls: + report("train/speaker_loss", float(speaker_loss)) + report("train/scaled_speaker_loss", float(scaled_speaker_loss)) losses_dict["l1_loss"] = float(l1_loss) losses_dict["duration_loss"] = float(duration_loss) losses_dict["pitch_loss"] = float(pitch_loss) losses_dict["energy_loss"] = float(energy_loss) + if self.enable_spk_cls: + losses_dict["speaker_loss"] = float(speaker_loss) + losses_dict["scaled_speaker_loss"] = float(scaled_speaker_loss) losses_dict["loss"] = float(loss) self.msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_dict.items()) diff --git a/paddlespeech/t2s/models/hifigan/__init__.py b/paddlespeech/t2s/models/hifigan/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/hifigan/hifigan.py b/paddlespeech/t2s/models/hifigan/hifigan.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/hifigan/hifigan_updater.py b/paddlespeech/t2s/models/hifigan/hifigan_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/melgan/__init__.py b/paddlespeech/t2s/models/melgan/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py b/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/melgan/style_melgan.py b/paddlespeech/t2s/models/melgan/style_melgan.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/melgan/style_melgan_updater.py b/paddlespeech/t2s/models/melgan/style_melgan_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/parallel_wavegan/__init__.py b/paddlespeech/t2s/models/parallel_wavegan/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/speedyspeech/__init__.py b/paddlespeech/t2s/models/speedyspeech/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/tacotron2/__init__.py b/paddlespeech/t2s/models/tacotron2/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2.py b/paddlespeech/t2s/models/tacotron2/tacotron2.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/transformer_tts/__init__.py b/paddlespeech/t2s/models/transformer_tts/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/__init__.py b/paddlespeech/t2s/models/vits/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/duration_predictor.py b/paddlespeech/t2s/models/vits/duration_predictor.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/flow.py b/paddlespeech/t2s/models/vits/flow.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/monotonic_align/__init__.py b/paddlespeech/t2s/models/vits/monotonic_align/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/monotonic_align/core.pyx b/paddlespeech/t2s/models/vits/monotonic_align/core.pyx old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/monotonic_align/setup.py b/paddlespeech/t2s/models/vits/monotonic_align/setup.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/posterior_encoder.py b/paddlespeech/t2s/models/vits/posterior_encoder.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/residual_coupling.py b/paddlespeech/t2s/models/vits/residual_coupling.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/transform.py b/paddlespeech/t2s/models/vits/transform.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/vits_updater.py b/paddlespeech/t2s/models/vits/vits_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/wavenet/__init__.py b/paddlespeech/t2s/models/vits/wavenet/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/wavenet/residual_block.py b/paddlespeech/t2s/models/vits/wavenet/residual_block.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/vits/wavenet/wavenet.py b/paddlespeech/t2s/models/vits/wavenet/wavenet.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/wavernn/__init__.py b/paddlespeech/t2s/models/wavernn/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/models/wavernn/wavernn_updater.py b/paddlespeech/t2s/models/wavernn/wavernn_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/__init__.py b/paddlespeech/t2s/modules/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/activation.py b/paddlespeech/t2s/modules/activation.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/adversarial_loss/__init__.py b/paddlespeech/t2s/modules/adversarial_loss/__init__.py new file mode 100755 index 000000000..abf198b97 --- /dev/null +++ b/paddlespeech/t2s/modules/adversarial_loss/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/t2s/modules/adversarial_loss/gradient_reversal.py b/paddlespeech/t2s/modules/adversarial_loss/gradient_reversal.py new file mode 100755 index 000000000..64da16053 --- /dev/null +++ b/paddlespeech/t2s/modules/adversarial_loss/gradient_reversal.py @@ -0,0 +1,58 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +import paddle.nn as nn +from paddle.autograd import PyLayer + + +class GradientReversalFunction(PyLayer): + """Gradient Reversal Layer from: + Unsupervised Domain Adaptation by Backpropagation (Ganin & Lempitsky, 2015) + + Forward pass is the identity function. In the backward pass, + the upstream gradients are multiplied by -lambda (i.e. gradient is reversed) + """ + + @staticmethod + def forward(ctx, x, lambda_=1): + """Forward in networks + """ + ctx.save_for_backward(lambda_) + return x.clone() + + @staticmethod + def backward(ctx, grads): + """Backward in networks + """ + lambda_, = ctx.saved_tensor() + dx = -lambda_ * grads + return paddle.clip(dx, min=-0.5, max=0.5) + + +class GradientReversalLayer(nn.Layer): + """Gradient Reversal Layer from: + Unsupervised Domain Adaptation by Backpropagation (Ganin & Lempitsky, 2015) + + Forward pass is the identity function. In the backward pass, + the upstream gradients are multiplied by -lambda (i.e. gradient is reversed) + """ + + def __init__(self, lambda_=1): + super(GradientReversalLayer, self).__init__() + self.lambda_ = lambda_ + + def forward(self, x): + """Forward in networks + """ + return GradientReversalFunction.apply(x, self.lambda_) diff --git a/paddlespeech/t2s/modules/adversarial_loss/speaker_classifier.py b/paddlespeech/t2s/modules/adversarial_loss/speaker_classifier.py new file mode 100755 index 000000000..d731b2d27 --- /dev/null +++ b/paddlespeech/t2s/modules/adversarial_loss/speaker_classifier.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from Cross-Lingual-Voice-Cloning(https://github.com/deterministic-algorithms-lab/Cross-Lingual-Voice-Cloning) +import paddle +from paddle import nn +from typeguard import check_argument_types + + +class SpeakerClassifier(nn.Layer): + def __init__( + self, + idim: int, + hidden_sc_dim: int, + spk_num: int, ): + assert check_argument_types() + super().__init__() + # store hyperparameters + self.idim = idim + self.hidden_sc_dim = hidden_sc_dim + self.spk_num = spk_num + + self.model = nn.Sequential( + nn.Linear(self.idim, self.hidden_sc_dim), + nn.Linear(self.hidden_sc_dim, self.spk_num)) + + def parse_outputs(self, out, text_lengths): + mask = paddle.arange(out.shape[1]).expand( + [out.shape[0], out.shape[1]]) < text_lengths.unsqueeze(1) + out = paddle.transpose(out, perm=[2, 0, 1]) + out = out * mask + out = paddle.transpose(out, perm=[1, 2, 0]) + return out + + def forward(self, encoder_outputs, text_lengths): + """ + encoder_outputs = [batch_size, seq_len, encoder_embedding_size] + text_lengths = [batch_size] + + log probabilities of speaker classification = [batch_size, seq_len, spk_num] + """ + + out = self.model(encoder_outputs) + out = self.parse_outputs(out, text_lengths) + return out diff --git a/paddlespeech/t2s/modules/causal_conv.py b/paddlespeech/t2s/modules/causal_conv.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/conformer/__init__.py b/paddlespeech/t2s/modules/conformer/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/conformer/convolution.py b/paddlespeech/t2s/modules/conformer/convolution.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/conv.py b/paddlespeech/t2s/modules/conv.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/geometry.py b/paddlespeech/t2s/modules/geometry.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/layer_norm.py b/paddlespeech/t2s/modules/layer_norm.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/masked_fill.py b/paddlespeech/t2s/modules/masked_fill.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/normalizer.py b/paddlespeech/t2s/modules/normalizer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/positional_encoding.py b/paddlespeech/t2s/modules/positional_encoding.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/pqmf.py b/paddlespeech/t2s/modules/pqmf.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/predictor/__init__.py b/paddlespeech/t2s/modules/predictor/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/residual_block.py b/paddlespeech/t2s/modules/residual_block.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/residual_stack.py b/paddlespeech/t2s/modules/residual_stack.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/style_encoder.py b/paddlespeech/t2s/modules/style_encoder.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/tacotron2/__init__.py b/paddlespeech/t2s/modules/tacotron2/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/tacotron2/encoder.py b/paddlespeech/t2s/modules/tacotron2/encoder.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/tade_res_block.py b/paddlespeech/t2s/modules/tade_res_block.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/__init__.py b/paddlespeech/t2s/modules/transformer/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/decoder.py b/paddlespeech/t2s/modules/transformer/decoder.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/decoder_layer.py b/paddlespeech/t2s/modules/transformer/decoder_layer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/encoder_layer.py b/paddlespeech/t2s/modules/transformer/encoder_layer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/mask.py b/paddlespeech/t2s/modules/transformer/mask.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py b/paddlespeech/t2s/modules/transformer/positionwise_feed_forward.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/transformer/subsampling.py b/paddlespeech/t2s/modules/transformer/subsampling.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/modules/upsample.py b/paddlespeech/t2s/modules/upsample.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/ssml/__init__.py new file mode 100755 index 000000000..9b4db053b --- /dev/null +++ b/paddlespeech/t2s/ssml/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .xml_processor import * diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py new file mode 100755 index 000000000..b39121347 --- /dev/null +++ b/paddlespeech/t2s/ssml/xml_processor.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +import re +import xml.dom.minidom +import xml.parsers.expat +from xml.dom.minidom import Node +from xml.dom.minidom import parseString +''' +Note: xml 有5种特殊字符, &<>"' +其一,采用特殊标签,将包含特殊字符的字符串封装起来。 +例如: + +其二,使用XML转义序列表示这些特殊的字符,这5个特殊字符所对应XML转义序列为: +& & +< < +> > +" " +' ' +例如: +"姓名" + +''' + + +class MixTextProcessor(): + def __repr__(self): + print("@an MixTextProcessor class") + + def get_xml_content(self, mixstr): + '''返回字符串的 xml 内容''' + xmlptn = re.compile(r".*?", re.M | re.S) + ctn = re.search(xmlptn, mixstr) + if ctn: + return ctn.group(0) + else: + return None + + def get_content_split(self, mixstr): + ''' 文本分解,顺序加了列表中,按非 xml 和 xml 分开,对应的字符串,带标点符号 + 不能去除空格,因为 xml 中tag 属性带空格 + ''' + ctlist = [] + # print("Testing:",mixstr[:20]) + patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) + mat = re.match(patn, mixstr) + if mat: + pre_xml = mat.group(1) + in_xml = mat.group(2) + after_xml = mat.group(3) + + ctlist.append(pre_xml) + ctlist.append(in_xml) + ctlist.append(after_xml) + return ctlist + else: + ctlist.append(mixstr) + return ctlist + + @classmethod + def get_pinyin_split(self, mixstr): + ctlist = [] + patn = re.compile(r'(.*\s*?)(.*?)(.*\s*)$', re.M | re.S) + mat = re.match(patn, mixstr) + if mat: + pre_xml = mat.group(1) + in_xml = mat.group(2) + after_xml = mat.group(3) + + ctlist.append([pre_xml, []]) + dom = DomXml(in_xml) + pinyinlist = dom.get_pinyins_for_xml() + ctlist = ctlist + pinyinlist + ctlist.append([after_xml, []]) + else: + ctlist.append([mixstr, []]) + return ctlist + + +class DomXml(): + def __init__(self, xmlstr): + self.tdom = parseString(xmlstr) #Document + self.root = self.tdom.documentElement #Element + self.rnode = self.tdom.childNodes #NodeList + + def get_text(self): + '''返回 xml 内容的所有文本内容的列表''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + res.append(x1.value) + else: + for x2 in x1.childNodes: + if isinstance(x2, xml.dom.minidom.Text): + res.append(x2.data) + else: + for x3 in x2.childNodes: + if isinstance(x3, xml.dom.minidom.Text): + res.append(x3.data) + else: + print("len(nodes of x3):", len(x3.childNodes)) + + return res + + def get_xmlchild_list(self): + '''返回 xml 内容的列表,包括所有文本内容(不带 tag)''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + res.append(x1.value) + else: + for x2 in x1.childNodes: + if isinstance(x2, xml.dom.minidom.Text): + res.append(x2.data) + else: + for x3 in x2.childNodes: + if isinstance(x3, xml.dom.minidom.Text): + res.append(x3.data) + else: + print("len(nodes of x3):", len(x3.childNodes)) + print(res) + return res + + def get_pinyins_for_xml(self): + '''返回 xml 内容,字符串和拼音的 list ''' + res = [] + + for x1 in self.rnode: + if x1.nodeType == Node.TEXT_NODE: + t = re.sub(r"\s+", "", x1.value) + res.append([t, []]) + else: + for x2 in x1.childNodes: + if isinstance(x2, xml.dom.minidom.Text): + t = re.sub(r"\s+", "", x2.data) + res.append([t, []]) + else: + # print("x2",x2,x2.tagName) + if x2.hasAttribute('pinyin'): + pinyin_value = x2.getAttribute("pinyin") + pinyins = pinyin_value.split(" ") + for x3 in x2.childNodes: + # print('x3',x3) + if isinstance(x3, xml.dom.minidom.Text): + t = re.sub(r"\s+", "", x3.data) + res.append([t, pinyins]) + else: + print("len(nodes of x3):", len(x3.childNodes)) + + return res + + def get_all_tags(self, tag_name): + '''获取所有的 tag 及属性值''' + alltags = self.root.getElementsByTagName(tag_name) + for x in alltags: + if x.hasAttribute('pinyin'): # pinyin + print(x.tagName, 'pinyin', + x.getAttribute('pinyin'), x.firstChild.data) diff --git a/paddlespeech/t2s/training/__init__.py b/paddlespeech/t2s/training/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/cli.py b/paddlespeech/t2s/training/cli.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/default_config.py b/paddlespeech/t2s/training/default_config.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/experiment.py b/paddlespeech/t2s/training/experiment.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/extension.py b/paddlespeech/t2s/training/extension.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/extensions/__init__.py b/paddlespeech/t2s/training/extensions/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/extensions/evaluator.py b/paddlespeech/t2s/training/extensions/evaluator.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/extensions/snapshot.py b/paddlespeech/t2s/training/extensions/snapshot.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/extensions/visualizer.py b/paddlespeech/t2s/training/extensions/visualizer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/optimizer.py b/paddlespeech/t2s/training/optimizer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/reporter.py b/paddlespeech/t2s/training/reporter.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/seeding.py b/paddlespeech/t2s/training/seeding.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/trainer.py b/paddlespeech/t2s/training/trainer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/trigger.py b/paddlespeech/t2s/training/trigger.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/triggers/__init__.py b/paddlespeech/t2s/training/triggers/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/triggers/interval_trigger.py b/paddlespeech/t2s/training/triggers/interval_trigger.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/triggers/limit_trigger.py b/paddlespeech/t2s/training/triggers/limit_trigger.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/triggers/time_trigger.py b/paddlespeech/t2s/training/triggers/time_trigger.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/updater.py b/paddlespeech/t2s/training/updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/updaters/__init__.py b/paddlespeech/t2s/training/updaters/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/training/updaters/standard_updater.py b/paddlespeech/t2s/training/updaters/standard_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/utils/__init__.py b/paddlespeech/t2s/utils/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/utils/checkpoint.py b/paddlespeech/t2s/utils/checkpoint.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/utils/display.py b/paddlespeech/t2s/utils/display.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/utils/error_rate.py b/paddlespeech/t2s/utils/error_rate.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/utils/h5_utils.py b/paddlespeech/t2s/utils/h5_utils.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/utils/internals.py b/paddlespeech/t2s/utils/internals.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/utils/layer_tools.py b/paddlespeech/t2s/utils/layer_tools.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/utils/mp_tools.py b/paddlespeech/t2s/utils/mp_tools.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/utils/profiler.py b/paddlespeech/t2s/utils/profiler.py old mode 100644 new mode 100755 diff --git a/paddlespeech/t2s/utils/scheduler.py b/paddlespeech/t2s/utils/scheduler.py old mode 100644 new mode 100755 diff --git a/paddlespeech/text/__init__.py b/paddlespeech/text/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/text/exps/__init__.py b/paddlespeech/text/exps/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/text/exps/ernie_linear/__init__.py b/paddlespeech/text/exps/ernie_linear/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/text/exps/ernie_linear/avg_model.py b/paddlespeech/text/exps/ernie_linear/avg_model.py old mode 100644 new mode 100755 diff --git a/paddlespeech/text/exps/ernie_linear/punc_restore.py b/paddlespeech/text/exps/ernie_linear/punc_restore.py old mode 100644 new mode 100755 index 2cb4d0719..98804606c --- a/paddlespeech/text/exps/ernie_linear/punc_restore.py +++ b/paddlespeech/text/exps/ernie_linear/punc_restore.py @@ -25,8 +25,6 @@ DefinedClassifier = { 'ErnieLinear': ErnieLinear, } -tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') - def _clean_text(text, punc_list): text = text.lower() @@ -35,7 +33,7 @@ def _clean_text(text, punc_list): return text -def preprocess(text, punc_list): +def preprocess(text, punc_list, tokenizer): clean_text = _clean_text(text, punc_list) assert len(clean_text) > 0, f'Invalid input string: {text}' tokenized_input = tokenizer( @@ -51,7 +49,8 @@ def test(args): with open(args.config) as f: config = CfgNode(yaml.safe_load(f)) print("========Args========") - print(yaml.safe_dump(vars(args))) + print(yaml.safe_dump(vars(args), allow_unicode=True)) + # print(args) print("========Config========") print(config) @@ -61,10 +60,16 @@ def test(args): punc_list.append(line.strip()) model = DefinedClassifier[config["model_type"]](**config["model"]) + # print(model) + + pretrained_token = config['data_params']['pretrained_token'] + tokenizer = ErnieTokenizer.from_pretrained(pretrained_token) + # tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') + state_dict = paddle.load(args.checkpoint) model.set_state_dict(state_dict["main_params"]) model.eval() - _inputs = preprocess(args.text, punc_list) + _inputs = preprocess(args.text, punc_list, tokenizer) seq_len = _inputs['seq_len'] input_ids = paddle.to_tensor(_inputs['input_ids']).unsqueeze(0) seg_ids = paddle.to_tensor(_inputs['seg_ids']).unsqueeze(0) diff --git a/paddlespeech/text/exps/ernie_linear/test.py b/paddlespeech/text/exps/ernie_linear/test.py old mode 100644 new mode 100755 index 4302a1a3b..aa172cc69 --- a/paddlespeech/text/exps/ernie_linear/test.py +++ b/paddlespeech/text/exps/ernie_linear/test.py @@ -23,6 +23,7 @@ from sklearn.metrics import classification_report from sklearn.metrics import precision_recall_fscore_support from yacs.config import CfgNode +from paddlespeech.t2s.utils import str2bool from paddlespeech.text.models.ernie_linear import ErnieLinear from paddlespeech.text.models.ernie_linear import PuncDataset from paddlespeech.text.models.ernie_linear import PuncDatasetFromErnieTokenizer @@ -91,9 +92,10 @@ def test(args): t = classification_report( test_total_label, test_total_predict, target_names=punc_list) print(t) - t2 = evaluation(test_total_label, test_total_predict) - print('=========================================================') - print(t2) + if args.print_eval: + t2 = evaluation(test_total_label, test_total_predict) + print('=========================================================') + print(t2) def main(): @@ -101,6 +103,7 @@ def main(): parser = argparse.ArgumentParser(description="Test a ErnieLinear model.") parser.add_argument("--config", type=str, help="ErnieLinear config file.") parser.add_argument("--checkpoint", type=str, help="snapshot to load.") + parser.add_argument("--print_eval", type=str2bool, default=True) parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") diff --git a/paddlespeech/text/exps/ernie_linear/train.py b/paddlespeech/text/exps/ernie_linear/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/text/models/__init__.py b/paddlespeech/text/models/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/text/models/ernie_crf/__init__.py b/paddlespeech/text/models/ernie_crf/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/text/models/ernie_crf/model.py b/paddlespeech/text/models/ernie_crf/model.py old mode 100644 new mode 100755 diff --git a/paddlespeech/text/models/ernie_linear/__init__.py b/paddlespeech/text/models/ernie_linear/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/text/models/ernie_linear/dataset.py b/paddlespeech/text/models/ernie_linear/dataset.py old mode 100644 new mode 100755 diff --git a/paddlespeech/text/models/ernie_linear/ernie_linear.py b/paddlespeech/text/models/ernie_linear/ernie_linear.py old mode 100644 new mode 100755 diff --git a/paddlespeech/text/models/ernie_linear/ernie_linear_updater.py b/paddlespeech/text/models/ernie_linear/ernie_linear_updater.py old mode 100644 new mode 100755 diff --git a/paddlespeech/utils/__init__.py b/paddlespeech/utils/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/utils/dynamic_import.py b/paddlespeech/utils/dynamic_import.py old mode 100644 new mode 100755 diff --git a/paddlespeech/utils/env.py b/paddlespeech/utils/env.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/__init__.py b/paddlespeech/vector/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/cluster/__init__.py b/paddlespeech/vector/cluster/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/cluster/diarization.py b/paddlespeech/vector/cluster/diarization.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/cluster/plda.py b/paddlespeech/vector/cluster/plda.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/__init__.py b/paddlespeech/vector/exps/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/ecapa_tdnn/test.py b/paddlespeech/vector/exps/ecapa_tdnn/test.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/ge2e/__init__.py b/paddlespeech/vector/exps/ge2e/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/ge2e/audio_processor.py b/paddlespeech/vector/exps/ge2e/audio_processor.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/ge2e/config.py b/paddlespeech/vector/exps/ge2e/config.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/ge2e/dataset_processors.py b/paddlespeech/vector/exps/ge2e/dataset_processors.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/ge2e/inference.py b/paddlespeech/vector/exps/ge2e/inference.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/ge2e/preprocess.py b/paddlespeech/vector/exps/ge2e/preprocess.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/ge2e/random_cycle.py b/paddlespeech/vector/exps/ge2e/random_cycle.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py b/paddlespeech/vector/exps/ge2e/speaker_verification_dataset.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/exps/ge2e/train.py b/paddlespeech/vector/exps/ge2e/train.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/io/__init__.py b/paddlespeech/vector/io/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/io/batch.py b/paddlespeech/vector/io/batch.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/io/dataset.py b/paddlespeech/vector/io/dataset.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/io/dataset_from_json.py b/paddlespeech/vector/io/dataset_from_json.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/io/embedding_norm.py b/paddlespeech/vector/io/embedding_norm.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/io/signal_processing.py b/paddlespeech/vector/io/signal_processing.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/models/__init__.py b/paddlespeech/vector/models/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/models/lstm_speaker_encoder.py b/paddlespeech/vector/models/lstm_speaker_encoder.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/modules/__init__.py b/paddlespeech/vector/modules/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/modules/layer.py b/paddlespeech/vector/modules/layer.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/modules/loss.py b/paddlespeech/vector/modules/loss.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/modules/sid_model.py b/paddlespeech/vector/modules/sid_model.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/training/__init__.py b/paddlespeech/vector/training/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/training/scheduler.py b/paddlespeech/vector/training/scheduler.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/training/seeding.py b/paddlespeech/vector/training/seeding.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/utils/__init__.py b/paddlespeech/vector/utils/__init__.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/utils/time.py b/paddlespeech/vector/utils/time.py old mode 100644 new mode 100755 diff --git a/paddlespeech/vector/utils/vector_utils.py b/paddlespeech/vector/utils/vector_utils.py old mode 100644 new mode 100755 diff --git a/setup.cfg b/setup.cfg old mode 100644 new mode 100755 diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index e551d9fa6..7fb4c70be --- a/setup.py +++ b/setup.py @@ -75,6 +75,8 @@ base = [ "braceexpand", "pyyaml", "pybind11", + "paddlelite", + "paddleslim==2.3.4", ] server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"] diff --git a/speechx/.clang-format b/speechx/.clang-format new file mode 100755 index 000000000..af946a4a9 --- /dev/null +++ b/speechx/.clang-format @@ -0,0 +1,29 @@ +# This file is used by clang-format to autoformat paddle source code +# +# The clang-format is part of llvm toolchain. +# It need to install llvm and clang to format source code style. +# +# The basic usage is, +# clang-format -i -style=file PATH/TO/SOURCE/CODE +# +# The -style=file implicit use ".clang-format" file located in one of +# parent directory. +# The -i means inplace change. +# +# The document of clang-format is +# http://clang.llvm.org/docs/ClangFormat.html +# http://clang.llvm.org/docs/ClangFormatStyleOptions.html +--- +Language: Cpp +BasedOnStyle: Google +IndentWidth: 4 +TabWidth: 4 +ContinuationIndentWidth: 4 +MaxEmptyLinesToKeep: 2 +AccessModifierOffset: -2 # The private/protected/public has no indent in class +Standard: Cpp11 +AllowAllParametersOfDeclarationOnNextLine: true +BinPackParameters: false +BinPackArguments: false +... + diff --git a/speechx/.gitignore b/speechx/.gitignore old mode 100644 new mode 100755 index e0c618470..9a93805c0 --- a/speechx/.gitignore +++ b/speechx/.gitignore @@ -1 +1,2 @@ tools/valgrind* +*log diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt old mode 100644 new mode 100755 index 4b5838e5c..09bdb3c1e --- a/speechx/CMakeLists.txt +++ b/speechx/CMakeLists.txt @@ -13,7 +13,6 @@ set(CMAKE_CXX_STANDARD 14) set(speechx_cmake_dir ${PROJECT_SOURCE_DIR}/cmake) # Modules -list(APPEND CMAKE_MODULE_PATH ${speechx_cmake_dir}/external) list(APPEND CMAKE_MODULE_PATH ${speechx_cmake_dir}) include(FetchContent) include(ExternalProject) @@ -32,9 +31,13 @@ SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall ############################################################################### # Option Configurations ############################################################################### -# option configurations option(TEST_DEBUG "option for debug" OFF) +option(USE_PROFILING "enable c++ profling" OFF) +option(USING_U2 "compile u2 model." ON) +option(USING_DS2 "compile with ds2 model." ON) + +option(USING_GPU "u2 compute on GPU." OFF) ############################################################################### # Include third party @@ -83,48 +86,65 @@ add_dependencies(openfst gflags glog) # paddle lib -set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib) -set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix) -ExternalProject_Add(paddle - URL https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz - URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873 - PREFIX ${paddle_PREFIX_DIR} - SOURCE_DIR ${paddle_SOURCE_DIR} - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" -) - -set(PADDLE_LIB ${fc_patch}/paddle-lib) -include_directories("${PADDLE_LIB}/paddle/include") -set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") -include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") -include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") -include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include") - -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") -link_directories("${PADDLE_LIB}/paddle/lib") -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib") - -##paddle with mkl -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") -set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml") -include_directories("${MATH_LIB_PATH}/include") -set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} - ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) -set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") -include_directories("${MKLDNN_PATH}/include") -set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) -set(EXTERNAL_LIB "-lrt -ldl -lpthread") - -set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX}) -set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf xxhash cryptopp - ${EXTERNAL_LIB}) - +include(paddleinference) + + +# paddle core.so +find_package(Threads REQUIRED) +find_package(PythonLibs REQUIRED) +find_package(Python3 REQUIRED) +find_package(pybind11 CONFIG) + +message(STATUS "PYTHON_LIBRARIES = ${PYTHON_LIBRARIES}") +message(STATUS "Python3_EXECUTABLE = ${Python3_EXECUTABLE}") +message(STATUS "Pybind11_INCLUDES = ${pybind11_INCLUDE_DIRS}, pybind11_LIBRARIES=${pybind11_LIBRARIES}, pybind11_DEFINITIONS=${pybind11_DEFINITIONS}") + +# paddle include and link option +# -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/libs -L/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/fluid -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so +execute_process( + COMMAND python -c "\ +import os;\ +import paddle;\ +include_dir=paddle.sysconfig.get_include();\ +paddle_dir=os.path.split(include_dir)[0];\ +libs_dir=os.path.join(paddle_dir, 'libs');\ +fluid_dir=os.path.join(paddle_dir, 'fluid');\ +out=' '.join([\"-L\" + libs_dir, \"-L\" + fluid_dir]);\ +out += \" -l:libpaddle.so -l:libdnnl.so.2 -l:libiomp5.so\"; print(out);\ + " + OUTPUT_VARIABLE PADDLE_LINK_FLAGS + RESULT_VARIABLE SUCESS) + +message(STATUS PADDLE_LINK_FLAGS= ${PADDLE_LINK_FLAGS}) +string(STRIP ${PADDLE_LINK_FLAGS} PADDLE_LINK_FLAGS) + +# paddle compile option +# -I/workspace/DeepSpeech-2.x/speechx/venv/lib/python3.7/site-packages/paddle/include +execute_process( + COMMAND python -c "\ +import paddle; \ +include_dir = paddle.sysconfig.get_include(); \ +print(f\"-I{include_dir}\"); \ + " + OUTPUT_VARIABLE PADDLE_COMPILE_FLAGS) +message(STATUS PADDLE_COMPILE_FLAGS= ${PADDLE_COMPILE_FLAGS}) +string(STRIP ${PADDLE_COMPILE_FLAGS} PADDLE_COMPILE_FLAGS) + + +# for LD_LIBRARY_PATH +# set(PADDLE_LIB_DIRS /workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid:/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/libs/) +execute_process( + COMMAND python -c "\ +import os; \ +import paddle; \ +include_dir=paddle.sysconfig.get_include(); \ +paddle_dir=os.path.split(include_dir)[0]; \ +libs_dir=os.path.join(paddle_dir, 'libs'); \ +fluid_dir=os.path.join(paddle_dir, 'fluid'); \ +out=':'.join([libs_dir, fluid_dir]); print(out); \ + " + OUTPUT_VARIABLE PADDLE_LIB_DIRS) +message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS}) ############################################################################### diff --git a/speechx/README.md b/speechx/README.md old mode 100644 new mode 100755 index cd1cd62c1..a575040db --- a/speechx/README.md +++ b/speechx/README.md @@ -3,11 +3,14 @@ ## Environment We develop under: +* python - 3.7 * docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7` * os - Ubuntu 16.04.7 LTS * gcc/g++/gfortran - 8.2.0 * cmake - 3.16.0 +> Please use `tools/env.sh` to create python `venv`, then `source venv/bin/activate` to build speechx. + > We make sure all things work fun under docker, and recommend using it to develop and deploy. * [How to Install Docker](https://docs.docker.com/engine/install/) @@ -24,16 +27,23 @@ docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --nam * More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html). +2. Create python environment. -2. Build `speechx` and `examples`. +``` +bash tools/venv.sh +``` -> Do not source venv. +2. Build `speechx` and `examples`. +For now we are using feature under `develop` branch of paddle, so we need to install `paddlepaddle` nightly build version. +For example: ``` -pushd /path/to/speechx +source venv/bin/activate +python -m pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html ./build.sh ``` + 3. Go to `examples` to have a fun. More details please see `README.md` under `examples`. @@ -60,3 +70,46 @@ popd ### Deepspeech2 with linear feature * DecibelNormalizer: there is a small difference between the offline and online db norm. The computation of online db norm reads features chunk by chunk, which causes the feature size to be different different with offline db norm. In `normalizer.cc:73`, the `samples.size()` is different, which causes the different result. + +## FAQ + +1. No moudle named `paddle`. + +``` +CMake Error at CMakeLists.txt:119 (string): + string sub-command STRIP requires two arguments. + + +Traceback (most recent call last): + File "", line 1, in +ModuleNotFoundError: No module named 'paddle' +-- PADDLE_COMPILE_FLAGS= +CMake Error at CMakeLists.txt:131 (string): + string sub-command STRIP requires two arguments. + + + File "", line 1 + import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out); + ^ +``` + +please install paddlepaddle >= 2.4rc + +2. `u2_recognizer_main: error while loading shared libraries: liblibpaddle.so: cannot open shared object file: No such file or directory` + + +``` +cd $YOUR_ENV_PATH/lib/python3.7/site-packages/paddle/fluid +patchelf --set-soname libpaddle.so libpaddle.so +``` + +3. `u2_recognizer_main: error while loading shared libraries: libgfortran.so.5: cannot open shared object file: No such file or directory` + +``` +# my gcc version is 8.2 +apt-get install gfortran-8 +``` + +4. `Undefined reference to '_gfortran_concat_string'` + +using gcc 8.2, gfortran 8.2. diff --git a/speechx/build.sh b/speechx/build.sh index a6eef6565..e0a386752 100755 --- a/speechx/build.sh +++ b/speechx/build.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -xe # the build script had verified in the paddlepaddle docker image. # please follow the instruction below to install PaddlePaddle image. @@ -17,11 +18,6 @@ fi #rm -rf build mkdir -p build -cd build -cmake .. -DBOOST_ROOT:STRING=${boost_SOURCE_DIR} -#cmake .. - -make -j - -cd - +cmake -B build -DBOOST_ROOT:STRING=${boost_SOURCE_DIR} +cmake --build build diff --git a/speechx/cmake/EnableCMP0048.cmake b/speechx/cmake/EnableCMP0048.cmake old mode 100644 new mode 100755 diff --git a/speechx/cmake/FindGFortranLibs.cmake b/speechx/cmake/FindGFortranLibs.cmake old mode 100644 new mode 100755 diff --git a/speechx/cmake/external/absl.cmake b/speechx/cmake/absl.cmake old mode 100644 new mode 100755 similarity index 100% rename from speechx/cmake/external/absl.cmake rename to speechx/cmake/absl.cmake diff --git a/speechx/cmake/external/boost.cmake b/speechx/cmake/boost.cmake old mode 100644 new mode 100755 similarity index 100% rename from speechx/cmake/external/boost.cmake rename to speechx/cmake/boost.cmake diff --git a/speechx/cmake/external/eigen.cmake b/speechx/cmake/eigen.cmake old mode 100644 new mode 100755 similarity index 100% rename from speechx/cmake/external/eigen.cmake rename to speechx/cmake/eigen.cmake diff --git a/speechx/cmake/external/gflags.cmake b/speechx/cmake/external/gflags.cmake deleted file mode 100644 index 66ae47f70..000000000 --- a/speechx/cmake/external/gflags.cmake +++ /dev/null @@ -1,12 +0,0 @@ -include(FetchContent) - -FetchContent_Declare( - gflags - URL https://github.com/gflags/gflags/archive/v2.2.1.zip - URL_HASH SHA256=4e44b69e709c826734dbbbd5208f61888a2faf63f239d73d8ba0011b2dccc97a -) - -FetchContent_MakeAvailable(gflags) - -# openfst need -include_directories(${gflags_BINARY_DIR}/include) \ No newline at end of file diff --git a/speechx/cmake/gflags.cmake b/speechx/cmake/gflags.cmake new file mode 100755 index 000000000..36bebc877 --- /dev/null +++ b/speechx/cmake/gflags.cmake @@ -0,0 +1,11 @@ +include(FetchContent) + +FetchContent_Declare( + gflags + URL https://github.com/gflags/gflags/archive/v2.2.2.zip + URL_HASH SHA256=19713a36c9f32b33df59d1c79b4958434cb005b5b47dc5400a7a4b078111d9b5 +) +FetchContent_MakeAvailable(gflags) + +# openfst need +include_directories(${gflags_BINARY_DIR}/include) \ No newline at end of file diff --git a/speechx/cmake/external/glog.cmake b/speechx/cmake/glog.cmake old mode 100644 new mode 100755 similarity index 100% rename from speechx/cmake/external/glog.cmake rename to speechx/cmake/glog.cmake diff --git a/speechx/cmake/external/gtest.cmake b/speechx/cmake/gtest.cmake old mode 100644 new mode 100755 similarity index 69% rename from speechx/cmake/external/gtest.cmake rename to speechx/cmake/gtest.cmake index 7fe397fcb..1ea8ed0b7 --- a/speechx/cmake/external/gtest.cmake +++ b/speechx/cmake/gtest.cmake @@ -1,8 +1,8 @@ include(FetchContent) FetchContent_Declare( gtest - URL https://github.com/google/googletest/archive/release-1.10.0.zip - URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91 + URL https://github.com/google/googletest/archive/release-1.11.0.zip + URL_HASH SHA256=353571c2440176ded91c2de6d6cd88ddd41401d14692ec1f99e35d013feda55a ) FetchContent_MakeAvailable(gtest) diff --git a/speechx/cmake/external/kenlm.cmake b/speechx/cmake/kenlm.cmake old mode 100644 new mode 100755 similarity index 100% rename from speechx/cmake/external/kenlm.cmake rename to speechx/cmake/kenlm.cmake diff --git a/speechx/cmake/external/libsndfile.cmake b/speechx/cmake/libsndfile.cmake old mode 100644 new mode 100755 similarity index 100% rename from speechx/cmake/external/libsndfile.cmake rename to speechx/cmake/libsndfile.cmake diff --git a/speechx/cmake/external/openblas.cmake b/speechx/cmake/openblas.cmake old mode 100644 new mode 100755 similarity index 88% rename from speechx/cmake/external/openblas.cmake rename to speechx/cmake/openblas.cmake index 5c196527e..27e132075 --- a/speechx/cmake/external/openblas.cmake +++ b/speechx/cmake/openblas.cmake @@ -1,7 +1,7 @@ include(FetchContent) -set(OpenBLAS_SOURCE_DIR ${fc_patch}/OpenBLAS-src) -set(OpenBLAS_PREFIX ${fc_patch}/OpenBLAS-prefix) +set(OpenBLAS_SOURCE_DIR ${fc_patch}/openblas-src) +set(OpenBLAS_PREFIX ${fc_patch}/openblas-prefix) # ###################################################################################################################### # OPENBLAS https://github.com/lattice/quda/blob/develop/CMakeLists.txt#L575 @@ -43,6 +43,7 @@ ExternalProject_Add( # https://cmake.org/cmake/help/latest/module/ExternalProject.html?highlight=externalproject_get_property#external-project-definition ExternalProject_Get_Property(OPENBLAS INSTALL_DIR) +message(STATUS "OPENBLAS install dir: ${INSTALL_DIR}") set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR}) add_library(openblas STATIC IMPORTED) add_dependencies(openblas OPENBLAS) @@ -55,4 +56,6 @@ set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_P # ${CMAKE_INSTALL_LIBDIR} lib # ${CMAKE_INSTALL_INCLUDEDIR} include link_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}) -include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}) \ No newline at end of file +# include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}) +# fix for can not find `cblas.h` +include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/openblas) \ No newline at end of file diff --git a/speechx/cmake/external/openfst.cmake b/speechx/cmake/openfst.cmake old mode 100644 new mode 100755 similarity index 100% rename from speechx/cmake/external/openfst.cmake rename to speechx/cmake/openfst.cmake diff --git a/speechx/cmake/paddleinference.cmake b/speechx/cmake/paddleinference.cmake new file mode 100755 index 000000000..d8a9c6134 --- /dev/null +++ b/speechx/cmake/paddleinference.cmake @@ -0,0 +1,49 @@ +set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib) +set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix) + +include(FetchContent) +FetchContent_Declare( + paddle + URL https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz + URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873 + PREFIX ${paddle_PREFIX_DIR} + SOURCE_DIR ${paddle_SOURCE_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" +) +FetchContent_MakeAvailable(paddle) + +set(PADDLE_LIB_THIRD_PARTY_PATH "${paddle_SOURCE_DIR}/third_party/install/") + +include_directories("${paddle_SOURCE_DIR}/paddle/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include") + +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") +link_directories("${paddle_SOURCE_DIR}/paddle/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn/lib") + +##paddle with mkl +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") +set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml") +include_directories("${MATH_LIB_PATH}/include") +set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + +set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") +include_directories("${MKLDNN_PATH}/include") +set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) +set(EXTERNAL_LIB "-lrt -ldl -lpthread") + +# global vars +set(DEPS ${paddle_SOURCE_DIR}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX} CACHE INTERNAL "deps") +set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags protobuf xxhash cryptopp + ${EXTERNAL_LIB} CACHE INTERNAL "deps") +message(STATUS "Deps libraries: ${DEPS}") diff --git a/speechx/docker/.gitkeep b/speechx/docker/.gitkeep old mode 100644 new mode 100755 diff --git a/speechx/examples/.gitignore b/speechx/examples/.gitignore old mode 100644 new mode 100755 diff --git a/speechx/examples/README.md b/speechx/examples/README.md old mode 100644 new mode 100755 index f7f6f9ac0..de27bd94b --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -1,20 +1,42 @@ # Examples for SpeechX +> `u2pp_ol` is recommended. + +* `u2pp_ol` - u2++ streaming asr test under `aishell-1` test dataset. * `ds2_ol` - ds2 streaming test under `aishell-1` test dataset. + ## How to run -`run.sh` is the entry point. +### Create env + +Using `tools/evn.sh` under `speechx` to create python env. + +``` +bash tools/env.sh +``` + +Source env before play with example. +``` +. venv/bin/activate +``` + +### Play with example + +`run.sh` is the entry point for every example. -Example to play `ds2_ol`: +Example to play `u2pp_ol`: ``` -pushd ds2_ol/aishell -bash run.sh +pushd u2pp_ol/wenetspeech +bash run.sh --stop_stage 4 ``` ## Display Model with [Netron](https://github.com/lutzroeder/netron) +If you have a model, we can using this commnd to show model graph. + +For example: ``` pip install netron netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host 10.21.55.20 diff --git a/speechx/examples/codelab/README.md b/speechx/examples/codelab/README.md old mode 100644 new mode 100755 index f89184de9..803f25fac --- a/speechx/examples/codelab/README.md +++ b/speechx/examples/codelab/README.md @@ -1,8 +1,9 @@ # Codelab -## introduction +> The below is for developing and offline testing. +> Do not run it only if you know what it is. -> The below is for developing and offline testing. Do not run it only if you know what it is. * nnet * feat * decoder +* u2 diff --git a/speechx/examples/codelab/decoder/.gitignore b/speechx/examples/codelab/decoder/.gitignore old mode 100644 new mode 100755 diff --git a/speechx/examples/codelab/decoder/README.md b/speechx/examples/codelab/decoder/README.md old mode 100644 new mode 100755 diff --git a/speechx/examples/codelab/decoder/path.sh b/speechx/examples/codelab/decoder/path.sh old mode 100644 new mode 100755 diff --git a/speechx/examples/codelab/decoder/run.sh b/speechx/examples/codelab/decoder/run.sh index a911eb033..1a9e3cd7e 100755 --- a/speechx/examples/codelab/decoder/run.sh +++ b/speechx/examples/codelab/decoder/run.sh @@ -69,7 +69,7 @@ compute_linear_spectrogram_main \ echo "compute linear spectrogram feature." # run ctc beam search decoder as streaming -ctc_prefix_beam_search_decoder_main \ +ctc_beam_search_decoder_main \ --result_wspecifier=ark,t:$exp_dir/result.txt \ --feature_rspecifier=ark:$feat_wspecifier \ --model_path=$model_dir/avg_1.jit.pdmodel \ diff --git a/speechx/examples/codelab/feat/.gitignore b/speechx/examples/codelab/feat/.gitignore new file mode 100755 index 000000000..bbd86a25b --- /dev/null +++ b/speechx/examples/codelab/feat/.gitignore @@ -0,0 +1,2 @@ +data +exp diff --git a/speechx/examples/codelab/feat/README.md b/speechx/examples/codelab/feat/README.md old mode 100644 new mode 100755 diff --git a/speechx/examples/codelab/feat/path.sh b/speechx/examples/codelab/feat/path.sh old mode 100644 new mode 100755 index 3b89d01e9..9d2291743 --- a/speechx/examples/codelab/feat/path.sh +++ b/speechx/examples/codelab/feat/path.sh @@ -1,12 +1,12 @@ # This contains the locations of binarys build required for running the examples. SPEECHX_ROOT=$PWD/../../../ -SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples +SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx SPEECHX_TOOLS=$SPEECHX_ROOT/tools TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } export LC_AL=C diff --git a/speechx/examples/codelab/feat/run.sh b/speechx/examples/codelab/feat/run.sh index 1fa37f981..5d7612ae5 100755 --- a/speechx/examples/codelab/feat/run.sh +++ b/speechx/examples/codelab/feat/run.sh @@ -42,8 +42,8 @@ mkdir -p $exp_dir export GLOG_logtostderr=1 cmvn_json2kaldi_main \ - --json_file $model_dir/data/mean_std.json \ - --cmvn_write_path $exp_dir/cmvn.ark \ + --json_file=$model_dir/data/mean_std.json \ + --cmvn_write_path=$exp_dir/cmvn.ark \ --binary=false echo "convert json cmvn to kaldi ark." @@ -54,4 +54,10 @@ compute_linear_spectrogram_main \ --cmvn_file=$exp_dir/cmvn.ark echo "compute linear spectrogram feature." +compute_fbank_main \ + --num_bins=161 \ + --wav_rspecifier=scp:$data_dir/wav.scp \ + --feature_wspecifier=ark,t:$exp_dir/fbank.ark \ + --cmvn_file=$exp_dir/cmvn.ark +echo "compute fbank feature." diff --git a/speechx/examples/codelab/nnet/.gitignore b/speechx/examples/codelab/nnet/.gitignore old mode 100644 new mode 100755 diff --git a/speechx/examples/codelab/nnet/README.md b/speechx/examples/codelab/nnet/README.md old mode 100644 new mode 100755 diff --git a/speechx/examples/codelab/nnet/path.sh b/speechx/examples/codelab/nnet/path.sh old mode 100644 new mode 100755 index 7d395d648..11c8aef8b --- a/speechx/examples/codelab/nnet/path.sh +++ b/speechx/examples/codelab/nnet/path.sh @@ -6,7 +6,7 @@ SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx SPEECHX_TOOLS=$SPEECHX_ROOT/tools TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } export LC_AL=C diff --git a/speechx/examples/codelab/u2/.gitignore b/speechx/examples/codelab/u2/.gitignore new file mode 100755 index 000000000..1269488f7 --- /dev/null +++ b/speechx/examples/codelab/u2/.gitignore @@ -0,0 +1 @@ +data diff --git a/speechx/examples/codelab/u2/README.md b/speechx/examples/codelab/u2/README.md new file mode 100755 index 000000000..3c85dc917 --- /dev/null +++ b/speechx/examples/codelab/u2/README.md @@ -0,0 +1 @@ +# u2/u2pp Streaming Test diff --git a/speechx/examples/codelab/u2/local/decode.sh b/speechx/examples/codelab/u2/local/decode.sh new file mode 100755 index 000000000..11c1afe86 --- /dev/null +++ b/speechx/examples/codelab/u2/local/decode.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set +x +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=$data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +ctc_prefix_beam_search_decoder_main \ + --model_path=$model_dir/export.jit \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --vocab_path=$model_dir/unit.txt \ + --feature_rspecifier=ark,t:$exp/fbank.ark \ + --result_wspecifier=ark,t:$exp/result.ark + +echo "u2 ctc prefix beam search decode." diff --git a/speechx/examples/codelab/u2/local/feat.sh b/speechx/examples/codelab/u2/local/feat.sh new file mode 100755 index 000000000..1eec3aae3 --- /dev/null +++ b/speechx/examples/codelab/u2/local/feat.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + + +cmvn_json2kaldi_main \ + --json_file $model_dir/mean_std.json \ + --cmvn_write_path $exp/cmvn.ark \ + --binary=false + +echo "convert json cmvn to kaldi ark." + +compute_fbank_main \ + --num_bins 80 \ + --wav_rspecifier=scp:$data/wav.scp \ + --cmvn_file=$exp/cmvn.ark \ + --feature_wspecifier=ark,t:$exp/fbank.ark + +echo "compute fbank feature." diff --git a/speechx/examples/codelab/u2/local/nnet.sh b/speechx/examples/codelab/u2/local/nnet.sh new file mode 100755 index 000000000..4419201cf --- /dev/null +++ b/speechx/examples/codelab/u2/local/nnet.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +u2_nnet_main \ + --model_path=$model_dir/export.jit \ + --feature_rspecifier=ark,t:$exp/fbank.ark \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --acoustic_scale=1.0 \ + --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \ + --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark +echo "u2 nnet decode." + diff --git a/speechx/examples/codelab/u2/local/recognizer.sh b/speechx/examples/codelab/u2/local/recognizer.sh new file mode 100755 index 000000000..9f697b459 --- /dev/null +++ b/speechx/examples/codelab/u2/local/recognizer.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -e + +. path.sh + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + +u2_recognizer_main \ + --use_fbank=true \ + --num_bins=80 \ + --cmvn_file=$exp/cmvn.ark \ + --model_path=$model_dir/export.jit \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --vocab_path=$model_dir/unit.txt \ + --wav_rspecifier=scp:$data/wav.scp \ + --result_wspecifier=ark,t:$exp/result.ark diff --git a/speechx/examples/codelab/u2/path.sh b/speechx/examples/codelab/u2/path.sh new file mode 100755 index 000000000..ec278bd3d --- /dev/null +++ b/speechx/examples/codelab/u2/path.sh @@ -0,0 +1,18 @@ +# This contains the locations of binarys build required for running the examples. + +unset GREP_OPTIONS + +SPEECHX_ROOT=$PWD/../../../ +SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx + +SPEECHX_TOOLS=$SPEECHX_ROOT/tools +TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin + +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } + +export LC_AL=C + +export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer + +PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);") +export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH diff --git a/speechx/examples/codelab/u2/run.sh b/speechx/examples/codelab/u2/run.sh new file mode 100755 index 000000000..d314262ba --- /dev/null +++ b/speechx/examples/codelab/u2/run.sh @@ -0,0 +1,43 @@ +#!/bin/bash +set -x +set -e + +. path.sh + +# 1. compile +if [ ! -d ${SPEECHX_EXAMPLES} ]; then + pushd ${SPEECHX_ROOT} + bash build.sh + popd +fi + +# 2. download model +if [ ! -f data/model/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz ]; then + mkdir -p data/model + pushd data/model + wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz + tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model.tar.gz + popd +fi + +# produce wav scp +if [ ! -f data/wav.scp ]; then + mkdir -p data + pushd data + wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + echo "utt1 " $PWD/zh.wav > wav.scp + popd +fi + +data=data +exp=exp +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.1.0.model/ + + +./local/feat.sh + +./local/nnet.sh + +./local/decode.sh diff --git a/speechx/examples/codelab/u2/utils b/speechx/examples/codelab/u2/utils new file mode 120000 index 000000000..23cef9612 --- /dev/null +++ b/speechx/examples/codelab/u2/utils @@ -0,0 +1 @@ +../../../../utils \ No newline at end of file diff --git a/speechx/examples/custom_asr/README.md b/speechx/examples/custom_asr/README.md old mode 100644 new mode 100755 diff --git a/speechx/examples/custom_asr/path.sh b/speechx/examples/custom_asr/path.sh old mode 100644 new mode 100755 diff --git a/speechx/examples/custom_asr/run.sh b/speechx/examples/custom_asr/run.sh old mode 100644 new mode 100755 diff --git a/speechx/examples/ds2_ol/README.md b/speechx/examples/ds2_ol/README.md old mode 100644 new mode 100755 diff --git a/speechx/examples/ds2_ol/aishell/.gitignore b/speechx/examples/ds2_ol/aishell/.gitignore old mode 100644 new mode 100755 diff --git a/speechx/examples/ds2_ol/aishell/README.md b/speechx/examples/ds2_ol/aishell/README.md old mode 100644 new mode 100755 diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index 82e889ce5..794b533ff 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -1,5 +1,5 @@ #!/bin/bash -set +x +set -x set -e . path.sh @@ -11,7 +11,7 @@ stop_stage=100 . utils/parse_options.sh # 1. compile -if [ ! -d ${SPEECHX_EXAMPLES} ]; then +if [ ! -d ${SPEECHX_BUILD} ]; then pushd ${SPEECHX_ROOT} bash build.sh popd @@ -84,7 +84,7 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # recognizer utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ @@ -103,7 +103,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # decode with lm utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ @@ -135,7 +135,7 @@ fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # TLG decoder utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \ - tlg_decoder_main \ + ctc_tlg_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ --model_path=$model_dir/avg_1.jit.pdmodel \ --param_path=$model_dir/avg_1.jit.pdiparams \ diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh index 720728354..1c3c3e010 100755 --- a/speechx/examples/ds2_ol/aishell/run_fbank.sh +++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh @@ -84,7 +84,7 @@ fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # recognizer utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wolm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ --model_path=$model_dir/avg_5.jit.pdmodel \ --param_path=$model_dir/avg_5.jit.pdiparams \ @@ -102,7 +102,7 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # decode with lm utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.lm.log \ - ctc_prefix_beam_search_decoder_main \ + ctc_beam_search_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ --model_path=$model_dir/avg_5.jit.pdmodel \ --param_path=$model_dir/avg_5.jit.pdiparams \ @@ -133,7 +133,7 @@ fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # TLG decoder utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wfst.log \ - tlg_decoder_main \ + ctc_tlg_decoder_main \ --feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \ --model_path=$model_dir/avg_5.jit.pdmodel \ --param_path=$model_dir/avg_5.jit.pdiparams \ diff --git a/speechx/examples/ds2_ol/onnx/.gitignore b/speechx/examples/ds2_ol/onnx/.gitignore old mode 100644 new mode 100755 diff --git a/speechx/examples/ds2_ol/onnx/README.md b/speechx/examples/ds2_ol/onnx/README.md old mode 100644 new mode 100755 diff --git a/speechx/examples/ds2_ol/websocket/.gitignore b/speechx/examples/ds2_ol/websocket/.gitignore old mode 100644 new mode 100755 diff --git a/speechx/examples/text_lm/.gitignore b/speechx/examples/text_lm/.gitignore old mode 100644 new mode 100755 diff --git a/speechx/examples/text_lm/README.md b/speechx/examples/text_lm/README.md old mode 100644 new mode 100755 diff --git a/speechx/examples/text_lm/local/data/chars.dic b/speechx/examples/text_lm/local/data/chars.dic old mode 100644 new mode 100755 diff --git a/speechx/examples/text_lm/local/data/words.dic b/speechx/examples/text_lm/local/data/words.dic old mode 100644 new mode 100755 diff --git a/speechx/examples/text_lm/path.sh b/speechx/examples/text_lm/path.sh old mode 100644 new mode 100755 diff --git a/speechx/examples/u2pp_ol/README.md b/speechx/examples/u2pp_ol/README.md new file mode 100755 index 000000000..838db435c --- /dev/null +++ b/speechx/examples/u2pp_ol/README.md @@ -0,0 +1,5 @@ +# U2/U2++ Streaming ASR + +## Examples + +* `wenetspeech` - Streaming Decoding with wenetspeech u2/u2++ model. Using aishell test data for testing. diff --git a/speechx/examples/u2pp_ol/wenetspeech/.gitignore b/speechx/examples/u2pp_ol/wenetspeech/.gitignore new file mode 100755 index 000000000..bbd86a25b --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/.gitignore @@ -0,0 +1,2 @@ +data +exp diff --git a/speechx/examples/u2pp_ol/wenetspeech/README.md b/speechx/examples/u2pp_ol/wenetspeech/README.md new file mode 100755 index 000000000..b90b8e201 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/README.md @@ -0,0 +1,27 @@ +# u2/u2pp Streaming ASR + +## Testing with Aishell Test Data + +### Download wav and model + +``` +./run.sh --stop_stage 0 +``` + +### compute feature + +``` +./run.sh --stage 1 --stop_stage 1 +``` + +### decoding using feature + +``` +./run.sh --stage 2 --stop_stage 2 +``` + +### decoding using wav + +``` +./run.sh --stage 3 --stop_stage 3 +``` diff --git a/speechx/examples/u2pp_ol/wenetspeech/RESULTS.md b/speechx/examples/u2pp_ol/wenetspeech/RESULTS.md new file mode 100755 index 000000000..5b33f3641 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/RESULTS.md @@ -0,0 +1,47 @@ +# aishell test + +7176 utts, duration 36108.9 sec. + +## U2++ Attention Rescore + +> Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz, support `avx512_vnni` +> RTF with feature and decoder which is more end to end. +### FP32 + +#### CER + +``` +Overall -> 5.75 % N=104765 C=99035 S=5587 D=143 I=294 +Mandarin -> 5.75 % N=104762 C=99035 S=5584 D=143 I=294 +English -> 0.00 % N=0 C=0 S=0 D=0 I=0 +Other -> 100.00 % N=3 C=0 S=3 D=0 I=0 +``` + +#### RTF + +``` +I1027 10:52:38.662868 51665 u2_recognizer_main.cc:122] total wav duration is: 36108.9 sec +I1027 10:52:38.662858 51665 u2_recognizer_main.cc:121] total cost:11169.1 sec +I1027 10:52:38.662876 51665 u2_recognizer_main.cc:123] RTF is: 0.309318 +``` + +### INT8 + +> RTF relative improve 12.8%, which count feature and decoder time. + +#### CER + +``` +Overall -> 5.83 % N=104765 C=98943 S=5675 D=147 I=286 +Mandarin -> 5.83 % N=104762 C=98943 S=5672 D=147 I=286 +English -> 0.00 % N=0 C=0 S=0 D=0 I=0 +Other -> 100.00 % N=3 C=0 S=3 D=0 I=0 +``` + +#### RTF + +``` +I1110 09:59:52.551712 37249 u2_recognizer_main.cc:122] total wav duration is: 36108.9 sec +I1110 09:59:52.551717 37249 u2_recognizer_main.cc:123] total decode cost:9737.63 sec +I1110 09:59:52.551723 37249 u2_recognizer_main.cc:124] RTF is: 0.269674 +``` diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh b/speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh new file mode 100755 index 000000000..544a1f59a --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/aishell_train_lms.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# To be run from one directory above this script. +. ./path.sh + +nj=40 +text=data/local/lm/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# Check SRILM tools +if ! which ngram-count > /dev/null; then + echo "srilm tools are not found, please download it and install it from: " + echo "http://www.speech.sri.com/projects/srilm/download.html" + echo "Then add the tools to your PATH" + exit 1 +fi + +# This script takes no arguments. It assumes you have already run +# aishell_data_prep.sh. +# It takes as input the files +# data/local/lm/text +# data/local/dict/lexicon.txt +dir=data/local/lm +mkdir -p $dir + +cleantext=$dir/text.no_oov + +# oov to +# lexicon line: word char0 ... charn +# text line: utt word0 ... wordn -> line: word0 ... wordn +text_dir=$(dirname $text) +split_name=$(basename $text) +./local/split_data.sh $text_dir $text $split_name $nj + +utils/run.pl JOB=1:$nj $text_dir/split${nj}/JOB/${split_name}.no_oov.log \ + cat ${text_dir}/split${nj}/JOB/${split_name} \| awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + \> ${text_dir}/split${nj}/JOB/${split_name}.no_oov || exit 1; +cat ${text_dir}/split${nj}/*/${split_name}.no_oov > $cleantext + +# compute word counts, sort in descending order +# line: count word +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort --parallel=`nproc` | uniq -c | \ + sort --parallel=`nproc` -nr > $dir/word.counts || exit 1; + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort --parallel=`nproc` | uniq -c | sort --parallel=`nproc` -nr > $dir/unigram.counts || exit 1; + +# word with +cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist + +# hold out to compute ppl +heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results + +mkdir -p $dir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/train + +ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $dir/lm.arpa +ngram -lm $dir/lm.arpa -ppl $dir/heldout \ No newline at end of file diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/decode.sh b/speechx/examples/u2pp_ol/wenetspeech/local/decode.sh new file mode 100755 index 000000000..059ed1b36 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/decode.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -e + +. path.sh + +data=data +exp=exp +nj=20 +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.3.0.model/ +text=$data/test/text + +utils/run.pl JOB=1:$nj $data/split${nj}/JOB/decoder.log \ +ctc_prefix_beam_search_decoder_main \ + --model_path=$model_dir/export.jit \ + --vocab_path=$model_dir/unit.txt \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --feature_rspecifier=scp:$data/split${nj}/JOB/fbank.scp \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_decode.ark + +cat $data/split${nj}/*/result_decode.ark > $exp/aishell.decode.rsl +utils/compute-wer.py --char=1 --v=1 $text $exp/aishell.decode.rsl > $exp/aishell.decode.err +tail -n 7 $exp/aishell.decode.err \ No newline at end of file diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/feat.sh b/speechx/examples/u2pp_ol/wenetspeech/local/feat.sh new file mode 100755 index 000000000..e181951e3 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/feat.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -e + +. path.sh + +nj=20 +stage=-1 +stop_stage=100 + +. utils/parse_options.sh + +data=data +exp=exp +mkdir -p $exp + +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.3.0.model/ +aishell_wav_scp=aishell_test.scp + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + cmvn_json2kaldi_main \ + --json_file $model_dir/mean_std.json \ + --cmvn_write_path $exp/cmvn.ark \ + --binary=false + + echo "convert json cmvn to kaldi ark." +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj + + utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \ + compute_fbank_main \ + --num_bins 80 \ + --cmvn_file=$exp/cmvn.ark \ + --streaming_chunk=36 \ + --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ + --feature_wspecifier=ark,scp:$data/split${nj}/JOB/fbank.ark,$data/split${nj}/JOB/fbank.scp + + echo "compute fbank feature." +fi diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh b/speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh new file mode 100755 index 000000000..f947e6b17 --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/nnet.sh @@ -0,0 +1,25 @@ +#!/bin/bash +set -e + +. path.sh + +nj=20 +data=data +exp=exp + +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.3.0.model/ + +utils/run.pl JOB=1:$nj $data/split${nj}/JOB/nnet.log \ +u2_nnet_main \ + --model_path=$model_dir/export.jit \ + --vocab_path=$model_dir/unit.txt \ + --feature_rspecifier=ark,t:${data}/split${nj}/JOB/fbank.ark \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --acoustic_scale=1.0 \ + --nnet_encoder_outs_wspecifier=ark,t:$exp/encoder_outs.ark \ + --nnet_prob_wspecifier=ark,t:$exp/logprobs.ark +echo "u2 nnet decode." diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh new file mode 100755 index 000000000..344fbcbce --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +data=data +exp=exp +nj=20 + +. utils/parse_options.sh + +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.3.0.model/ +aishell_wav_scp=aishell_test.scp +text=$data/test/text + +./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj + +utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.log \ +u2_recognizer_main \ + --use_fbank=true \ + --num_bins=80 \ + --cmvn_file=$exp/cmvn.ark \ + --model_path=$model_dir/export.jit \ + --vocab_path=$model_dir/unit.txt \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_recognizer.ark + + +cat $data/split${nj}/*/result_recognizer.ark > $exp/aishell_recognizer +utils/compute-wer.py --char=1 --v=1 $text $exp/aishell_recognizer > $exp/aishell.recognizer.err +echo "recognizer test have finished!!!" +echo "please checkout in $exp/aishell.recognizer.err" +tail -n 7 $exp/aishell.recognizer.err diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh new file mode 100755 index 000000000..1ce403a3c --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/recognizer_quant.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +data=data +exp=exp +nj=20 + +. utils/parse_options.sh + +mkdir -p $exp +ckpt_dir=./data/model +model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_quant_1.3.0.model/ +aishell_wav_scp=aishell_test.scp +text=$data/test/text + +./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj + +utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.quant.log \ +u2_recognizer_main \ + --use_fbank=true \ + --num_bins=80 \ + --cmvn_file=$exp/cmvn.ark \ + --model_path=$model_dir/export \ + --vocab_path=$model_dir/unit.txt \ + --nnet_decoder_chunk=16 \ + --receptive_field_length=7 \ + --subsampling_rate=4 \ + --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/recognizer.quant.rsl.ark + + +cat $data/split${nj}/*/recognizer.quant.rsl.ark > $exp/aishell.recognizer.quant.rsl +utils/compute-wer.py --char=1 --v=1 $text $exp/aishell.recognizer.quant.rsl > $exp/aishell.recognizer.quant.err +echo "recognizer quant test have finished!!!" +echo "please checkout in $exp/aishell.recognizer.quant.err" +tail -n 7 $exp/aishell.recognizer.quant.err diff --git a/speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh b/speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh new file mode 100755 index 000000000..faa5c42dc --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/local/split_data.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +set -eo pipefail + +data=$1 +scp=$2 +split_name=$3 +numsplit=$4 + +# save in $data/split{n} +# $scp to split +# + +if [[ ! $numsplit -gt 0 ]]; then + echo "$0: Invalid num-split argument"; + exit 1; +fi + +directories=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n; done) +scp_splits=$(for n in `seq $numsplit`; do echo $data/split${numsplit}/$n/${split_name}; done) + +# if this mkdir fails due to argument-list being too long, iterate. +if ! mkdir -p $directories >&/dev/null; then + for n in `seq $numsplit`; do + mkdir -p $data/split${numsplit}/$n + done +fi + +echo "utils/split_scp.pl $scp $scp_splits" +utils/split_scp.pl $scp $scp_splits diff --git a/speechx/examples/u2pp_ol/wenetspeech/path.sh b/speechx/examples/u2pp_ol/wenetspeech/path.sh new file mode 100755 index 000000000..ec278bd3d --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/path.sh @@ -0,0 +1,18 @@ +# This contains the locations of binarys build required for running the examples. + +unset GREP_OPTIONS + +SPEECHX_ROOT=$PWD/../../../ +SPEECHX_BUILD=$SPEECHX_ROOT/build/speechx + +SPEECHX_TOOLS=$SPEECHX_ROOT/tools +TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin + +[ -d $SPEECHX_BUILD ] || { echo "Error: 'build/speechx' directory not found. please ensure that the project build successfully"; } + +export LC_AL=C + +export PATH=$PATH:$TOOLS_BIN:$SPEECHX_BUILD/nnet:$SPEECHX_BUILD/decoder:$SPEECHX_BUILD/frontend/audio:$SPEECHX_BUILD/recognizer + +PADDLE_LIB_PATH=$(python -c "import os; import paddle; include_dir=paddle.sysconfig.get_include(); paddle_dir=os.path.split(include_dir)[0]; libs_dir=os.path.join(paddle_dir, 'libs'); fluid_dir=os.path.join(paddle_dir, 'fluid'); out=':'.join([libs_dir, fluid_dir]); print(out);") +export LD_LIBRARY_PATH=$PADDLE_LIB_PATH:$LD_LIBRARY_PATH diff --git a/speechx/examples/u2pp_ol/wenetspeech/run.sh b/speechx/examples/u2pp_ol/wenetspeech/run.sh new file mode 100755 index 000000000..870c5deeb --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/run.sh @@ -0,0 +1,84 @@ +#!/bin/bash +set -e + +. path.sh + +nj=40 +stage=-1 +stop_stage=100 + +. utils/parse_options.sh + +# input +data=data +exp=exp +mkdir -p $exp $data +aishell_wav_scp=aishell_test.scp + +# 1. compile +if [ ! -d ${SPEECHX_BUILD} ]; then + pushd ${SPEECHX_ROOT} + bash build.sh + popd +fi + + +ckpt_dir=$data/model + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then + # download u2pp model + if [ ! -f $ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_1.3.0.model.tar.gz ]; then + mkdir -p $ckpt_dir + pushd $ckpt_dir + + wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_1.3.0.model.tar.gz + tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_1.3.0.model.tar.gz + + popd + fi + + # download u2pp quant model + if [ ! -f $ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_quant_1.3.0.model.tar.gz ]; then + mkdir -p $ckpt_dir + pushd $ckpt_dir + + wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_quant_1.3.0.model.tar.gz + tar xzfv asr1_chunk_conformer_u2pp_wenetspeech_static_quant_1.3.0.model.tar.gz + + popd + fi + + # test wav scp + if [ ! -f data/wav.scp ]; then + mkdir -p $data + pushd $data + wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav + echo "utt1 " $PWD/zh.wav > wav.scp + popd + fi + + # aishell wav scp + if [ ! -d $data/test ]; then + pushd $data + wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip + unzip aishell_test.zip + popd + + realpath $data/test/*/*.wav > $data/wavlist + awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id + paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp + fi +fi + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + ./local/feat.sh +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + ./local/decode.sh +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + ./loca/recognizer.sh +fi diff --git a/speechx/examples/u2pp_ol/wenetspeech/utils b/speechx/examples/u2pp_ol/wenetspeech/utils new file mode 120000 index 000000000..c2519a9dd --- /dev/null +++ b/speechx/examples/u2pp_ol/wenetspeech/utils @@ -0,0 +1 @@ +../../../../utils/ \ No newline at end of file diff --git a/speechx/patch/CPPLINT.cfg b/speechx/patch/CPPLINT.cfg old mode 100644 new mode 100755 diff --git a/speechx/patch/README.md b/speechx/patch/README.md old mode 100644 new mode 100755 diff --git a/speechx/patch/openfst/src/include/fst/flags.h b/speechx/patch/openfst/src/include/fst/flags.h old mode 100644 new mode 100755 diff --git a/speechx/patch/openfst/src/include/fst/log.h b/speechx/patch/openfst/src/include/fst/log.h old mode 100644 new mode 100755 diff --git a/speechx/patch/openfst/src/lib/flags.cc b/speechx/patch/openfst/src/lib/flags.cc old mode 100644 new mode 100755 diff --git a/speechx/requirement.txt b/speechx/requirement.txt new file mode 100755 index 000000000..6a6db0960 --- /dev/null +++ b/speechx/requirement.txt @@ -0,0 +1 @@ +paddlepaddle>=2.4rc diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt old mode 100644 new mode 100755 index c8e21d486..60c183472 --- a/speechx/speechx/CMakeLists.txt +++ b/speechx/speechx/CMakeLists.txt @@ -32,6 +32,12 @@ ${CMAKE_CURRENT_SOURCE_DIR}/decoder ) add_subdirectory(decoder) +include_directories( +${CMAKE_CURRENT_SOURCE_DIR} +${CMAKE_CURRENT_SOURCE_DIR}/recognizer +) +add_subdirectory(recognizer) + include_directories( ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/protocol diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h old mode 100644 new mode 100755 index 206b7be67..2b15a61fe --- a/speechx/speechx/base/basic_types.h +++ b/speechx/speechx/base/basic_types.h @@ -14,47 +14,47 @@ #pragma once -#include "kaldi/base/kaldi-types.h" - #include +#include "kaldi/base/kaldi-types.h" + typedef float BaseFloat; typedef double double64; typedef signed char int8; -typedef short int16; -typedef int int32; +typedef short int16; // NOLINT +typedef int int32; // NOLINT #if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD) -typedef long int64; +typedef long int64; // NOLINT #else -typedef long long int64; +typedef long long int64; // NOLINT #endif -typedef unsigned char uint8; -typedef unsigned short uint16; -typedef unsigned int uint32; +typedef unsigned char uint8; // NOLINT +typedef unsigned short uint16; // NOLINT +typedef unsigned int uint32; // NOLINT #if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD) -typedef unsigned long uint64; +typedef unsigned long uint64; // NOLINT #else -typedef unsigned long long uint64; +typedef unsigned long long uint64; // NOLINT #endif typedef signed int char32; -const uint8 kuint8max = ((uint8)0xFF); -const uint16 kuint16max = ((uint16)0xFFFF); -const uint32 kuint32max = ((uint32)0xFFFFFFFF); -const uint64 kuint64max = ((uint64)(0xFFFFFFFFFFFFFFFFLL)); -const int8 kint8min = ((int8)0x80); -const int8 kint8max = ((int8)0x7F); -const int16 kint16min = ((int16)0x8000); -const int16 kint16max = ((int16)0x7FFF); -const int32 kint32min = ((int32)0x80000000); -const int32 kint32max = ((int32)0x7FFFFFFF); -const int64 kint64min = ((int64)(0x8000000000000000LL)); -const int64 kint64max = ((int64)(0x7FFFFFFFFFFFFFFFLL)); +const uint8 kuint8max = static_cast(0xFF); +const uint16 kuint16max = static_cast(0xFFFF); +const uint32 kuint32max = static_cast(0xFFFFFFFF); +const uint64 kuint64max = static_cast(0xFFFFFFFFFFFFFFFFLL); +const int8 kint8min = static_cast(0x80); +const int8 kint8max = static_cast(0x7F); +const int16 kint16min = static_cast(0x8000); +const int16 kint16max = static_cast(0x7FFF); +const int32 kint32min = static_cast(0x80000000); +const int32 kint32max = static_cast(0x7FFFFFFF); +const int64 kint64min = static_cast(0x8000000000000000LL); +const int64 kint64max = static_cast(0x7FFFFFFFFFFFFFFFLL); const BaseFloat kBaseFloatMax = std::numeric_limits::max(); const BaseFloat kBaseFloatMin = std::numeric_limits::min(); diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h old mode 100644 new mode 100755 index a9303cbbc..97bff9662 --- a/speechx/speechx/base/common.h +++ b/speechx/speechx/base/common.h @@ -14,21 +14,30 @@ #pragma once +#include +#include +#include #include +#include #include #include +#include #include #include #include #include #include +#include #include #include #include #include #include +#include #include #include +#include +#include #include #include #include @@ -38,3 +47,5 @@ #include "base/flags.h" #include "base/log.h" #include "base/macros.h" +#include "utils/file_utils.h" +#include "utils/math.h" \ No newline at end of file diff --git a/speechx/speechx/base/flags.h b/speechx/speechx/base/flags.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/base/log.h b/speechx/speechx/base/log.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h old mode 100644 new mode 100755 index d7d5a78d1..db9898123 --- a/speechx/speechx/base/macros.h +++ b/speechx/speechx/base/macros.h @@ -14,6 +14,9 @@ #pragma once +#include +#include + namespace ppspeech { #ifndef DISALLOW_COPY_AND_ASSIGN @@ -22,4 +25,7 @@ namespace ppspeech { void operator=(const TypeName&) = delete #endif -} // namespace pp_speech \ No newline at end of file +// kSpaceSymbol in UTF-8 is: ▁ +const char kSpaceSymbo[] = "\xe2\x96\x81"; + +} // namespace ppspeech diff --git a/speechx/speechx/base/thread_pool.h b/speechx/speechx/base/thread_pool.h old mode 100644 new mode 100755 index ba895f714..6d59dac54 --- a/speechx/speechx/base/thread_pool.h +++ b/speechx/speechx/base/thread_pool.h @@ -35,7 +35,7 @@ class ThreadPool { public: - ThreadPool(size_t); + explicit ThreadPool(size_t); template auto enqueue(F&& f, Args&&... args) -> std::future::type>; diff --git a/speechx/speechx/codelab/CMakeLists.txt b/speechx/speechx/codelab/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/codelab/README.md b/speechx/speechx/codelab/README.md old mode 100644 new mode 100755 diff --git a/speechx/speechx/codelab/glog/CMakeLists.txt b/speechx/speechx/codelab/glog/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/codelab/glog/README.md b/speechx/speechx/codelab/glog/README.md old mode 100644 new mode 100755 diff --git a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc old mode 100644 new mode 100755 index b0616a7de..c891827a1 --- a/speechx/speechx/codelab/glog/glog_logtostderr_main.cc +++ b/speechx/speechx/codelab/glog/glog_logtostderr_main.cc @@ -17,7 +17,7 @@ int main(int argc, char* argv[]) { // Initialize Google’s logging library. google::InitGoogleLogging(argv[0]); - + google::InstallFailureSignalHandler(); FLAGS_logtostderr = 1; LOG(INFO) << "Found " << 10 << " cookies"; diff --git a/speechx/speechx/codelab/glog/glog_main.cc b/speechx/speechx/codelab/glog/glog_main.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/codelab/nnet/CMakeLists.txt b/speechx/speechx/codelab/nnet/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc old mode 100644 new mode 100755 index 283466dc1..ab7b2cb58 --- a/speechx/speechx/codelab/nnet/ds2_model_test_main.cc +++ b/speechx/speechx/codelab/nnet/ds2_model_test_main.cc @@ -21,6 +21,7 @@ #include #include #include + #include "base/flags.h" #include "base/log.h" #include "paddle_inference_api.h" @@ -63,8 +64,8 @@ void model_forward_test() { ; std::string model_graph = FLAGS_model_path; std::string model_params = FLAGS_param_path; - CHECK(model_graph != ""); - CHECK(model_params != ""); + CHECK_NE(model_graph, ""); + CHECK_NE(model_params, ""); cout << "model path: " << model_graph << endl; cout << "model param path : " << model_params << endl; @@ -195,8 +196,11 @@ void model_forward_test() { } int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; model_forward_test(); return 0; diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt old mode 100644 new mode 100755 index 1df935112..f0fd32ba1 --- a/speechx/speechx/decoder/CMakeLists.txt +++ b/speechx/speechx/decoder/CMakeLists.txt @@ -1,25 +1,55 @@ -project(decoder) - include_directories(${CMAKE_CURRENT_SOURCE_DIR/ctc_decoders}) -add_library(decoder STATIC - ctc_beam_search_decoder.cc + +set(srcs) + +if (USING_DS2) +list(APPEND srcs ctc_decoders/decoder_utils.cpp ctc_decoders/path_trie.cpp ctc_decoders/scorer.cpp + ctc_beam_search_decoder.cc ctc_tlg_decoder.cc - recognizer.cc ) -target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder) +endif() -set(BINS - ctc_prefix_beam_search_decoder_main - nnet_logprob_decoder_main - recognizer_main - tlg_decoder_main -) +if (USING_U2) + list(APPEND srcs + ctc_prefix_beam_search_decoder.cc + ) +endif() + +add_library(decoder STATIC ${srcs}) +target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder absl::strings) + +# test +if (USING_DS2) + set(BINS + ctc_beam_search_decoder_main + nnet_logprob_decoder_main + ctc_tlg_decoder_main + ) + + foreach(bin_name IN LISTS BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) + endforeach() +endif() + + +if (USING_U2) + set(TEST_BINS + ctc_prefix_beam_search_decoder_main + ) + + foreach(bin_name IN LISTS TEST_BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util) + target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) + target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) + endforeach() + +endif() -foreach(bin_name IN LISTS BINS) - add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) - target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) - target_link_libraries(${bin_name} PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) -endforeach() diff --git a/speechx/speechx/decoder/common.h b/speechx/speechx/decoder/common.h old mode 100644 new mode 100755 index 52deffac9..0ae732771 --- a/speechx/speechx/decoder/common.h +++ b/speechx/speechx/decoder/common.h @@ -1,3 +1,4 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,10 +13,36 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "base/basic_types.h" +#pragma once + +#include "base/common.h" struct DecoderResult { BaseFloat acoustic_score; std::vector words_idx; - std::vector> time_stamp; + std::vector> time_stamp; +}; + + +namespace ppspeech { + +struct WordPiece { + std::string word; + int start = -1; + int end = -1; + + WordPiece(std::string word, int start, int end) + : word(std::move(word)), start(start), end(end) {} }; + +struct DecodeResult { + float score = -kBaseFloatMax; + std::string sentence; + std::vector word_pieces; + + static bool CompareFunc(const DecodeResult& a, const DecodeResult& b) { + return a.score > b.score; + } +}; + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc old mode 100644 new mode 100755 index 5a12c0b50..6e3a0d136 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. + #include "decoder/ctc_beam_search_decoder.h" -#include "base/basic_types.h" +#include "base/common.h" #include "decoder/ctc_decoders/decoder_utils.h" #include "utils/file_utils.h" @@ -24,12 +25,7 @@ using std::vector; using FSTMATCH = fst::SortedMatcher; CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) - : opts_(opts), - init_ext_scorer_(nullptr), - blank_id_(-1), - space_id_(-1), - num_frame_decoded_(0), - root_(nullptr) { + : opts_(opts), init_ext_scorer_(nullptr), space_id_(-1), root_(nullptr) { LOG(INFO) << "dict path: " << opts_.dict_file; if (!ReadFileToVector(opts_.dict_file, &vocabulary_)) { LOG(INFO) << "load the dict failed"; @@ -43,12 +39,12 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts) opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_); } - blank_id_ = 0; - auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " "); + CHECK_EQ(opts_.blank, 0); + auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " "); space_id_ = it - vocabulary_.begin(); // if no space in vocabulary - if ((size_t)space_id_ >= vocabulary_.size()) { + if (static_cast(space_id_) >= vocabulary_.size()) { space_id_ = -2; } } @@ -84,8 +80,6 @@ void CTCBeamSearch::Decode( return; } -int32 CTCBeamSearch::NumFrameDecoded() { return num_frame_decoded_ + 1; } - // todo rename, refactor void CTCBeamSearch::AdvanceDecode( const std::shared_ptr& decodable) { @@ -110,17 +104,21 @@ void CTCBeamSearch::ResetPrefixes() { } int CTCBeamSearch::DecodeLikelihoods(const vector>& probs, - vector& nbest_words) { + const vector& nbest_words) { kaldi::Timer timer; - timer.Reset(); AdvanceDecoding(probs); LOG(INFO) << "ctc decoding elapsed time(s) " << static_cast(timer.Elapsed()) / 1000.0f; return 0; } +vector> CTCBeamSearch::GetNBestPath(int n) { + int beam_size = n == -1 ? opts_.beam_size : std::min(n, opts_.beam_size); + return get_beam_search_result(prefixes_, vocabulary_, beam_size); +} + vector> CTCBeamSearch::GetNBestPath() { - return get_beam_search_result(prefixes_, vocabulary_, opts_.beam_size); + return GetNBestPath(-1); } string CTCBeamSearch::GetBestPath() { @@ -167,7 +165,7 @@ void CTCBeamSearch::AdvanceDecoding(const vector>& probs) { continue; } min_cutoff = prefixes_[num_prefixes_ - 1]->score + - std::log(prob[blank_id_]) - + std::log(prob[opts_.blank]) - std::max(0.0, init_ext_scorer_->beta); full_beam = (num_prefixes_ == beam_size); @@ -195,9 +193,9 @@ void CTCBeamSearch::AdvanceDecoding(const vector>& probs) { for (size_t i = beam_size; i < prefixes_.size(); ++i) { prefixes_[i]->remove(); } - } // if + } // end if num_frame_decoded_++; - } // for probs_seq + } // end for probs_seq } int32 CTCBeamSearch::SearchOneChar( @@ -215,7 +213,7 @@ int32 CTCBeamSearch::SearchOneChar( break; } - if (c == blank_id_) { + if (c == opts_.blank) { prefix->log_prob_b_cur = log_sum_exp(prefix->log_prob_b_cur, log_prob_c + prefix->score); continue; diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h old mode 100644 new mode 100755 index 9d0a5d142..f06d88e32 --- a/speechx/speechx/decoder/ctc_beam_search_decoder.h +++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h @@ -12,67 +12,47 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "base/common.h" -#include "decoder/ctc_decoders/path_trie.h" -#include "decoder/ctc_decoders/scorer.h" -#include "kaldi/decoder/decodable-itf.h" -#include "util/parse-options.h" +// used by deepspeech2 #pragma once -namespace ppspeech { +#include "decoder/ctc_beam_search_opt.h" +#include "decoder/ctc_decoders/path_trie.h" +#include "decoder/ctc_decoders/scorer.h" +#include "decoder/decoder_itf.h" -struct CTCBeamSearchOptions { - std::string dict_file; - std::string lm_path; - BaseFloat alpha; - BaseFloat beta; - BaseFloat cutoff_prob; - int beam_size; - int cutoff_top_n; - int num_proc_bsearch; - CTCBeamSearchOptions() - : dict_file("vocab.txt"), - lm_path(""), - alpha(1.9f), - beta(5.0), - beam_size(300), - cutoff_prob(0.99f), - cutoff_top_n(40), - num_proc_bsearch(10) {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register("dict", &dict_file, "dict file "); - opts->Register("lm-path", &lm_path, "language model file"); - opts->Register("alpha", &alpha, "alpha"); - opts->Register("beta", &beta, "beta"); - opts->Register( - "beam-size", &beam_size, "beam size for beam search method"); - opts->Register("cutoff-prob", &cutoff_prob, "cutoff probs"); - opts->Register("cutoff-top-n", &cutoff_top_n, "cutoff top n"); - opts->Register( - "num-proc-bsearch", &num_proc_bsearch, "num proc bsearch"); - } -}; +namespace ppspeech { -class CTCBeamSearch { +class CTCBeamSearch : public DecoderBase { public: explicit CTCBeamSearch(const CTCBeamSearchOptions& opts); ~CTCBeamSearch() {} + void InitDecoder(); + + void Reset(); + + void AdvanceDecode( + const std::shared_ptr& decodable); + void Decode(std::shared_ptr decodable); + std::string GetBestPath(); std::vector> GetNBestPath(); + std::vector> GetNBestPath(int n); std::string GetFinalBestPath(); - int NumFrameDecoded(); + + std::string GetPartialResult() { + CHECK(false) << "Not implement."; + return {}; + } + int DecodeLikelihoods(const std::vector>& probs, - std::vector& nbest_words); - void AdvanceDecode( - const std::shared_ptr& decodable); - void Reset(); + const std::vector& nbest_words); private: void ResetPrefixes(); + int32 SearchOneChar(const bool& full_beam, const std::pair& log_prob_idx, const BaseFloat& min_cutoff); @@ -83,12 +63,11 @@ class CTCBeamSearch { CTCBeamSearchOptions opts_; std::shared_ptr init_ext_scorer_; // todo separate later std::vector vocabulary_; // todo remove later - size_t blank_id_; int space_id_; std::shared_ptr root_; std::vector prefixes_; - int num_frame_decoded_; + DISALLOW_COPY_AND_ASSIGN(CTCBeamSearch); }; -} // namespace basr \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/tlg_decoder_main.cc b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc old mode 100644 new mode 100755 similarity index 78% rename from speechx/speechx/decoder/tlg_decoder_main.cc rename to speechx/speechx/decoder/ctc_beam_search_decoder_main.cc index b175ed135..ab0376b6b --- a/speechx/speechx/decoder/tlg_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_beam_search_decoder_main.cc @@ -12,29 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -// todo refactor, repalce with gtest +// used by deepspeech2 #include "base/flags.h" #include "base/log.h" -#include "decoder/ctc_tlg_decoder.h" +#include "decoder/ctc_beam_search_decoder.h" #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" +#include "nnet/ds2_nnet.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); -DEFINE_string(graph_path, "TLG", "decoder graph"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); -DEFINE_int32(max_active, 7500, "decoder graph"); -DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); +DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm"); +DEFINE_string(lm_path, "", "language model"); DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, +DEFINE_int32(subsampling_rate, 4, "two CNN(kernel=3) module downsampling rate."); DEFINE_string( @@ -48,59 +45,59 @@ DEFINE_string(model_cache_names, "chunk_state_h_box,chunk_state_c_box", "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); +DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); using kaldi::BaseFloat; using kaldi::Matrix; using std::vector; -// test TLG decoder by feeding speech feature. +// test ds2 online decoder by feeding speech feature int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + CHECK_NE(FLAGS_result_wspecifier, ""); + CHECK_NE(FLAGS_feature_rspecifier, ""); kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - std::string model_graph = FLAGS_model_path; + std::string model_path = FLAGS_model_path; std::string model_params = FLAGS_param_path; - std::string word_symbol_table = FLAGS_word_symbol_table; - std::string graph_path = FLAGS_graph_path; - LOG(INFO) << "model path: " << model_graph; + std::string dict_file = FLAGS_dict_file; + std::string lm_path = FLAGS_lm_path; + LOG(INFO) << "model path: " << model_path; LOG(INFO) << "model param: " << model_params; - LOG(INFO) << "word symbol path: " << word_symbol_table; - LOG(INFO) << "graph path: " << graph_path; + LOG(INFO) << "dict path: " << dict_file; + LOG(INFO) << "lm path: " << lm_path; int32 num_done = 0, num_err = 0; - ppspeech::TLGDecoderOptions opts; - opts.word_symbol_table = word_symbol_table; - opts.fst_path = graph_path; - opts.opts.max_active = FLAGS_max_active; - opts.opts.beam = 15.0; - opts.opts.lattice_beam = 7.5; - ppspeech::TLGDecoder decoder(opts); - - ppspeech::ModelOptions model_opts; - model_opts.model_path = model_graph; - model_opts.param_path = model_params; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; + ppspeech::CTCBeamSearchOptions opts; + opts.dict_file = dict_file; + opts.lm_path = lm_path; + ppspeech::CTCBeamSearch decoder(opts); + + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + std::shared_ptr nnet( new ppspeech::PaddleNnet(model_opts)); std::shared_ptr raw_data(new ppspeech::DataCache()); std::shared_ptr decodable( - new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); + new ppspeech::Decodable(nnet, raw_data)); int32 chunk_size = FLAGS_receptive_field_length + - (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; - int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; int32 receptive_field_length = FLAGS_receptive_field_length; LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk stride (frame): " << chunk_stride; LOG(INFO) << "receptive field (frame): " << receptive_field_length; decoder.InitDecoder(); + kaldi::Timer timer; for (; !feature_reader.Done(); feature_reader.Next()) { string utt = feature_reader.Key(); @@ -132,6 +129,7 @@ int main(int argc, char* argv[]) { if (feature_chunk_size < receptive_field_length) break; int32 start = chunk_idx * chunk_stride; + for (int row_id = 0; row_id < chunk_size; ++row_id) { kaldi::SubVector tmp(feature, start); kaldi::SubVector f_chunk_tmp( @@ -161,10 +159,9 @@ int main(int argc, char* argv[]) { ++num_done; } - double elapsed = timer.Elapsed(); - KALDI_LOG << " cost:" << elapsed << " s"; - KALDI_LOG << "Done " << num_done << " utterances, " << num_err << " with errors."; + double elapsed = timer.Elapsed(); + KALDI_LOG << " cost:" << elapsed << " s"; return (num_done != 0 ? 0 : 1); } diff --git a/speechx/speechx/decoder/ctc_beam_search_opt.h b/speechx/speechx/decoder/ctc_beam_search_opt.h new file mode 100755 index 000000000..f4a81b3a6 --- /dev/null +++ b/speechx/speechx/decoder/ctc_beam_search_opt.h @@ -0,0 +1,78 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "base/common.h" +#include "util/parse-options.h" + +namespace ppspeech { + + +struct CTCBeamSearchOptions { + // common + int blank; + + // ds2 + std::string dict_file; + std::string lm_path; + int beam_size; + BaseFloat alpha; + BaseFloat beta; + BaseFloat cutoff_prob; + int cutoff_top_n; + int num_proc_bsearch; + + // u2 + int first_beam_size; + int second_beam_size; + CTCBeamSearchOptions() + : blank(0), + dict_file("vocab.txt"), + lm_path(""), + beam_size(300), + alpha(1.9f), + beta(5.0), + cutoff_prob(0.99f), + cutoff_top_n(40), + num_proc_bsearch(10), + first_beam_size(10), + second_beam_size(10) {} + + void Register(kaldi::OptionsItf* opts) { + std::string module = "Ds2BeamSearchConfig: "; + opts->Register("dict", &dict_file, module + "vocab file path."); + opts->Register( + "lm-path", &lm_path, module + "ngram language model path."); + opts->Register("alpha", &alpha, module + "alpha"); + opts->Register("beta", &beta, module + "beta"); + opts->Register("beam-size", + &beam_size, + module + "beam size for beam search method"); + opts->Register("cutoff-prob", &cutoff_prob, module + "cutoff probs"); + opts->Register("cutoff-top-n", &cutoff_top_n, module + "cutoff top n"); + opts->Register( + "num-proc-bsearch", &num_proc_bsearch, module + "num proc bsearch"); + + opts->Register("blank", &blank, "blank id, default is 0."); + + module = "U2BeamSearchConfig: "; + opts->Register( + "first-beam-size", &first_beam_size, module + "first beam size."); + opts->Register("second-beam-size", + &second_beam_size, + module + "second beam size."); + } +}; + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc new file mode 100755 index 000000000..07e8e5608 --- /dev/null +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.cc @@ -0,0 +1,380 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang, Di Wu) +// 2022 Binbin Zhang (binbzha@qq.com) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "decoder/ctc_prefix_beam_search_decoder.h" + +#include "absl/strings/str_join.h" +#include "base/common.h" +#include "decoder/ctc_beam_search_opt.h" +#include "decoder/ctc_prefix_beam_search_score.h" +#include "utils/math.h" + +#ifdef USE_PROFILING +#include "paddle/fluid/platform/profiler.h" +using paddle::platform::RecordEvent; +using paddle::platform::TracerEventType; +#endif + +namespace ppspeech { + +CTCPrefixBeamSearch::CTCPrefixBeamSearch(const std::string& vocab_path, + const CTCBeamSearchOptions& opts) + : opts_(opts) { + unit_table_ = std::shared_ptr( + fst::SymbolTable::ReadText(vocab_path)); + CHECK(unit_table_ != nullptr); + + Reset(); +} + +void CTCPrefixBeamSearch::Reset() { + num_frame_decoded_ = 0; + + cur_hyps_.clear(); + + hypotheses_.clear(); + likelihood_.clear(); + viterbi_likelihood_.clear(); + times_.clear(); + outputs_.clear(); + + // empty hyp with Score + std::vector empty; + PrefixScore prefix_score; + prefix_score.InitEmpty(); + cur_hyps_[empty] = prefix_score; + + outputs_.emplace_back(empty); + hypotheses_.emplace_back(empty); + likelihood_.emplace_back(prefix_score.TotalScore()); + times_.emplace_back(empty); +} + +void CTCPrefixBeamSearch::InitDecoder() { Reset(); } + + +void CTCPrefixBeamSearch::AdvanceDecode( + const std::shared_ptr& decodable) { + double search_cost = 0.0; + double feat_nnet_cost = 0.0; + while (1) { + // forward frame by frame + kaldi::Timer timer; + std::vector frame_prob; + bool flag = decodable->FrameLikelihood(num_frame_decoded_, &frame_prob); + feat_nnet_cost += timer.Elapsed(); + if (flag == false) { + VLOG(3) << "decoder advance decode exit." << frame_prob.size(); + break; + } + + timer.Reset(); + std::vector> likelihood; + likelihood.push_back(frame_prob); + AdvanceDecoding(likelihood); + search_cost += timer.Elapsed(); + + VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_; + } + VLOG(1) << "AdvanceDecode feat + forward cost: " << feat_nnet_cost + << " sec."; + VLOG(1) << "AdvanceDecode search cost: " << search_cost << " sec."; +} + +static bool PrefixScoreCompare( + const std::pair, PrefixScore>& a, + const std::pair, PrefixScore>& b) { + // log domain + return a.second.TotalScore() > b.second.TotalScore(); +} + + +void CTCPrefixBeamSearch::AdvanceDecoding( + const std::vector>& logp) { +#ifdef USE_PROFILING + RecordEvent event("CtcPrefixBeamSearch::AdvanceDecoding", + TracerEventType::UserDefined, + 1); +#endif + + if (logp.size() == 0) return; + + int first_beam_size = + std::min(static_cast(logp[0].size()), opts_.first_beam_size); + + for (int t = 0; t < logp.size(); ++t, ++num_frame_decoded_) { + const std::vector& logp_t = logp[t]; + std::unordered_map, PrefixScore, PrefixScoreHash> + next_hyps; + + // 1. first beam prune, only select topk candidates + std::vector topk_score; + std::vector topk_index; + TopK(logp_t, first_beam_size, &topk_score, &topk_index); + VLOG(2) << "topk: " << num_frame_decoded_ << " " + << *std::max_element(logp_t.begin(), logp_t.end()) << " " + << topk_score[0]; + for (int i = 0; i < topk_score.size(); i++) { + VLOG(2) << "topk: " << num_frame_decoded_ << " " << topk_score[i]; + } + + // 2. token passing + for (int i = 0; i < topk_index.size(); ++i) { + int id = topk_index[i]; + auto prob = topk_score[i]; + + for (const auto& it : cur_hyps_) { + const std::vector& prefix = it.first; + const PrefixScore& prefix_score = it.second; + + // If prefix doesn't exist in next_hyps, next_hyps[prefix] will + // insert + // PrefixScore(-inf, -inf) by default, since the default + // constructor + // of PrefixScore will set fields b(blank ending Score) and + // nb(none blank ending Score) to -inf, respectively. + + if (id == opts_.blank) { + // case 0: *a + => *a, *a + => *a, + // prefix not + // change + PrefixScore& next_score = next_hyps[prefix]; + next_score.b = + LogSumExp(next_score.b, prefix_score.Score() + prob); + + // timestamp, blank is slince, not effact timestamp + next_score.v_b = prefix_score.ViterbiScore() + prob; + next_score.times_b = prefix_score.Times(); + + // Prefix not changed, copy the context from pefix + if (context_graph_ && !next_score.has_context) { + next_score.CopyContext(prefix_score); + next_score.has_context = true; + } + + } else if (!prefix.empty() && id == prefix.back()) { + // case 1: *a + a => *a, prefix not changed + PrefixScore& next_score1 = next_hyps[prefix]; + next_score1.nb = + LogSumExp(next_score1.nb, prefix_score.nb + prob); + + // timestamp, non-blank symbol effact timestamp + if (next_score1.v_nb < prefix_score.v_nb + prob) { + // compute viterbi Score + next_score1.v_nb = prefix_score.v_nb + prob; + if (next_score1.cur_token_prob < prob) { + // store max token prob + next_score1.cur_token_prob = prob; + // update this timestamp as token appeared here. + next_score1.times_nb = prefix_score.times_nb; + assert(next_score1.times_nb.size() > 0); + next_score1.times_nb.back() = num_frame_decoded_; + } + } + + // Prefix not changed, copy the context from pefix + if (context_graph_ && !next_score1.has_context) { + next_score1.CopyContext(prefix_score); + next_score1.has_context = true; + } + + // case 2: *a + a => *aa, prefix changed. + std::vector new_prefix(prefix); + new_prefix.emplace_back(id); + PrefixScore& next_score2 = next_hyps[new_prefix]; + next_score2.nb = + LogSumExp(next_score2.nb, prefix_score.b + prob); + + // timestamp, non-blank symbol effact timestamp + if (next_score2.v_nb < prefix_score.v_b + prob) { + // compute viterbi Score + next_score2.v_nb = prefix_score.v_b + prob; + // new token added + next_score2.cur_token_prob = prob; + next_score2.times_nb = prefix_score.times_b; + next_score2.times_nb.emplace_back(num_frame_decoded_); + } + + // Prefix changed, calculate the context Score. + if (context_graph_ && !next_score2.has_context) { + next_score2.UpdateContext( + context_graph_, prefix_score, id, prefix.size()); + next_score2.has_context = true; + } + + } else { + // id != prefix.back() + // case 3: *a + b => *ab, *a +b => *ab + std::vector new_prefix(prefix); + new_prefix.emplace_back(id); + PrefixScore& next_score = next_hyps[new_prefix]; + next_score.nb = + LogSumExp(next_score.nb, prefix_score.Score() + prob); + + // timetamp, non-blank symbol effact timestamp + if (next_score.v_nb < prefix_score.ViterbiScore() + prob) { + next_score.v_nb = prefix_score.ViterbiScore() + prob; + + next_score.cur_token_prob = prob; + next_score.times_nb = prefix_score.Times(); + next_score.times_nb.emplace_back(num_frame_decoded_); + } + + // Prefix changed, calculate the context Score. + if (context_graph_ && !next_score.has_context) { + next_score.UpdateContext( + context_graph_, prefix_score, id, prefix.size()); + next_score.has_context = true; + } + } + } // end for (const auto& it : cur_hyps_) + } // end for (int i = 0; i < topk_index.size(); ++i) + + // 3. second beam prune, only keep top n best paths + std::vector, PrefixScore>> arr( + next_hyps.begin(), next_hyps.end()); + int second_beam_size = + std::min(static_cast(arr.size()), opts_.second_beam_size); + std::nth_element(arr.begin(), + arr.begin() + second_beam_size, + arr.end(), + PrefixScoreCompare); + arr.resize(second_beam_size); + std::sort(arr.begin(), arr.end(), PrefixScoreCompare); + + // 4. update cur_hyps by next_hyps, and get new result + UpdateHypotheses(arr); + } // end for (int t = 0; t < logp.size(); ++t, ++num_frame_decoded_) +} + + +void CTCPrefixBeamSearch::UpdateHypotheses( + const std::vector, PrefixScore>>& hyps) { + cur_hyps_.clear(); + + outputs_.clear(); + hypotheses_.clear(); + likelihood_.clear(); + viterbi_likelihood_.clear(); + times_.clear(); + + for (auto& item : hyps) { + cur_hyps_[item.first] = item.second; + + UpdateOutputs(item); + hypotheses_.emplace_back(std::move(item.first)); + likelihood_.emplace_back(item.second.TotalScore()); + viterbi_likelihood_.emplace_back(item.second.ViterbiScore()); + times_.emplace_back(item.second.Times()); + } +} + +void CTCPrefixBeamSearch::UpdateOutputs( + const std::pair, PrefixScore>& prefix) { + const std::vector& input = prefix.first; + const std::vector& start_boundaries = prefix.second.start_boundaries; + const std::vector& end_boundaries = prefix.second.end_boundaries; + + // add tag + std::vector output; + int s = 0; + int e = 0; + for (int i = 0; i < input.size(); ++i) { + output.emplace_back(input[i]); + } + + outputs_.emplace_back(output); +} + +void CTCPrefixBeamSearch::FinalizeSearch() { + UpdateFinalContext(); + + VLOG(2) << "num_frame_decoded_: " << num_frame_decoded_; + int cnt = 0; + for (int i = 0; i < hypotheses_.size(); i++) { + VLOG(2) << "hyp " << cnt << " len: " << hypotheses_[i].size() + << " ctc score: " << likelihood_[i]; + for (int j = 0; j < hypotheses_[i].size(); j++) { + VLOG(2) << hypotheses_[i][j]; + } + } +} + +void CTCPrefixBeamSearch::UpdateFinalContext() { + if (context_graph_ == nullptr) return; + + CHECK(hypotheses_.size() == cur_hyps_.size()); + CHECK(hypotheses_.size() == likelihood_.size()); + + // We should backoff the context Score/state when the context is + // not fully matched at the last time. + for (const auto& prefix : hypotheses_) { + PrefixScore& prefix_score = cur_hyps_[prefix]; + if (prefix_score.context_score != 0) { + prefix_score.UpdateContext( + context_graph_, prefix_score, 0, prefix.size()); + } + } + std::vector, PrefixScore>> arr(cur_hyps_.begin(), + cur_hyps_.end()); + std::sort(arr.begin(), arr.end(), PrefixScoreCompare); + + // Update cur_hyps_ and get new result + UpdateHypotheses(arr); +} + +std::string CTCPrefixBeamSearch::GetBestPath(int index) { + int n_hyps = Outputs().size(); + CHECK_GT(n_hyps, 0); + CHECK_LT(index, n_hyps); + std::vector one = Outputs()[index]; + std::string sentence; + for (int i = 0; i < one.size(); i++) { + sentence += unit_table_->Find(one[i]); + } + return sentence; +} + +std::string CTCPrefixBeamSearch::GetBestPath() { return GetBestPath(0); } + +std::vector> CTCPrefixBeamSearch::GetNBestPath( + int n) { + int hyps_size = hypotheses_.size(); + CHECK_GT(hyps_size, 0); + + int min_n = n == -1 ? hypotheses_.size() : std::min(n, hyps_size); + + std::vector> n_best; + n_best.reserve(min_n); + + for (int i = 0; i < min_n; i++) { + n_best.emplace_back(Likelihood()[i], GetBestPath(i)); + } + return n_best; +} + +std::vector> +CTCPrefixBeamSearch::GetNBestPath() { + return GetNBestPath(-1); +} + +std::string CTCPrefixBeamSearch::GetFinalBestPath() { return GetBestPath(); } + +std::string CTCPrefixBeamSearch::GetPartialResult() { return GetBestPath(); } + + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h new file mode 100755 index 000000000..5013246a4 --- /dev/null +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder.h @@ -0,0 +1,101 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.cc + +#pragma once + +#include "decoder/ctc_beam_search_opt.h" +#include "decoder/ctc_prefix_beam_search_score.h" +#include "decoder/decoder_itf.h" +#include "fst/symbol-table.h" + +namespace ppspeech { +class ContextGraph; +class CTCPrefixBeamSearch : public DecoderBase { + public: + CTCPrefixBeamSearch(const std::string& vocab_path, + const CTCBeamSearchOptions& opts); + ~CTCPrefixBeamSearch() {} + + SearchType Type() const { return SearchType::kPrefixBeamSearch; } + + void InitDecoder() override; + + void Reset() override; + + void AdvanceDecode( + const std::shared_ptr& decodable) override; + + std::string GetFinalBestPath() override; + std::string GetPartialResult() override; + + void FinalizeSearch(); + + const std::shared_ptr VocabTable() const { + return unit_table_; + } + + const std::vector>& Inputs() const { return hypotheses_; } + const std::vector>& Outputs() const { return outputs_; } + const std::vector& Likelihood() const { return likelihood_; } + const std::vector& ViterbiLikelihood() const { + return viterbi_likelihood_; + } + const std::vector>& Times() const { return times_; } + + + protected: + std::string GetBestPath() override; + std::vector> GetNBestPath() override; + std::vector> GetNBestPath(int n) override; + + private: + std::string GetBestPath(int index); + + void AdvanceDecoding( + const std::vector>& logp); + + void UpdateOutputs(const std::pair, PrefixScore>& prefix); + void UpdateHypotheses( + const std::vector, PrefixScore>>& prefix); + void UpdateFinalContext(); + + + private: + CTCBeamSearchOptions opts_; + std::shared_ptr unit_table_{nullptr}; + + std::unordered_map, PrefixScore, PrefixScoreHash> + cur_hyps_; + + // n-best list and corresponding likelihood, in sorted order + std::vector> hypotheses_; + std::vector likelihood_; + + std::vector> times_; + std::vector viterbi_likelihood_; + + // Outputs contain the hypotheses_ and tags lik: and + std::vector> outputs_; + + std::shared_ptr context_graph_{nullptr}; + + DISALLOW_COPY_AND_ASSIGN(CTCPrefixBeamSearch); +}; + + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc old mode 100644 new mode 100755 index 7cfee06c9..c59b1f2e7 --- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -12,40 +12,29 @@ // See the License for the specific language governing permissions and // limitations under the License. -// todo refactor, repalce with gtest - -#include "base/flags.h" -#include "base/log.h" -#include "decoder/ctc_beam_search_decoder.h" +#include "absl/strings/str_split.h" +#include "base/common.h" +#include "decoder/ctc_prefix_beam_search_decoder.h" #include "frontend/audio/data_cache.h" +#include "fst/symbol-table.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" +#include "nnet/u2_nnet.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); -DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm"); -DEFINE_string(lm_path, "", "language model"); +DEFINE_string(vocab_path, "", "vocab path"); + +DEFINE_string(model_path, "", "paddle nnet model"); + DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, +DEFINE_int32(subsampling_rate, 4, "two CNN(kernel=3) module downsampling rate."); -DEFINE_string( - model_input_names, - "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", - "model input names"); -DEFINE_string(model_output_names, - "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0", - "model output names"); -DEFINE_string(model_cache_names, - "chunk_state_h_box,chunk_state_c_box", - "model cache names"); -DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); -DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); + +DEFINE_int32(nnet_decoder_chunk, 16, "paddle nnet forward chunk"); using kaldi::BaseFloat; using kaldi::Matrix; @@ -53,117 +42,138 @@ using std::vector; // test ds2 online decoder by feeding speech feature int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; - CHECK(FLAGS_result_wspecifier != ""); - CHECK(FLAGS_feature_rspecifier != ""); + int32 num_done = 0, num_err = 0; + + CHECK_NE(FLAGS_result_wspecifier, ""); + CHECK_NE(FLAGS_feature_rspecifier, ""); + CHECK_NE(FLAGS_vocab_path, ""); + CHECK_NE(FLAGS_model_path, ""); + LOG(INFO) << "model path: " << FLAGS_model_path; + LOG(INFO) << "Reading vocab table " << FLAGS_vocab_path; kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); - std::string model_path = FLAGS_model_path; - std::string model_params = FLAGS_param_path; - std::string dict_file = FLAGS_dict_file; - std::string lm_path = FLAGS_lm_path; - LOG(INFO) << "model path: " << model_path; - LOG(INFO) << "model param: " << model_params; - LOG(INFO) << "dict path: " << dict_file; - LOG(INFO) << "lm path: " << lm_path; - int32 num_done = 0, num_err = 0; + // nnet + ppspeech::ModelOptions model_opts; + model_opts.model_path = FLAGS_model_path; + std::shared_ptr nnet = + std::make_shared(model_opts); + // decodeable + std::shared_ptr raw_data = + std::make_shared(); + std::shared_ptr decodable = + std::make_shared(nnet, raw_data); + + // decoder ppspeech::CTCBeamSearchOptions opts; - opts.dict_file = dict_file; - opts.lm_path = lm_path; - ppspeech::CTCBeamSearch decoder(opts); + opts.blank = 0; + opts.first_beam_size = 10; + opts.second_beam_size = 10; + ppspeech::CTCPrefixBeamSearch decoder(FLAGS_vocab_path, opts); - ppspeech::ModelOptions model_opts; - model_opts.model_path = model_path; - model_opts.param_path = model_params; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; - std::shared_ptr nnet( - new ppspeech::PaddleNnet(model_opts)); - std::shared_ptr raw_data(new ppspeech::DataCache()); - std::shared_ptr decodable( - new ppspeech::Decodable(nnet, raw_data)); int32 chunk_size = FLAGS_receptive_field_length + - (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; - int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; int32 receptive_field_length = FLAGS_receptive_field_length; LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk stride (frame): " << chunk_stride; LOG(INFO) << "receptive field (frame): " << receptive_field_length; + decoder.InitDecoder(); kaldi::Timer timer; for (; !feature_reader.Done(); feature_reader.Next()) { string utt = feature_reader.Key(); kaldi::Matrix feature = feature_reader.Value(); - raw_data->SetDim(feature.NumCols()); - LOG(INFO) << "process utt: " << utt; - LOG(INFO) << "rows: " << feature.NumRows(); - LOG(INFO) << "cols: " << feature.NumCols(); - int32 row_idx = 0; - int32 padding_len = 0; + int nframes = feature.NumRows(); + int feat_dim = feature.NumCols(); + raw_data->SetDim(feat_dim); + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim; + + raw_data->SetDim(feat_dim); + int32 ori_feature_len = feature.NumRows(); - if ((feature.NumRows() - chunk_size) % chunk_stride != 0) { - padding_len = - chunk_stride - (feature.NumRows() - chunk_size) % chunk_stride; - feature.Resize(feature.NumRows() + padding_len, - feature.NumCols(), - kaldi::kCopyData); - } - int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1; + int32 num_chunks = feature.NumRows() / chunk_stride + 1; + LOG(INFO) << "num_chunks: " << num_chunks; + for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { - kaldi::Vector feature_chunk(chunk_size * - feature.NumCols()); - int32 feature_chunk_size = 0; + int32 this_chunk_size = 0; if (ori_feature_len > chunk_idx * chunk_stride) { - feature_chunk_size = std::min( + this_chunk_size = std::min( ori_feature_len - chunk_idx * chunk_stride, chunk_size); } - if (feature_chunk_size < receptive_field_length) break; + if (this_chunk_size < receptive_field_length) { + LOG(WARNING) + << "utt: " << utt << " skip last " << this_chunk_size + << " frames, expect is " << receptive_field_length; + break; + } + + kaldi::Vector feature_chunk(this_chunk_size * + feat_dim); int32 start = chunk_idx * chunk_stride; + for (int row_id = 0; row_id < this_chunk_size; ++row_id) { + kaldi::SubVector feat_row(feature, start); + kaldi::SubVector feature_chunk_row( + feature_chunk.Data() + row_id * feat_dim, feat_dim); - for (int row_id = 0; row_id < chunk_size; ++row_id) { - kaldi::SubVector tmp(feature, start); - kaldi::SubVector f_chunk_tmp( - feature_chunk.Data() + row_id * feature.NumCols(), - feature.NumCols()); - f_chunk_tmp.CopyFromVec(tmp); + feature_chunk_row.CopyFromVec(feat_row); ++start; } + + // feat to frontend pipeline cache raw_data->Accept(feature_chunk); + + // send data finish signal if (chunk_idx == num_chunks - 1) { raw_data->SetFinished(); } + + // forward nnet decoder.AdvanceDecode(decodable); + + LOG(INFO) << "Partial result: " << decoder.GetPartialResult(); } - std::string result; - result = decoder.GetFinalBestPath(); + + decoder.FinalizeSearch(); + + // get 1-best result + std::string result = decoder.GetFinalBestPath(); + + // after process one utt, then reset state. decodable->Reset(); decoder.Reset(); + if (result.empty()) { // the TokenWriter can not write empty string. ++num_err; - KALDI_LOG << " the result of " << utt << " is empty"; + LOG(INFO) << " the result of " << utt << " is empty"; continue; } - KALDI_LOG << " the result of " << utt << " is " << result; + + LOG(INFO) << " the result of " << utt << " is " << result; result_writer.Write(utt, result); + ++num_done; } - KALDI_LOG << "Done " << num_done << " utterances, " << num_err - << " with errors."; double elapsed = timer.Elapsed(); - KALDI_LOG << " cost:" << elapsed << " s"; + LOG(INFO) << "Program cost:" << elapsed << " sec"; + + LOG(INFO) << "Done " << num_done << " utterances, " << num_err + << " with errors."; return (num_done != 0 ? 0 : 1); } diff --git a/speechx/speechx/decoder/ctc_prefix_beam_search_score.h b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h new file mode 100755 index 000000000..76b09e9b8 --- /dev/null +++ b/speechx/speechx/decoder/ctc_prefix_beam_search_score.h @@ -0,0 +1,98 @@ +// Copyright (c) 2020 Mobvoi Inc (Binbin Zhang) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/ctc_prefix_beam_search.h + +#pragma once + +#include "base/common.h" +#include "utils/math.h" + +namespace ppspeech { + +class ContextGraph; + +struct PrefixScore { + // decoding, unit in log scale + float b = -kBaseFloatMax; // blank ending score + float nb = -kBaseFloatMax; // none-blank ending score + + // decoding score, sum + float Score() const { return LogSumExp(b, nb); } + + // timestamp, unit in log sclae + float v_b = -kBaseFloatMax; // viterbi blank ending score + float v_nb = -kBaseFloatMax; // niterbi none-blank ending score + float cur_token_prob = -kBaseFloatMax; // prob of current token + std::vector times_b; // times of viterbi blank path + std::vector times_nb; // times of viterbi non-blank path + + + // timestamp score, max + float ViterbiScore() const { return std::max(v_b, v_nb); } + + // get timestamp + const std::vector& Times() const { + return v_b > v_nb ? times_b : times_nb; + } + + // context state + bool has_context = false; + int context_state = 0; + float context_score = 0; + std::vector start_boundaries; + std::vector end_boundaries; + + + // decodign score with context bias + float TotalScore() const { return Score() + context_score; } + + void CopyContext(const PrefixScore& prefix_score) { + context_state = prefix_score.context_state; + context_score = prefix_score.context_score; + start_boundaries = prefix_score.start_boundaries; + end_boundaries = prefix_score.end_boundaries; + } + + void UpdateContext(const std::shared_ptr& constext_graph, + const PrefixScore& prefix_score, + int word_id, + int prefix_len) { + CHECK(false); + } + + void InitEmpty() { + b = 0.0f; // log(1) + nb = -kBaseFloatMax; // log(0) + v_b = 0.0f; // log(1) + v_nb = 0.0f; // log(1) + } +}; + +struct PrefixScoreHash { + // https://stackoverflow.com/questions/20511347/a-good-hash-function-for-a-vector + std::size_t operator()(const std::vector& prefix) const { + std::size_t seed = prefix.size(); + for (auto& i : prefix) { + seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; + } +}; + +using PrefixWithScoreType = std::pair, PrefixScoreHash>; + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.cc b/speechx/speechx/decoder/ctc_tlg_decoder.cc old mode 100644 new mode 100755 index 712d27dd4..2c2b6d3c9 --- a/speechx/speechx/decoder/ctc_tlg_decoder.cc +++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc @@ -18,37 +18,38 @@ namespace ppspeech { TLGDecoder::TLGDecoder(TLGDecoderOptions opts) { fst_.reset(fst::Fst::Read(opts.fst_path)); CHECK(fst_ != nullptr); + word_symbol_table_.reset( fst::SymbolTable::ReadText(opts.word_symbol_table)); + decoder_.reset(new kaldi::LatticeFasterOnlineDecoder(*fst_, opts.opts)); - decoder_->InitDecoding(); - frame_decoded_size_ = 0; + + Reset(); } -void TLGDecoder::InitDecoder() { +void TLGDecoder::Reset() { decoder_->InitDecoding(); - frame_decoded_size_ = 0; + num_frame_decoded_ = 0; + return; } +void TLGDecoder::InitDecoder() { Reset(); } + void TLGDecoder::AdvanceDecode( const std::shared_ptr& decodable) { - while (!decodable->IsLastFrame(frame_decoded_size_)) { + while (!decodable->IsLastFrame(num_frame_decoded_)) { AdvanceDecoding(decodable.get()); } } void TLGDecoder::AdvanceDecoding(kaldi::DecodableInterface* decodable) { decoder_->AdvanceDecoding(decodable, 1); - frame_decoded_size_++; + num_frame_decoded_++; } -void TLGDecoder::Reset() { - InitDecoder(); - return; -} std::string TLGDecoder::GetPartialResult() { - if (frame_decoded_size_ == 0) { + if (num_frame_decoded_ == 0) { // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call // BestPathEnd if no frames were decoded.") return std::string(""); @@ -68,7 +69,7 @@ std::string TLGDecoder::GetPartialResult() { } std::string TLGDecoder::GetFinalBestPath() { - if (frame_decoded_size_ == 0) { + if (num_frame_decoded_ == 0) { // Assertion failed: (this->NumFramesDecoded() > 0 && "You cannot call // BestPathEnd if no frames were decoded.") return std::string(""); @@ -88,4 +89,5 @@ std::string TLGDecoder::GetFinalBestPath() { } return words; } -} + +} // namespace ppspeech diff --git a/speechx/speechx/decoder/ctc_tlg_decoder.h b/speechx/speechx/decoder/ctc_tlg_decoder.h old mode 100644 new mode 100755 index 1ac46ac64..8be69dadd --- a/speechx/speechx/decoder/ctc_tlg_decoder.h +++ b/speechx/speechx/decoder/ctc_tlg_decoder.h @@ -14,37 +14,78 @@ #pragma once -#include "base/basic_types.h" -#include "kaldi/decoder/decodable-itf.h" +#include "base/common.h" +#include "decoder/decoder_itf.h" #include "kaldi/decoder/lattice-faster-online-decoder.h" #include "util/parse-options.h" + +DECLARE_string(graph_path); +DECLARE_string(word_symbol_table); +DECLARE_int32(max_active); +DECLARE_double(beam); +DECLARE_double(lattice_beam); + namespace ppspeech { struct TLGDecoderOptions { - kaldi::LatticeFasterDecoderConfig opts; + kaldi::LatticeFasterDecoderConfig opts{}; // todo remove later, add into decode resource std::string word_symbol_table; std::string fst_path; - TLGDecoderOptions() : word_symbol_table(""), fst_path("") {} + static TLGDecoderOptions InitFromFlags() { + TLGDecoderOptions decoder_opts; + decoder_opts.word_symbol_table = FLAGS_word_symbol_table; + decoder_opts.fst_path = FLAGS_graph_path; + LOG(INFO) << "fst path: " << decoder_opts.fst_path; + LOG(INFO) << "fst symbole table: " << decoder_opts.word_symbol_table; + + decoder_opts.opts.max_active = FLAGS_max_active; + decoder_opts.opts.beam = FLAGS_beam; + decoder_opts.opts.lattice_beam = FLAGS_lattice_beam; + LOG(INFO) << "LatticeFasterDecoder max active: " + << decoder_opts.opts.max_active; + LOG(INFO) << "LatticeFasterDecoder beam: " << decoder_opts.opts.beam; + LOG(INFO) << "LatticeFasterDecoder lattice_beam: " + << decoder_opts.opts.lattice_beam; + + return decoder_opts; + } }; -class TLGDecoder { +class TLGDecoder : public DecoderBase { public: explicit TLGDecoder(TLGDecoderOptions opts); + ~TLGDecoder() = default; + void InitDecoder(); - void Decode(); - std::string GetBestPath(); - std::vector> GetNBestPath(); - std::string GetFinalBestPath(); - std::string GetPartialResult(); - int NumFrameDecoded(); - int DecodeLikelihoods(const std::vector>& probs, - std::vector& nbest_words); + void Reset(); + void AdvanceDecode( const std::shared_ptr& decodable); - void Reset(); + + void Decode(); + + std::string GetFinalBestPath() override; + std::string GetPartialResult() override; + + int DecodeLikelihoods(const std::vector>& probs, + const std::vector& nbest_words); + + protected: + std::string GetBestPath() override { + CHECK(false); + return {}; + } + std::vector> GetNBestPath() override { + CHECK(false); + return {}; + } + std::vector> GetNBestPath(int n) override { + CHECK(false); + return {}; + } private: void AdvanceDecoding(kaldi::DecodableInterface* decodable); @@ -52,8 +93,6 @@ class TLGDecoder { std::shared_ptr decoder_; std::shared_ptr> fst_; std::shared_ptr word_symbol_table_; - // the frame size which have decoded starts from 0. - int32 frame_decoded_size_; }; diff --git a/speechx/speechx/decoder/ctc_tlg_decoder_main.cc b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc new file mode 100755 index 000000000..e9bd8a3f4 --- /dev/null +++ b/speechx/speechx/decoder/ctc_tlg_decoder_main.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// todo refactor, repalce with gtest + +#include "base/common.h" +#include "decoder/ctc_tlg_decoder.h" +#include "decoder/param.h" +#include "frontend/audio/data_cache.h" +#include "kaldi/util/table-types.h" +#include "nnet/decodable.h" +#include "nnet/ds2_nnet.h" + + +DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); +DEFINE_string(result_wspecifier, "", "test result wspecifier"); + + +using kaldi::BaseFloat; +using kaldi::Matrix; +using std::vector; + +// test TLG decoder by feeding speech feature. +int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + kaldi::SequentialBaseFloatMatrixReader feature_reader( + FLAGS_feature_rspecifier); + kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); + + int32 num_done = 0, num_err = 0; + + ppspeech::TLGDecoderOptions opts = + ppspeech::TLGDecoderOptions::InitFromFlags(); + opts.opts.beam = 15.0; + opts.opts.lattice_beam = 7.5; + ppspeech::TLGDecoder decoder(opts); + + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + + std::shared_ptr nnet( + new ppspeech::PaddleNnet(model_opts)); + std::shared_ptr raw_data(new ppspeech::DataCache()); + std::shared_ptr decodable( + new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); + + int32 chunk_size = FLAGS_receptive_field_length + + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; + int32 receptive_field_length = FLAGS_receptive_field_length; + LOG(INFO) << "chunk size (frame): " << chunk_size; + LOG(INFO) << "chunk stride (frame): " << chunk_stride; + LOG(INFO) << "receptive field (frame): " << receptive_field_length; + + decoder.InitDecoder(); + kaldi::Timer timer; + for (; !feature_reader.Done(); feature_reader.Next()) { + string utt = feature_reader.Key(); + kaldi::Matrix feature = feature_reader.Value(); + raw_data->SetDim(feature.NumCols()); + LOG(INFO) << "process utt: " << utt; + LOG(INFO) << "rows: " << feature.NumRows(); + LOG(INFO) << "cols: " << feature.NumCols(); + + int32 row_idx = 0; + int32 padding_len = 0; + int32 ori_feature_len = feature.NumRows(); + if ((feature.NumRows() - chunk_size) % chunk_stride != 0) { + padding_len = + chunk_stride - (feature.NumRows() - chunk_size) % chunk_stride; + feature.Resize(feature.NumRows() + padding_len, + feature.NumCols(), + kaldi::kCopyData); + } + int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1; + for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + kaldi::Vector feature_chunk(chunk_size * + feature.NumCols()); + int32 feature_chunk_size = 0; + if (ori_feature_len > chunk_idx * chunk_stride) { + feature_chunk_size = std::min( + ori_feature_len - chunk_idx * chunk_stride, chunk_size); + } + if (feature_chunk_size < receptive_field_length) break; + + int32 start = chunk_idx * chunk_stride; + for (int row_id = 0; row_id < chunk_size; ++row_id) { + kaldi::SubVector tmp(feature, start); + kaldi::SubVector f_chunk_tmp( + feature_chunk.Data() + row_id * feature.NumCols(), + feature.NumCols()); + f_chunk_tmp.CopyFromVec(tmp); + ++start; + } + raw_data->Accept(feature_chunk); + if (chunk_idx == num_chunks - 1) { + raw_data->SetFinished(); + } + decoder.AdvanceDecode(decodable); + } + std::string result; + result = decoder.GetFinalBestPath(); + decodable->Reset(); + decoder.Reset(); + if (result.empty()) { + // the TokenWriter can not write empty string. + ++num_err; + KALDI_LOG << " the result of " << utt << " is empty"; + continue; + } + KALDI_LOG << " the result of " << utt << " is " << result; + result_writer.Write(utt, result); + ++num_done; + } + + double elapsed = timer.Elapsed(); + KALDI_LOG << " cost:" << elapsed << " s"; + + KALDI_LOG << "Done " << num_done << " utterances, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); +} diff --git a/speechx/speechx/decoder/decoder_itf.h b/speechx/speechx/decoder/decoder_itf.h new file mode 100755 index 000000000..2289b3173 --- /dev/null +++ b/speechx/speechx/decoder/decoder_itf.h @@ -0,0 +1,66 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "base/common.h" +#include "kaldi/decoder/decodable-itf.h" + +namespace ppspeech { + +enum SearchType { + kPrefixBeamSearch = 0, + kWfstBeamSearch = 1, +}; +class DecoderInterface { + public: + virtual ~DecoderInterface() {} + + virtual void InitDecoder() = 0; + + virtual void Reset() = 0; + + // call AdvanceDecoding + virtual void AdvanceDecode( + const std::shared_ptr& decodable) = 0; + + // call GetBestPath + virtual std::string GetFinalBestPath() = 0; + + virtual std::string GetPartialResult() = 0; + + protected: + // virtual void AdvanceDecoding(kaldi::DecodableInterface* decodable) = 0; + + // virtual void Decode() = 0; + + virtual std::string GetBestPath() = 0; + + virtual std::vector> GetNBestPath() = 0; + + virtual std::vector> GetNBestPath(int n) = 0; +}; + +class DecoderBase : public DecoderInterface { + protected: + // start from one + int NumFrameDecoded() { return num_frame_decoded_ + 1; } + + protected: + // current decoding frame number, abs_time_step_ + int32 num_frame_decoded_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc old mode 100644 new mode 100755 index 0e249cc6b..e0acbe77b --- a/speechx/speechx/decoder/nnet_logprob_decoder_main.cc +++ b/speechx/speechx/decoder/nnet_logprob_decoder_main.cc @@ -30,8 +30,11 @@ using std::vector; // test decoder by feeding nnet posterior probability int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialBaseFloatMatrixReader likelihood_reader( FLAGS_nnet_prob_respecifier); diff --git a/speechx/speechx/decoder/param.h b/speechx/speechx/decoder/param.h old mode 100644 new mode 100755 index d6ee27058..ebdd71197 --- a/speechx/speechx/decoder/param.h +++ b/speechx/speechx/decoder/param.h @@ -17,23 +17,29 @@ #include "base/common.h" #include "decoder/ctc_beam_search_decoder.h" #include "decoder/ctc_tlg_decoder.h" -#include "frontend/audio/feature_pipeline.h" // feature DEFINE_bool(use_fbank, false, "False for fbank; or linear feature"); +DEFINE_bool(fill_zero, + false, + "fill zero at last chunk, when chunk < chunk_size"); // DEFINE_bool(to_float32, true, "audio convert to pcm32. True for linear // feature, or fbank"); DEFINE_int32(num_bins, 161, "num bins of mel"); DEFINE_string(cmvn_file, "", "read cmvn"); + // feature sliding window DEFINE_int32(receptive_field_length, 7, "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, +DEFINE_int32(subsampling_rate, 4, "two CNN(kernel=3) module downsampling rate."); DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); + + // nnet +DEFINE_string(vocab_path, "", "nnet vocab path."); DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); DEFINE_string( @@ -48,71 +54,30 @@ DEFINE_string(model_cache_names, "model cache names"); DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); + // decoder -DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); -DEFINE_string(graph_path, "TLG", "decoder graph"); DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); + +DEFINE_string(graph_path, "TLG", "decoder graph"); +DEFINE_string(word_symbol_table, "words.txt", "word symbol table"); DEFINE_int32(max_active, 7500, "max active"); DEFINE_double(beam, 15.0, "decoder beam"); DEFINE_double(lattice_beam, 7.5, "decoder beam"); -namespace ppspeech { -// todo refactor later -FeaturePipelineOptions InitFeaturePipelineOptions() { - FeaturePipelineOptions opts; - opts.cmvn_file = FLAGS_cmvn_file; - kaldi::FrameExtractionOptions frame_opts; - frame_opts.dither = 0.0; - frame_opts.frame_shift_ms = 10; - opts.use_fbank = FLAGS_use_fbank; - if (opts.use_fbank) { - opts.to_float32 = false; - frame_opts.window_type = "povey"; - frame_opts.frame_length_ms = 25; - opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; - opts.fbank_opts.frame_opts = frame_opts; - } else { - opts.to_float32 = true; - frame_opts.remove_dc_offset = false; - frame_opts.frame_length_ms = 20; - frame_opts.window_type = "hanning"; - frame_opts.preemph_coeff = 0.0; - opts.linear_spectrogram_opts.frame_opts = frame_opts; - } - opts.assembler_opts.subsampling_rate = FLAGS_downsampling_rate; - opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length; - opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; - - return opts; -} - -ModelOptions InitModelOptions() { - ModelOptions model_opts; - model_opts.model_path = FLAGS_model_path; - model_opts.param_path = FLAGS_param_path; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; - return model_opts; -} - -TLGDecoderOptions InitDecoderOptions() { - TLGDecoderOptions decoder_opts; - decoder_opts.word_symbol_table = FLAGS_word_symbol_table; - decoder_opts.fst_path = FLAGS_graph_path; - decoder_opts.opts.max_active = FLAGS_max_active; - decoder_opts.opts.beam = FLAGS_beam; - decoder_opts.opts.lattice_beam = FLAGS_lattice_beam; - return decoder_opts; -} -RecognizerResource InitRecognizerResoure() { - RecognizerResource resource; - resource.acoustic_scale = FLAGS_acoustic_scale; - resource.feature_pipeline_opts = InitFeaturePipelineOptions(); - resource.model_opts = InitModelOptions(); - resource.tlg_opts = InitDecoderOptions(); - return resource; -} -} +// DecodeOptions flags +// DEFINE_int32(chunk_size, -1, "decoding chunk size"); +DEFINE_int32(num_left_chunks, -1, "left chunks in decoding"); +DEFINE_double(ctc_weight, + 0.5, + "ctc weight when combining ctc score and rescoring score"); +DEFINE_double(rescoring_weight, + 1.0, + "rescoring weight when combining ctc score and rescoring score"); +DEFINE_double(reverse_weight, + 0.3, + "used for bitransformer rescoring. it must be 0.0 if decoder is" + "conventional transformer decoder, and only reverse_weight > 0.0" + "dose the right to left decoder will be calculated and used"); +DEFINE_int32(nbest, 10, "nbest for ctc wfst or prefix search"); +DEFINE_int32(blank, 0, "blank id in vocab"); diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt old mode 100644 new mode 100755 index 8ae63256a..050d78bea --- a/speechx/speechx/frontend/audio/CMakeLists.txt +++ b/speechx/speechx/frontend/audio/CMakeLists.txt @@ -1,5 +1,3 @@ -project(frontend) - add_library(frontend STATIC cmvn.cc db_norm.cc diff --git a/speechx/speechx/frontend/audio/assembler.cc b/speechx/speechx/frontend/audio/assembler.cc old mode 100644 new mode 100755 index 37eeec80f..9d5fc4036 --- a/speechx/speechx/frontend/audio/assembler.cc +++ b/speechx/speechx/frontend/audio/assembler.cc @@ -16,16 +16,18 @@ namespace ppspeech { +using kaldi::BaseFloat; using kaldi::Vector; using kaldi::VectorBase; -using kaldi::BaseFloat; using std::unique_ptr; Assembler::Assembler(AssemblerOptions opts, unique_ptr base_extractor) { + fill_zero_ = opts.fill_zero; frame_chunk_stride_ = opts.subsampling_rate * opts.nnet_decoder_chunk; frame_chunk_size_ = (opts.nnet_decoder_chunk - 1) * opts.subsampling_rate + opts.receptive_filed_length; + cache_size_ = frame_chunk_size_ - frame_chunk_stride_; receptive_filed_length_ = opts.receptive_filed_length; base_extractor_ = std::move(base_extractor); dim_ = base_extractor_->Dim(); @@ -38,49 +40,85 @@ void Assembler::Accept(const kaldi::VectorBase& inputs) { // pop feature chunk bool Assembler::Read(kaldi::Vector* feats) { - feats->Resize(dim_ * frame_chunk_size_); + kaldi::Timer timer; bool result = Compute(feats); + VLOG(1) << "Assembler::Read cost: " << timer.Elapsed() << " sec."; return result; } -// read all data from base_feature_extractor_ into cache_ +// read frame by frame from base_feature_extractor_ into cache_ bool Assembler::Compute(Vector* feats) { - // compute and feed - bool result = false; + // compute and feed frame by frame while (feature_cache_.size() < frame_chunk_size_) { Vector feature; - result = base_extractor_->Read(&feature); + bool result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) { - if (IsFinished() == false) return false; - break; + VLOG(3) << "result: " << result + << " feature dim: " << feature.Dim(); + if (IsFinished() == false) { + VLOG(3) << "finished reading feature. cache size: " + << feature_cache_.size(); + return false; + } else { + VLOG(3) << "break"; + break; + } } + + CHECK(feature.Dim() == dim_); feature_cache_.push(feature); + + nframes_ += 1; + VLOG(3) << "nframes: " << nframes_; } if (feature_cache_.size() < receptive_filed_length_) { + VLOG(3) << "feature_cache less than receptive_filed_lenght. " + << feature_cache_.size() << ": " << receptive_filed_length_; return false; } - while (feature_cache_.size() < frame_chunk_size_) { - Vector feature(dim_, kaldi::kSetZero); - feature_cache_.push(feature); + if (fill_zero_) { + while (feature_cache_.size() < frame_chunk_size_) { + Vector feature(dim_, kaldi::kSetZero); + nframes_ += 1; + feature_cache_.push(feature); + } } + int32 this_chunk_size = + std::min(static_cast(feature_cache_.size()), frame_chunk_size_); + feats->Resize(dim_ * this_chunk_size); + VLOG(3) << "read " << this_chunk_size << " feat."; + int32 counter = 0; - int32 cache_size = frame_chunk_size_ - frame_chunk_stride_; - int32 elem_dim = base_extractor_->Dim(); - while (counter < frame_chunk_size_) { + while (counter < this_chunk_size) { Vector& val = feature_cache_.front(); - int32 start = counter * elem_dim; - feats->Range(start, elem_dim).CopyFromVec(val); - if (frame_chunk_size_ - counter <= cache_size) { + CHECK(val.Dim() == dim_) << val.Dim(); + + int32 start = counter * dim_; + feats->Range(start, dim_).CopyFromVec(val); + + if (this_chunk_size - counter <= cache_size_) { feature_cache_.push(val); } + + // val is reference, so we should pop here feature_cache_.pop(); + counter++; } + CHECK(feature_cache_.size() == cache_size_); - return result; + return true; +} + + +void Assembler::Reset() { + std::queue> empty; + std::swap(feature_cache_, empty); + nframes_ = 0; + base_extractor_->Reset(); } } // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/assembler.h b/speechx/speechx/frontend/audio/assembler.h old mode 100644 new mode 100755 index 258e61f2b..72e6f6352 --- a/speechx/speechx/frontend/audio/assembler.h +++ b/speechx/speechx/frontend/audio/assembler.h @@ -22,14 +22,11 @@ namespace ppspeech { struct AssemblerOptions { // refer:https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/paddlespeech/s2t/exps/deepspeech2/model.py // the nnet batch forward - int32 receptive_filed_length; - int32 subsampling_rate; - int32 nnet_decoder_chunk; - - AssemblerOptions() - : receptive_filed_length(1), - subsampling_rate(1), - nnet_decoder_chunk(1) {} + int32 receptive_filed_length{1}; + int32 subsampling_rate{1}; + int32 nnet_decoder_chunk{1}; + bool fill_zero{false}; // whether fill zero when last chunk is not equal to + // frame_chunk_size_ }; class Assembler : public FrontendInterface { @@ -39,29 +36,34 @@ class Assembler : public FrontendInterface { std::unique_ptr base_extractor = NULL); // Feed feats or waves - virtual void Accept(const kaldi::VectorBase& inputs); + void Accept(const kaldi::VectorBase& inputs) override; // feats size = num_frames * feat_dim - virtual bool Read(kaldi::Vector* feats); + bool Read(kaldi::Vector* feats) override; // feat dim - virtual size_t Dim() const { return dim_; } + size_t Dim() const override { return dim_; } - virtual void SetFinished() { base_extractor_->SetFinished(); } + void SetFinished() override { base_extractor_->SetFinished(); } - virtual bool IsFinished() const { return base_extractor_->IsFinished(); } + bool IsFinished() const override { return base_extractor_->IsFinished(); } - virtual void Reset() { base_extractor_->Reset(); } + void Reset() override; private: bool Compute(kaldi::Vector* feats); - int32 dim_; + bool fill_zero_{false}; + + int32 dim_; // feat dim int32 frame_chunk_size_; // window int32 frame_chunk_stride_; // stride + int32 cache_size_; // window - stride int32 receptive_filed_length_; std::queue> feature_cache_; std::unique_ptr base_extractor_; + + int32 nframes_; // num frame computed DISALLOW_COPY_AND_ASSIGN(Assembler); }; diff --git a/speechx/speechx/frontend/audio/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc old mode 100644 new mode 100755 index b7a15acd7..c6a91f4b3 --- a/speechx/speechx/frontend/audio/audio_cache.cc +++ b/speechx/speechx/frontend/audio/audio_cache.cc @@ -13,13 +13,14 @@ // limitations under the License. #include "frontend/audio/audio_cache.h" + #include "kaldi/base/timer.h" namespace ppspeech { using kaldi::BaseFloat; -using kaldi::VectorBase; using kaldi::Vector; +using kaldi::VectorBase; AudioCache::AudioCache(int buffer_size, bool to_float32) : finished_(false), @@ -37,6 +38,7 @@ BaseFloat AudioCache::Convert2PCM32(BaseFloat val) { } void AudioCache::Accept(const VectorBase& waves) { + kaldi::Timer timer; std::unique_lock lock(mutex_); while (size_ + waves.Dim() > ring_buffer_.size()) { ready_feed_condition_.wait(lock); @@ -47,11 +49,13 @@ void AudioCache::Accept(const VectorBase& waves) { if (to_float32_) ring_buffer_[buffer_idx] = Convert2PCM32(waves(idx)); } size_ += waves.Dim(); + VLOG(1) << "AudioCache::Accept cost: " << timer.Elapsed() << " sec. " + << waves.Dim() << " samples."; } bool AudioCache::Read(Vector* waves) { - size_t chunk_size = waves->Dim(); kaldi::Timer timer; + size_t chunk_size = waves->Dim(); std::unique_lock lock(mutex_); while (chunk_size > size_) { // when audio is empty and no more data feed @@ -83,7 +87,13 @@ bool AudioCache::Read(Vector* waves) { } size_ -= chunk_size; offset_ = (offset_ + chunk_size) % ring_buffer_.size(); + + nsamples_ += chunk_size; + VLOG(3) << "nsamples readed: " << nsamples_; + ready_feed_condition_.notify_one(); + VLOG(1) << "AudioCache::Read cost: " << timer.Elapsed() << " sec. " + << chunk_size << " samples."; return true; } diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h old mode 100644 new mode 100755 index fc07d4bab..4708a6e0f --- a/speechx/speechx/frontend/audio/audio_cache.h +++ b/speechx/speechx/frontend/audio/audio_cache.h @@ -41,10 +41,11 @@ class AudioCache : public FrontendInterface { virtual bool IsFinished() const { return finished_; } - virtual void Reset() { + void Reset() override { offset_ = 0; size_ = 0; finished_ = false; + nsamples_ = 0; } private: @@ -61,6 +62,7 @@ class AudioCache : public FrontendInterface { kaldi::int32 timeout_; // millisecond bool to_float32_; // int16 -> float32. used in linear_spectrogram + int32 nsamples_; // number samples readed. DISALLOW_COPY_AND_ASSIGN(AudioCache); }; diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc old mode 100644 new mode 100755 index 1ea83aba5..a4d861d2d --- a/speechx/speechx/frontend/audio/cmvn.cc +++ b/speechx/speechx/frontend/audio/cmvn.cc @@ -14,22 +14,25 @@ #include "frontend/audio/cmvn.h" + #include "kaldi/feat/cmvn.h" #include "kaldi/util/kaldi-io.h" namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor) : var_norm_(true) { + CHECK_NE(cmvn_file, ""); base_extractor_ = std::move(base_extractor); + bool binary; kaldi::Input ki(cmvn_file, &binary); stats_.Read(ki.Stream(), binary); @@ -47,19 +50,22 @@ bool CMVN::Read(kaldi::Vector* feats) { if (base_extractor_->Read(feats) == false || feats->Dim() == 0) { return false; } + // appply cmvn + kaldi::Timer timer; Compute(feats); + VLOG(1) << "CMVN::Read cost: " << timer.Elapsed() << " sec."; return true; } // feats contain num_frames feature. void CMVN::Compute(VectorBase* feats) const { KALDI_ASSERT(feats != NULL); - int32 dim = stats_.NumCols() - 1; + if (stats_.NumRows() > 2 || stats_.NumRows() < 1 || - feats->Dim() % dim != 0) { - KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x' - << stats_.NumCols() << ", feats " << feats->Dim() << 'x'; + feats->Dim() % dim_ != 0) { + KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << ',' + << stats_.NumCols() - 1 << ", feats " << feats->Dim() << 'x'; } if (stats_.NumRows() == 1 && var_norm_) { KALDI_ERR @@ -67,7 +73,7 @@ void CMVN::Compute(VectorBase* feats) const { << "are supplied."; } - double count = stats_(0, dim); + double count = stats_(0, dim_); // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when // computing an offset and representing it as stats_, we use a count of one. if (count < 1.0) @@ -77,14 +83,14 @@ void CMVN::Compute(VectorBase* feats) const { if (!var_norm_) { Vector offset(feats->Dim()); - SubVector mean_stats(stats_.RowData(0), dim); + SubVector mean_stats(stats_.RowData(0), dim_); Vector mean_stats_apply(feats->Dim()); - // fill the datat of mean_stats in mean_stats_appy whose dim is equal - // with the dim of feature. - // the dim of feats = dim * num_frames; - for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) { - SubVector stats_tmp(mean_stats_apply.Data() + dim * idx, - dim); + // fill the datat of mean_stats in mean_stats_appy whose dim_ is equal + // with the dim_ of feature. + // the dim_ of feats = dim_ * num_frames; + for (int32 idx = 0; idx < feats->Dim() / dim_; ++idx) { + SubVector stats_tmp(mean_stats_apply.Data() + dim_ * idx, + dim_); stats_tmp.CopyFromVec(mean_stats); } offset.AddVec(-1.0 / count, mean_stats_apply); @@ -94,7 +100,7 @@ void CMVN::Compute(VectorBase* feats) const { // norm(0, d) = mean offset; // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d). kaldi::Matrix norm(2, feats->Dim()); - for (int32 d = 0; d < dim; d++) { + for (int32 d = 0; d < dim_; d++) { double mean, offset, scale; mean = stats_(0, d) / count; double var = (stats_(1, d) / count) - mean * mean, floor = 1.0e-20; @@ -111,7 +117,7 @@ void CMVN::Compute(VectorBase* feats) const { for (int32 d_skip = d; d_skip < feats->Dim();) { norm(0, d_skip) = offset; norm(1, d_skip) = scale; - d_skip = d_skip + dim; + d_skip = d_skip + dim_; } } // Apply the normalization. diff --git a/speechx/speechx/frontend/audio/cmvn.h b/speechx/speechx/frontend/audio/cmvn.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc old mode 100644 new mode 100755 index 0def14660..713c9ef1e --- a/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc +++ b/speechx/speechx/frontend/audio/cmvn_json2kaldi_main.cc @@ -30,8 +30,11 @@ DEFINE_bool(binary, true, "write cmvn in binary (true) or text(false)"); using namespace boost::json; // from int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; LOG(INFO) << "cmvn josn path: " << FLAGS_json_file; @@ -44,13 +47,13 @@ int main(int argc, char* argv[]) { for (auto obj : value.as_object()) { if (obj.key() == "mean_stat") { - LOG(INFO) << "mean_stat:" << obj.value(); + VLOG(2) << "mean_stat:" << obj.value(); } if (obj.key() == "var_stat") { - LOG(INFO) << "var_stat: " << obj.value(); + VLOG(2) << "var_stat: " << obj.value(); } if (obj.key() == "frame_num") { - LOG(INFO) << "frame_num: " << obj.value(); + VLOG(2) << "frame_num: " << obj.value(); } } @@ -76,7 +79,7 @@ int main(int argc, char* argv[]) { cmvn_stats(1, idx) = var_stat_vec[idx]; } cmvn_stats(0, mean_size) = frame_num; - LOG(INFO) << cmvn_stats; + VLOG(2) << cmvn_stats; kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, FLAGS_binary); LOG(INFO) << "cmvn stats have write into: " << FLAGS_cmvn_write_path; diff --git a/speechx/speechx/frontend/audio/compute_fbank_main.cc b/speechx/speechx/frontend/audio/compute_fbank_main.cc old mode 100644 new mode 100755 index f7a42315f..e2b54a8a6 --- a/speechx/speechx/frontend/audio/compute_fbank_main.cc +++ b/speechx/speechx/frontend/audio/compute_fbank_main.cc @@ -16,29 +16,36 @@ #include "base/flags.h" #include "base/log.h" -#include "kaldi/feat/wave-reader.h" -#include "kaldi/util/kaldi-io.h" -#include "kaldi/util/table-types.h" - #include "frontend/audio/audio_cache.h" #include "frontend/audio/data_cache.h" #include "frontend/audio/fbank.h" #include "frontend/audio/feature_cache.h" #include "frontend/audio/frontend_itf.h" #include "frontend/audio/normalizer.h" +#include "kaldi/feat/wave-reader.h" +#include "kaldi/util/kaldi-io.h" +#include "kaldi/util/table-types.h" DEFINE_string(wav_rspecifier, "", "test wav scp path"); DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); DEFINE_string(cmvn_file, "", "read cmvn"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); DEFINE_int32(num_bins, 161, "fbank num bins"); +DEFINE_int32(sample_rate, 16000, "sampe rate: 16k, 8k."); int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + CHECK_GT(FLAGS_wav_rspecifier.size(), 0); + CHECK_GT(FLAGS_feature_wspecifier.size(), 0); kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); + kaldi::SequentialTableReader wav_info_reader( + FLAGS_wav_rspecifier); kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier); int32 num_done = 0, num_err = 0; @@ -54,6 +61,10 @@ int main(int argc, char* argv[]) { opt.frame_opts.frame_shift_ms = 10; opt.mel_opts.num_bins = FLAGS_num_bins; opt.frame_opts.dither = 0.0; + LOG(INFO) << "frame_length_ms: " << opt.frame_opts.frame_length_ms; + LOG(INFO) << "frame_shift_ms: " << opt.frame_opts.frame_shift_ms; + LOG(INFO) << "num_bins: " << opt.mel_opts.num_bins; + LOG(INFO) << "dither: " << opt.frame_opts.dither; std::unique_ptr fbank( new ppspeech::Fbank(opt, std::move(data_source))); @@ -61,53 +72,76 @@ int main(int argc, char* argv[]) { std::unique_ptr cmvn( new ppspeech::CMVN(FLAGS_cmvn_file, std::move(fbank))); - ppspeech::FeatureCacheOptions feat_cache_opts; // the feature cache output feature chunk by chunk. + ppspeech::FeatureCacheOptions feat_cache_opts; ppspeech::FeatureCache feature_cache(feat_cache_opts, std::move(cmvn)); LOG(INFO) << "fbank: " << true; LOG(INFO) << "feat dim: " << feature_cache.Dim(); - int sample_rate = 16000; + float streaming_chunk = FLAGS_streaming_chunk; - int chunk_sample_size = streaming_chunk * sample_rate; - LOG(INFO) << "sr: " << sample_rate; - LOG(INFO) << "chunk size (s): " << streaming_chunk; + int chunk_sample_size = streaming_chunk * FLAGS_sample_rate; + LOG(INFO) << "sr: " << FLAGS_sample_rate; + LOG(INFO) << "chunk size (sec): " << streaming_chunk; LOG(INFO) << "chunk size (sample): " << chunk_sample_size; - for (; !wav_reader.Done(); wav_reader.Next()) { - std::string utt = wav_reader.Key(); + for (; !wav_reader.Done() && !wav_info_reader.Done(); + wav_reader.Next(), wav_info_reader.Next()) { + const std::string& utt = wav_reader.Key(); const kaldi::WaveData& wave_data = wav_reader.Value(); - LOG(INFO) << "process utt: " << utt; + const std::string& utt2 = wav_info_reader.Key(); + const kaldi::WaveInfo& wave_info = wav_info_reader.Value(); + + CHECK(utt == utt2) + << "wav reader and wav info reader using diff rspecifier!!!"; + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "samples: " << wave_info.SampleCount(); + LOG(INFO) << "dur: " << wave_info.Duration() << " sec"; + CHECK(wave_info.SampFreq() == FLAGS_sample_rate) + << "need " << FLAGS_sample_rate << " get " << wave_info.SampFreq(); + + // load first channel wav int32 this_channel = 0; kaldi::SubVector waveform(wave_data.Data(), this_channel); - int tot_samples = waveform.Dim(); - LOG(INFO) << "wav len (sample): " << tot_samples; + // compute feat chunk by chunk + int tot_samples = waveform.Dim(); int sample_offset = 0; std::vector> feats; int feature_rows = 0; while (sample_offset < tot_samples) { + // cur chunk size int cur_chunk_size = std::min(chunk_sample_size, tot_samples - sample_offset); + // get chunk wav kaldi::Vector wav_chunk(cur_chunk_size); for (int i = 0; i < cur_chunk_size; ++i) { wav_chunk(i) = waveform(sample_offset + i); } - kaldi::Vector features; + // compute feat feature_cache.Accept(wav_chunk); + + // send finish signal if (cur_chunk_size < chunk_sample_size) { feature_cache.SetFinished(); } + + // read feat + kaldi::Vector features; bool flag = true; do { flag = feature_cache.Read(&features); - feats.push_back(features); - feature_rows += features.Dim() / feature_cache.Dim(); + if (flag && features.Dim() != 0) { + feats.push_back(features); + feature_rows += features.Dim() / feature_cache.Dim(); + } } while (flag == true && features.Dim() != 0); + + // forward offset sample_offset += cur_chunk_size; } @@ -125,14 +159,20 @@ int main(int argc, char* argv[]) { ++cur_idx; } } + LOG(INFO) << "feat shape: " << features.NumRows() << " , " + << features.NumCols(); feat_writer.Write(utt, features); + + // reset frontend pipeline state feature_cache.Reset(); if (num_done % 50 == 0 && num_done != 0) - KALDI_VLOG(2) << "Processed " << num_done << " utterances"; + VLOG(2) << "Processed " << num_done << " utterances"; + num_done++; } - KALDI_LOG << "Done " << num_done << " utterances, " << num_err + + LOG(INFO) << "Done " << num_done << " utterances, " << num_err << " with errors."; return (num_done != 0 ? 0 : 1); } diff --git a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc old mode 100644 new mode 100755 index 162c3529d..42693c0c6 --- a/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc +++ b/speechx/speechx/frontend/audio/compute_linear_spectrogram_main.cc @@ -14,16 +14,15 @@ #include "base/flags.h" #include "base/log.h" -#include "kaldi/feat/wave-reader.h" -#include "kaldi/util/kaldi-io.h" -#include "kaldi/util/table-types.h" - #include "frontend/audio/audio_cache.h" #include "frontend/audio/data_cache.h" #include "frontend/audio/feature_cache.h" #include "frontend/audio/frontend_itf.h" #include "frontend/audio/linear_spectrogram.h" #include "frontend/audio/normalizer.h" +#include "kaldi/feat/wave-reader.h" +#include "kaldi/util/kaldi-io.h" +#include "kaldi/util/table-types.h" DEFINE_string(wav_rspecifier, "", "test wav scp path"); DEFINE_string(feature_wspecifier, "", "output feats wspecifier"); @@ -31,8 +30,11 @@ DEFINE_string(cmvn_file, "./cmvn.ark", "read cmvn"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialTableReader wav_reader( FLAGS_wav_rspecifier); diff --git a/speechx/speechx/frontend/audio/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h old mode 100644 new mode 100755 index 64e9db860..5fe5e4fe0 --- a/speechx/speechx/frontend/audio/data_cache.h +++ b/speechx/speechx/frontend/audio/data_cache.h @@ -27,14 +27,14 @@ namespace ppspeech { // pre-recorded audio/feature class DataCache : public FrontendInterface { public: - explicit DataCache() { finished_ = false; } + DataCache() : finished_{false}, dim_{0} {} // accept waves/feats - virtual void Accept(const kaldi::VectorBase& inputs) { + void Accept(const kaldi::VectorBase& inputs) override { data_ = inputs; } - virtual bool Read(kaldi::Vector* feats) { + bool Read(kaldi::Vector* feats) override { if (data_.Dim() == 0) { return false; } @@ -43,11 +43,14 @@ class DataCache : public FrontendInterface { return true; } - virtual void SetFinished() { finished_ = true; } - virtual bool IsFinished() const { return finished_; } - virtual size_t Dim() const { return dim_; } + void SetFinished() override { finished_ = true; } + bool IsFinished() const override { return finished_; } + size_t Dim() const override { return dim_; } void SetDim(int32 dim) { dim_ = dim; } - virtual void Reset() { finished_ = true; } + void Reset() override { + finished_ = true; + dim_ = 0; + } private: kaldi::Vector data_; @@ -56,4 +59,4 @@ class DataCache : public FrontendInterface { DISALLOW_COPY_AND_ASSIGN(DataCache); }; -} \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/db_norm.cc b/speechx/speechx/frontend/audio/db_norm.cc old mode 100644 new mode 100755 index 931e932d6..ad79fcc3a --- a/speechx/speechx/frontend/audio/db_norm.cc +++ b/speechx/speechx/frontend/audio/db_norm.cc @@ -14,17 +14,18 @@ #include "frontend/audio/db_norm.h" + #include "kaldi/feat/cmvn.h" #include "kaldi/util/kaldi-io.h" namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; DecibelNormalizer::DecibelNormalizer( const DecibelNormalizerOptions& opts, diff --git a/speechx/speechx/frontend/audio/db_norm.h b/speechx/speechx/frontend/audio/db_norm.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/frontend/audio/fbank.cc b/speechx/speechx/frontend/audio/fbank.cc old mode 100644 new mode 100755 index 059abbbd1..deabe8764 --- a/speechx/speechx/frontend/audio/fbank.cc +++ b/speechx/speechx/frontend/audio/fbank.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "frontend/audio/fbank.h" + #include "kaldi/base/kaldi-math.h" #include "kaldi/feat/feature-common.h" #include "kaldi/feat/feature-functions.h" @@ -20,12 +21,12 @@ namespace ppspeech { -using kaldi::int32; using kaldi::BaseFloat; -using kaldi::Vector; +using kaldi::int32; +using kaldi::Matrix; using kaldi::SubVector; +using kaldi::Vector; using kaldi::VectorBase; -using kaldi::Matrix; using std::vector; FbankComputer::FbankComputer(const Options& opts) diff --git a/speechx/speechx/frontend/audio/fbank.h b/speechx/speechx/frontend/audio/fbank.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc old mode 100644 new mode 100755 index 509a98c3b..5110d7046 --- a/speechx/speechx/frontend/audio/feature_cache.cc +++ b/speechx/speechx/frontend/audio/feature_cache.cc @@ -16,12 +16,12 @@ namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; FeatureCache::FeatureCache(FeatureCacheOptions opts, unique_ptr base_extractor) { @@ -34,6 +34,7 @@ FeatureCache::FeatureCache(FeatureCacheOptions opts, void FeatureCache::Accept(const kaldi::VectorBase& inputs) { // read inputs base_extractor_->Accept(inputs); + // feed current data bool result = false; do { @@ -62,6 +63,7 @@ bool FeatureCache::Read(kaldi::Vector* feats) { feats->CopyFromVec(cache_.front()); cache_.pop(); ready_feed_condition_.notify_one(); + VLOG(1) << "FeatureCache::Read cost: " << timer.Elapsed() << " sec."; return true; } @@ -72,7 +74,12 @@ bool FeatureCache::Compute() { bool result = base_extractor_->Read(&feature); if (result == false || feature.Dim() == 0) return false; + kaldi::Timer timer; + int32 num_chunk = feature.Dim() / dim_; + nframe_ += num_chunk; + VLOG(3) << "nframe computed: " << nframe_; + for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) { int32 start = chunk_idx * dim_; Vector feature_chunk(dim_); @@ -89,7 +96,10 @@ bool FeatureCache::Compute() { cache_.push(feature_chunk); ready_read_condition_.notify_one(); } - return result; + + VLOG(1) << "FeatureCache::Compute cost: " << timer.Elapsed() << " sec. " + << num_chunk << " feats."; + return true; } } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h old mode 100644 new mode 100755 index b922de12c..a4ebd6047 --- a/speechx/speechx/frontend/audio/feature_cache.h +++ b/speechx/speechx/frontend/audio/feature_cache.h @@ -41,21 +41,24 @@ class FeatureCache : public FrontendInterface { virtual size_t Dim() const { return dim_; } virtual void SetFinished() { + LOG(INFO) << "set finished"; // std::unique_lock lock(mutex_); base_extractor_->SetFinished(); - LOG(INFO) << "set finished"; + // read the last chunk data Compute(); // ready_feed_condition_.notify_one(); + LOG(INFO) << "compute last feats done."; } virtual bool IsFinished() const { return base_extractor_->IsFinished(); } - virtual void Reset() { + void Reset() override { + std::queue> empty; + std::swap(cache_, empty); + nframe_ = 0; base_extractor_->Reset(); - while (!cache_.empty()) { - cache_.pop(); - } + VLOG(3) << "feature cache reset: cache size: " << cache_.size(); } private: @@ -74,6 +77,7 @@ class FeatureCache : public FrontendInterface { std::condition_variable ready_feed_condition_; std::condition_variable ready_read_condition_; + int32 nframe_; // num of feature computed DISALLOW_COPY_AND_ASSIGN(FeatureCache); }; diff --git a/speechx/speechx/frontend/audio/feature_common.h b/speechx/speechx/frontend/audio/feature_common.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/frontend/audio/feature_common_inl.h b/speechx/speechx/frontend/audio/feature_common_inl.h old mode 100644 new mode 100755 index b86f79918..dcf44ef61 --- a/speechx/speechx/frontend/audio/feature_common_inl.h +++ b/speechx/speechx/frontend/audio/feature_common_inl.h @@ -34,6 +34,7 @@ bool StreamingFeatureTpl::Read(kaldi::Vector* feats) { bool flag = base_extractor_->Read(&wav); if (flag == false || wav.Dim() == 0) return false; + kaldi::Timer timer; // append remaned waves int32 wav_len = wav.Dim(); int32 left_len = remained_wav_.Dim(); @@ -52,6 +53,8 @@ bool StreamingFeatureTpl::Read(kaldi::Vector* feats) { remained_wav_.Resize(left_samples); remained_wav_.CopyFromVec( waves.Range(frame_shift * num_frames, left_samples)); + VLOG(1) << "StreamingFeatureTpl::Read cost: " << timer.Elapsed() + << " sec."; return true; } diff --git a/speechx/speechx/frontend/audio/feature_pipeline.cc b/speechx/speechx/frontend/audio/feature_pipeline.cc old mode 100644 new mode 100755 index 9cacff9f7..2931b96b9 --- a/speechx/speechx/frontend/audio/feature_pipeline.cc +++ b/speechx/speechx/frontend/audio/feature_pipeline.cc @@ -18,7 +18,8 @@ namespace ppspeech { using std::unique_ptr; -FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { +FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) + : opts_(opts) { unique_ptr data_source( new ppspeech::AudioCache(1000 * kint16max, opts.to_float32)); @@ -32,6 +33,7 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { opts.linear_spectrogram_opts, std::move(data_source))); } + CHECK_NE(opts.cmvn_file, ""); unique_ptr cmvn( new ppspeech::CMVN(opts.cmvn_file, std::move(base_feature))); @@ -42,4 +44,4 @@ FeaturePipeline::FeaturePipeline(const FeaturePipelineOptions& opts) { new ppspeech::Assembler(opts.assembler_opts, std::move(cache))); } -} // ppspeech +} // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/feature_pipeline.h b/speechx/speechx/frontend/audio/feature_pipeline.h old mode 100644 new mode 100755 index 48f95e3f3..e83a3f316 --- a/speechx/speechx/frontend/audio/feature_pipeline.h +++ b/speechx/speechx/frontend/audio/feature_pipeline.h @@ -25,27 +25,78 @@ #include "frontend/audio/linear_spectrogram.h" #include "frontend/audio/normalizer.h" +// feature +DECLARE_bool(use_fbank); +DECLARE_bool(fill_zero); +DECLARE_int32(num_bins); +DECLARE_string(cmvn_file); + +// feature sliding window +DECLARE_int32(receptive_field_length); +DECLARE_int32(subsampling_rate); +DECLARE_int32(nnet_decoder_chunk); + namespace ppspeech { struct FeaturePipelineOptions { - std::string cmvn_file; - bool to_float32; // true, only for linear feature - bool use_fbank; - LinearSpectrogramOptions linear_spectrogram_opts; - kaldi::FbankOptions fbank_opts; - FeatureCacheOptions feature_cache_opts; - AssemblerOptions assembler_opts; - - FeaturePipelineOptions() - : cmvn_file(""), - to_float32(false), // true, only for linear feature - use_fbank(true), - linear_spectrogram_opts(), - fbank_opts(), - feature_cache_opts(), - assembler_opts() {} + std::string cmvn_file{}; + bool to_float32{false}; // true, only for linear feature + bool use_fbank{true}; + LinearSpectrogramOptions linear_spectrogram_opts{}; + kaldi::FbankOptions fbank_opts{}; + FeatureCacheOptions feature_cache_opts{}; + AssemblerOptions assembler_opts{}; + + static FeaturePipelineOptions InitFromFlags() { + FeaturePipelineOptions opts; + opts.cmvn_file = FLAGS_cmvn_file; + LOG(INFO) << "cmvn file: " << opts.cmvn_file; + + // frame options + kaldi::FrameExtractionOptions frame_opts; + frame_opts.dither = 0.0; + LOG(INFO) << "dither: " << frame_opts.dither; + frame_opts.frame_shift_ms = 10; + LOG(INFO) << "frame shift ms: " << frame_opts.frame_shift_ms; + opts.use_fbank = FLAGS_use_fbank; + LOG(INFO) << "feature type: " << (opts.use_fbank ? "fbank" : "linear"); + if (opts.use_fbank) { + opts.to_float32 = false; + frame_opts.window_type = "povey"; + frame_opts.frame_length_ms = 25; + opts.fbank_opts.mel_opts.num_bins = FLAGS_num_bins; + LOG(INFO) << "num bins: " << opts.fbank_opts.mel_opts.num_bins; + + opts.fbank_opts.frame_opts = frame_opts; + } else { + opts.to_float32 = true; + frame_opts.remove_dc_offset = false; + frame_opts.frame_length_ms = 20; + frame_opts.window_type = "hanning"; + frame_opts.preemph_coeff = 0.0; + + opts.linear_spectrogram_opts.frame_opts = frame_opts; + } + LOG(INFO) << "frame length ms: " << frame_opts.frame_length_ms; + + // assembler opts + opts.assembler_opts.subsampling_rate = FLAGS_subsampling_rate; + opts.assembler_opts.receptive_filed_length = + FLAGS_receptive_field_length; + opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; + opts.assembler_opts.fill_zero = FLAGS_fill_zero; + LOG(INFO) << "subsampling rate: " + << opts.assembler_opts.subsampling_rate; + LOG(INFO) << "nnet receptive filed length: " + << opts.assembler_opts.receptive_filed_length; + LOG(INFO) << "nnet chunk size: " + << opts.assembler_opts.nnet_decoder_chunk; + LOG(INFO) << "frontend fill zeros: " << opts.assembler_opts.fill_zero; + return opts; + } }; + class FeaturePipeline : public FrontendInterface { public: explicit FeaturePipeline(const FeaturePipelineOptions& opts); @@ -60,7 +111,21 @@ class FeaturePipeline : public FrontendInterface { virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual void Reset() { base_extractor_->Reset(); } + const FeaturePipelineOptions& Config() { return opts_; } + + const BaseFloat FrameShift() const { + return opts_.fbank_opts.frame_opts.frame_shift_ms; + } + const BaseFloat FrameLength() const { + return opts_.fbank_opts.frame_opts.frame_length_ms; + } + const BaseFloat SampleRate() const { + return opts_.fbank_opts.frame_opts.samp_freq; + } + private: + FeaturePipelineOptions opts_; std::unique_ptr base_extractor_; }; -} + +} // namespace ppspeech diff --git a/speechx/speechx/frontend/audio/frontend_itf.h b/speechx/speechx/frontend/audio/frontend_itf.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.cc b/speechx/speechx/frontend/audio/linear_spectrogram.cc old mode 100644 new mode 100755 index 55c039787..d4a2fcc6c --- a/speechx/speechx/frontend/audio/linear_spectrogram.cc +++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "frontend/audio/linear_spectrogram.h" + #include "kaldi/base/kaldi-math.h" #include "kaldi/feat/feature-common.h" #include "kaldi/feat/feature-functions.h" @@ -20,12 +21,12 @@ namespace ppspeech { -using kaldi::int32; using kaldi::BaseFloat; -using kaldi::Vector; +using kaldi::int32; +using kaldi::Matrix; using kaldi::SubVector; +using kaldi::Vector; using kaldi::VectorBase; -using kaldi::Matrix; using std::vector; LinearSpectrogramComputer::LinearSpectrogramComputer(const Options& opts) diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.h b/speechx/speechx/frontend/audio/linear_spectrogram.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/frontend/audio/mfcc.cc b/speechx/speechx/frontend/audio/mfcc.cc old mode 100644 new mode 100755 index bda1f96d7..15f8cb0fa --- a/speechx/speechx/frontend/audio/mfcc.cc +++ b/speechx/speechx/frontend/audio/mfcc.cc @@ -14,6 +14,7 @@ #include "frontend/audio/mfcc.h" + #include "kaldi/base/kaldi-math.h" #include "kaldi/feat/feature-common.h" #include "kaldi/feat/feature-functions.h" @@ -21,12 +22,12 @@ namespace ppspeech { -using kaldi::int32; using kaldi::BaseFloat; -using kaldi::Vector; +using kaldi::int32; +using kaldi::Matrix; using kaldi::SubVector; +using kaldi::Vector; using kaldi::VectorBase; -using kaldi::Matrix; using std::vector; Mfcc::Mfcc(const MfccOptions& opts, diff --git a/speechx/speechx/frontend/audio/mfcc.h b/speechx/speechx/frontend/audio/mfcc.h old mode 100644 new mode 100755 index 62b0078c7..6c1c2f7df --- a/speechx/speechx/frontend/audio/mfcc.h +++ b/speechx/speechx/frontend/audio/mfcc.h @@ -14,7 +14,6 @@ #pragma once -#include "kaldi/feat/feature-mfcc.h" #include "kaldi/feat/feature-mfcc.h" #include "kaldi/matrix/kaldi-vector.h" diff --git a/speechx/speechx/frontend/audio/normalizer.h b/speechx/speechx/frontend/audio/normalizer.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/frontend/text/CMakeLists.txt b/speechx/speechx/frontend/text/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/.gitkeep b/speechx/speechx/kaldi/.gitkeep old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/CMakeLists.txt b/speechx/speechx/kaldi/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/CMakeLists.txt b/speechx/speechx/kaldi/base/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/io-funcs-inl.h b/speechx/speechx/kaldi/base/io-funcs-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/io-funcs.cc b/speechx/speechx/kaldi/base/io-funcs.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/io-funcs.h b/speechx/speechx/kaldi/base/io-funcs.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/kaldi-common.h b/speechx/speechx/kaldi/base/kaldi-common.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/kaldi-error.cc b/speechx/speechx/kaldi/base/kaldi-error.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/kaldi-error.h b/speechx/speechx/kaldi/base/kaldi-error.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/kaldi-math.cc b/speechx/speechx/kaldi/base/kaldi-math.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/kaldi-math.h b/speechx/speechx/kaldi/base/kaldi-math.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/kaldi-types.h b/speechx/speechx/kaldi/base/kaldi-types.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/kaldi-utils.cc b/speechx/speechx/kaldi/base/kaldi-utils.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/kaldi-utils.h b/speechx/speechx/kaldi/base/kaldi-utils.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/timer.cc b/speechx/speechx/kaldi/base/timer.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/timer.h b/speechx/speechx/kaldi/base/timer.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/base/version.h b/speechx/speechx/kaldi/base/version.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/decoder/CMakeLists.txt b/speechx/speechx/kaldi/decoder/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/decoder/decodable-itf.h b/speechx/speechx/kaldi/decoder/decodable-itf.h old mode 100644 new mode 100755 index b8ce9143e..a7c12588b --- a/speechx/speechx/kaldi/decoder/decodable-itf.h +++ b/speechx/speechx/kaldi/decoder/decodable-itf.h @@ -101,7 +101,9 @@ namespace kaldi { */ class DecodableInterface { public: - /// Returns the log likelihood, which will be negated in the decoder. + virtual ~DecodableInterface() {} + + /// Returns the log likelihood(logprob), which will be negated in the decoder. /// The "frame" starts from zero. You should verify that NumFramesReady() > /// frame /// before calling this. @@ -143,11 +145,12 @@ class DecodableInterface { /// this is for compatibility with OpenFst). virtual int32 NumIndices() const = 0; + /// Returns the likelihood(prob), which will be postive in the decoder. + /// The "frame" starts from zero. You should verify that NumFramesReady() > + /// frame + /// before calling this. virtual bool FrameLikelihood( int32 frame, std::vector* likelihood) = 0; - - - virtual ~DecodableInterface() {} }; /// @} } // namespace Kaldi diff --git a/speechx/speechx/kaldi/decoder/lattice-faster-decoder.cc b/speechx/speechx/kaldi/decoder/lattice-faster-decoder.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/decoder/lattice-faster-decoder.h b/speechx/speechx/kaldi/decoder/lattice-faster-decoder.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/decoder/lattice-faster-online-decoder.cc b/speechx/speechx/kaldi/decoder/lattice-faster-online-decoder.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/decoder/lattice-faster-online-decoder.h b/speechx/speechx/kaldi/decoder/lattice-faster-online-decoder.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/CMakeLists.txt b/speechx/speechx/kaldi/feat/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/cmvn.cc b/speechx/speechx/kaldi/feat/cmvn.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/cmvn.h b/speechx/speechx/kaldi/feat/cmvn.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-common-inl.h b/speechx/speechx/kaldi/feat/feature-common-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-common.h b/speechx/speechx/kaldi/feat/feature-common.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-fbank.cc b/speechx/speechx/kaldi/feat/feature-fbank.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-fbank.h b/speechx/speechx/kaldi/feat/feature-fbank.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-functions.cc b/speechx/speechx/kaldi/feat/feature-functions.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-functions.h b/speechx/speechx/kaldi/feat/feature-functions.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-mfcc.cc b/speechx/speechx/kaldi/feat/feature-mfcc.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-mfcc.h b/speechx/speechx/kaldi/feat/feature-mfcc.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-plp.cc b/speechx/speechx/kaldi/feat/feature-plp.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-plp.h b/speechx/speechx/kaldi/feat/feature-plp.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-spectrogram.cc b/speechx/speechx/kaldi/feat/feature-spectrogram.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-spectrogram.h b/speechx/speechx/kaldi/feat/feature-spectrogram.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-window.cc b/speechx/speechx/kaldi/feat/feature-window.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/feature-window.h b/speechx/speechx/kaldi/feat/feature-window.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/mel-computations.cc b/speechx/speechx/kaldi/feat/mel-computations.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/mel-computations.h b/speechx/speechx/kaldi/feat/mel-computations.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/online-feature.cc b/speechx/speechx/kaldi/feat/online-feature.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/online-feature.h b/speechx/speechx/kaldi/feat/online-feature.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/pitch-functions.cc b/speechx/speechx/kaldi/feat/pitch-functions.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/pitch-functions.h b/speechx/speechx/kaldi/feat/pitch-functions.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/resample.cc b/speechx/speechx/kaldi/feat/resample.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/resample.h b/speechx/speechx/kaldi/feat/resample.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/signal.cc b/speechx/speechx/kaldi/feat/signal.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/signal.h b/speechx/speechx/kaldi/feat/signal.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/wave-reader.cc b/speechx/speechx/kaldi/feat/wave-reader.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/feat/wave-reader.h b/speechx/speechx/kaldi/feat/wave-reader.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstbin/CMakeLists.txt b/speechx/speechx/kaldi/fstbin/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstbin/fstaddselfloops.cc b/speechx/speechx/kaldi/fstbin/fstaddselfloops.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstbin/fstdeterminizestar.cc b/speechx/speechx/kaldi/fstbin/fstdeterminizestar.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstbin/fstisstochastic.cc b/speechx/speechx/kaldi/fstbin/fstisstochastic.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstbin/fstminimizeencoded.cc b/speechx/speechx/kaldi/fstbin/fstminimizeencoded.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstbin/fsttablecompose.cc b/speechx/speechx/kaldi/fstbin/fsttablecompose.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/CMakeLists.txt b/speechx/speechx/kaldi/fstext/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/determinize-lattice-inl.h b/speechx/speechx/kaldi/fstext/determinize-lattice-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/determinize-lattice.h b/speechx/speechx/kaldi/fstext/determinize-lattice.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/determinize-star-inl.h b/speechx/speechx/kaldi/fstext/determinize-star-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/determinize-star.h b/speechx/speechx/kaldi/fstext/determinize-star.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/fstext-lib.h b/speechx/speechx/kaldi/fstext/fstext-lib.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/fstext-utils-inl.h b/speechx/speechx/kaldi/fstext/fstext-utils-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/fstext-utils.h b/speechx/speechx/kaldi/fstext/fstext-utils.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/kaldi-fst-io-inl.h b/speechx/speechx/kaldi/fstext/kaldi-fst-io-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/kaldi-fst-io.cc b/speechx/speechx/kaldi/fstext/kaldi-fst-io.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/kaldi-fst-io.h b/speechx/speechx/kaldi/fstext/kaldi-fst-io.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/lattice-utils-inl.h b/speechx/speechx/kaldi/fstext/lattice-utils-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/lattice-utils.h b/speechx/speechx/kaldi/fstext/lattice-utils.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/lattice-weight.h b/speechx/speechx/kaldi/fstext/lattice-weight.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/pre-determinize-inl.h b/speechx/speechx/kaldi/fstext/pre-determinize-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/pre-determinize.h b/speechx/speechx/kaldi/fstext/pre-determinize.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/remove-eps-local-inl.h b/speechx/speechx/kaldi/fstext/remove-eps-local-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/remove-eps-local.h b/speechx/speechx/kaldi/fstext/remove-eps-local.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/fstext/table-matcher.h b/speechx/speechx/kaldi/fstext/table-matcher.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lat/CMakeLists.txt b/speechx/speechx/kaldi/lat/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lat/determinize-lattice-pruned.cc b/speechx/speechx/kaldi/lat/determinize-lattice-pruned.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lat/determinize-lattice-pruned.h b/speechx/speechx/kaldi/lat/determinize-lattice-pruned.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lat/kaldi-lattice.cc b/speechx/speechx/kaldi/lat/kaldi-lattice.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lat/kaldi-lattice.h b/speechx/speechx/kaldi/lat/kaldi-lattice.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lat/lattice-functions.cc b/speechx/speechx/kaldi/lat/lattice-functions.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lat/lattice-functions.h b/speechx/speechx/kaldi/lat/lattice-functions.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lm/CMakeLists.txt b/speechx/speechx/kaldi/lm/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lm/arpa-file-parser.cc b/speechx/speechx/kaldi/lm/arpa-file-parser.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lm/arpa-file-parser.h b/speechx/speechx/kaldi/lm/arpa-file-parser.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lm/arpa-lm-compiler.cc b/speechx/speechx/kaldi/lm/arpa-lm-compiler.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lm/arpa-lm-compiler.h b/speechx/speechx/kaldi/lm/arpa-lm-compiler.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lmbin/CMakeLists.txt b/speechx/speechx/kaldi/lmbin/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/lmbin/arpa2fst.cc b/speechx/speechx/kaldi/lmbin/arpa2fst.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/CMakeLists.txt b/speechx/speechx/kaldi/matrix/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/cblas-wrappers.h b/speechx/speechx/kaldi/matrix/cblas-wrappers.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/compressed-matrix.cc b/speechx/speechx/kaldi/matrix/compressed-matrix.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/compressed-matrix.h b/speechx/speechx/kaldi/matrix/compressed-matrix.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/jama-eig.h b/speechx/speechx/kaldi/matrix/jama-eig.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/jama-svd.h b/speechx/speechx/kaldi/matrix/jama-svd.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/kaldi-blas.h b/speechx/speechx/kaldi/matrix/kaldi-blas.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/kaldi-matrix-inl.h b/speechx/speechx/kaldi/matrix/kaldi-matrix-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/kaldi-matrix.cc b/speechx/speechx/kaldi/matrix/kaldi-matrix.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/kaldi-matrix.h b/speechx/speechx/kaldi/matrix/kaldi-matrix.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/kaldi-vector-inl.h b/speechx/speechx/kaldi/matrix/kaldi-vector-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/kaldi-vector.cc b/speechx/speechx/kaldi/matrix/kaldi-vector.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/kaldi-vector.h b/speechx/speechx/kaldi/matrix/kaldi-vector.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/matrix-common.h b/speechx/speechx/kaldi/matrix/matrix-common.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/matrix-functions-inl.h b/speechx/speechx/kaldi/matrix/matrix-functions-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/matrix-functions.cc b/speechx/speechx/kaldi/matrix/matrix-functions.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/matrix-functions.h b/speechx/speechx/kaldi/matrix/matrix-functions.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/matrix-lib.h b/speechx/speechx/kaldi/matrix/matrix-lib.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/optimization.cc b/speechx/speechx/kaldi/matrix/optimization.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/optimization.h b/speechx/speechx/kaldi/matrix/optimization.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/packed-matrix.cc b/speechx/speechx/kaldi/matrix/packed-matrix.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/packed-matrix.h b/speechx/speechx/kaldi/matrix/packed-matrix.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/qr.cc b/speechx/speechx/kaldi/matrix/qr.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/sp-matrix-inl.h b/speechx/speechx/kaldi/matrix/sp-matrix-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/sp-matrix.cc b/speechx/speechx/kaldi/matrix/sp-matrix.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/sp-matrix.h b/speechx/speechx/kaldi/matrix/sp-matrix.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/sparse-matrix.cc b/speechx/speechx/kaldi/matrix/sparse-matrix.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/sparse-matrix.h b/speechx/speechx/kaldi/matrix/sparse-matrix.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/srfft.cc b/speechx/speechx/kaldi/matrix/srfft.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/srfft.h b/speechx/speechx/kaldi/matrix/srfft.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/tp-matrix.cc b/speechx/speechx/kaldi/matrix/tp-matrix.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/matrix/tp-matrix.h b/speechx/speechx/kaldi/matrix/tp-matrix.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/CMakeLists.txt b/speechx/speechx/kaldi/util/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/basic-filebuf.h b/speechx/speechx/kaldi/util/basic-filebuf.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/common-utils.h b/speechx/speechx/kaldi/util/common-utils.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/const-integer-set-inl.h b/speechx/speechx/kaldi/util/const-integer-set-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/const-integer-set.h b/speechx/speechx/kaldi/util/const-integer-set.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/edit-distance-inl.h b/speechx/speechx/kaldi/util/edit-distance-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/edit-distance.h b/speechx/speechx/kaldi/util/edit-distance.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/hash-list-inl.h b/speechx/speechx/kaldi/util/hash-list-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/hash-list.h b/speechx/speechx/kaldi/util/hash-list.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-cygwin-io-inl.h b/speechx/speechx/kaldi/util/kaldi-cygwin-io-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-holder-inl.h b/speechx/speechx/kaldi/util/kaldi-holder-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-holder.cc b/speechx/speechx/kaldi/util/kaldi-holder.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-holder.h b/speechx/speechx/kaldi/util/kaldi-holder.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-io-inl.h b/speechx/speechx/kaldi/util/kaldi-io-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-io.cc b/speechx/speechx/kaldi/util/kaldi-io.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-io.h b/speechx/speechx/kaldi/util/kaldi-io.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-pipebuf.h b/speechx/speechx/kaldi/util/kaldi-pipebuf.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-semaphore.cc b/speechx/speechx/kaldi/util/kaldi-semaphore.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-semaphore.h b/speechx/speechx/kaldi/util/kaldi-semaphore.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-table-inl.h b/speechx/speechx/kaldi/util/kaldi-table-inl.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-table.cc b/speechx/speechx/kaldi/util/kaldi-table.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-table.h b/speechx/speechx/kaldi/util/kaldi-table.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-thread.cc b/speechx/speechx/kaldi/util/kaldi-thread.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/kaldi-thread.h b/speechx/speechx/kaldi/util/kaldi-thread.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/options-itf.h b/speechx/speechx/kaldi/util/options-itf.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/parse-options.cc b/speechx/speechx/kaldi/util/parse-options.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/parse-options.h b/speechx/speechx/kaldi/util/parse-options.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/simple-io-funcs.cc b/speechx/speechx/kaldi/util/simple-io-funcs.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/simple-io-funcs.h b/speechx/speechx/kaldi/util/simple-io-funcs.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/simple-options.cc b/speechx/speechx/kaldi/util/simple-options.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/simple-options.h b/speechx/speechx/kaldi/util/simple-options.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/stl-utils.h b/speechx/speechx/kaldi/util/stl-utils.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/table-types.h b/speechx/speechx/kaldi/util/table-types.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/text-utils.cc b/speechx/speechx/kaldi/util/text-utils.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/kaldi/util/text-utils.h b/speechx/speechx/kaldi/util/text-utils.h old mode 100644 new mode 100755 diff --git a/speechx/speechx/model/CMakeLists.txt b/speechx/speechx/model/CMakeLists.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt old mode 100644 new mode 100755 index c325ce755..435666163 --- a/speechx/speechx/nnet/CMakeLists.txt +++ b/speechx/speechx/nnet/CMakeLists.txt @@ -1,14 +1,39 @@ -project(nnet) +set(srcs decodable.cc) -add_library(nnet STATIC - decodable.cc - paddle_nnet.cc -) +if(USING_DS2) + list(APPEND srcs ds2_nnet.cc) +endif() + +if(USING_U2) + list(APPEND srcs u2_nnet.cc) +endif() + +add_library(nnet STATIC ${srcs}) target_link_libraries(nnet absl::strings) -set(bin_name nnet_forward_main) -add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) -target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet ${DEPS}) +if(USING_U2) + target_compile_options(nnet PUBLIC ${PADDLE_COMPILE_FLAGS}) + target_include_directories(nnet PUBLIC ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) +endif() + + +if(USING_DS2) + set(bin_name ds2_nnet_main) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet) + + target_link_libraries(${bin_name} ${DEPS}) +endif() +# test bin +if(USING_U2) + set(bin_name u2_nnet_main) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} utils kaldi-util kaldi-matrix gflags glog nnet) + target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) + target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) +endif() diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc old mode 100644 new mode 100755 index 465f64a94..5fe2b9842 --- a/speechx/speechx/nnet/decodable.cc +++ b/speechx/speechx/nnet/decodable.cc @@ -18,10 +18,10 @@ namespace ppspeech { using kaldi::BaseFloat; using kaldi::Matrix; -using std::vector; using kaldi::Vector; +using std::vector; -Decodable::Decodable(const std::shared_ptr& nnet, +Decodable::Decodable(const std::shared_ptr& nnet, const std::shared_ptr& frontend, kaldi::BaseFloat acoustic_scale) : frontend_(frontend), @@ -30,17 +30,17 @@ Decodable::Decodable(const std::shared_ptr& nnet, frames_ready_(0), acoustic_scale_(acoustic_scale) {} +// for debug void Decodable::Acceptlikelihood(const Matrix& likelihood) { - nnet_cache_ = likelihood; + nnet_out_cache_ = likelihood; frames_ready_ += likelihood.NumRows(); } -// Decodable::Init(DecodableConfig config) { -//} // return the size of frame have computed. int32 Decodable::NumFramesReady() const { return frames_ready_; } + // frame idx is from 0 to frame_ready_ -1; bool Decodable::IsLastFrame(int32 frame) { bool flag = EnsureFrameHaveComputed(frame); @@ -53,18 +53,9 @@ int32 Decodable::NumIndices() const { return 0; } // id. int32 Decodable::TokenId2NnetId(int32 token_id) { return token_id - 1; } -BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) { - CHECK_LE(index, nnet_cache_.NumCols()); - CHECK_LE(frame, frames_ready_); - int32 frame_idx = frame - frame_offset_; - // the nnet output is prob ranther than log prob - // the index - 1, because the ilabel - return acoustic_scale_ * - std::log(nnet_cache_(frame_idx, TokenId2NnetId(index)) + - std::numeric_limits::min()); -} bool Decodable::EnsureFrameHaveComputed(int32 frame) { + // decoding frame if (frame >= frames_ready_) { return AdvanceChunk(); } @@ -72,38 +63,117 @@ bool Decodable::EnsureFrameHaveComputed(int32 frame) { } bool Decodable::AdvanceChunk() { + kaldi::Timer timer; + // read feats Vector features; if (frontend_ == NULL || frontend_->Read(&features) == false) { + // no feat or frontend_ not init. + VLOG(3) << "decodable exit;"; return false; } - int32 nnet_dim = 0; - Vector inferences; - nnet_->FeedForward(features, frontend_->Dim(), &inferences, &nnet_dim); - nnet_cache_.Resize(inferences.Dim() / nnet_dim, nnet_dim); - nnet_cache_.CopyRowsFromVec(inferences); - + CHECK_GE(frontend_->Dim(), 0); + VLOG(1) << "AdvanceChunk feat cost: " << timer.Elapsed() << " sec."; + VLOG(2) << "Forward in " << features.Dim() / frontend_->Dim() << " feats."; + + // forward feats + NnetOut out; + nnet_->FeedForward(features, frontend_->Dim(), &out); + int32& vocab_dim = out.vocab_dim; + Vector& logprobs = out.logprobs; + + VLOG(2) << "Forward out " << logprobs.Dim() / vocab_dim + << " decoder frames."; + // cache nnet outupts + nnet_out_cache_.Resize(logprobs.Dim() / vocab_dim, vocab_dim); + nnet_out_cache_.CopyRowsFromVec(logprobs); + + // update state, decoding frame. frame_offset_ = frames_ready_; - frames_ready_ += nnet_cache_.NumRows(); + frames_ready_ += nnet_out_cache_.NumRows(); + VLOG(1) << "AdvanceChunk feat + forward cost: " << timer.Elapsed() + << " sec."; + return true; +} + +bool Decodable::AdvanceChunk(kaldi::Vector* logprobs, + int* vocab_dim) { + if (AdvanceChunk() == false) { + return false; + } + + int nrows = nnet_out_cache_.NumRows(); + CHECK(nrows == (frames_ready_ - frame_offset_)); + if (nrows <= 0) { + LOG(WARNING) << "No new nnet out in cache."; + return false; + } + + logprobs->Resize(nnet_out_cache_.NumRows() * nnet_out_cache_.NumCols()); + logprobs->CopyRowsFromMat(nnet_out_cache_); + + *vocab_dim = nnet_out_cache_.NumCols(); return true; } +// read one frame likelihood bool Decodable::FrameLikelihood(int32 frame, vector* likelihood) { - std::vector result; - if (EnsureFrameHaveComputed(frame) == false) return false; - likelihood->resize(nnet_cache_.NumCols()); - for (int32 idx = 0; idx < nnet_cache_.NumCols(); ++idx) { + if (EnsureFrameHaveComputed(frame) == false) { + VLOG(3) << "framelikehood exit."; + return false; + } + + int nrows = nnet_out_cache_.NumRows(); + CHECK(nrows == (frames_ready_ - frame_offset_)); + int vocab_size = nnet_out_cache_.NumCols(); + likelihood->resize(vocab_size); + + for (int32 idx = 0; idx < vocab_size; ++idx) { (*likelihood)[idx] = - nnet_cache_(frame - frame_offset_, idx) * acoustic_scale_; + nnet_out_cache_(frame - frame_offset_, idx) * acoustic_scale_; + + VLOG(4) << "nnet out: " << frame << " offset:" << frame_offset_ << " " + << nnet_out_cache_.NumRows() + << " logprob: " << nnet_out_cache_(frame - frame_offset_, idx); } return true; } +BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) { + if (EnsureFrameHaveComputed(frame) == false) { + return false; + } + + CHECK_LE(index, nnet_out_cache_.NumCols()); + CHECK_LE(frame, frames_ready_); + + // the nnet output is prob ranther than log prob + // the index - 1, because the ilabel + BaseFloat logprob = 0.0; + int32 frame_idx = frame - frame_offset_; + BaseFloat nnet_out = nnet_out_cache_(frame_idx, TokenId2NnetId(index)); + if (nnet_->IsLogProb()) { + logprob = nnet_out; + } else { + logprob = std::log(nnet_out + std::numeric_limits::epsilon()); + } + CHECK(!std::isnan(logprob) && !std::isinf(logprob)); + return acoustic_scale_ * logprob; +} + void Decodable::Reset() { if (frontend_ != nullptr) frontend_->Reset(); if (nnet_ != nullptr) nnet_->Reset(); frame_offset_ = 0; frames_ready_ = 0; - nnet_cache_.Resize(0, 0); + nnet_out_cache_.Resize(0, 0); +} + +void Decodable::AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) { + kaldi::Timer timer; + nnet_->AttentionRescoring(hyps, reverse_weight, rescoring_score); + VLOG(1) << "Attention Rescoring cost: " << timer.Elapsed() << " sec."; } } // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h old mode 100644 new mode 100755 index 9555fea79..dd7b329e5 --- a/speechx/speechx/nnet/decodable.h +++ b/speechx/speechx/nnet/decodable.h @@ -24,38 +24,68 @@ struct DecodableOpts; class Decodable : public kaldi::DecodableInterface { public: - explicit Decodable(const std::shared_ptr& nnet, + explicit Decodable(const std::shared_ptr& nnet, const std::shared_ptr& frontend, kaldi::BaseFloat acoustic_scale = 1.0); + // void Init(DecodableOpts config); + + // nnet logprob output, used by wfst virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index); - virtual bool IsLastFrame(int32 frame); - virtual int32 NumIndices() const; - // not logprob + + // nnet output virtual bool FrameLikelihood(int32 frame, std::vector* likelihood); + + // forward nnet with feats + bool AdvanceChunk(); + + // forward nnet with feats, and get nnet output + bool AdvanceChunk(kaldi::Vector* logprobs, + int* vocab_dim); + + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score); + + virtual bool IsLastFrame(int32 frame); + + // nnet output dim, e.g. vocab size + virtual int32 NumIndices() const; + virtual int32 NumFramesReady() const; - // for offline test - void Acceptlikelihood(const kaldi::Matrix& likelihood); + void Reset(); + bool IsInputFinished() const { return frontend_->IsFinished(); } + bool EnsureFrameHaveComputed(int32 frame); + int32 TokenId2NnetId(int32 token_id); + std::shared_ptr Nnet() { return nnet_; } + + // for offline test + void Acceptlikelihood(const kaldi::Matrix& likelihood); + private: - bool AdvanceChunk(); std::shared_ptr frontend_; - std::shared_ptr nnet_; - kaldi::Matrix nnet_cache_; + std::shared_ptr nnet_; + + // nnet outputs' cache + kaldi::Matrix nnet_out_cache_; + // the frame is nnet prob frame rather than audio feature frame // nnet frame subsample the feature frame // eg: 35 frame features output 8 frame inferences int32 frame_offset_; int32 frames_ready_; + // todo: feature frame mismatch with nnet inference frame // so use subsampled_frame int32 current_log_post_subsampled_offset_; int32 num_chunk_computed_; + kaldi::BaseFloat acoustic_scale_; }; diff --git a/speechx/speechx/nnet/paddle_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc old mode 100644 new mode 100755 similarity index 94% rename from speechx/speechx/nnet/paddle_nnet.cc rename to speechx/speechx/nnet/ds2_nnet.cc index 881a82f51..22c7f61b8 --- a/speechx/speechx/nnet/paddle_nnet.cc +++ b/speechx/speechx/nnet/ds2_nnet.cc @@ -12,16 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "nnet/paddle_nnet.h" +#include "nnet/ds2_nnet.h" + #include "absl/strings/str_split.h" namespace ppspeech { -using std::vector; -using std::string; -using std::shared_ptr; using kaldi::Matrix; using kaldi::Vector; +using std::shared_ptr; +using std::string; +using std::vector; void PaddleNnet::InitCacheEncouts(const ModelOptions& opts) { std::vector cache_names; @@ -48,6 +49,7 @@ void PaddleNnet::InitCacheEncouts(const ModelOptions& opts) { } PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) { + subsampling_rate_ = opts.subsample_rate; paddle_infer::Config config; config.SetModel(opts.model_path, opts.param_path); if (opts.use_gpu) { @@ -143,9 +145,8 @@ shared_ptr> PaddleNnet::GetCacheEncoder(const string& name) { } void PaddleNnet::FeedForward(const Vector& features, - int32 feature_dim, - Vector* inferences, - int32* inference_dim) { + const int32& feature_dim, + NnetOut* out) { paddle_infer::Predictor* predictor = GetPredictor(); int feat_row = features.Dim() / feature_dim; @@ -203,9 +204,13 @@ void PaddleNnet::FeedForward(const Vector& features, std::vector output_shape = output_tensor->shape(); int32 row = output_shape[1]; int32 col = output_shape[2]; - inferences->Resize(row * col); - *inference_dim = col; - output_tensor->CopyToCpu(inferences->Data()); + + + // inferences->Resize(row * col); + // *inference_dim = col; + out->logprobs.Resize(row * col); + out->vocab_dim = col; + output_tensor->CopyToCpu(out->logprobs.Data()); ReleasePredictor(predictor); } diff --git a/speechx/speechx/nnet/paddle_nnet.h b/speechx/speechx/nnet/ds2_nnet.h old mode 100644 new mode 100755 similarity index 50% rename from speechx/speechx/nnet/paddle_nnet.h rename to speechx/speechx/nnet/ds2_nnet.h index e2b3d5bc4..420fa1771 --- a/speechx/speechx/nnet/paddle_nnet.h +++ b/speechx/speechx/nnet/ds2_nnet.h @@ -13,64 +13,20 @@ // limitations under the License. #pragma once #include + #include "base/common.h" #include "kaldi/matrix/kaldi-matrix.h" -#include "kaldi/util/options-itf.h" #include "nnet/nnet_itf.h" #include "paddle_inference_api.h" namespace ppspeech { -struct ModelOptions { - std::string model_path; - std::string param_path; - int thread_num; // predictor thread pool size - bool use_gpu; - bool switch_ir_optim; - std::string input_names; - std::string output_names; - std::string cache_names; - std::string cache_shape; - bool enable_fc_padding; - bool enable_profile; - ModelOptions() - : model_path(""), - param_path(""), - thread_num(2), - use_gpu(false), - input_names(""), - output_names(""), - cache_names(""), - cache_shape(""), - switch_ir_optim(false), - enable_fc_padding(false), - enable_profile(false) {} - - void Register(kaldi::OptionsItf* opts) { - opts->Register("model-path", &model_path, "model file path"); - opts->Register("model-param", ¶m_path, "params model file path"); - opts->Register("thread-num", &thread_num, "thread num"); - opts->Register("use-gpu", &use_gpu, "if use gpu"); - opts->Register("input-names", &input_names, "paddle input names"); - opts->Register("output-names", &output_names, "paddle output names"); - opts->Register("cache-names", &cache_names, "cache names"); - opts->Register("cache-shape", &cache_shape, "cache shape"); - opts->Register("switch-ir-optiom", - &switch_ir_optim, - "paddle SwitchIrOptim option"); - opts->Register("enable-fc-padding", - &enable_fc_padding, - "paddle EnableFCPadding option"); - opts->Register( - "enable-profile", &enable_profile, "paddle EnableProfile option"); - } -}; template class Tensor { public: Tensor() {} - Tensor(const std::vector& shape) : _shape(shape) { + explicit Tensor(const std::vector& shape) : _shape(shape) { int neml = std::accumulate( _shape.begin(), _shape.end(), 1, std::multiplies()); LOG(INFO) << "Tensor neml: " << neml; @@ -92,21 +48,35 @@ class Tensor { std::vector _data; }; -class PaddleNnet : public NnetInterface { +class PaddleNnet : public NnetBase { public: - PaddleNnet(const ModelOptions& opts); + explicit PaddleNnet(const ModelOptions& opts); - virtual void FeedForward(const kaldi::Vector& features, - int32 feature_dim, - kaldi::Vector* inferences, - int32* inference_dim); + void FeedForward(const kaldi::Vector& features, + const int32& feature_dim, + NnetOut* out) override; + + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) override { + VLOG(2) << "deepspeech2 not has AttentionRescoring."; + } void Dim(); - virtual void Reset(); + + void Reset() override; + + bool IsLogProb() override { return false; } + + std::shared_ptr> GetCacheEncoder( const std::string& name); + void InitCacheEncouts(const ModelOptions& opts); + void EncoderOuts(std::vector>* encoder_out) + const override {} + private: paddle_infer::Predictor* GetPredictor(); int ReleasePredictor(paddle_infer::Predictor* predictor); @@ -117,6 +87,7 @@ class PaddleNnet : public NnetInterface { std::map predictor_to_thread_id; std::map cache_names_idx_; std::vector>> cache_encouts_; + ModelOptions opts_; public: diff --git a/speechx/speechx/nnet/nnet_forward_main.cc b/speechx/speechx/nnet/ds2_nnet_main.cc old mode 100644 new mode 100755 similarity index 77% rename from speechx/speechx/nnet/nnet_forward_main.cc rename to speechx/speechx/nnet/ds2_nnet_main.cc index 0d4ea8ff7..6092b8a4c --- a/speechx/speechx/nnet/nnet_forward_main.cc +++ b/speechx/speechx/nnet/ds2_nnet_main.cc @@ -12,45 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "base/flags.h" -#include "base/log.h" +#include "base/common.h" +#include "decoder/param.h" #include "frontend/audio/assembler.h" #include "frontend/audio/data_cache.h" #include "kaldi/util/table-types.h" #include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" +#include "nnet/ds2_nnet.h" DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier"); -DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model"); -DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param"); -DEFINE_int32(nnet_decoder_chunk, 1, "paddle nnet forward chunk"); -DEFINE_int32(receptive_field_length, - 7, - "receptive field of two CNN(kernel=3) downsampling module."); -DEFINE_int32(downsampling_rate, - 4, - "two CNN(kernel=3) module downsampling rate."); -DEFINE_string( - model_input_names, - "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box", - "model input names"); -DEFINE_string(model_output_names, - "softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0", - "model output names"); -DEFINE_string(model_cache_names, - "chunk_state_h_box,chunk_state_c_box", - "model cache names"); -DEFINE_string(model_cache_shapes, "5-1-1024,5-1-1024", "model cache shapes"); -DEFINE_double(acoustic_scale, 1.0, "acoustic scale"); using kaldi::BaseFloat; using kaldi::Matrix; using std::vector; int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; kaldi::SequentialBaseFloatMatrixReader feature_reader( FLAGS_feature_rspecifier); @@ -62,13 +44,8 @@ int main(int argc, char* argv[]) { int32 num_done = 0, num_err = 0; - ppspeech::ModelOptions model_opts; - model_opts.model_path = model_graph; - model_opts.param_path = model_params; - model_opts.cache_names = FLAGS_model_cache_names; - model_opts.cache_shape = FLAGS_model_cache_shapes; - model_opts.input_names = FLAGS_model_input_names; - model_opts.output_names = FLAGS_model_output_names; + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + std::shared_ptr nnet( new ppspeech::PaddleNnet(model_opts)); std::shared_ptr raw_data(new ppspeech::DataCache()); @@ -76,8 +53,8 @@ int main(int argc, char* argv[]) { new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); int32 chunk_size = FLAGS_receptive_field_length + - (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; - int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; int32 receptive_field_length = FLAGS_receptive_field_length; LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk stride (frame): " << chunk_stride; @@ -146,7 +123,7 @@ int main(int argc, char* argv[]) { } kaldi::Matrix result(prob_vec.size(), prob_vec[0].Dim()); - for (int32 row_idx = 0; row_idx < prob_vec.size(); ++row_idx) { + for (int row_idx = 0; row_idx < prob_vec.size(); ++row_idx) { for (int32 col_idx = 0; col_idx < prob_vec[0].Dim(); ++col_idx) { result(row_idx, col_idx) = prob_vec[row_idx](col_idx); } diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h old mode 100644 new mode 100755 index ac040fbaa..a504cce51 --- a/speechx/speechx/nnet/nnet_itf.h +++ b/speechx/speechx/nnet/nnet_itf.h @@ -11,24 +11,110 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - - #pragma once #include "base/basic_types.h" #include "kaldi/base/kaldi-types.h" #include "kaldi/matrix/kaldi-matrix.h" +#include "kaldi/util/options-itf.h" + +DECLARE_int32(subsampling_rate); +DECLARE_string(model_path); +DECLARE_string(param_path); +DECLARE_string(model_input_names); +DECLARE_string(model_output_names); +DECLARE_string(model_cache_names); +DECLARE_string(model_cache_shapes); namespace ppspeech { +struct ModelOptions { + // common + int subsample_rate{1}; + int thread_num{1}; // predictor thread pool size for ds2; + bool use_gpu{false}; + std::string model_path; + + std::string param_path; + + // ds2 for inference + std::string input_names{}; + std::string output_names{}; + std::string cache_names{}; + std::string cache_shape{}; + bool switch_ir_optim{false}; + bool enable_fc_padding{false}; + bool enable_profile{false}; + + static ModelOptions InitFromFlags() { + ModelOptions opts; + opts.subsample_rate = FLAGS_subsampling_rate; + LOG(INFO) << "subsampling rate: " << opts.subsample_rate; + opts.model_path = FLAGS_model_path; + LOG(INFO) << "model path: " << opts.model_path; + + opts.param_path = FLAGS_param_path; + LOG(INFO) << "param path: " << opts.param_path; + + LOG(INFO) << "DS2 param: "; + opts.cache_names = FLAGS_model_cache_names; + LOG(INFO) << " cache names: " << opts.cache_names; + opts.cache_shape = FLAGS_model_cache_shapes; + LOG(INFO) << " cache shape: " << opts.cache_shape; + opts.input_names = FLAGS_model_input_names; + LOG(INFO) << " input names: " << opts.input_names; + opts.output_names = FLAGS_model_output_names; + LOG(INFO) << " output names: " << opts.output_names; + return opts; + } +}; + +struct NnetOut { + // nnet out. maybe logprob or prob. Almost time this is logprob. + kaldi::Vector logprobs; + int32 vocab_dim; + + // nnet state. Only using in Attention model. + std::vector> encoder_outs; + + NnetOut() : logprobs({}), vocab_dim(-1), encoder_outs({}) {} +}; + + class NnetInterface { public: + virtual ~NnetInterface() {} + + // forward feat with nnet. + // nnet do not cache feats, feats cached by frontend. + // nnet cache model state, i.e. encoder_outs, att_cache, cnn_cache, + // frame_offset. virtual void FeedForward(const kaldi::Vector& features, - int32 feature_dim, - kaldi::Vector* inferences, - int32* inference_dim) = 0; + const int32& feature_dim, + NnetOut* out) = 0; + + virtual void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) = 0; + + // reset nnet state, e.g. nnet_logprob_cache_, offset_, encoder_outs_. virtual void Reset() = 0; - virtual ~NnetInterface() {} + + // true, nnet output is logprob; otherwise is prob, + virtual bool IsLogProb() = 0; + + // using to get encoder outs. e.g. seq2seq with Attention model. + virtual void EncoderOuts( + std::vector>* encoder_out) const = 0; +}; + + +class NnetBase : public NnetInterface { + public: + int SubsamplingRate() const { return subsampling_rate_; } + + protected: + int subsampling_rate_{1}; }; } // namespace ppspeech diff --git a/speechx/speechx/nnet/u2_nnet.cc b/speechx/speechx/nnet/u2_nnet.cc new file mode 100755 index 000000000..7707406a1 --- /dev/null +++ b/speechx/speechx/nnet/u2_nnet.cc @@ -0,0 +1,667 @@ +// Copyright 2022 Horizon Robotics. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.cc + +#include "nnet/u2_nnet.h" + +#ifdef USE_PROFILING +#include "paddle/fluid/platform/profiler.h" +using paddle::platform::RecordEvent; +using paddle::platform::TracerEventType; +#endif // end USE_PROFILING + +namespace ppspeech { + + +void U2Nnet::LoadModel(const std::string& model_path_w_prefix) { + paddle::jit::utils::InitKernelSignatureMap(); + +#ifdef USE_GPU + dev_ = phi::GPUPlace(); +#else + dev_ = phi::CPUPlace(); +#endif + paddle::jit::Layer model = paddle::jit::Load(model_path_w_prefix, dev_); + model_ = std::make_shared(std::move(model)); + + subsampling_rate_ = model_->Attribute("subsampling_rate"); + right_context_ = model_->Attribute("right_context"); + sos_ = model_->Attribute("sos_symbol"); + eos_ = model_->Attribute("eos_symbol"); + is_bidecoder_ = model_->Attribute("is_bidirectional_decoder"); + + forward_encoder_chunk_ = model_->Function("forward_encoder_chunk"); + forward_attention_decoder_ = model_->Function("forward_attention_decoder"); + ctc_activation_ = model_->Function("ctc_activation"); + CHECK(forward_encoder_chunk_.IsValid()); + CHECK(forward_attention_decoder_.IsValid()); + CHECK(ctc_activation_.IsValid()); + + LOG(INFO) << "Paddle Model Info: "; + LOG(INFO) << "\tsubsampling_rate " << subsampling_rate_; + LOG(INFO) << "\tright context " << right_context_; + LOG(INFO) << "\tsos " << sos_; + LOG(INFO) << "\teos " << eos_; + LOG(INFO) << "\tis bidecoder " << is_bidecoder_ << std::endl; + + Warmup(); +} + +void U2Nnet::Warmup() { +#ifdef USE_PROFILING + RecordEvent event("warmup", TracerEventType::UserDefined, 1); +#endif + + { +#ifdef USE_PROFILING + RecordEvent event( + "warmup-encoder-ctc", TracerEventType::UserDefined, 1); +#endif + int feat_dim = 80; + int frame_num = 16 * 4 + 3; // chunk_size * downsample_rate + + // (receptive_field - downsample_rate) + paddle::Tensor feats = paddle::full( + {1, frame_num, feat_dim}, 0.12f, paddle::DataType::FLOAT32); + paddle::Tensor offset = paddle::zeros({1}, paddle::DataType::INT32); + paddle::Tensor att_cache = + paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32); + paddle::Tensor cnn_cache = + paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32); + std::vector inputs = { + feats, offset, /*required_cache_size, */ att_cache, cnn_cache}; + std::vector outputs = forward_encoder_chunk_(inputs); + + auto chunk_out = outputs[0]; + inputs = std::move(std::vector({chunk_out})); + outputs = ctc_activation_(inputs); + } + + { +#ifdef USE_PROFILING + RecordEvent event("warmup-decoder", TracerEventType::UserDefined, 1); +#endif + auto hyps = + paddle::full({10, 8}, 10, paddle::DataType::INT64, phi::CPUPlace()); + auto hyps_lens = + paddle::full({10}, 8, paddle::DataType::INT64, phi::CPUPlace()); + auto encoder_out = paddle::ones( + {1, 20, 512}, paddle::DataType::FLOAT32, phi::CPUPlace()); + + std::vector inputs{ + hyps, hyps_lens, encoder_out}; + + std::vector outputs = + forward_attention_decoder_(inputs); + } + + Reset(); +} + +U2Nnet::U2Nnet(const ModelOptions& opts) : opts_(opts) { + LoadModel(opts_.model_path); +} + +// shallow copy +U2Nnet::U2Nnet(const U2Nnet& other) { + // copy meta + right_context_ = other.right_context_; + subsampling_rate_ = other.subsampling_rate_; + sos_ = other.sos_; + eos_ = other.eos_; + is_bidecoder_ = other.is_bidecoder_; + chunk_size_ = other.chunk_size_; + num_left_chunks_ = other.num_left_chunks_; + + forward_encoder_chunk_ = other.forward_encoder_chunk_; + forward_attention_decoder_ = other.forward_attention_decoder_; + ctc_activation_ = other.ctc_activation_; + + offset_ = other.offset_; + + // copy model ptr + model_ = other.model_; + + // ignore inner states +} + +std::shared_ptr U2Nnet::Copy() const { + auto asr_model = std::make_shared(*this); + // reset inner state for new decoding + asr_model->Reset(); + return asr_model; +} + +void U2Nnet::Reset() { + offset_ = 0; + + att_cache_ = + std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); + cnn_cache_ = + std::move(paddle::zeros({0, 0, 0, 0}, paddle::DataType::FLOAT32)); + + encoder_outs_.clear(); + VLOG(3) << "u2nnet reset"; +} + +// Debug API +void U2Nnet::FeedEncoderOuts(const paddle::Tensor& encoder_out) { + // encoder_out (T,D) + encoder_outs_.clear(); + encoder_outs_.push_back(encoder_out); +} + + +void U2Nnet::FeedForward(const kaldi::Vector& features, + const int32& feature_dim, + NnetOut* out) { + kaldi::Timer timer; + std::vector chunk_feats(features.Data(), + features.Data() + features.Dim()); + + std::vector ctc_probs; + ForwardEncoderChunkImpl( + chunk_feats, feature_dim, &ctc_probs, &out->vocab_dim); + + out->logprobs.Resize(ctc_probs.size(), kaldi::kSetZero); + std::memcpy(out->logprobs.Data(), + ctc_probs.data(), + ctc_probs.size() * sizeof(kaldi::BaseFloat)); + VLOG(1) << "FeedForward cost: " << timer.Elapsed() << " sec. " + << chunk_feats.size() / feature_dim << " frames."; +} + + +void U2Nnet::ForwardEncoderChunkImpl( + const std::vector& chunk_feats, + const int32& feat_dim, + std::vector* out_prob, + int32* vocab_dim) { +#ifdef USE_PROFILING + RecordEvent event( + "ForwardEncoderChunkImpl", TracerEventType::UserDefined, 1); +#endif + + // 1. splice cached_feature, and chunk_feats + // First dimension is B, which is 1. + // int num_frames = cached_feats_.size() + chunk_feats.size(); + + int num_frames = chunk_feats.size() / feat_dim; + VLOG(3) << "num_frames: " << num_frames; + VLOG(3) << "feat_dim: " << feat_dim; + + // feats (B=1,T,D) + paddle::Tensor feats = + paddle::zeros({1, num_frames, feat_dim}, paddle::DataType::FLOAT32); + float* feats_ptr = feats.mutable_data(); + + // not cache feature in nnet + CHECK_EQ(cached_feats_.size(), 0); + // CHECK_EQ(std::is_same::value, true); + std::memcpy(feats_ptr, + chunk_feats.data(), + chunk_feats.size() * sizeof(kaldi::BaseFloat)); + + VLOG(3) << "feats shape: " << feats.shape()[0] << ", " << feats.shape()[1] + << ", " << feats.shape()[2]; + +#ifdef TEST_DEBUG + { + std::stringstream path("feat", std::ios_base::app | std::ios_base::out); + path << offset_; + std::ofstream feat_fobj(path.str().c_str(), std::ios::out); + CHECK(feat_fobj.is_open()); + // feat_fobj << feats.shape()[0] << " " << feats.shape()[1] << " " + // << feats.shape()[2] << "\n"; + for (int i = 0; i < feats.numel(); i++) { + feat_fobj << std::setprecision(18) << feats_ptr[i] << " "; + if ((i + 1) % feat_dim == 0) { + feat_fobj << "\n"; + } + } + feat_fobj << "\n"; + } +#endif + +// Endocer chunk forward +#ifdef USE_GPU + feats = feats.copy_to(paddle::GPUPlace(), /*blocking*/ false); + att_cache_ = att_cache_.copy_to(paddle::GPUPlace()), /*blocking*/ false; + cnn_cache_ = cnn_cache_.copy_to(Paddle::GPUPlace(), /*blocking*/ false); +#endif + + int required_cache_size = num_left_chunks_ * chunk_size_; // -1 * 16 + // must be scalar, but paddle do not have scalar. + paddle::Tensor offset = paddle::full({1}, offset_, paddle::DataType::INT32); + // freeze `required_cache_size` in graph, so not specific it in function + // call. + std::vector inputs = { + feats, offset, /*required_cache_size, */ att_cache_, cnn_cache_}; + CHECK_EQ(inputs.size(), 4); + std::vector outputs = forward_encoder_chunk_(inputs); + CHECK_EQ(outputs.size(), 3); + +#ifdef USE_GPU + paddle::Tensor chunk_out = outputs[0].copy_to(paddle::CPUPlace()); + att_cache_ = outputs[1].copy_to(paddle::CPUPlace()); + cnn_cache_ = outputs[2].copy_to(paddle::CPUPlace()); +#else + paddle::Tensor chunk_out = outputs[0]; + att_cache_ = outputs[1]; + cnn_cache_ = outputs[2]; +#endif + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits", + std::ios_base::app | std::ios_base::out); + auto i = offset_ - chunk_out.shape()[1]; + path << std::max(i, 0L); + std::ofstream logits_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_fobj.is_open()); + logits_fobj << chunk_out.shape()[0] << " " << chunk_out.shape()[1] + << " " << chunk_out.shape()[2] << "\n"; + const float* chunk_out_ptr = chunk_out.data(); + logits_fobj << chunk_out_ptr << std::endl; + for (int i = 0; i < chunk_out.numel(); i++) { + logits_fobj << chunk_out_ptr[i] << " "; + } + logits_fobj << "\n"; + } +#endif // end TEST_DEBUG + + // current offset in decoder frame + // not used in nnet + offset_ += chunk_out.shape()[1]; + VLOG(2) << "encoder out chunk size: " << chunk_out.shape()[1] + << " total: " << offset_; + + + // collects encoder outs. + encoder_outs_.push_back(chunk_out); + VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size(); + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits_list", + std::ios_base::app | std::ios_base::out); + path << offset_ - encoder_outs_[0].shape()[1]; + std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_out_fobj.is_open()); + logits_out_fobj << encoder_outs_[0].shape()[0] << " " + << encoder_outs_[0].shape()[1] << " " + << encoder_outs_[0].shape()[2] << "\n"; + const float* encoder_outs_ptr = encoder_outs_[0].data(); + logits_out_fobj << encoder_outs_ptr << std::endl; + for (int i = 0; i < encoder_outs_[0].numel(); i++) { + logits_out_fobj << encoder_outs_ptr[i] << " "; + } + logits_out_fobj << "\n"; + } +#endif // end TEST_DEBUG + +#ifdef USE_GPU + +#error "Not implementation." + +#else + // compute ctc_activation == log_softmax + inputs.clear(); + outputs.clear(); + inputs.push_back(chunk_out); + CHECK_EQ(inputs.size(), 1); + outputs = ctc_activation_(inputs); + CHECK_EQ(outputs.size(), 1); + paddle::Tensor ctc_log_probs = outputs[0]; + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logprob", + std::ios_base::app | std::ios_base::out); + path << offset_ - chunk_out.shape()[1]; + + std::ofstream logprob_fobj(path.str().c_str(), std::ios::out); + CHECK(logprob_fobj.is_open()); + logprob_fobj << ctc_log_probs.shape()[0] << " " + << ctc_log_probs.shape()[1] << " " + << ctc_log_probs.shape()[2] << "\n"; + const float* logprob_ptr = ctc_log_probs.data(); + for (int i = 0; i < ctc_log_probs.numel(); i++) { + logprob_fobj << logprob_ptr[i] << " "; + if ((i + 1) % ctc_log_probs.shape()[2] == 0) { + logprob_fobj << "\n"; + } + } + logprob_fobj << "\n"; + } +#endif // end TEST_DEBUG + +#endif // end USE_GPU + + // Copy to output, (B=1,T,D) + std::vector ctc_log_probs_shape = ctc_log_probs.shape(); + CHECK_EQ(ctc_log_probs_shape.size(), 3); + int B = ctc_log_probs_shape[0]; + CHECK_EQ(B, 1); + int T = ctc_log_probs_shape[1]; + int D = ctc_log_probs_shape[2]; + *vocab_dim = D; + + float* ctc_log_probs_ptr = ctc_log_probs.data(); + + out_prob->resize(T * D); + std::memcpy( + out_prob->data(), ctc_log_probs_ptr, T * D * sizeof(kaldi::BaseFloat)); + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits_list_ctc", + std::ios_base::app | std::ios_base::out); + path << offset_ - encoder_outs_[0].shape()[1]; + std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_out_fobj.is_open()); + logits_out_fobj << encoder_outs_[0].shape()[0] << " " + << encoder_outs_[0].shape()[1] << " " + << encoder_outs_[0].shape()[2] << "\n"; + const float* encoder_outs_ptr = encoder_outs_[0].data(); + logits_out_fobj << encoder_outs_ptr << std::endl; + for (int i = 0; i < encoder_outs_[0].numel(); i++) { + logits_out_fobj << encoder_outs_ptr[i] << " "; + } + logits_out_fobj << "\n"; + } +#endif // end TEST_DEBUG + + return; +} + +float U2Nnet::ComputePathScore(const paddle::Tensor& prob, + const std::vector& hyp, + int eos) { + // sum `hyp` path scores in `prob` + // prob (1, Umax, V) + // hyp (U,) + float score = 0.0f; + std::vector dims = prob.shape(); + CHECK_EQ(dims.size(), 3); + VLOG(2) << "prob shape: " << dims[0] << ", " << dims[1] << ", " << dims[2]; + CHECK_EQ(dims[0], 1); + int vocab_dim = static_cast(dims[2]); + + const float* prob_ptr = prob.data(); + for (size_t i = 0; i < hyp.size(); ++i) { + const float* row = prob_ptr + i * vocab_dim; + score += row[hyp[i]]; + } + const float* row = prob_ptr + hyp.size() * vocab_dim; + score += row[eos]; + return score; +} + + +void U2Nnet::AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) { +#ifdef USE_PROFILING + RecordEvent event("AttentionRescoring", TracerEventType::UserDefined, 1); +#endif + CHECK(rescoring_score != nullptr); + + int num_hyps = hyps.size(); + rescoring_score->resize(num_hyps, 0.0f); + + if (num_hyps == 0) return; + VLOG(2) << "num hyps: " << num_hyps; + + if (encoder_outs_.size() == 0) { + // no encoder outs + std::cerr << "encoder_outs_.size() is zero. Please check it." + << std::endl; + return; + } + + // prepare input + paddle::Tensor hyps_lens = + paddle::zeros({num_hyps}, paddle::DataType::INT64); + int64_t* hyps_len_ptr = hyps_lens.mutable_data(); + int max_hyps_len = 0; + for (size_t i = 0; i < num_hyps; ++i) { + int len = hyps[i].size() + 1; // eos + max_hyps_len = std::max(max_hyps_len, len); + hyps_len_ptr[i] = static_cast(len); + } + VLOG(2) << "max_hyps_len: " << max_hyps_len; + + paddle::Tensor hyps_tensor = + paddle::full({num_hyps, max_hyps_len}, eos_, paddle::DataType::INT64); + int64_t* hyps_ptr = hyps_tensor.mutable_data(); + for (size_t i = 0; i < num_hyps; ++i) { + const std::vector& hyp = hyps[i]; + int64_t* row = hyps_ptr + max_hyps_len * i; + row[0] = sos_; + for (size_t j = 0; j < hyp.size(); ++j) { + row[j + 1] = hyp[j]; + } + } + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_logits_concat", + std::ios_base::app | std::ios_base::out); + for (int j = 0; j < encoder_outs_.size(); j++) { + path << j; + std::ofstream logits_out_fobj(path.str().c_str(), std::ios::out); + CHECK(logits_out_fobj.is_open()); + logits_out_fobj << encoder_outs_[j].shape()[0] << " " + << encoder_outs_[j].shape()[1] << " " + << encoder_outs_[j].shape()[2] << "\n"; + const float* encoder_outs_ptr = encoder_outs_[j].data(); + for (int i = 0; i < encoder_outs_[j].numel(); i++) { + logits_out_fobj << encoder_outs_ptr[i] << " "; + } + logits_out_fobj << "\n"; + } + } +#endif // end TEST_DEBUG + + // forward attention decoder by hyps and correspoinding encoder_outs_ + paddle::Tensor encoder_out = paddle::concat(encoder_outs_, 1); + VLOG(2) << "encoder_outs_ size: " << encoder_outs_.size(); + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_out0", + std::ios_base::app | std::ios_base::out); + std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out); + CHECK(encoder_out_fobj.is_open()); + + encoder_out_fobj << encoder_outs_[0].shape()[0] << " " + << encoder_outs_[0].shape()[1] << " " + << encoder_outs_[0].shape()[2] << "\n"; + const float* enc_logprob_ptr = encoder_outs_[0].data(); + + size_t size = encoder_outs_[0].numel(); + for (int i = 0; i < size; i++) { + encoder_out_fobj << enc_logprob_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + +#ifdef TEST_DEBUG + { + std::stringstream path("encoder_out", + std::ios_base::app | std::ios_base::out); + std::ofstream encoder_out_fobj(path.str().c_str(), std::ios::out); + CHECK(encoder_out_fobj.is_open()); + + encoder_out_fobj << encoder_out.shape()[0] << " " + << encoder_out.shape()[1] << " " + << encoder_out.shape()[2] << "\n"; + const float* enc_logprob_ptr = encoder_out.data(); + + size_t size = encoder_out.numel(); + for (int i = 0; i < size; i++) { + encoder_out_fobj << enc_logprob_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + + std::vector inputs{ + hyps_tensor, hyps_lens, encoder_out}; + std::vector outputs = forward_attention_decoder_(inputs); + CHECK_EQ(outputs.size(), 2); + + // (B, Umax, V) + paddle::Tensor probs = outputs[0]; + std::vector probs_shape = probs.shape(); + CHECK_EQ(probs_shape.size(), 3); + CHECK_EQ(probs_shape[0], num_hyps); + CHECK_EQ(probs_shape[1], max_hyps_len); + +#ifdef TEST_DEBUG + { + std::stringstream path("decoder_logprob", + std::ios_base::app | std::ios_base::out); + std::ofstream dec_logprob_fobj(path.str().c_str(), std::ios::out); + CHECK(dec_logprob_fobj.is_open()); + + dec_logprob_fobj << probs.shape()[0] << " " << probs.shape()[1] << " " + << probs.shape()[2] << "\n"; + const float* dec_logprob_ptr = probs.data(); + + size_t size = probs.numel(); + for (int i = 0; i < size; i++) { + dec_logprob_fobj << dec_logprob_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + +#ifdef TEST_DEBUG + { + std::stringstream path("hyps_lens", + std::ios_base::app | std::ios_base::out); + std::ofstream hyps_len_fobj(path.str().c_str(), std::ios::out); + CHECK(hyps_len_fobj.is_open()); + + const int64_t* hyps_lens_ptr = hyps_lens.data(); + + size_t size = hyps_lens.numel(); + for (int i = 0; i < size; i++) { + hyps_len_fobj << hyps_lens_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + +#ifdef TEST_DEBUG + { + std::stringstream path("hyps_tensor", + std::ios_base::app | std::ios_base::out); + std::ofstream hyps_tensor_fobj(path.str().c_str(), std::ios::out); + CHECK(hyps_tensor_fobj.is_open()); + + const int64_t* hyps_tensor_ptr = hyps_tensor.data(); + + size_t size = hyps_tensor.numel(); + for (int i = 0; i < size; i++) { + hyps_tensor_fobj << hyps_tensor_ptr[i] << "\n"; + } + } +#endif // end TEST_DEBUG + + paddle::Tensor r_probs = outputs[1]; + std::vector r_probs_shape = r_probs.shape(); + if (is_bidecoder_ && reverse_weight > 0) { + CHECK_EQ(r_probs_shape.size(), 3); + CHECK_EQ(r_probs_shape[0], num_hyps); + CHECK_EQ(r_probs_shape[1], max_hyps_len); + } else { + // dump r_probs + CHECK_EQ(r_probs_shape.size(), 1); + CHECK_EQ(r_probs_shape[0], 1) << r_probs_shape[0]; + } + + // compute rescoring score + using IntArray = paddle::experimental::IntArray; + std::vector probs_v = + paddle::experimental::split_with_num(probs, num_hyps, 0); + VLOG(2) << "split prob: " << probs_v.size() << " " + << probs_v[0].shape().size() << " 0: " << probs_v[0].shape()[0] + << ", " << probs_v[0].shape()[1] << ", " << probs_v[0].shape()[2]; + CHECK(static_cast(probs_v.size()) == num_hyps) + << ": is " << probs_v.size() << " expect: " << num_hyps; + + std::vector r_probs_v; + if (is_bidecoder_ && reverse_weight > 0) { + r_probs_v = paddle::experimental::split_with_num(r_probs, num_hyps, 0); + CHECK(static_cast(r_probs_v.size()) == num_hyps) + << "r_probs_v size: is " << r_probs_v.size() + << " expect: " << num_hyps; + } + + for (int i = 0; i < num_hyps; ++i) { + const std::vector& hyp = hyps[i]; + + // left-to-right decoder score + float score = 0.0f; + score = ComputePathScore(probs_v[i], hyp, eos_); + + // right-to-left decoder score + float r_score = 0.0f; + if (is_bidecoder_ && reverse_weight > 0) { + std::vector r_hyp(hyp.size()); + std::reverse_copy(hyp.begin(), hyp.end(), r_hyp.begin()); + r_score = ComputePathScore(r_probs_v[i], r_hyp, eos_); + } + + // combinded left-to-right and right-to-lfet score + (*rescoring_score)[i] = + score * (1 - reverse_weight) + r_score * reverse_weight; + VLOG(3) << "hyp " << i << " " << hyp.size() << " score: " << score + << " r_score: " << r_score + << " reverse_weight: " << reverse_weight + << " final score: " << (*rescoring_score)[i]; + } +} + + +void U2Nnet::EncoderOuts( + std::vector>* encoder_out) const { + // list of (B=1,T,D) + int size = encoder_outs_.size(); + VLOG(3) << "encoder_outs_ size: " << size; + + for (int i = 0; i < size; i++) { + const paddle::Tensor& item = encoder_outs_[i]; + const std::vector shape = item.shape(); + CHECK_EQ(shape.size(), 3); + const int& B = shape[0]; + const int& T = shape[1]; + const int& D = shape[2]; + CHECK(B == 1) << "Only support batch one."; + VLOG(3) << "encoder out " << i << " shape: (" << B << "," << T << "," + << D << ")"; + + const float* this_tensor_ptr = item.data(); + for (int j = 0; j < T; j++) { + const float* cur = this_tensor_ptr + j * D; + kaldi::Vector out(D); + std::memcpy(out.Data(), cur, D * sizeof(kaldi::BaseFloat)); + encoder_out->emplace_back(out); + } + } +} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/u2_nnet.h b/speechx/speechx/nnet/u2_nnet.h new file mode 100755 index 000000000..23cc0ea3b --- /dev/null +++ b/speechx/speechx/nnet/u2_nnet.h @@ -0,0 +1,132 @@ +// Copyright 2022 Horizon Robotics. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// modified from +// https://github.com/wenet-e2e/wenet/blob/main/runtime/core/decoder/asr_model.h +#pragma once + +#include "base/common.h" +#include "kaldi/matrix/kaldi-matrix.h" +#include "nnet/nnet_itf.h" +#include "paddle/extension.h" +#include "paddle/jit/all.h" +#include "paddle/phi/api/all.h" + +namespace ppspeech { + + +class U2NnetBase : public NnetBase { + public: + virtual int Context() const { return right_context_ + 1; } + virtual int RightContext() const { return right_context_; } + + virtual int EOS() const { return eos_; } + virtual int SOS() const { return sos_; } + virtual int IsBidecoder() const { return is_bidecoder_; } + // current offset in decoder frame + virtual int Offset() const { return offset_; } + virtual void SetChunkSize(int chunk_size) { chunk_size_ = chunk_size; } + virtual void SetNumLeftChunks(int num_left_chunks) { + num_left_chunks_ = num_left_chunks; + } + + virtual std::shared_ptr Copy() const = 0; + + protected: + virtual void ForwardEncoderChunkImpl( + const std::vector& chunk_feats, + const int32& feat_dim, + std::vector* ctc_probs, + int32* vocab_dim) = 0; + + protected: + // model specification + int right_context_{0}; + + int sos_{0}; + int eos_{0}; + + bool is_bidecoder_{false}; + + int chunk_size_{16}; // num of decoder frames. If chunk_size > 0, streaming + // case. Otherwise, none streaming case + int num_left_chunks_{-1}; // -1 means all left chunks + + // asr decoder state, not used in nnet + int offset_{0}; // current offset in encoder output time stamp. Used by + // position embedding. + std::vector> cached_feats_{}; // features cache +}; + + +class U2Nnet : public U2NnetBase { + public: + explicit U2Nnet(const ModelOptions& opts); + U2Nnet(const U2Nnet& other); + + void FeedForward(const kaldi::Vector& features, + const int32& feature_dim, + NnetOut* out) override; + + void Reset() override; + + bool IsLogProb() override { return true; } + + void Dim(); + + void LoadModel(const std::string& model_path_w_prefix); + void Warmup(); + + std::shared_ptr model() const { return model_; } + + std::shared_ptr Copy() const override; + + void ForwardEncoderChunkImpl( + const std::vector& chunk_feats, + const int32& feat_dim, + std::vector* ctc_probs, + int32* vocab_dim) override; + + float ComputePathScore(const paddle::Tensor& prob, + const std::vector& hyp, + int eos); + + void AttentionRescoring(const std::vector>& hyps, + float reverse_weight, + std::vector* rescoring_score) override; + + // debug + void FeedEncoderOuts(const paddle::Tensor& encoder_out); + + void EncoderOuts( + std::vector>* encoder_out) const; + + private: + ModelOptions opts_; + + phi::Place dev_; + std::shared_ptr model_{nullptr}; + std::vector encoder_outs_; + // transformer/conformer attention cache + paddle::Tensor att_cache_ = paddle::full({0, 0, 0, 0}, 0.0); + // conformer-only conv_module cache + paddle::Tensor cnn_cache_ = paddle::full({0, 0, 0, 0}, 0.0); + + paddle::jit::Function forward_encoder_chunk_; + paddle::jit::Function forward_attention_decoder_; + paddle::jit::Function ctc_activation_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/nnet/u2_nnet_main.cc b/speechx/speechx/nnet/u2_nnet_main.cc new file mode 100755 index 000000000..53fc55546 --- /dev/null +++ b/speechx/speechx/nnet/u2_nnet_main.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "base/common.h" +#include "decoder/param.h" +#include "frontend/audio/assembler.h" +#include "frontend/audio/data_cache.h" +#include "kaldi/util/table-types.h" +#include "nnet/decodable.h" +#include "nnet/u2_nnet.h" + + +DEFINE_string(feature_rspecifier, "", "test feature rspecifier"); +DEFINE_string(nnet_prob_wspecifier, "", "nnet porb wspecifier"); +DEFINE_string(nnet_encoder_outs_wspecifier, "", "nnet encoder outs wspecifier"); + +using kaldi::BaseFloat; +using kaldi::Matrix; +using std::vector; + +int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + int32 num_done = 0, num_err = 0; + + CHECK_GT(FLAGS_feature_rspecifier.size(), 0); + CHECK_GT(FLAGS_nnet_prob_wspecifier.size(), 0); + CHECK_GT(FLAGS_model_path.size(), 0); + LOG(INFO) << "input rspecifier: " << FLAGS_feature_rspecifier; + LOG(INFO) << "output wspecifier: " << FLAGS_nnet_prob_wspecifier; + LOG(INFO) << "model path: " << FLAGS_model_path; + + kaldi::SequentialBaseFloatMatrixReader feature_reader( + FLAGS_feature_rspecifier); + kaldi::BaseFloatMatrixWriter nnet_out_writer(FLAGS_nnet_prob_wspecifier); + kaldi::BaseFloatMatrixWriter nnet_encoder_outs_writer( + FLAGS_nnet_encoder_outs_wspecifier); + + ppspeech::ModelOptions model_opts = ppspeech::ModelOptions::InitFromFlags(); + + int32 chunk_size = (FLAGS_nnet_decoder_chunk - 1) * FLAGS_subsampling_rate + + FLAGS_receptive_field_length; + int32 chunk_stride = FLAGS_subsampling_rate * FLAGS_nnet_decoder_chunk; + int32 receptive_field_length = FLAGS_receptive_field_length; + LOG(INFO) << "chunk size (frame): " << chunk_size; + LOG(INFO) << "chunk stride (frame): " << chunk_stride; + LOG(INFO) << "receptive field (frame): " << receptive_field_length; + + std::shared_ptr nnet(new ppspeech::U2Nnet(model_opts)); + std::shared_ptr raw_data(new ppspeech::DataCache()); + std::shared_ptr decodable( + new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); + kaldi::Timer timer; + + for (; !feature_reader.Done(); feature_reader.Next()) { + string utt = feature_reader.Key(); + kaldi::Matrix feature = feature_reader.Value(); + + int nframes = feature.NumRows(); + int feat_dim = feature.NumCols(); + raw_data->SetDim(feat_dim); + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "feat shape: " << nframes << ", " << feat_dim; + + int32 frame_idx = 0; + int vocab_dim = 0; + std::vector> prob_vec; + std::vector> encoder_out_vec; + int32 ori_feature_len = feature.NumRows(); + int32 num_chunks = feature.NumRows() / chunk_stride + 1; + LOG(INFO) << "num_chunks: " << num_chunks; + + for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + int32 this_chunk_size = 0; + if (ori_feature_len > chunk_idx * chunk_stride) { + this_chunk_size = std::min( + ori_feature_len - chunk_idx * chunk_stride, chunk_size); + } + if (this_chunk_size < receptive_field_length) { + LOG(WARNING) + << "utt: " << utt << " skip last " << this_chunk_size + << " frames, expect is " << receptive_field_length; + break; + } + + kaldi::Vector feature_chunk(this_chunk_size * + feat_dim); + int32 start = chunk_idx * chunk_stride; + for (int row_id = 0; row_id < this_chunk_size; ++row_id) { + kaldi::SubVector feat_row(feature, start); + kaldi::SubVector feature_chunk_row( + feature_chunk.Data() + row_id * feat_dim, feat_dim); + + feature_chunk_row.CopyFromVec(feat_row); + ++start; + } + + // feat to frontend pipeline cache + raw_data->Accept(feature_chunk); + + // send data finish signal + if (chunk_idx == num_chunks - 1) { + raw_data->SetFinished(); + } + + // get nnet outputs + kaldi::Timer timer; + kaldi::Vector logprobs; + bool isok = decodable->AdvanceChunk(&logprobs, &vocab_dim); + CHECK(isok == true); + for (int row_idx = 0; row_idx < logprobs.Dim() / vocab_dim; + row_idx++) { + kaldi::Vector vec_tmp(vocab_dim); + std::memcpy(vec_tmp.Data(), + logprobs.Data() + row_idx * vocab_dim, + sizeof(kaldi::BaseFloat) * vocab_dim); + prob_vec.push_back(vec_tmp); + } + + VLOG(2) << "frame_idx: " << frame_idx + << " elapsed: " << timer.Elapsed() << " sec."; + } + + // get encoder out + decodable->Nnet()->EncoderOuts(&encoder_out_vec); + + // after process one utt, then reset decoder state. + decodable->Reset(); + + if (prob_vec.size() == 0 || encoder_out_vec.size() == 0) { + // the TokenWriter can not write empty string. + ++num_err; + LOG(WARNING) << " the nnet prob/encoder_out of " << utt + << " is empty"; + continue; + } + + { + // writer nnet output + kaldi::MatrixIndexT nrow = prob_vec.size(); + kaldi::MatrixIndexT ncol = prob_vec[0].Dim(); + LOG(INFO) << "nnet out shape: " << nrow << ", " << ncol; + kaldi::Matrix nnet_out(nrow, ncol); + for (int32 row_idx = 0; row_idx < nrow; ++row_idx) { + for (int32 col_idx = 0; col_idx < ncol; ++col_idx) { + nnet_out(row_idx, col_idx) = prob_vec[row_idx](col_idx); + } + } + nnet_out_writer.Write(utt, nnet_out); + } + + + { + // writer nnet encoder outs + kaldi::MatrixIndexT nrow = encoder_out_vec.size(); + kaldi::MatrixIndexT ncol = encoder_out_vec[0].Dim(); + LOG(INFO) << "nnet encoder outs shape: " << nrow << ", " << ncol; + kaldi::Matrix encoder_outs(nrow, ncol); + for (int32 row_idx = 0; row_idx < nrow; ++row_idx) { + for (int32 col_idx = 0; col_idx < ncol; ++col_idx) { + encoder_outs(row_idx, col_idx) = + encoder_out_vec[row_idx](col_idx); + } + } + nnet_encoder_outs_writer.Write(utt, encoder_outs); + } + + ++num_done; + } + + + double elapsed = timer.Elapsed(); + LOG(INFO) << "Program cost:" << elapsed << " sec"; + + LOG(INFO) << "Done " << num_done << " utterances, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); +} diff --git a/speechx/speechx/protocol/CMakeLists.txt b/speechx/speechx/protocol/CMakeLists.txt old mode 100644 new mode 100755 index 98b2f38b4..71b33daa9 --- a/speechx/speechx/protocol/CMakeLists.txt +++ b/speechx/speechx/protocol/CMakeLists.txt @@ -1,3 +1 @@ -cmake_minimum_required(VERSION 3.14 FATAL_ERROR) - add_subdirectory(websocket) diff --git a/speechx/speechx/protocol/websocket/CMakeLists.txt b/speechx/speechx/protocol/websocket/CMakeLists.txt old mode 100644 new mode 100755 index c3454c399..cafbbec73 --- a/speechx/speechx/protocol/websocket/CMakeLists.txt +++ b/speechx/speechx/protocol/websocket/CMakeLists.txt @@ -1,10 +1,8 @@ -project(websocket) - add_library(websocket STATIC websocket_server.cc websocket_client.cc ) -target_link_libraries(websocket PUBLIC frontend decoder nnet) +target_link_libraries(websocket PUBLIC frontend nnet decoder recognizer) add_executable(websocket_server_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_server_main.cc) target_include_directories(websocket_server_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) diff --git a/speechx/speechx/protocol/websocket/websocket_client.cc b/speechx/speechx/protocol/websocket/websocket_client.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/protocol/websocket/websocket_client.h b/speechx/speechx/protocol/websocket/websocket_client.h old mode 100644 new mode 100755 index 886da2929..7ae6d98d5 --- a/speechx/speechx/protocol/websocket/websocket_client.h +++ b/speechx/speechx/protocol/websocket/websocket_client.h @@ -13,7 +13,6 @@ // limitations under the License. #include "base/common.h" - #include "boost/asio/connect.hpp" #include "boost/asio/ip/tcp.hpp" #include "boost/beast/core.hpp" @@ -54,4 +53,4 @@ class WebSocketClient { websocket::stream ws_{ioc_}; std::unique_ptr t_{nullptr}; }; -} \ No newline at end of file +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/protocol/websocket/websocket_client_main.cc b/speechx/speechx/protocol/websocket/websocket_client_main.cc old mode 100644 new mode 100755 index 7ad36e3a5..7c5a4f2f7 --- a/speechx/speechx/protocol/websocket/websocket_client_main.cc +++ b/speechx/speechx/protocol/websocket/websocket_client_main.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "websocket/websocket_client.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/kaldi-io.h" #include "kaldi/util/table-types.h" +#include "websocket/websocket_client.h" DEFINE_string(host, "127.0.0.1", "host of websocket server"); DEFINE_int32(port, 8082, "port of websocket server"); diff --git a/speechx/speechx/protocol/websocket/websocket_server.cc b/speechx/speechx/protocol/websocket/websocket_server.cc old mode 100644 new mode 100755 diff --git a/speechx/speechx/protocol/websocket/websocket_server.h b/speechx/speechx/protocol/websocket/websocket_server.h old mode 100644 new mode 100755 index 009fc42ed..b0dcb3e37 --- a/speechx/speechx/protocol/websocket/websocket_server.h +++ b/speechx/speechx/protocol/websocket/websocket_server.h @@ -15,14 +15,12 @@ #pragma once #include "base/common.h" - #include "boost/asio/connect.hpp" #include "boost/asio/ip/tcp.hpp" #include "boost/beast/core.hpp" #include "boost/beast/websocket.hpp" - -#include "decoder/recognizer.h" #include "frontend/audio/feature_pipeline.h" +#include "recognizer/recognizer.h" namespace beast = boost::beast; // from namespace http = beast::http; // from diff --git a/speechx/speechx/protocol/websocket/websocket_server_main.cc b/speechx/speechx/protocol/websocket/websocket_server_main.cc old mode 100644 new mode 100755 index 109da96b6..5c32caf27 --- a/speechx/speechx/protocol/websocket/websocket_server_main.cc +++ b/speechx/speechx/protocol/websocket/websocket_server_main.cc @@ -12,16 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "websocket/websocket_server.h" #include "decoder/param.h" +#include "websocket/websocket_server.h" DEFINE_int32(port, 8082, "websocket listening port"); +ppspeech::RecognizerResource InitRecognizerResoure() { + ppspeech::RecognizerResource resource; + resource.acoustic_scale = FLAGS_acoustic_scale; + resource.feature_pipeline_opts = + ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); + resource.tlg_opts = ppspeech::TLGDecoderOptions::InitFromFlags(); + return resource; +} + int main(int argc, char *argv[]) { gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); - ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure(); + ppspeech::RecognizerResource resource = InitRecognizerResoure(); ppspeech::WebSocketServer server(FLAGS_port, resource); LOG(INFO) << "Listening at port " << FLAGS_port; diff --git a/speechx/speechx/recognizer/CMakeLists.txt b/speechx/speechx/recognizer/CMakeLists.txt new file mode 100755 index 000000000..050788739 --- /dev/null +++ b/speechx/speechx/recognizer/CMakeLists.txt @@ -0,0 +1,45 @@ +set(srcs) + +if (USING_DS2) +list(APPEND srcs +recognizer.cc +) +endif() + +if (USING_U2) + list(APPEND srcs + u2_recognizer.cc + ) +endif() + +add_library(recognizer STATIC ${srcs}) +target_link_libraries(recognizer PUBLIC decoder) + +# test +if (USING_DS2) + set(BINS recognizer_main) + + foreach(bin_name IN LISTS BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} PUBLIC recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS}) + endforeach() +endif() + + +if (USING_U2) + set(TEST_BINS + u2_recognizer_main + ) + + foreach(bin_name IN LISTS TEST_BINS) + add_executable(${bin_name} ${CMAKE_CURRENT_SOURCE_DIR}/${bin_name}.cc) + target_include_directories(${bin_name} PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) + target_link_libraries(${bin_name} recognizer nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util) + target_compile_options(${bin_name} PRIVATE ${PADDLE_COMPILE_FLAGS}) + target_include_directories(${bin_name} PRIVATE ${pybind11_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}) + target_link_libraries(${bin_name} ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS}) + endforeach() + +endif() + diff --git a/speechx/speechx/decoder/recognizer.cc b/speechx/speechx/recognizer/recognizer.cc old mode 100644 new mode 100755 similarity index 96% rename from speechx/speechx/decoder/recognizer.cc rename to speechx/speechx/recognizer/recognizer.cc index 44c3911c9..c66318131 --- a/speechx/speechx/decoder/recognizer.cc +++ b/speechx/speechx/recognizer/recognizer.cc @@ -12,25 +12,31 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "decoder/recognizer.h" +#include "recognizer/recognizer.h" + namespace ppspeech { -using kaldi::Vector; -using kaldi::VectorBase; using kaldi::BaseFloat; -using std::vector; using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; using std::unique_ptr; +using std::vector; + Recognizer::Recognizer(const RecognizerResource& resource) { // resource_ = resource; const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts; feature_pipeline_.reset(new FeaturePipeline(feature_opts)); + std::shared_ptr nnet(new PaddleNnet(resource.model_opts)); + BaseFloat ac_scale = resource.acoustic_scale; decodable_.reset(new Decodable(nnet, feature_pipeline_, ac_scale)); + decoder_.reset(new TLGDecoder(resource.tlg_opts)); + input_finished_ = false; } diff --git a/speechx/speechx/decoder/recognizer.h b/speechx/speechx/recognizer/recognizer.h old mode 100644 new mode 100755 similarity index 64% rename from speechx/speechx/decoder/recognizer.h rename to speechx/speechx/recognizer/recognizer.h index 35e1e1676..57d5bb363 --- a/speechx/speechx/decoder/recognizer.h +++ b/speechx/speechx/recognizer/recognizer.h @@ -20,21 +20,31 @@ #include "decoder/ctc_tlg_decoder.h" #include "frontend/audio/feature_pipeline.h" #include "nnet/decodable.h" -#include "nnet/paddle_nnet.h" +#include "nnet/ds2_nnet.h" + +DECLARE_double(acoustic_scale); namespace ppspeech { struct RecognizerResource { - FeaturePipelineOptions feature_pipeline_opts; - ModelOptions model_opts; - TLGDecoderOptions tlg_opts; + kaldi::BaseFloat acoustic_scale{1.0}; + FeaturePipelineOptions feature_pipeline_opts{}; + ModelOptions model_opts{}; + TLGDecoderOptions tlg_opts{}; // CTCBeamSearchOptions beam_search_opts; - kaldi::BaseFloat acoustic_scale; - RecognizerResource() - : acoustic_scale(1.0), - feature_pipeline_opts(), - model_opts(), - tlg_opts() {} + + static RecognizerResource InitFromFlags() { + RecognizerResource resource; + resource.acoustic_scale = FLAGS_acoustic_scale; + resource.feature_pipeline_opts = + FeaturePipelineOptions::InitFromFlags(); + resource.feature_pipeline_opts.assembler_opts.fill_zero = true; + LOG(INFO) << "ds2 need fill zero be true: " + << resource.feature_pipeline_opts.assembler_opts.fill_zero; + resource.model_opts = ModelOptions::InitFromFlags(); + resource.tlg_opts = TLGDecoderOptions::InitFromFlags(); + return resource; + } }; class Recognizer { diff --git a/speechx/speechx/decoder/recognizer_main.cc b/speechx/speechx/recognizer/recognizer_main.cc old mode 100644 new mode 100755 similarity index 93% rename from speechx/speechx/decoder/recognizer_main.cc rename to speechx/speechx/recognizer/recognizer_main.cc index 232513539..cb0de2d6a --- a/speechx/speechx/decoder/recognizer_main.cc +++ b/speechx/speechx/recognizer/recognizer_main.cc @@ -12,21 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "decoder/recognizer.h" #include "decoder/param.h" #include "kaldi/feat/wave-reader.h" #include "kaldi/util/table-types.h" +#include "recognizer/recognizer.h" DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); DEFINE_string(result_wspecifier, "", "test result wspecifier"); DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); DEFINE_int32(sample_rate, 16000, "sample rate"); + int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); gflags::ParseCommandLineFlags(&argc, &argv, false); google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; - ppspeech::RecognizerResource resource = ppspeech::InitRecognizerResoure(); + ppspeech::RecognizerResource resource = + ppspeech::RecognizerResource::InitFromFlags(); ppspeech::Recognizer recognizer(resource); kaldi::SequentialTableReader wav_reader( diff --git a/speechx/speechx/recognizer/u2_recognizer.cc b/speechx/speechx/recognizer/u2_recognizer.cc new file mode 100755 index 000000000..d1d308ebd --- /dev/null +++ b/speechx/speechx/recognizer/u2_recognizer.cc @@ -0,0 +1,218 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "recognizer/u2_recognizer.h" + +#include "nnet/u2_nnet.h" + +namespace ppspeech { + +using kaldi::BaseFloat; +using kaldi::SubVector; +using kaldi::Vector; +using kaldi::VectorBase; +using std::unique_ptr; +using std::vector; + +U2Recognizer::U2Recognizer(const U2RecognizerResource& resource) + : opts_(resource) { + const FeaturePipelineOptions& feature_opts = resource.feature_pipeline_opts; + feature_pipeline_.reset(new FeaturePipeline(feature_opts)); + + std::shared_ptr nnet(new U2Nnet(resource.model_opts)); + + BaseFloat am_scale = resource.acoustic_scale; + decodable_.reset(new Decodable(nnet, feature_pipeline_, am_scale)); + + CHECK_NE(resource.vocab_path, ""); + decoder_.reset(new CTCPrefixBeamSearch( + resource.vocab_path, resource.decoder_opts.ctc_prefix_search_opts)); + + unit_table_ = decoder_->VocabTable(); + symbol_table_ = unit_table_; + + input_finished_ = false; + + Reset(); +} + +void U2Recognizer::Reset() { + global_frame_offset_ = 0; + num_frames_ = 0; + result_.clear(); + + decodable_->Reset(); + decoder_->Reset(); +} + +void U2Recognizer::ResetContinuousDecoding() { + global_frame_offset_ = num_frames_; + num_frames_ = 0; + result_.clear(); + + decodable_->Reset(); + decoder_->Reset(); +} + + +void U2Recognizer::Accept(const VectorBase& waves) { + kaldi::Timer timer; + feature_pipeline_->Accept(waves); + VLOG(1) << "feed waves cost: " << timer.Elapsed() << " sec. " << waves.Dim() + << " samples."; +} + + +void U2Recognizer::Decode() { + decoder_->AdvanceDecode(decodable_); + UpdateResult(false); +} + +void U2Recognizer::Rescoring() { + // Do attention Rescoring + AttentionRescoring(); +} + +void U2Recognizer::UpdateResult(bool finish) { + const auto& hypotheses = decoder_->Outputs(); + const auto& inputs = decoder_->Inputs(); + const auto& likelihood = decoder_->Likelihood(); + const auto& times = decoder_->Times(); + result_.clear(); + + CHECK_EQ(hypotheses.size(), likelihood.size()); + for (size_t i = 0; i < hypotheses.size(); i++) { + const std::vector& hypothesis = hypotheses[i]; + + DecodeResult path; + path.score = likelihood[i]; + for (size_t j = 0; j < hypothesis.size(); j++) { + std::string word = symbol_table_->Find(hypothesis[j]); + // A detailed explanation of this if-else branch can be found in + // https://github.com/wenet-e2e/wenet/issues/583#issuecomment-907994058 + if (decoder_->Type() == kWfstBeamSearch) { + path.sentence += (" " + word); + } else { + path.sentence += (word); + } + } + + // TimeStamp is only supported in final result + // TimeStamp of the output of CtcWfstBeamSearch may be inaccurate due to + // various FST operations when building the decoding graph. So here we + // use time stamp of the input(e2e model unit), which is more accurate, + // and it requires the symbol table of the e2e model used in training. + if (unit_table_ != nullptr && finish) { + int offset = global_frame_offset_ * FrameShiftInMs(); + + const std::vector& input = inputs[i]; + const std::vector time_stamp = times[i]; + CHECK_EQ(input.size(), time_stamp.size()); + + for (size_t j = 0; j < input.size(); j++) { + std::string word = unit_table_->Find(input[j]); + + int start = + time_stamp[j] * FrameShiftInMs() - time_stamp_gap_ > 0 + ? time_stamp[j] * FrameShiftInMs() - time_stamp_gap_ + : 0; + if (j > 0) { + start = + (time_stamp[j] - time_stamp[j - 1]) * FrameShiftInMs() < + time_stamp_gap_ + ? (time_stamp[j - 1] + time_stamp[j]) / 2 * + FrameShiftInMs() + : start; + } + + int end = time_stamp[j] * FrameShiftInMs(); + if (j < input.size() - 1) { + end = + (time_stamp[j + 1] - time_stamp[j]) * FrameShiftInMs() < + time_stamp_gap_ + ? (time_stamp[j + 1] + time_stamp[j]) / 2 * + FrameShiftInMs() + : end; + } + + WordPiece word_piece(word, offset + start, offset + end); + path.word_pieces.emplace_back(word_piece); + } + } + + // if (post_processor_ != nullptr) { + // path.sentence = post_processor_->Process(path.sentence, finish); + // } + + result_.emplace_back(path); + } + + if (DecodedSomething()) { + VLOG(1) << "Partial CTC result " << result_[0].sentence; + } +} + +void U2Recognizer::AttentionRescoring() { + decoder_->FinalizeSearch(); + UpdateResult(true); + + // No need to do rescoring + if (0.0 == opts_.decoder_opts.rescoring_weight) { + LOG_EVERY_N(WARNING, 3) << "Not do AttentionRescoring!"; + return; + } + LOG_EVERY_N(WARNING, 3) << "Do AttentionRescoring!"; + + // Inputs() returns N-best input ids, which is the basic unit for rescoring + // In CtcPrefixBeamSearch, inputs are the same to outputs + const auto& hypotheses = decoder_->Inputs(); + int num_hyps = hypotheses.size(); + if (num_hyps <= 0) { + return; + } + + std::vector rescoring_score; + decodable_->AttentionRescoring( + hypotheses, opts_.decoder_opts.reverse_weight, &rescoring_score); + + // combine ctc score and rescoring score + for (size_t i = 0; i < num_hyps; i++) { + VLOG(3) << "hyp " << i << " rescoring_score: " << rescoring_score[i] + << " ctc_score: " << result_[i].score + << " rescoring_weight: " << opts_.decoder_opts.rescoring_weight + << " ctc_weight: " << opts_.decoder_opts.ctc_weight; + result_[i].score = + opts_.decoder_opts.rescoring_weight * rescoring_score[i] + + opts_.decoder_opts.ctc_weight * result_[i].score; + + VLOG(3) << "hyp: " << result_[0].sentence + << " score: " << result_[0].score; + } + + std::sort(result_.begin(), result_.end(), DecodeResult::CompareFunc); + VLOG(3) << "result: " << result_[0].sentence + << " score: " << result_[0].score; +} + +std::string U2Recognizer::GetFinalResult() { return result_[0].sentence; } + +std::string U2Recognizer::GetPartialResult() { return result_[0].sentence; } + +void U2Recognizer::SetFinished() { + feature_pipeline_->SetFinished(); + input_finished_ = true; +} + + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/recognizer/u2_recognizer.h b/speechx/speechx/recognizer/u2_recognizer.h new file mode 100755 index 000000000..258508633 --- /dev/null +++ b/speechx/speechx/recognizer/u2_recognizer.h @@ -0,0 +1,172 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "decoder/common.h" +#include "decoder/ctc_beam_search_opt.h" +#include "decoder/ctc_prefix_beam_search_decoder.h" +#include "decoder/decoder_itf.h" +#include "frontend/audio/feature_pipeline.h" +#include "fst/fstlib.h" +#include "fst/symbol-table.h" +#include "nnet/decodable.h" + +DECLARE_int32(nnet_decoder_chunk); +DECLARE_int32(num_left_chunks); +DECLARE_double(ctc_weight); +DECLARE_double(rescoring_weight); +DECLARE_double(reverse_weight); +DECLARE_int32(nbest); +DECLARE_int32(blank); + +DECLARE_double(acoustic_scale); +DECLARE_string(vocab_path); + +namespace ppspeech { + +struct DecodeOptions { + // chunk_size is the frame number of one chunk after subsampling. + // e.g. if subsample rate is 4 and chunk_size = 16, the frames in + // one chunk are 67=16*4 + 3, stride is 64=16*4 + int chunk_size{16}; + int num_left_chunks{-1}; + + // final_score = rescoring_weight * rescoring_score + ctc_weight * + // ctc_score; + // rescoring_score = left_to_right_score * (1 - reverse_weight) + + // right_to_left_score * reverse_weight + // Please note the concept of ctc_scores + // in the following two search methods are different. For + // CtcPrefixBeamSerch, + // it's a sum(prefix) score + context score For CtcWfstBeamSerch, it's a + // max(viterbi) path score + context score So we should carefully set + // ctc_weight accroding to the search methods. + float ctc_weight{0.0}; + float rescoring_weight{1.0}; + float reverse_weight{0.0}; + + // CtcEndpointConfig ctc_endpoint_opts; + CTCBeamSearchOptions ctc_prefix_search_opts{}; + + static DecodeOptions InitFromFlags() { + DecodeOptions decoder_opts; + decoder_opts.chunk_size = FLAGS_nnet_decoder_chunk; + decoder_opts.num_left_chunks = FLAGS_num_left_chunks; + decoder_opts.ctc_weight = FLAGS_ctc_weight; + decoder_opts.rescoring_weight = FLAGS_rescoring_weight; + decoder_opts.reverse_weight = FLAGS_reverse_weight; + decoder_opts.ctc_prefix_search_opts.blank = FLAGS_blank; + decoder_opts.ctc_prefix_search_opts.first_beam_size = FLAGS_nbest; + decoder_opts.ctc_prefix_search_opts.second_beam_size = FLAGS_nbest; + LOG(INFO) << "chunk_size: " << decoder_opts.chunk_size; + LOG(INFO) << "num_left_chunks: " << decoder_opts.num_left_chunks; + LOG(INFO) << "ctc_weight: " << decoder_opts.ctc_weight; + LOG(INFO) << "rescoring_weight: " << decoder_opts.rescoring_weight; + LOG(INFO) << "reverse_weight: " << decoder_opts.reverse_weight; + LOG(INFO) << "blank: " << FLAGS_blank; + LOG(INFO) << "first_beam_size: " << FLAGS_nbest; + LOG(INFO) << "second_beam_size: " << FLAGS_nbest; + return decoder_opts; + } +}; + +struct U2RecognizerResource { + kaldi::BaseFloat acoustic_scale{1.0}; + std::string vocab_path{}; + + FeaturePipelineOptions feature_pipeline_opts{}; + ModelOptions model_opts{}; + DecodeOptions decoder_opts{}; + + static U2RecognizerResource InitFromFlags() { + U2RecognizerResource resource; + resource.vocab_path = FLAGS_vocab_path; + resource.acoustic_scale = FLAGS_acoustic_scale; + LOG(INFO) << "vocab path: " << resource.vocab_path; + LOG(INFO) << "acoustic_scale: " << resource.acoustic_scale; + + resource.feature_pipeline_opts = + ppspeech::FeaturePipelineOptions::InitFromFlags(); + resource.feature_pipeline_opts.assembler_opts.fill_zero = false; + LOG(INFO) << "u2 need fill zero be false: " + << resource.feature_pipeline_opts.assembler_opts.fill_zero; + resource.model_opts = ppspeech::ModelOptions::InitFromFlags(); + resource.decoder_opts = ppspeech::DecodeOptions::InitFromFlags(); + return resource; + } +}; + + +class U2Recognizer { + public: + explicit U2Recognizer(const U2RecognizerResource& resouce); + void Reset(); + void ResetContinuousDecoding(); + + void Accept(const kaldi::VectorBase& waves); + void Decode(); + void Rescoring(); + + + std::string GetFinalResult(); + std::string GetPartialResult(); + + void SetFinished(); + bool IsFinished() { return input_finished_; } + + bool DecodedSomething() const { + return !result_.empty() && !result_[0].sentence.empty(); + } + + + int FrameShiftInMs() const { + // one decoder frame length in ms + return decodable_->Nnet()->SubsamplingRate() * + feature_pipeline_->FrameShift(); + } + + + const std::vector& Result() const { return result_; } + + private: + void AttentionRescoring(); + void UpdateResult(bool finish = false); + + private: + U2RecognizerResource opts_; + + // std::shared_ptr resource_; + // U2RecognizerResource resource_; + std::shared_ptr feature_pipeline_; + std::shared_ptr decodable_; + std::unique_ptr decoder_; + + // e2e unit symbol table + std::shared_ptr unit_table_ = nullptr; + std::shared_ptr symbol_table_ = nullptr; + + std::vector result_; + + // global decoded frame offset + int global_frame_offset_; + // cur decoded frame num + int num_frames_; + // timestamp gap between words in a sentence + const int time_stamp_gap_ = 100; + + bool input_finished_; +}; + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/recognizer/u2_recognizer_main.cc b/speechx/speechx/recognizer/u2_recognizer_main.cc new file mode 100755 index 000000000..d7c584074 --- /dev/null +++ b/speechx/speechx/recognizer/u2_recognizer_main.cc @@ -0,0 +1,125 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "decoder/param.h" +#include "kaldi/feat/wave-reader.h" +#include "kaldi/util/table-types.h" +#include "recognizer/u2_recognizer.h" + +DEFINE_string(wav_rspecifier, "", "test feature rspecifier"); +DEFINE_string(result_wspecifier, "", "test result wspecifier"); +DEFINE_double(streaming_chunk, 0.36, "streaming feature chunk size"); +DEFINE_int32(sample_rate, 16000, "sample rate"); + +int main(int argc, char* argv[]) { + gflags::SetUsageMessage("Usage:"); + gflags::ParseCommandLineFlags(&argc, &argv, false); + google::InitGoogleLogging(argv[0]); + google::InstallFailureSignalHandler(); + FLAGS_logtostderr = 1; + + int32 num_done = 0, num_err = 0; + double tot_wav_duration = 0.0; + double tot_decode_time = 0.0; + + kaldi::SequentialTableReader wav_reader( + FLAGS_wav_rspecifier); + kaldi::TokenWriter result_writer(FLAGS_result_wspecifier); + + int sample_rate = FLAGS_sample_rate; + float streaming_chunk = FLAGS_streaming_chunk; + int chunk_sample_size = streaming_chunk * sample_rate; + LOG(INFO) << "sr: " << sample_rate; + LOG(INFO) << "chunk size (s): " << streaming_chunk; + LOG(INFO) << "chunk size (sample): " << chunk_sample_size; + + ppspeech::U2RecognizerResource resource = + ppspeech::U2RecognizerResource::InitFromFlags(); + ppspeech::U2Recognizer recognizer(resource); + + for (; !wav_reader.Done(); wav_reader.Next()) { + std::string utt = wav_reader.Key(); + const kaldi::WaveData& wave_data = wav_reader.Value(); + LOG(INFO) << "utt: " << utt; + LOG(INFO) << "wav dur: " << wave_data.Duration() << " sec."; + double dur = wave_data.Duration(); + tot_wav_duration += dur; + + int32 this_channel = 0; + kaldi::SubVector waveform(wave_data.Data(), + this_channel); + int tot_samples = waveform.Dim(); + LOG(INFO) << "wav len (sample): " << tot_samples; + + int sample_offset = 0; + int cnt = 0; + kaldi::Timer timer; + kaldi::Timer local_timer; + + while (sample_offset < tot_samples) { + int cur_chunk_size = + std::min(chunk_sample_size, tot_samples - sample_offset); + + kaldi::Vector wav_chunk(cur_chunk_size); + for (int i = 0; i < cur_chunk_size; ++i) { + wav_chunk(i) = waveform(sample_offset + i); + } + // wav_chunk = waveform.Range(sample_offset + i, cur_chunk_size); + + recognizer.Accept(wav_chunk); + if (cur_chunk_size < chunk_sample_size) { + recognizer.SetFinished(); + } + recognizer.Decode(); + if (recognizer.DecodedSomething()) { + LOG(INFO) << "Pratial result: " << cnt << " " + << recognizer.GetPartialResult(); + } + + // no overlap + sample_offset += cur_chunk_size; + cnt++; + } + CHECK(sample_offset == tot_samples); + + // second pass decoding + recognizer.Rescoring(); + + tot_decode_time += timer.Elapsed(); + + std::string result = recognizer.GetFinalResult(); + + recognizer.Reset(); + + if (result.empty()) { + // the TokenWriter can not write empty string. + ++num_err; + LOG(INFO) << " the result of " << utt << " is empty"; + continue; + } + + LOG(INFO) << utt << " " << result; + LOG(INFO) << " RTF: " << local_timer.Elapsed() / dur << " dur: " << dur + << " cost: " << local_timer.Elapsed(); + + result_writer.Write(utt, result); + + ++num_done; + } + + LOG(INFO) << "Done " << num_done << " out of " << (num_err + num_done); + LOG(INFO) << "total wav duration is: " << tot_wav_duration << " sec"; + LOG(INFO) << "total decode cost:" << tot_decode_time << " sec"; + LOG(INFO) << "RTF is: " << tot_decode_time / tot_wav_duration; +} diff --git a/speechx/speechx/third_party/CMakeLists.txt b/speechx/speechx/third_party/CMakeLists.txt old mode 100644 new mode 100755 diff --git a/speechx/speechx/third_party/README.md b/speechx/speechx/third_party/README.md old mode 100644 new mode 100755 diff --git a/speechx/speechx/utils/CMakeLists.txt b/speechx/speechx/utils/CMakeLists.txt old mode 100644 new mode 100755 index 95e865744..c1e875be1 --- a/speechx/speechx/utils/CMakeLists.txt +++ b/speechx/speechx/utils/CMakeLists.txt @@ -1,4 +1,5 @@ add_library(utils file_utils.cc + math.cc ) \ No newline at end of file diff --git a/speechx/speechx/utils/file_utils.cc b/speechx/speechx/utils/file_utils.cc old mode 100644 new mode 100755 index e5943e318..c42a642c7 --- a/speechx/speechx/utils/file_utils.cc +++ b/speechx/speechx/utils/file_utils.cc @@ -40,4 +40,4 @@ std::string ReadFile2String(const std::string& path) { return std::string((std::istreambuf_iterator(input_file)), std::istreambuf_iterator()); } -} +} // namespace ppspeech diff --git a/speechx/speechx/utils/file_utils.h b/speechx/speechx/utils/file_utils.h old mode 100644 new mode 100755 index 8c56c02eb..a471e024e --- a/speechx/speechx/utils/file_utils.h +++ b/speechx/speechx/utils/file_utils.h @@ -20,4 +20,4 @@ bool ReadFileToVector(const std::string& filename, std::vector* data); std::string ReadFile2String(const std::string& path); -} +} // namespace ppspeech diff --git a/speechx/speechx/utils/math.cc b/speechx/speechx/utils/math.cc new file mode 100755 index 000000000..71656cb30 --- /dev/null +++ b/speechx/speechx/utils/math.cc @@ -0,0 +1,98 @@ + +// Copyright (c) 2021 Mobvoi Inc (Zhendong Peng) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "utils/math.h" + +#include +#include +#include +#include + +#include "base/common.h" + + +namespace ppspeech { + +// Sum in log scale +float LogSumExp(float x, float y) { + if (x <= -kBaseFloatMax) return y; + if (y <= -kBaseFloatMax) return x; + float max = std::max(x, y); + return max + std::log(std::exp(x - max) + std::exp(y - max)); +} + +// greater compare for smallest priority_queue +template +struct ValGreaterComp { + bool operator()(const std::pair& lhs, + const std::pair& rhs) const { + return lhs.first > rhs.first || + (lhs.first == rhs.first && lhs.second < rhs.second); + } +}; + +template +void TopK(const std::vector& data, + int32_t k, + std::vector* values, + std::vector* indices) { + int n = data.size(); + int min_k_n = std::min(k, n); + + // smallest heap, (val, idx) + std::vector> smallest_heap; + for (int i = 0; i < min_k_n; i++) { + smallest_heap.emplace_back(data[i], i); + } + + // smallest priority_queue + std::priority_queue, + std::vector>, + ValGreaterComp> + pq(ValGreaterComp(), std::move(smallest_heap)); + + // top k + for (int i = k; i < n; i++) { + if (pq.top().first < data[i]) { + pq.pop(); + pq.emplace(data[i], i); + } + } + + values->resize(min_k_n); + indices->resize(min_k_n); + + // from largest to samllest + int cur = values->size() - 1; + while (!pq.empty()) { + const auto& item = pq.top(); + + (*values)[cur] = item.first; + (*indices)[cur] = item.second; + + // item if reference, must pop here + pq.pop(); + + cur--; + } +} + +template void TopK(const std::vector& data, + int32_t k, + std::vector* values, + std::vector* indices); + +} // namespace ppspeech diff --git a/speechx/speechx/utils/math.h b/speechx/speechx/utils/math.h new file mode 100755 index 000000000..7c863b009 --- /dev/null +++ b/speechx/speechx/utils/math.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace ppspeech { + +// Sum in log scale +float LogSumExp(float x, float y); + +template +void TopK(const std::vector& data, + int32_t k, + std::vector* values, + std::vector* indices); + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/tools/clang-format.sh b/speechx/tools/clang-format.sh new file mode 100755 index 000000000..30f636ff4 --- /dev/null +++ b/speechx/tools/clang-format.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +find speechx -name '*.c' -o -name '*.h' -not -path "*kaldi*" | xargs -I{} clang-format -i {} diff --git a/speechx/tools/venv.sh b/speechx/tools/venv.sh new file mode 100755 index 000000000..3952988c6 --- /dev/null +++ b/speechx/tools/venv.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set -ex + +PYTHON=python3.7 +test -d venv || virtualenv -p ${PYTHON} venv diff --git a/tests/benchmark/audio/README.md b/tests/benchmark/audio/README.md old mode 100644 new mode 100755 diff --git a/tests/benchmark/audio/log_melspectrogram.py b/tests/benchmark/audio/log_melspectrogram.py old mode 100644 new mode 100755 diff --git a/tests/benchmark/audio/melspectrogram.py b/tests/benchmark/audio/melspectrogram.py old mode 100644 new mode 100755 diff --git a/tests/benchmark/audio/mfcc.py b/tests/benchmark/audio/mfcc.py old mode 100644 new mode 100755 diff --git a/tests/benchmark/conformer/README.md b/tests/benchmark/conformer/README.md old mode 100644 new mode 100755 diff --git a/tests/benchmark/conformer/prepare.sh b/tests/benchmark/conformer/prepare.sh old mode 100644 new mode 100755 diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh old mode 100644 new mode 100755 diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh old mode 100644 new mode 100755 diff --git a/tests/benchmark/pwgan/README.md b/tests/benchmark/pwgan/README.md old mode 100644 new mode 100755 diff --git a/tests/chains/ds2/README.md b/tests/chains/ds2/README.md old mode 100644 new mode 100755 diff --git a/tests/chains/ds2/ds2_params_lite_train_infer.txt b/tests/chains/ds2/ds2_params_lite_train_infer.txt old mode 100644 new mode 100755 diff --git a/tests/chains/ds2/ds2_params_whole_train_infer.txt b/tests/chains/ds2/ds2_params_whole_train_infer.txt old mode 100644 new mode 100755 diff --git a/tests/chains/ds2/lite_train_infer.sh b/tests/chains/ds2/lite_train_infer.sh old mode 100644 new mode 100755 diff --git a/tests/chains/ds2/prepare.sh b/tests/chains/ds2/prepare.sh old mode 100644 new mode 100755 diff --git a/tests/chains/ds2/speedyspeech_params_lite.txt b/tests/chains/ds2/speedyspeech_params_lite.txt old mode 100644 new mode 100755 diff --git a/tests/chains/ds2/test.sh b/tests/chains/ds2/test.sh old mode 100644 new mode 100755 diff --git a/tests/chains/ds2/whole_train_infer.sh b/tests/chains/ds2/whole_train_infer.sh old mode 100644 new mode 100755 diff --git a/tests/chains/speedyspeech/README.md b/tests/chains/speedyspeech/README.md old mode 100644 new mode 100755 diff --git a/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt old mode 100644 new mode 100755 diff --git a/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt old mode 100644 new mode 100755 diff --git a/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt old mode 100644 new mode 100755 diff --git a/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt b/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt old mode 100644 new mode 100755 diff --git a/tests/test_tipc/barrier.sh b/tests/test_tipc/barrier.sh old mode 100644 new mode 100755 diff --git a/tests/test_tipc/benchmark_train.sh b/tests/test_tipc/benchmark_train.sh old mode 100644 new mode 100755 diff --git a/tests/test_tipc/common_func.sh b/tests/test_tipc/common_func.sh old mode 100644 new mode 100755 diff --git a/tests/test_tipc/configs/conformer/train_infer_python.txt b/tests/test_tipc/configs/conformer/train_infer_python.txt old mode 100644 new mode 100755 diff --git a/tests/test_tipc/configs/mdtc/train_infer_python.txt b/tests/test_tipc/configs/mdtc/train_infer_python.txt old mode 100644 new mode 100755 index 7a5f658ee..6fb8c3484 --- a/tests/test_tipc/configs/mdtc/train_infer_python.txt +++ b/tests/test_tipc/configs/mdtc/train_infer_python.txt @@ -49,9 +49,3 @@ null:null null:null null:null null:null -===========================train_benchmark_params========================== -batch_size:16|30 -fp_items:fp32 -iteration:50 ---profiler-options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile" -flags:null diff --git a/tests/test_tipc/configs/pwgan/train_infer_python.txt b/tests/test_tipc/configs/pwgan/train_infer_python.txt old mode 100644 new mode 100755 diff --git a/tests/test_tipc/conformer/scripts/aishell_tiny.py b/tests/test_tipc/conformer/scripts/aishell_tiny.py old mode 100644 new mode 100755 diff --git a/tests/test_tipc/docs/benchmark_train.md b/tests/test_tipc/docs/benchmark_train.md old mode 100644 new mode 100755 diff --git a/tests/test_tipc/test_train_inference_python.sh b/tests/test_tipc/test_train_inference_python.sh old mode 100644 new mode 100755 diff --git a/tests/unit/asr/deepspeech2_model_test.py b/tests/unit/asr/deepspeech2_model_test.py old mode 100644 new mode 100755 diff --git a/tests/unit/asr/deepspeech2_online_model_test.py b/tests/unit/asr/deepspeech2_online_model_test.py old mode 100644 new mode 100755 diff --git a/tests/unit/asr/deepspeech2_online_model_test.sh b/tests/unit/asr/deepspeech2_online_model_test.sh old mode 100644 new mode 100755 diff --git a/tests/unit/asr/error_rate_test.py b/tests/unit/asr/error_rate_test.py old mode 100644 new mode 100755 diff --git a/tests/unit/asr/mask_test.py b/tests/unit/asr/mask_test.py old mode 100644 new mode 100755 diff --git a/tests/unit/asr/reverse_pad_list.py b/tests/unit/asr/reverse_pad_list.py old mode 100644 new mode 100755 diff --git a/tests/unit/asr/u2_model_test.py b/tests/unit/asr/u2_model_test.py old mode 100644 new mode 100755 diff --git a/tests/unit/audio/backends/__init__.py b/tests/unit/audio/backends/__init__.py old mode 100644 new mode 100755 diff --git a/tests/unit/audio/backends/base.py b/tests/unit/audio/backends/base.py old mode 100644 new mode 100755 diff --git a/tests/unit/audio/backends/soundfile/__init__.py b/tests/unit/audio/backends/soundfile/__init__.py old mode 100644 new mode 100755 diff --git a/tests/unit/audio/backends/soundfile/test_io.py b/tests/unit/audio/backends/soundfile/test_io.py old mode 100644 new mode 100755 diff --git a/tests/unit/audio/features/__init__.py b/tests/unit/audio/features/__init__.py old mode 100644 new mode 100755 diff --git a/tests/unit/audio/features/base.py b/tests/unit/audio/features/base.py old mode 100644 new mode 100755 diff --git a/tests/unit/audio/features/test_istft.py b/tests/unit/audio/features/test_istft.py old mode 100644 new mode 100755 diff --git a/tests/unit/audio/features/test_kaldi.py b/tests/unit/audio/features/test_kaldi.py old mode 100644 new mode 100755 diff --git a/tests/unit/audio/features/test_librosa.py b/tests/unit/audio/features/test_librosa.py old mode 100644 new mode 100755 diff --git a/tests/unit/audio/features/test_log_melspectrogram.py b/tests/unit/audio/features/test_log_melspectrogram.py old mode 100644 new mode 100755 diff --git a/tests/unit/audio/features/test_spectrogram.py b/tests/unit/audio/features/test_spectrogram.py old mode 100644 new mode 100755 diff --git a/tests/unit/audio/features/test_stft.py b/tests/unit/audio/features/test_stft.py old mode 100644 new mode 100755 diff --git a/tests/unit/cli/aishell_test_prepare.py b/tests/unit/cli/aishell_test_prepare.py old mode 100644 new mode 100755 diff --git a/tests/unit/cli/calc_RTF_CER_by_aishell.sh b/tests/unit/cli/calc_RTF_CER_by_aishell.sh old mode 100644 new mode 100755 diff --git a/tests/unit/cli/path.sh b/tests/unit/cli/path.sh old mode 100644 new mode 100755 diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index c6837c303..3a58626d2 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -9,6 +9,10 @@ paddlespeech cls --input ./cat.wav --topk 10 # Punctuation_restoration paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast +# Speech SSL +paddlespeech ssl --task asr --lang en --input ./en.wav +paddlespeech ssl --task vector --lang en --input ./en.wav + # Speech_recognition wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav paddlespeech asr --input ./zh.wav @@ -16,6 +20,7 @@ paddlespeech asr --model conformer_aishell --input ./zh.wav paddlespeech asr --model conformer_online_aishell --input ./zh.wav paddlespeech asr --model conformer_online_wenetspeech --input ./zh.wav paddlespeech asr --model conformer_online_multicn --input ./zh.wav +paddlespeech asr --model conformer_u2pp_online_wenetspeech --lang zh --input zh.wav paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav paddlespeech asr --model deepspeech2offline_aishell --input ./zh.wav paddlespeech asr --model deepspeech2online_wenetspeech --input ./zh.wav @@ -53,6 +58,7 @@ paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." +paddlespeech tts --am fastspeech2_male --voc pwgan_male --input "你好,欢迎使用百度飞桨深度学习框架!" # mix tts # The `am` must be `fastspeech2_mix`! # The `lang` must be `mix`! @@ -92,5 +98,11 @@ paddlespeech stats --task text paddlespeech stats --task vector paddlespeech stats --task st +# whisper text recognize +paddlespeech whisper --task transcribe --input ./zh.wav + +# whisper recognize text and translate to English +paddlespeech whisper --task translate --input ./zh.wav + echo -e "\033[32mTest success !!!\033[0m" diff --git a/tests/unit/server/offline/change_yaml.py b/tests/unit/server/offline/change_yaml.py old mode 100644 new mode 100755 diff --git a/tests/unit/server/offline/conf/application.yaml b/tests/unit/server/offline/conf/application.yaml old mode 100644 new mode 100755 diff --git a/tests/unit/server/offline/test_server_client.sh b/tests/unit/server/offline/test_server_client.sh old mode 100644 new mode 100755 diff --git a/tests/unit/server/online/tts/check_server/change_yaml.py b/tests/unit/server/online/tts/check_server/change_yaml.py old mode 100644 new mode 100755 diff --git a/tests/unit/server/online/tts/check_server/conf/application.yaml b/tests/unit/server/online/tts/check_server/conf/application.yaml old mode 100644 new mode 100755 diff --git a/tests/unit/server/online/tts/check_server/test.sh b/tests/unit/server/online/tts/check_server/test.sh old mode 100644 new mode 100755 diff --git a/tests/unit/server/online/tts/check_server/test_all.sh b/tests/unit/server/online/tts/check_server/test_all.sh old mode 100644 new mode 100755 diff --git a/tests/unit/server/online/tts/check_server/tts_online_application.yaml b/tests/unit/server/online/tts/check_server/tts_online_application.yaml old mode 100644 new mode 100755 diff --git a/tests/unit/server/online/tts/test_server/test_http_client.py b/tests/unit/server/online/tts/test_server/test_http_client.py old mode 100644 new mode 100755 diff --git a/tests/unit/tts/test_data_table.py b/tests/unit/tts/test_data_table.py old mode 100644 new mode 100755 diff --git a/tests/unit/tts/test_expansion.py b/tests/unit/tts/test_expansion.py old mode 100644 new mode 100755 diff --git a/tests/unit/tts/test_optimizer.py b/tests/unit/tts/test_optimizer.py old mode 100644 new mode 100755 diff --git a/tests/unit/tts/test_pwg.py b/tests/unit/tts/test_pwg.py old mode 100644 new mode 100755 index 78cb34f25..10c82c9fd --- a/tests/unit/tts/test_pwg.py +++ b/tests/unit/tts/test_pwg.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle import torch +from paddle.device.cuda import synchronize from parallel_wavegan.layers import residual_block from parallel_wavegan.layers import upsample from parallel_wavegan.models import parallel_wavegan as pwgan @@ -24,7 +25,6 @@ from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator from paddlespeech.t2s.models.parallel_wavegan import ResidualBlock from paddlespeech.t2s.models.parallel_wavegan import ResidualPWGDiscriminator from paddlespeech.t2s.utils.layer_tools import summary -from paddlespeech.t2s.utils.profile import synchronize paddle.set_device("gpu:0") device = torch.device("cuda:0") diff --git a/tests/unit/tts/test_raise.py b/tests/unit/tts/test_raise.py old mode 100644 new mode 100755 diff --git a/tests/unit/tts/test_reporter.py b/tests/unit/tts/test_reporter.py old mode 100644 new mode 100755 diff --git a/tests/unit/tts/test_snapshot.py b/tests/unit/tts/test_snapshot.py old mode 100644 new mode 100755 diff --git a/tests/unit/tts/test_stft.py b/tests/unit/tts/test_stft.py old mode 100644 new mode 100755 diff --git a/tests/unit/tts/test_to_static.py b/tests/unit/tts/test_to_static.py old mode 100644 new mode 100755 diff --git a/tests/unit/vector/conftest.py b/tests/unit/vector/conftest.py old mode 100644 new mode 100755 diff --git a/tests/unit/vector/test_augment.py b/tests/unit/vector/test_augment.py old mode 100644 new mode 100755 diff --git a/third_party/README.md b/third_party/README.md old mode 100644 new mode 100755 diff --git a/third_party/__init__.py b/third_party/__init__.py old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/.gitignore b/third_party/ctc_decoders/.gitignore old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/COPYING.APACHE2.0 b/third_party/ctc_decoders/COPYING.APACHE2.0 old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/COPYING.LESSER.3 b/third_party/ctc_decoders/COPYING.LESSER.3 old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/LICENSE b/third_party/ctc_decoders/LICENSE old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/__init__.py b/third_party/ctc_decoders/__init__.py old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/ctc_beam_search_decoder.cpp b/third_party/ctc_decoders/ctc_beam_search_decoder.cpp old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/ctc_beam_search_decoder.h b/third_party/ctc_decoders/ctc_beam_search_decoder.h old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/ctc_greedy_decoder.cpp b/third_party/ctc_decoders/ctc_greedy_decoder.cpp old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/ctc_greedy_decoder.h b/third_party/ctc_decoders/ctc_greedy_decoder.h old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/decoder_utils.cpp b/third_party/ctc_decoders/decoder_utils.cpp old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/decoder_utils.h b/third_party/ctc_decoders/decoder_utils.h old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/decoders.i b/third_party/ctc_decoders/decoders.i old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/path_trie.cpp b/third_party/ctc_decoders/path_trie.cpp old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/path_trie.h b/third_party/ctc_decoders/path_trie.h old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/scorer.cpp b/third_party/ctc_decoders/scorer.cpp old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/scorer.h b/third_party/ctc_decoders/scorer.h old mode 100644 new mode 100755 diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py old mode 100644 new mode 100755 diff --git a/third_party/install.sh b/third_party/install.sh old mode 100644 new mode 100755 diff --git a/third_party/install_win_ctc.bat b/third_party/install_win_ctc.bat old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/.gitignore b/third_party/python_kaldi_features/.gitignore old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/LICENSE b/third_party/python_kaldi_features/LICENSE old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/MANIFEST b/third_party/python_kaldi_features/MANIFEST old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/README.rst b/third_party/python_kaldi_features/README.rst old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/docs/Makefile b/third_party/python_kaldi_features/docs/Makefile old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/docs/make.bat b/third_party/python_kaldi_features/docs/make.bat old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/docs/source/conf.py b/third_party/python_kaldi_features/docs/source/conf.py old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/docs/source/index.rst b/third_party/python_kaldi_features/docs/source/index.rst old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/english.wav b/third_party/python_kaldi_features/english.wav old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/example.py b/third_party/python_kaldi_features/example.py old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/python_speech_features/__init__.py b/third_party/python_kaldi_features/python_speech_features/__init__.py old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/python_speech_features/base.py b/third_party/python_kaldi_features/python_speech_features/base.py old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/python_speech_features/base_orig.py b/third_party/python_kaldi_features/python_speech_features/base_orig.py old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/python_speech_features/sigproc.py b/third_party/python_kaldi_features/python_speech_features/sigproc.py old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/python_speech_features/sigproc_orig.py b/third_party/python_kaldi_features/python_speech_features/sigproc_orig.py old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/requirements.txt b/third_party/python_kaldi_features/requirements.txt old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/setup.py b/third_party/python_kaldi_features/setup.py old mode 100644 new mode 100755 diff --git a/third_party/python_kaldi_features/test/test_sigproc.py b/third_party/python_kaldi_features/test/test_sigproc.py old mode 100644 new mode 100755 diff --git a/tools/Makefile b/tools/Makefile old mode 100644 new mode 100755 diff --git a/tools/extras/README.md b/tools/extras/README.md old mode 100644 new mode 100755 diff --git a/tools/extras/srilm.patch b/tools/extras/srilm.patch old mode 100644 new mode 100755 diff --git a/tools/get_contributors.ipynb b/tools/get_contributors.ipynb old mode 100644 new mode 100755 diff --git a/tools/watermark.py b/tools/watermark.py new file mode 100755 index 000000000..fc592d5bc --- /dev/null +++ b/tools/watermark.py @@ -0,0 +1,20 @@ +# add watermark for text +def watermark(content, pattern): + m = list(zip(pattern * (len(content) // len(pattern) + 1), content)) + return ''.join([x for t in m + for x in t] + [pattern[len(content) % len(pattern)]]) + + +# remove cyclic watermark in text +def iwatermark(content): + e = [x for i, x in enumerate(content) if i % 2 == 0] + o = [x for i, x in enumerate(content) if i % 2 != 0] + for i in range(1, len(e) // 2 + 1): + if e[i:] == e[:-i]: + return ''.join(o) + return ''.join(e) + + +if __name__ == "__main__": + print(watermark('跟世龙对齐 Triton 开发计划', 'hbzs')) + print(iwatermark('h跟b世z龙s对h齐b zTsrhibtzosnh b开z发s计h划b')) diff --git a/utils/README.md b/utils/README.md old mode 100644 new mode 100755 diff --git a/utils/format_rsl.py b/utils/format_rsl.py old mode 100644 new mode 100755 diff --git a/utils/tokenizer.perl b/utils/tokenizer.perl old mode 100644 new mode 100755