diff --git a/.gitignore b/.gitignore index cc8fff87..ad8e7492 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.pyc .vscode *log +*.wav *.pdmodel *.pdiparams* *.zip @@ -13,6 +14,7 @@ *.whl *.egg-info build +*output/ docs/build/ docs/topic/ctc/warp-ctc/ @@ -30,5 +32,6 @@ tools/OpenBLAS/ tools/Miniconda3-latest-Linux-x86_64.sh tools/activate_python.sh tools/miniconda.sh +tools/CRF++-0.58/ -*output/ +speechx/fc_patch/ \ No newline at end of file diff --git a/README.md b/README.md index 46730797..46f492e9 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,12 @@ For more synthesized audios, please refer to [PaddleSpeech Text-to-Speech sample - [PaddleSpeech Demo Video](https://paddlespeech.readthedocs.io/en/latest/demo_video.html) +- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): Use PaddleSpeech TTS and ASR to clone voice from videos.** + +
+ +
+ ### 🔥 Hot Activities - 2021.12.21~12.24 @@ -196,16 +202,18 @@ Developers can have a try of our models with [PaddleSpeech Command Line](./paddl ```shell paddlespeech cls --input input.wav ``` + **Automatic Speech Recognition** ```shell paddlespeech asr --lang zh --input input_16k.wav ``` -**Speech Translation** (English to Chinese) +**Speech Translation** (English to Chinese) (not support for Mac and Windows now) ```shell paddlespeech st --input input_16k.wav ``` + **Text-to-Speech** ```shell paddlespeech tts --input "你好,欢迎使用飞桨深度学习框架!" --output output.wav @@ -218,7 +226,16 @@ paddlespeech tts --input "你好,欢迎使用飞桨深度学习框架!" --ou paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 ``` - +**Batch Process** +``` +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts +``` + +**Shell Pipeline** +- ASR + Punctuation Restoration +``` +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc +``` For more command lines, please see: [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos) @@ -561,6 +578,9 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P - Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function. - Many thanks to [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) for contributing Punctuation Restoration model. - Many thanks to [kslz](https://github.com/745165806) for supplementary Chinese documents. +- Many thanks to [awmmmm](https://github.com/awmmmm) for contributing fastspeech2 aishell3 conformer pretrained model. +- Many thanks to [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) for developing a dubbing tool with GUI based on PaddleSpeech TTS model. +- Many thanks to [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) for developing a GUI tool based on PaddleSpeech TTS and code for making datasets from videos based on PaddleSpeech ASR. Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information. diff --git a/README_cn.md b/README_cn.md index 9782240a..e8494737 100644 --- a/README_cn.md +++ b/README_cn.md @@ -150,6 +150,12 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme - [PaddleSpeech 示例视频](https://paddlespeech.readthedocs.io/en/latest/demo_video.html) +- **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): 使用 PaddleSpeech 的语音合成和语音识别从视频中克隆人声。** + +
+ +
+ ### 🔥 热门活动 - 2021.12.21~12.24 @@ -216,6 +222,17 @@ paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架! paddlespeech text --task punc --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 ``` +**批处理** +``` +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts +``` + +**Shell管道** +ASR + Punc: +``` +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc +``` + 更多命令行命令请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos) > Note: 如果需要训练或者微调,请查看[语音识别](./docs/source/asr/quick_start.md), [语音合成](./docs/source/tts/quick_start.md)。 @@ -556,6 +573,10 @@ year={2021} - 非常感谢 [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) 采用 PaddleSpeech 语音合成功能实现 Virtual Uploader(VUP)/Virtual YouTuber(VTuber) 虚拟主播。 - 非常感谢 [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) 贡献标点重建相关模型。 - 非常感谢 [kslz](https://github.com/kslz) 补充中文文档。 +- 非常感谢 [awmmmm](https://github.com/awmmmm) 提供 fastspeech2 aishell3 conformer 预训练模型。 +- 非常感谢 [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) 基于 PaddleSpeech 的 TTS 模型搭建带 GUI 操作界面的配音工具。 +- 非常感谢 [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) 基于 PaddleSpeech 的 TTS GUI 界面和基于 ASR 制作数据集的相关代码。 + 此外,PaddleSpeech 依赖于许多开源存储库。有关更多信息,请参阅 [references](./docs/source/reference.md)。 diff --git a/demos/speech_recognition/.gitignore b/demos/speech_recognition/.gitignore new file mode 100644 index 00000000..d8dd7532 --- /dev/null +++ b/demos/speech_recognition/.gitignore @@ -0,0 +1 @@ +*.wav diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md index c49afa35..5d964fce 100644 --- a/demos/speech_recognition/README.md +++ b/demos/speech_recognition/README.md @@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav # English paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + # Chinese ASR + Punctuation Restoration + paddlespeech asr --input ./zh.wav | paddlespeech text --task punc ``` (It doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.) diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md index c2e38c91..ba1f1d65 100644 --- a/demos/speech_recognition/README_cn.md +++ b/demos/speech_recognition/README_cn.md @@ -25,6 +25,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee paddlespeech asr --input ./zh.wav # 英文 paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav + # 中文 + 标点恢复 + paddlespeech asr --input ./zh.wav | paddlespeech text --task punc ``` (如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error,没有关系,这个包是非必须的。) diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh index 5efc8b81..06466928 100755 --- a/demos/speech_recognition/run.sh +++ b/demos/speech_recognition/run.sh @@ -1,4 +1,10 @@ #!/bin/bash wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav + +# asr paddlespeech asr --input ./zh.wav + + +# asr + punc +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc \ No newline at end of file diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md index 39007f6c..a2f6f221 100644 --- a/demos/speech_server/README.md +++ b/demos/speech_server/README.md @@ -10,10 +10,15 @@ This demo is an implementation of starting the voice service and accessing the s ### 1. Installation see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -You can choose one way from easy, meduim and hard to install paddlespeech. +It is recommended to use **paddlepaddle 2.2.1** or above. +You can choose one way from meduim and hard to install paddlespeech. ### 2. Prepare config File -The configuration file contains the service-related configuration files and the model configuration related to the voice tasks contained in the service. They are all under the `conf` folder. +The configuration file can be found in `conf/application.yaml` . +Among them, `engine_list` indicates the speech engine that will be included in the service to be started, in the format of _. +At present, the speech tasks integrated by the service include: asr (speech recognition) and tts (speech synthesis). +Currently the engine type supports two forms: python and inference (Paddle Inference) + The input of ASR client demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. @@ -76,6 +81,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ### 4. ASR Client Usage +**Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) ``` paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav @@ -122,6 +128,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ``` ### 5. TTS Client Usage +**Note:** The response time will be slightly longer when using the client for the first time - Command Line (Recommended) ```bash paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav @@ -147,8 +154,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 15:20:37,875] [ INFO] - Save synthesized audio successfully on output.wav. [2022-02-23 15:20:37,875] [ INFO] - Audio duration: 3.612500 s. [2022-02-23 15:20:37,875] [ INFO] - Response time: 0.348050 s. - [2022-02-23 15:20:37,875] [ INFO] - RTF: 0.096346 - ``` @@ -174,51 +179,13 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee Save synthesized audio successfully on ./output.wav. Audio duration: 3.612500 s. Response time: 0.388317 s. - RTF: 0.107493 ``` -## Pretrained Models +## Models supported by the service ### ASR model -Here is a list of [ASR pretrained models](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_recognition/README.md#4pretrained-models) released by PaddleSpeech, both command line and python interfaces are available: - -| Model | Language | Sample Rate -| :--- | :---: | :---: | -| conformer_wenetspeech| zh| 16000 -| transformer_librispeech| en| 16000 +Get all models supported by the ASR service via `paddlespeech_server stats --task asr`, where static models can be used for paddle inference inference. ### TTS model -Here is a list of [TTS pretrained models](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/text_to_speech/README.md#4-pretrained-models) released by PaddleSpeech, both command line and python interfaces are available: - -- Acoustic model - | Model | Language - | :--- | :---: | - | speedyspeech_csmsc| zh - | fastspeech2_csmsc| zh - | fastspeech2_aishell3| zh - | fastspeech2_ljspeech| en - | fastspeech2_vctk| en - -- Vocoder - | Model | Language - | :--- | :---: | - | pwgan_csmsc| zh - | pwgan_aishell3| zh - | pwgan_ljspeech| en - | pwgan_vctk| en - | mb_melgan_csmsc| zh - -Here is a list of **TTS pretrained static models** released by PaddleSpeech, both command line and python interfaces are available: -- Acoustic model - | Model | Language - | :--- | :---: | - | speedyspeech_csmsc| zh - | fastspeech2_csmsc| zh - -- Vocoder - | Model | Language - | :--- | :---: | - | pwgan_csmsc| zh - | mb_melgan_csmsc| zh - | hifigan_csmsc| zh +Get all models supported by the TTS service via `paddlespeech_server stats --task tts`, where static models can be used for paddle inference inference. diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md index f5666070..762248a1 100644 --- a/demos/speech_server/README_cn.md +++ b/demos/speech_server/README_cn.md @@ -10,10 +10,16 @@ ### 1. 安装 请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). -你可以从 easy,medium,hard 三中方式中选择一种方式安装 PaddleSpeech。 +推荐使用 **paddlepaddle 2.2.1** 或以上版本。 +你可以从 medium,hard 三中方式中选择一种方式安装 PaddleSpeech。 + ### 2. 准备配置文件 -配置文件包含服务相关的配置文件和服务中包含的语音任务相关的模型配置。 它们都在 `conf` 文件夹下。 +配置文件可参见 `conf/application.yaml` 。 +其中,`engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。 +目前服务集成的语音任务有: asr(语音识别)、tts(语音合成)。 +目前引擎类型支持两种形式:python 及 inference (Paddle Inference) + 这个 ASR client 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 @@ -75,6 +81,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ``` ### 4. ASR客户端使用方法 +**注意:** 初次使用客户端时响应时间会略长 - 命令行 (推荐使用) ``` paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav @@ -123,9 +130,12 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee ``` ### 5. TTS客户端使用方法 - ```bash - paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav - ``` +**注意:** 初次使用客户端时响应时间会略长 +- 命令行 (推荐使用) + + ```bash + paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav + ``` 使用帮助: ```bash @@ -148,7 +158,6 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee [2022-02-23 15:20:37,875] [ INFO] - Save synthesized audio successfully on output.wav. [2022-02-23 15:20:37,875] [ INFO] - Audio duration: 3.612500 s. [2022-02-23 15:20:37,875] [ INFO] - Response time: 0.348050 s. - [2022-02-23 15:20:37,875] [ INFO] - RTF: 0.096346 ``` - Python API @@ -173,50 +182,12 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee Save synthesized audio successfully on ./output.wav. Audio duration: 3.612500 s. Response time: 0.388317 s. - RTF: 0.107493 ``` -## Pretrained Models -### ASR model -下面是PaddleSpeech发布的[ASR预训练模型](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_recognition/README.md#4pretrained-models)列表,命令行和python接口均可用: - -| Model | Language | Sample Rate -| :--- | :---: | :---: | -| conformer_wenetspeech| zh| 16000 -| transformer_librispeech| en| 16000 - -### TTS model -下面是PaddleSpeech发布的 [TTS预训练模型](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/text_to_speech/README.md#4-pretrained-models) 列表,命令行和python接口均可用: - -- Acoustic model - | Model | Language - | :--- | :---: | - | speedyspeech_csmsc| zh - | fastspeech2_csmsc| zh - | fastspeech2_aishell3| zh - | fastspeech2_ljspeech| en - | fastspeech2_vctk| en - -- Vocoder - | Model | Language - | :--- | :---: | - | pwgan_csmsc| zh - | pwgan_aishell3| zh - | pwgan_ljspeech| en - | pwgan_vctk| en - | mb_melgan_csmsc| zh - -下面是PaddleSpeech发布的 **TTS预训练静态模型** 列表,命令行和python接口均可用: -- Acoustic model - | Model | Language - | :--- | :---: | - | speedyspeech_csmsc| zh - | fastspeech2_csmsc| zh - -- Vocoder - | Model | Language - | :--- | :---: | - | pwgan_csmsc| zh - | mb_melgan_csmsc| zh - | hifigan_csmsc| zh +## 服务支持的模型 +### ASR支持的模型 +通过 `paddlespeech_server stats --task asr` 获取ASR服务支持的所有模型,其中静态模型可用于 paddle inference 推理。 + +### TTS支持的模型 +通过 `paddlespeech_server stats --task tts` 获取TTS服务支持的所有模型,其中静态模型可用于 paddle inference 推理。 diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml index fd4f5f37..6048450b 100644 --- a/demos/speech_server/conf/application.yaml +++ b/demos/speech_server/conf/application.yaml @@ -1,25 +1,107 @@ # This is the parameter configuration file for PaddleSpeech Serving. -################################################################## -# SERVER SETTING # -################################################################## -host: '0.0.0.0' +################################################################################# +# SERVER SETTING # +################################################################################# +host: 127.0.0.1 port: 8090 -################################################################## -# CONFIG FILE # -################################################################## -# The engine_type of speech task needs to keep the same type as the config file of speech task. -# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml' -# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml' -# -# add engine type (Options: python, inference) -engine_type: - asr: 'inference' - tts: 'inference' - -# add engine backend type (Options: asr, tts) and config file here. -# Adding a speech task to engine_backend means starting the service. -engine_backend: - asr: 'conf/asr/asr_pd.yaml' - tts: 'conf/tts/tts_pd.yaml' +# The task format in the engin_list is: _ +# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] + +engine_list: ['asr_python', 'tts_python'] + + +################################################################################# +# ENGINE CONFIG # +################################################################################# +################### speech task: asr; engine_type: python ####################### +asr_python: + model: 'conformer_wenetspeech' + lang: 'zh' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + decode_method: 'attention_rescoring' + force_yes: True + device: # set 'gpu:id' or 'cpu' + + +################### speech task: asr; engine_type: inference ####################### +asr_inference: + # model_type choices=['deepspeech2offline_aishell'] + model_type: 'deepspeech2offline_aishell' + am_model: # the pdmodel file of am static model [optional] + am_params: # the pdiparams file of am static model [optional] + lang: 'zh' + sample_rate: 16000 + cfg_path: + decode_method: + force_yes: True + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + +################### speech task: tts; engine_type: python ####################### +tts_python: + # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', + # 'fastspeech2_ljspeech', 'fastspeech2_aishell3', + # 'fastspeech2_vctk'] + am: 'fastspeech2_csmsc' + am_config: + am_ckpt: + am_stat: + phones_dict: + tones_dict: + speaker_dict: + spk_id: 0 + + # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', + # 'pwgan_vctk', 'mb_melgan_csmsc'] + voc: 'pwgan_csmsc' + voc_config: + voc_ckpt: + voc_stat: + + # others + lang: 'zh' + device: # set 'gpu:id' or 'cpu' + + +################### speech task: tts; engine_type: inference ####################### +tts_inference: + # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] + am: 'fastspeech2_csmsc' + am_model: # the pdmodel file of your am static model (XX.pdmodel) + am_params: # the pdiparams file of your am static model (XX.pdipparams) + am_sample_rate: 24000 + phones_dict: + tones_dict: + speaker_dict: + spk_id: 0 + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc'] + voc: 'pwgan_csmsc' + voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel) + voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams) + voc_sample_rate: 24000 + + voc_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + # others + lang: 'zh' + diff --git a/demos/speech_server/conf/asr/asr.yaml b/demos/speech_server/conf/asr/asr.yaml deleted file mode 100644 index 1a805142..00000000 --- a/demos/speech_server/conf/asr/asr.yaml +++ /dev/null @@ -1,8 +0,0 @@ -model: 'conformer_wenetspeech' -lang: 'zh' -sample_rate: 16000 -cfg_path: # [optional] -ckpt_path: # [optional] -decode_method: 'attention_rescoring' -force_yes: True -device: 'cpu' # set 'gpu:id' or 'cpu' diff --git a/demos/speech_server/conf/asr/asr_pd.yaml b/demos/speech_server/conf/asr/asr_pd.yaml deleted file mode 100644 index 6cddb450..00000000 --- a/demos/speech_server/conf/asr/asr_pd.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# This is the parameter configuration file for ASR server. -# These are the static models that support paddle inference. - -################################################################## -# ACOUSTIC MODEL SETTING # -# am choices=['deepspeech2offline_aishell'] TODO -################################################################## -model_type: 'deepspeech2offline_aishell' -am_model: # the pdmodel file of am static model [optional] -am_params: # the pdiparams file of am static model [optional] -lang: 'zh' -sample_rate: 16000 -cfg_path: -decode_method: -force_yes: True - -am_predictor_conf: - device: 'cpu' # set 'gpu:id' or 'cpu' - enable_mkldnn: True - switch_ir_optim: True - - -################################################################## -# OTHERS # -################################################################## diff --git a/demos/speech_server/conf/tts/tts.yaml b/demos/speech_server/conf/tts/tts.yaml deleted file mode 100644 index 19e8874e..00000000 --- a/demos/speech_server/conf/tts/tts.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# This is the parameter configuration file for TTS server. - -################################################################## -# ACOUSTIC MODEL SETTING # -# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', -# 'fastspeech2_ljspeech', 'fastspeech2_aishell3', -# 'fastspeech2_vctk'] -################################################################## -am: 'fastspeech2_csmsc' -am_config: -am_ckpt: -am_stat: -phones_dict: -tones_dict: -speaker_dict: -spk_id: 0 - -################################################################## -# VOCODER SETTING # -# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', -# 'pwgan_vctk', 'mb_melgan_csmsc'] -################################################################## -voc: 'pwgan_csmsc' -voc_config: -voc_ckpt: -voc_stat: - -################################################################## -# OTHERS # -################################################################## -lang: 'zh' -device: 'cpu' # set 'gpu:id' or 'cpu' diff --git a/demos/speech_server/conf/tts/tts_pd.yaml b/demos/speech_server/conf/tts/tts_pd.yaml deleted file mode 100644 index 97df5261..00000000 --- a/demos/speech_server/conf/tts/tts_pd.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# This is the parameter configuration file for TTS server. -# These are the static models that support paddle inference. - -################################################################## -# ACOUSTIC MODEL SETTING # -# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] -################################################################## -am: 'fastspeech2_csmsc' -am_model: # the pdmodel file of your am static model (XX.pdmodel) -am_params: # the pdiparams file of your am static model (XX.pdipparams) -am_sample_rate: 24000 -phones_dict: -tones_dict: -speaker_dict: -spk_id: 0 - -am_predictor_conf: - device: 'cpu' # set 'gpu:id' or 'cpu' - enable_mkldnn: False - switch_ir_optim: False - - -################################################################## -# VOCODER SETTING # -# voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc'] -################################################################## -voc: 'pwgan_csmsc' -voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel) -voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams) -voc_sample_rate: 24000 - -voc_predictor_conf: - device: 'cpu' # set 'gpu:id' or 'cpu' - enable_mkldnn: False - switch_ir_optim: False - -################################################################## -# OTHERS # -################################################################## -lang: 'zh' diff --git a/demos/speech_server/server.sh b/demos/speech_server/server.sh index d9367ec0..e5961286 100644 --- a/demos/speech_server/server.sh +++ b/demos/speech_server/server.sh @@ -1,3 +1,3 @@ #!/bin/bash -paddlespeech_server start --config_file ./conf/application.yaml \ No newline at end of file +paddlespeech_server start --config_file ./conf/application.yaml diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md index 9d3c4ac5..2df72a82 100644 --- a/demos/text_to_speech/README.md +++ b/demos/text_to_speech/README.md @@ -17,11 +17,14 @@ The input of this demo should be a text of the specific language that can be pas ### 3. Usage - Command Line (Recommended) - Chinese - The default acoustic model is `Fastspeech2`, and the default vocoder is `Parallel WaveGAN`. ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" ``` + - Batch Process + ```bash + echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts + ``` - Chinese, use `SpeedySpeech` as the acoustic model ```bash paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md index f075efda..7e02b962 100644 --- a/demos/text_to_speech/README_cn.md +++ b/demos/text_to_speech/README_cn.md @@ -24,6 +24,10 @@ ```bash paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!" ``` + - 批处理 + ```bash + echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts + ``` - 中文,使用 `SpeedySpeech` 作为声学模型 ```bash paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" diff --git a/demos/text_to_speech/run.sh b/demos/text_to_speech/run.sh index c2487aee..b1340241 100755 --- a/demos/text_to_speech/run.sh +++ b/demos/text_to_speech/run.sh @@ -1,3 +1,7 @@ #!/bin/bash +# single process paddlespeech tts --input 今天的天气不错啊 + +# Batch process +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts \ No newline at end of file diff --git a/docs/topic/ctc/ctc_loss_speed_compare.ipynb b/docs/topic/ctc/ctc_loss_speed_compare.ipynb new file mode 100644 index 00000000..eb7a030c --- /dev/null +++ b/docs/topic/ctc/ctc_loss_speed_compare.ipynb @@ -0,0 +1,369 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1e738e0", + "metadata": {}, + "source": [ + "## 获取测试的 logit 数据" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "29d3368b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hlens.npy\n", + "logits.npy\n", + "ys_lens.npy\n", + "ys_pad.npy\n" + ] + } + ], + "source": [ + "!mkdir -p ./test_data\n", + "!test -f ./test_data/ctc_loss_compare_data.tgz || wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/ctc_loss_compare_data.tgz\n", + "!tar xzvf test_data/ctc_loss_compare_data.tgz -C ./test_data\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "240caf1d", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "import time\n", + "\n", + "data_dir=\"./test_data\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "91bad949", + "metadata": {}, + "outputs": [], + "source": [ + "logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n", + "ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n", + "hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n", + "ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))" + ] + }, + { + "cell_type": "markdown", + "id": "4cef2f15", + "metadata": {}, + "source": [ + "## 使用 torch 的 ctc loss" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "90612004", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1.10.1+cu102'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "torch.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "00799f97", + "metadata": {}, + "outputs": [], + "source": [ + "def torch_ctc_loss(use_cpu):\n", + " if use_cpu:\n", + " device = torch.device(\"cpu\")\n", + " else:\n", + " device = torch.device(\"cuda\")\n", + "\n", + " reduction_type = \"sum\" \n", + "\n", + " ctc_loss = torch.nn.CTCLoss(reduction=reduction_type)\n", + "\n", + " ys_hat = torch.tensor(logits_np, device = device)\n", + " ys_pad = torch.tensor(ys_pad_np, device = device)\n", + " hlens = torch.tensor(hlens_np, device = device)\n", + " ys_lens = torch.tensor(ys_lens_np, device = device)\n", + "\n", + " ys_hat = ys_hat.transpose(0, 1)\n", + " \n", + " # 开始计算时间\n", + " start_time = time.time()\n", + " ys_hat = ys_hat.log_softmax(2)\n", + " loss = ctc_loss(ys_hat, ys_pad, hlens, ys_lens)\n", + " end_time = time.time()\n", + " \n", + " loss = loss / ys_hat.size(1)\n", + " return end_time - start_time, loss.item()" + ] + }, + { + "cell_type": "markdown", + "id": "ba47b5a4", + "metadata": {}, + "source": [ + "## 使用 paddle 的 ctc loss" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6882a06e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2.2.2'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import paddle\n", + "paddle.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3cfa3b7c", + "metadata": {}, + "outputs": [], + "source": [ + "def paddle_ctc_loss(use_cpu): \n", + " import paddle.nn as pn\n", + " if use_cpu:\n", + " device = \"cpu\"\n", + " else:\n", + " device = \"gpu\"\n", + "\n", + " paddle.set_device(device)\n", + "\n", + " logits = paddle.to_tensor(logits_np)\n", + " ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n", + " hlens = paddle.to_tensor(hlens_np, dtype='int64')\n", + " ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n", + "\n", + " logits = logits.transpose([1,0,2])\n", + "\n", + " ctc_loss = pn.CTCLoss(reduction='sum')\n", + " # 开始计算时间\n", + " start_time = time.time()\n", + " pn_loss = ctc_loss(logits, ys_pad, hlens, ys_lens)\n", + " end_time = time.time()\n", + " \n", + " pn_loss = pn_loss / logits.shape[1]\n", + " return end_time - start_time, pn_loss.item()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "40413ef9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU, iteration 10\n", + "torch_ctc_loss 159.17137145996094\n", + "paddle_ctc_loss 159.16574096679688\n", + "paddle average time 1.718252992630005\n", + "torch average time 0.17536230087280275\n", + "paddle time / torch time (cpu) 9.798303193320452\n", + "\n", + "GPU, iteration 10\n", + "torch_ctc_loss 159.172119140625\n", + "paddle_ctc_loss 159.17205810546875\n", + "paddle average time 0.018606925010681154\n", + "torch average time 0.0026710033416748047\n", + "paddle time / torch time (gpu) 6.966267963938231\n" + ] + } + ], + "source": [ + "# 使用 CPU\n", + "\n", + "iteration = 10\n", + "use_cpu = True\n", + "torch_total_time = 0\n", + "paddle_total_time = 0\n", + "for _ in range(iteration):\n", + " cost_time, torch_loss = torch_ctc_loss(use_cpu)\n", + " torch_total_time += cost_time\n", + "for _ in range(iteration):\n", + " cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n", + " paddle_total_time += cost_time\n", + "print (\"CPU, iteration\", iteration)\n", + "print (\"torch_ctc_loss\", torch_loss)\n", + "print (\"paddle_ctc_loss\", paddle_loss)\n", + "print (\"paddle average time\", paddle_total_time / iteration)\n", + "print (\"torch average time\", torch_total_time / iteration)\n", + "print (\"paddle time / torch time (cpu)\" , paddle_total_time/ torch_total_time)\n", + "\n", + "print (\"\")\n", + "\n", + "# 使用 GPU\n", + "\n", + "use_cpu = False\n", + "torch_total_time = 0\n", + "paddle_total_time = 0\n", + "for _ in range(iteration):\n", + " cost_time, torch_loss = torch_ctc_loss(use_cpu)\n", + " torch_total_time += cost_time\n", + "for _ in range(iteration):\n", + " cost_time, paddle_loss = paddle_ctc_loss(use_cpu)\n", + " paddle_total_time += cost_time\n", + "print (\"GPU, iteration\", iteration)\n", + "print (\"torch_ctc_loss\", torch_loss)\n", + "print (\"paddle_ctc_loss\", paddle_loss)\n", + "print (\"paddle average time\", paddle_total_time / iteration)\n", + "print (\"torch average time\", torch_total_time / iteration)\n", + "print (\"paddle time / torch time (gpu)\" , paddle_total_time/ torch_total_time)" + ] + }, + { + "cell_type": "markdown", + "id": "7cdf8697", + "metadata": {}, + "source": [ + "## 其他: 使用 PaddleSpeech 中的 ctcloss 查一下loss值" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "73fad81d", + "metadata": {}, + "outputs": [], + "source": [ + "logits_np = np.load(os.path.join(data_dir, \"logits.npy\"))\n", + "ys_pad_np = np.load(os.path.join(data_dir, \"ys_pad.npy\"))\n", + "hlens_np = np.load(os.path.join(data_dir, \"hlens.npy\"))\n", + "ys_lens_np = np.load(os.path.join(data_dir, \"ys_lens.npy\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2b41e45d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-02-25 11:34:34.143 | INFO | paddlespeech.s2t.modules.loss:__init__:41 - CTCLoss Loss reduction: sum, div-bs: True\n", + "2022-02-25 11:34:34.143 | INFO | paddlespeech.s2t.modules.loss:__init__:42 - CTCLoss Grad Norm Type: instance\n", + "2022-02-25 11:34:34.144 | INFO | paddlespeech.s2t.modules.loss:__init__:73 - CTCLoss() kwargs:{'norm_by_times': True}, not support: {'norm_by_batchsize': False, 'norm_by_total_logits_len': False}\n", + "loss 159.17205810546875\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/miniconda3/lib/python3.7/site-packages/paddle/fluid/dygraph/math_op_patch.py:253: UserWarning: The dtype of left and right variables are not the same, left dtype is paddle.float32, but right dtype is paddle.int32, the right dtype will convert to paddle.float32\n", + " format(lhs_dtype, rhs_dtype, lhs_dtype))\n" + ] + } + ], + "source": [ + "use_cpu = False\n", + "\n", + "from paddlespeech.s2t.modules.loss import CTCLoss\n", + "\n", + "if use_cpu:\n", + " device = \"cpu\"\n", + "else:\n", + " device = \"gpu\"\n", + "\n", + "paddle.set_device(device)\n", + "\n", + "blank_id=0\n", + "reduction_type='sum'\n", + "batch_average= True\n", + "grad_norm_type='instance'\n", + "\n", + "criterion = CTCLoss(\n", + " blank=blank_id,\n", + " reduction=reduction_type,\n", + " batch_average=batch_average,\n", + " grad_norm_type=grad_norm_type)\n", + "\n", + "logits = paddle.to_tensor(logits_np)\n", + "ys_pad = paddle.to_tensor(ys_pad_np,dtype='int32')\n", + "hlens = paddle.to_tensor(hlens_np, dtype='int64')\n", + "ys_lens = paddle.to_tensor(ys_lens_np, dtype='int64')\n", + "\n", + "pn_ctc_loss = criterion(logits, ys_pad, hlens, ys_lens)\n", + "print(\"loss\", pn_ctc_loss.item())\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "de525d38", + "metadata": {}, + "source": [ + "## 结论\n", + "在 CPU 环境下: torch 的 CTC loss 的计算速度是 paddle 的 9.8 倍 \n", + "在 GPU 环境下: torch 的 CTC loss 的计算速度是 paddle 的 6.87 倍\n", + "\n", + "## 其他结论\n", + "torch 的 ctc loss 在 CPU 和 GPU 下 都没有完全对齐。其中CPU的前向对齐精度大约为 1e-2。 GPU 的前向对齐精度大约为 1e-4 。" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md index 281ad836..d02ad1b6 100644 --- a/examples/aishell3/tts3/README.md +++ b/examples/aishell3/tts3/README.md @@ -225,7 +225,9 @@ optional arguments: 9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu. ## Pretrained Model -Pretrained FastSpeech2 model with no silence in the edge of audios. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip) +Pretrained FastSpeech2 model with no silence in the edge of audios: +- [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip) +- [fastspeech2_conformer_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_aishell3_ckpt_0.2.0.zip) (Thanks for [@awmmmm](https://github.com/awmmmm)'s contribution) FastSpeech2 checkpoint contains files listed below. diff --git a/examples/aishell3/tts3/conf/conformer.yaml b/examples/aishell3/tts3/conf/conformer.yaml new file mode 100644 index 00000000..ea73593d --- /dev/null +++ b/examples/aishell3/tts3/conf/conformer.yaml @@ -0,0 +1,110 @@ +########################################################### +# FEATURE EXTRACTION SETTING # +########################################################### + +fs: 24000 # sr +n_fft: 2048 # FFT size (samples). +n_shift: 300 # Hop size (samples). 12.5ms +win_length: 1200 # Window length (samples). 50ms + # If set to null, it will be the same as fft_size. +window: "hann" # Window function. + +# Only used for feats_type != raw + +fmin: 80 # Minimum frequency of Mel basis. +fmax: 7600 # Maximum frequency of Mel basis. +n_mels: 80 # The number of mel basis. + +# Only used for the model using pitch features (e.g. FastSpeech2) +f0min: 80 # Maximum f0 for pitch extraction. +f0max: 400 # Minimum f0 for pitch extraction. + + +########################################################### +# DATA SETTING # +########################################################### +batch_size: 32 +num_workers: 4 + + +########################################################### +# MODEL SETTING # +########################################################### +model: + adim: 384 # attention dimension + aheads: 2 # number of attention heads + elayers: 4 # number of encoder layers + eunits: 1536 # number of encoder ff units + dlayers: 4 # number of decoder layers + dunits: 1536 # number of decoder ff units + positionwise_layer_type: conv1d # type of position-wise layer + positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer + duration_predictor_layers: 2 # number of layers of duration predictor + duration_predictor_chans: 256 # number of channels of duration predictor + duration_predictor_kernel_size: 3 # filter size of duration predictor + postnet_layers: 5 # number of layers of postnset + postnet_filts: 5 # filter size of conv layers in postnet + postnet_chans: 256 # number of channels of conv layers in postnet + encoder_normalize_before: True # whether to perform layer normalization before the input + decoder_normalize_before: True # whether to perform layer normalization before the input + reduction_factor: 1 # reduction factor + encoder_type: conformer # encoder type + decoder_type: conformer # decoder type + conformer_pos_enc_layer_type: rel_pos # conformer positional encoding type + conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type + conformer_activation_type: swish # conformer activation type + use_macaron_style_in_conformer: true # whether to use macaron style in conformer + use_cnn_in_conformer: true # whether to use CNN in conformer + conformer_enc_kernel_size: 7 # kernel size in CNN module of conformer-based encoder + conformer_dec_kernel_size: 31 # kernel size in CNN module of conformer-based decoder + init_type: xavier_uniform # initialization type + transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer + transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding + transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer + transformer_dec_dropout_rate: 0.2 # dropout rate for transformer decoder layer + transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding + transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer + pitch_predictor_layers: 5 # number of conv layers in pitch predictor + pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor + pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor + pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor + pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch + pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch + stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder + energy_predictor_layers: 2 # number of conv layers in energy predictor + energy_predictor_chans: 256 # number of channels of conv layers in energy predictor + energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor + energy_predictor_dropout: 0.5 # dropout rate in energy predictor + energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy + energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy + stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder + spk_embed_dim: 256 # speaker embedding dimension + spk_embed_integration_type: concat # speaker embedding integration type + + +########################################################### +# UPDATER SETTING # +########################################################### +updater: + use_masking: True # whether to apply masking for padded part in loss calculation + + + +########################################################### +# OPTIMIZER SETTING # +########################################################### +optimizer: + optim: adam # optimizer type + learning_rate: 0.001 # learning rate + +########################################################### +# TRAINING SETTING # +########################################################### +max_epoch: 1000 +num_snapshots: 5 + + +########################################################### +# OTHER SETTING # +########################################################### +seed: 10086 diff --git a/examples/csmsc/tts0/local/synthesize.sh b/examples/csmsc/tts0/local/synthesize.sh index 4be06dd8..5b8ed15e 100755 --- a/examples/csmsc/tts0/local/synthesize.sh +++ b/examples/csmsc/tts0/local/synthesize.sh @@ -3,18 +3,98 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 +stage=0 +stop_stage=0 -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --am=tacotron2_csmsc \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/speech_stats.npy \ - --voc=pwgan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# style melgan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=tacotron2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh index 79bb9f83..f7675873 100755 --- a/examples/csmsc/tts0/local/synthesize_e2e.sh +++ b/examples/csmsc/tts0/local/synthesize_e2e.sh @@ -8,6 +8,7 @@ stage=0 stop_stage=0 # TODO: tacotron2 动转静的结果没有静态图的响亮, 可能还是 decode 的时候某个函数动静不对齐 +# pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ @@ -39,14 +40,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ --am_stat=dump/train/speech_stats.npy \ --voc=mb_melgan_csmsc \ - --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ - --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ - --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ - --phones_dict=dump/phone_id_map.txt + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi # the pretrained models haven't release now @@ -88,8 +89,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ - --phones_dict=dump/phone_id_map.txt + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi # wavernn @@ -111,4 +112,4 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then --output_dir=${train_output_path}/test_e2e \ --phones_dict=dump/phone_id_map.txt \ --inference_dir=${train_output_path}/inference -fi \ No newline at end of file +fi diff --git a/examples/csmsc/tts2/local/synthesize.sh b/examples/csmsc/tts2/local/synthesize.sh index cedc9717..37b29818 100755 --- a/examples/csmsc/tts2/local/synthesize.sh +++ b/examples/csmsc/tts2/local/synthesize.sh @@ -1,20 +1,105 @@ #!/bin/bash + config_path=$1 train_output_path=$2 ckpt_name=$3 +stage=0 +stop_stage=0 + +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi + +# style melgan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt \ + --tones_dict=dump/tone_id_map.txt +fi -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --am=speedyspeech_csmsc \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/feats_stats.npy \ - --voc=pwgan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt \ - --tones_dict=dump/tone_id_map.txt \ No newline at end of file +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=speedyspeech_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --tones_dict=dump/tone_id_map.txt \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh index 35fcf251..553b4554 100755 --- a/examples/csmsc/tts2/local/synthesize_e2e.sh +++ b/examples/csmsc/tts2/local/synthesize_e2e.sh @@ -7,6 +7,7 @@ ckpt_name=$3 stage=0 stop_stage=0 +# pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ @@ -22,9 +23,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \ - --tones_dict=dump/tone_id_map.txt + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi # for more GAN Vocoders @@ -44,9 +45,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \ - --tones_dict=dump/tone_id_map.txt + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi # the pretrained models haven't release now @@ -88,12 +89,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ --phones_dict=dump/phone_id_map.txt \ - --tones_dict=dump/tone_id_map.txt + --tones_dict=dump/tone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi - # wavernn if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then echo "in wavernn syn_e2e" diff --git a/examples/csmsc/tts3/local/synthesize.sh b/examples/csmsc/tts3/local/synthesize.sh index 19767426..043bb52f 100755 --- a/examples/csmsc/tts3/local/synthesize.sh +++ b/examples/csmsc/tts3/local/synthesize.sh @@ -3,18 +3,98 @@ config_path=$1 train_output_path=$2 ckpt_name=$3 +stage=0 +stop_stage=0 -FLAGS_allocator_strategy=naive_best_fit \ -FLAGS_fraction_of_gpu_memory_to_use=0.01 \ -python3 ${BIN_DIR}/../synthesize.py \ - --am=fastspeech2_csmsc \ - --am_config=${config_path} \ - --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ - --am_stat=dump/train/speech_stats.npy \ - --voc=pwgan_csmsc \ - --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ - --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ - --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ - --test_metadata=dump/test/norm/metadata.jsonl \ - --output_dir=${train_output_path}/test \ - --phones_dict=dump/phone_id_map.txt +# pwgan +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=pwgan_csmsc \ + --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ + --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ + --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# for more GAN Vocoders +# multi band melgan +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=mb_melgan_csmsc \ + --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\ + --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# style melgan +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=style_melgan_csmsc \ + --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ + --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# hifigan +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "in hifigan syn" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=hifigan_csmsc \ + --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ + --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ + --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi + +# wavernn +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "in wavernn syn" + FLAGS_allocator_strategy=naive_best_fit \ + FLAGS_fraction_of_gpu_memory_to_use=0.01 \ + python3 ${BIN_DIR}/../synthesize.py \ + --am=fastspeech2_csmsc \ + --am_config=${config_path} \ + --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ + --am_stat=dump/train/speech_stats.npy \ + --voc=wavernn_csmsc \ + --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \ + --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \ + --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \ + --test_metadata=dump/test/norm/metadata.jsonl \ + --output_dir=${train_output_path}/test \ + --phones_dict=dump/phone_id_map.txt +fi diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh index 44356e4b..512e062b 100755 --- a/examples/csmsc/tts3/local/synthesize_e2e.sh +++ b/examples/csmsc/tts3/local/synthesize_e2e.sh @@ -7,6 +7,7 @@ ckpt_name=$3 stage=0 stop_stage=0 +# pwgan if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then FLAGS_allocator_strategy=naive_best_fit \ FLAGS_fraction_of_gpu_memory_to_use=0.01 \ @@ -22,8 +23,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ - --phones_dict=dump/phone_id_map.txt + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi # for more GAN Vocoders @@ -43,8 +44,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ - --phones_dict=dump/phone_id_map.txt + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi # the pretrained models haven't release now @@ -86,8 +87,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then --lang=zh \ --text=${BIN_DIR}/../sentences.txt \ --output_dir=${train_output_path}/test_e2e \ - --inference_dir=${train_output_path}/inference \ - --phones_dict=dump/phone_id_map.txt + --phones_dict=dump/phone_id_map.txt \ + --inference_dir=${train_output_path}/inference fi diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md index c0f55bd4..141f7f74 100644 --- a/examples/other/g2p/README.md +++ b/examples/other/g2p/README.md @@ -10,7 +10,7 @@ Run the command below to get the results of the test. ```bash ./run.sh ``` -The `avg WER` of g2p is: 0.027124048652822204 +The `avg WER` of g2p is: 0.026014352515701198 ```text ,--------------------------------------------------------------------. | | # Snt # Wrd | Corr Sub Del Ins Err S.Err | diff --git a/paddleaudio/CHANGELOG.md b/paddleaudio/CHANGELOG.md index 825c32f0..91b0fef0 100644 --- a/paddleaudio/CHANGELOG.md +++ b/paddleaudio/CHANGELOG.md @@ -1 +1,5 @@ # Changelog + +Date: 2022-2-25, Author: Hui Zhang. + - Refactor architecture. + - dtw distance and mcd style dtw diff --git a/paddleaudio/features/augment.py b/paddleaudio/features/augment.py deleted file mode 100644 index 6f903bdb..00000000 --- a/paddleaudio/features/augment.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import List - -import numpy as np -from numpy import ndarray as array - -from ..backends import depth_convert -from ..utils import ParameterError - -__all__ = [ - 'depth_augment', - 'spect_augment', - 'random_crop1d', - 'random_crop2d', - 'adaptive_spect_augment', -] - - -def randint(high: int) -> int: - """Generate one random integer in range [0 high) - - This is a helper function for random data augmentaiton - """ - return int(np.random.randint(0, high=high)) - - -def rand() -> float: - """Generate one floating-point number in range [0 1) - - This is a helper function for random data augmentaiton - """ - return float(np.random.rand(1)) - - -def depth_augment(y: array, - choices: List=['int8', 'int16'], - probs: List[float]=[0.5, 0.5]) -> array: - """ Audio depth augmentation - - Do audio depth augmentation to simulate the distortion brought by quantization. - """ - assert len(probs) == len( - choices - ), 'number of choices {} must be equal to size of probs {}'.format( - len(choices), len(probs)) - depth = np.random.choice(choices, p=probs) - src_depth = y.dtype - y1 = depth_convert(y, depth) - y2 = depth_convert(y1, src_depth) - - return y2 - - -def adaptive_spect_augment(spect: array, tempo_axis: int=0, - level: float=0.1) -> array: - """Do adpative spectrogram augmentation - - The level of the augmentation is gowern by the paramter level, - ranging from 0 to 1, with 0 represents no augmentation。 - - """ - assert spect.ndim == 2., 'only supports 2d tensor or numpy array' - if tempo_axis == 0: - nt, nf = spect.shape - else: - nf, nt = spect.shape - - time_mask_width = int(nt * level * 0.5) - freq_mask_width = int(nf * level * 0.5) - - num_time_mask = int(10 * level) - num_freq_mask = int(10 * level) - - if tempo_axis == 0: - for _ in range(num_time_mask): - start = randint(nt - time_mask_width) - spect[start:start + time_mask_width, :] = 0 - for _ in range(num_freq_mask): - start = randint(nf - freq_mask_width) - spect[:, start:start + freq_mask_width] = 0 - else: - for _ in range(num_time_mask): - start = randint(nt - time_mask_width) - spect[:, start:start + time_mask_width] = 0 - for _ in range(num_freq_mask): - start = randint(nf - freq_mask_width) - spect[start:start + freq_mask_width, :] = 0 - - return spect - - -def spect_augment(spect: array, - tempo_axis: int=0, - max_time_mask: int=3, - max_freq_mask: int=3, - max_time_mask_width: int=30, - max_freq_mask_width: int=20) -> array: - """Do spectrogram augmentation in both time and freq axis - - Reference: - - """ - assert spect.ndim == 2., 'only supports 2d tensor or numpy array' - if tempo_axis == 0: - nt, nf = spect.shape - else: - nf, nt = spect.shape - - num_time_mask = randint(max_time_mask) - num_freq_mask = randint(max_freq_mask) - - time_mask_width = randint(max_time_mask_width) - freq_mask_width = randint(max_freq_mask_width) - - if tempo_axis == 0: - for _ in range(num_time_mask): - start = randint(nt - time_mask_width) - spect[start:start + time_mask_width, :] = 0 - for _ in range(num_freq_mask): - start = randint(nf - freq_mask_width) - spect[:, start:start + freq_mask_width] = 0 - else: - for _ in range(num_time_mask): - start = randint(nt - time_mask_width) - spect[:, start:start + time_mask_width] = 0 - for _ in range(num_freq_mask): - start = randint(nf - freq_mask_width) - spect[start:start + freq_mask_width, :] = 0 - - return spect - - -def random_crop1d(y: array, crop_len: int) -> array: - """ Do random cropping on 1d input signal - - The input is a 1d signal, typically a sound waveform - """ - if y.ndim != 1: - 'only accept 1d tensor or numpy array' - n = len(y) - idx = randint(n - crop_len) - return y[idx:idx + crop_len] - - -def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array: - """ Do random cropping for 2D array, typically a spectrogram. - - The cropping is done in temporal direction on the time-freq input signal. - """ - if tempo_axis >= s.ndim: - raise ParameterError('axis out of range') - - n = s.shape[tempo_axis] - idx = randint(high=n - crop_len) - sli = [slice(None) for i in range(s.ndim)] - sli[tempo_axis] = slice(idx, idx + crop_len) - out = s[tuple(sli)] - return out diff --git a/paddleaudio/features/spectrum.py b/paddleaudio/features/spectrum.py deleted file mode 100644 index 154b6484..00000000 --- a/paddleaudio/features/spectrum.py +++ /dev/null @@ -1,461 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math -from functools import partial -from typing import Optional -from typing import Union - -import paddle -import paddle.nn as nn - -from .window import get_window - -__all__ = [ - 'Spectrogram', - 'MelSpectrogram', - 'LogMelSpectrogram', -] - - -def hz_to_mel(freq: Union[paddle.Tensor, float], - htk: bool=False) -> Union[paddle.Tensor, float]: - """Convert Hz to Mels. - Parameters: - freq: the input tensor of arbitrary shape, or a single floating point number. - htk: use HTK formula to do the conversion. - The default value is False. - Returns: - The frequencies represented in Mel-scale. - """ - - if htk: - if isinstance(freq, paddle.Tensor): - return 2595.0 * paddle.log10(1.0 + freq / 700.0) - else: - return 2595.0 * math.log10(1.0 + freq / 700.0) - - # Fill in the linear part - f_min = 0.0 - f_sp = 200.0 / 3 - - mels = (freq - f_min) / f_sp - - # Fill in the log-scale part - - min_log_hz = 1000.0 # beginning of log region (Hz) - min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) - logstep = math.log(6.4) / 27.0 # step size for log region - - if isinstance(freq, paddle.Tensor): - target = min_log_mel + paddle.log( - freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10 - mask = (freq > min_log_hz).astype(freq.dtype) - mels = target * mask + mels * ( - 1 - mask) # will replace by masked_fill OP in future - else: - if freq >= min_log_hz: - mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep - - return mels - - -def mel_to_hz(mel: Union[float, paddle.Tensor], - htk: bool=False) -> Union[float, paddle.Tensor]: - """Convert mel bin numbers to frequencies. - Parameters: - mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number. - htk: use HTK formula to do the conversion. - Returns: - The frequencies represented in hz. - """ - if htk: - return 700.0 * (10.0**(mel / 2595.0) - 1.0) - - f_min = 0.0 - f_sp = 200.0 / 3 - freqs = f_min + f_sp * mel - # And now the nonlinear scale - min_log_hz = 1000.0 # beginning of log region (Hz) - min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) - logstep = math.log(6.4) / 27.0 # step size for log region - if isinstance(mel, paddle.Tensor): - target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel)) - mask = (mel > min_log_mel).astype(mel.dtype) - freqs = target * mask + freqs * ( - 1 - mask) # will replace by masked_fill OP in future - else: - if mel >= min_log_mel: - freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel)) - - return freqs - - -def mel_frequencies(n_mels: int=64, - f_min: float=0.0, - f_max: float=11025.0, - htk: bool=False, - dtype: str=paddle.float32): - """Compute mel frequencies. - Parameters: - n_mels(int): number of Mel bins. - f_min(float): the lower cut-off frequency, below which the filter response is zero. - f_max(float): the upper cut-off frequency, above which the filter response is zero. - htk(bool): whether to use htk formula. - dtype(str): the datatype of the return frequencies. - Returns: - The frequencies represented in Mel-scale - """ - # 'Center freqs' of mel bands - uniformly spaced between limits - min_mel = hz_to_mel(f_min, htk=htk) - max_mel = hz_to_mel(f_max, htk=htk) - mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype) - freqs = mel_to_hz(mels, htk=htk) - return freqs - - -def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32): - """Compute fourier frequencies. - Parameters: - sr(int): the audio sample rate. - n_fft(float): the number of fft bins. - dtype(str): the datatype of the return frequencies. - Returns: - The frequencies represented in hz. - """ - return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype) - - -def compute_fbank_matrix(sr: int, - n_fft: int, - n_mels: int=64, - f_min: float=0.0, - f_max: Optional[float]=None, - htk: bool=False, - norm: Union[str, float]='slaney', - dtype: str=paddle.float32): - """Compute fbank matrix. - Parameters: - sr(int): the audio sample rate. - n_fft(int): the number of fft bins. - n_mels(int): the number of Mel bins. - f_min(float): the lower cut-off frequency, below which the filter response is zero. - f_max(float): the upper cut-off frequency, above which the filter response is zero. - htk: whether to use htk formula. - return_complex(bool): whether to return complex matrix. If True, the matrix will - be complex type. Otherwise, the real and image part will be stored in the last - axis of returned tensor. - dtype(str): the datatype of the returned fbank matrix. - Returns: - The fbank matrix of shape (n_mels, int(1+n_fft//2)). - Shape: - output: (n_mels, int(1+n_fft//2)) - """ - - if f_max is None: - f_max = float(sr) / 2 - - # Initialize the weights - weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) - - # Center freqs of each FFT bin - fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype) - - # 'Center freqs' of mel bands - uniformly spaced between limits - mel_f = mel_frequencies( - n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype) - - fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f) - ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0) - #ramps = np.subtract.outer(mel_f, fftfreqs) - - for i in range(n_mels): - # lower and upper slopes for all bins - lower = -ramps[i] / fdiff[i] - upper = ramps[i + 2] / fdiff[i + 1] - - # .. then intersect them with each other and zero - weights[i] = paddle.maximum( - paddle.zeros_like(lower), paddle.minimum(lower, upper)) - - # Slaney-style mel is scaled to be approx constant energy per channel - if norm == 'slaney': - enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels]) - weights *= enorm.unsqueeze(1) - elif isinstance(norm, int) or isinstance(norm, float): - weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1) - - return weights - - -def power_to_db(magnitude: paddle.Tensor, - ref_value: float=1.0, - amin: float=1e-10, - top_db: Optional[float]=None) -> paddle.Tensor: - """Convert a power spectrogram (amplitude squared) to decibel (dB) units. - The function computes the scaling ``10 * log10(x / ref)`` in a numerically - stable way. - Parameters: - magnitude(Tensor): the input magnitude tensor of any shape. - ref_value(float): the reference value. If smaller than 1.0, the db level - of the signal will be pulled up accordingly. Otherwise, the db level - is pushed down. - amin(float): the minimum value of input magnitude, below which the input - magnitude is clipped(to amin). - top_db(float): the maximum db value of resulting spectrum, above which the - spectrum is clipped(to top_db). - Returns: - The spectrogram in log-scale. - shape: - input: any shape - output: same as input - """ - if amin <= 0: - raise Exception("amin must be strictly positive") - - if ref_value <= 0: - raise Exception("ref_value must be strictly positive") - - ones = paddle.ones_like(magnitude) - log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude)) - log_spec -= 10.0 * math.log10(max(ref_value, amin)) - - if top_db is not None: - if top_db < 0: - raise Exception("top_db must be non-negative") - log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db)) - - return log_spec - - -class Spectrogram(nn.Layer): - def __init__(self, - n_fft: int=512, - hop_length: Optional[int]=None, - win_length: Optional[int]=None, - window: str='hann', - center: bool=True, - pad_mode: str='reflect', - dtype: str=paddle.float32): - """Compute spectrogram of a given signal, typically an audio waveform. - The spectorgram is defined as the complex norm of the short-time - Fourier transformation. - Parameters: - n_fft(int): the number of frequency components of the discrete Fourier transform. - The default value is 2048, - hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4. - The default value is None. - win_length: the window length of the short time FFt. If None, it is set to same as n_fft. - The default value is None. - window(str): the name of the window function applied to the single before the Fourier transform. - The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', - 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. - The default value is 'hann' - center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. - If False, frame t begins at x[t * hop_length] - The default value is True - pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect' - and 'constant'. The default value is 'reflect'. - dtype(str): the data type of input and window. - Notes: - The Spectrogram transform relies on STFT transform to compute the spectrogram. - By default, the weights are not learnable. To fine-tune the Fourier coefficients, - set stop_gradient=False before training. - For more information, see STFT(). - """ - super(Spectrogram, self).__init__() - - if win_length is None: - win_length = n_fft - - fft_window = get_window(window, win_length, fftbins=True, dtype=dtype) - self._stft = partial( - paddle.signal.stft, - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - window=fft_window, - center=center, - pad_mode=pad_mode) - - def forward(self, x): - stft = self._stft(x) - spectrogram = paddle.square(paddle.abs(stft)) - return spectrogram - - -class MelSpectrogram(nn.Layer): - def __init__(self, - sr: int=22050, - n_fft: int=512, - hop_length: Optional[int]=None, - win_length: Optional[int]=None, - window: str='hann', - center: bool=True, - pad_mode: str='reflect', - n_mels: int=64, - f_min: float=50.0, - f_max: Optional[float]=None, - htk: bool=False, - norm: Union[str, float]='slaney', - dtype: str=paddle.float32): - """Compute the melspectrogram of a given signal, typically an audio waveform. - The melspectrogram is also known as filterbank or fbank feature in audio community. - It is computed by multiplying spectrogram with Mel filter bank matrix. - Parameters: - sr(int): the audio sample rate. - The default value is 22050. - n_fft(int): the number of frequency components of the discrete Fourier transform. - The default value is 2048, - hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4. - The default value is None. - win_length: the window length of the short time FFt. If None, it is set to same as n_fft. - The default value is None. - window(str): the name of the window function applied to the single before the Fourier transform. - The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', - 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. - The default value is 'hann' - center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. - If False, frame t begins at x[t * hop_length] - The default value is True - pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect' - and 'constant'. - The default value is 'reflect'. - n_mels(int): the mel bins. - f_min(float): the lower cut-off frequency, below which the filter response is zero. - f_max(float): the upper cut-off frequency, above which the filter response is zeros. - htk(bool): whether to use HTK formula in computing fbank matrix. - norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. - You can specify norm=1.0/2.0 to use customized p-norm normalization. - dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical - accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. - """ - super(MelSpectrogram, self).__init__() - - self._spectrogram = Spectrogram( - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - window=window, - center=center, - pad_mode=pad_mode, - dtype=dtype) - self.n_mels = n_mels - self.f_min = f_min - self.f_max = f_max - self.htk = htk - self.norm = norm - if f_max is None: - f_max = sr // 2 - self.fbank_matrix = compute_fbank_matrix( - sr=sr, - n_fft=n_fft, - n_mels=n_mels, - f_min=f_min, - f_max=f_max, - htk=htk, - norm=norm, - dtype=dtype) # float64 for better numerical results - self.register_buffer('fbank_matrix', self.fbank_matrix) - - def forward(self, x): - spect_feature = self._spectrogram(x) - mel_feature = paddle.matmul(self.fbank_matrix, spect_feature) - return mel_feature - - -class LogMelSpectrogram(nn.Layer): - def __init__(self, - sr: int=22050, - n_fft: int=512, - hop_length: Optional[int]=None, - win_length: Optional[int]=None, - window: str='hann', - center: bool=True, - pad_mode: str='reflect', - n_mels: int=64, - f_min: float=50.0, - f_max: Optional[float]=None, - htk: bool=False, - norm: Union[str, float]='slaney', - ref_value: float=1.0, - amin: float=1e-10, - top_db: Optional[float]=None, - dtype: str=paddle.float32): - """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal, - typically an audio waveform. - Parameters: - sr(int): the audio sample rate. - The default value is 22050. - n_fft(int): the number of frequency components of the discrete Fourier transform. - The default value is 2048, - hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4. - The default value is None. - win_length: the window length of the short time FFt. If None, it is set to same as n_fft. - The default value is None. - window(str): the name of the window function applied to the single before the Fourier transform. - The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', - 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. - The default value is 'hann' - center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. - If False, frame t begins at x[t * hop_length] - The default value is True - pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect' - and 'constant'. - The default value is 'reflect'. - n_mels(int): the mel bins. - f_min(float): the lower cut-off frequency, below which the filter response is zero. - f_max(float): the upper cut-off frequency, above which the filter response is zeros. - ref_value(float): the reference value. If smaller than 1.0, the db level - htk(bool): whether to use HTK formula in computing fbank matrix. - norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. - You can specify norm=1.0/2.0 to use customized p-norm normalization. - dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical - accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. - amin(float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly. - Otherwise, the db level is pushed down. - magnitude is clipped(to amin). For numerical stability, set amin to a larger value, - e.g., 1e-3. - top_db(float): the maximum db value of resulting spectrum, above which the - spectrum is clipped(to top_db). - """ - super(LogMelSpectrogram, self).__init__() - - self._melspectrogram = MelSpectrogram( - sr=sr, - n_fft=n_fft, - hop_length=hop_length, - win_length=win_length, - window=window, - center=center, - pad_mode=pad_mode, - n_mels=n_mels, - f_min=f_min, - f_max=f_max, - htk=htk, - norm=norm, - dtype=dtype) - - self.ref_value = ref_value - self.amin = amin - self.top_db = top_db - - def forward(self, x): - # import ipdb; ipdb.set_trace() - mel_feature = self._melspectrogram(x) - log_mel_feature = power_to_db( - mel_feature, - ref_value=self.ref_value, - amin=self.amin, - top_db=self.top_db) - return log_mel_feature diff --git a/paddleaudio/paddleaudio/__init__.py b/paddleaudio/paddleaudio/__init__.py new file mode 100644 index 00000000..6184c1dd --- /dev/null +++ b/paddleaudio/paddleaudio/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from . import compliance +from . import datasets +from . import features +from . import functional +from . import io +from . import metric +from . import sox_effects +from .backends import load +from .backends import save diff --git a/paddleaudio/paddleaudio/backends/__init__.py b/paddleaudio/paddleaudio/backends/__init__.py new file mode 100644 index 00000000..8eae07e8 --- /dev/null +++ b/paddleaudio/paddleaudio/backends/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .soundfile_backend import depth_convert +from .soundfile_backend import load +from .soundfile_backend import normalize +from .soundfile_backend import resample +from .soundfile_backend import save +from .soundfile_backend import to_mono diff --git a/paddleaudio/backends/audio.py b/paddleaudio/paddleaudio/backends/soundfile_backend.py similarity index 93% rename from paddleaudio/backends/audio.py rename to paddleaudio/paddleaudio/backends/soundfile_backend.py index 4127570e..2b920284 100644 --- a/paddleaudio/backends/audio.py +++ b/paddleaudio/paddleaudio/backends/soundfile_backend.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -29,7 +29,7 @@ __all__ = [ 'to_mono', 'depth_convert', 'normalize', - 'save_wav', + 'save', 'load', ] NORMALMIZE_TYPES = ['linear', 'gaussian'] @@ -41,12 +41,9 @@ EPS = 1e-8 def resample(y: array, src_sr: int, target_sr: int, mode: str='kaiser_fast') -> array: """ Audio resampling - This function is the same as using resampy.resample(). - Notes: The default mode is kaiser_fast. For better audio quality, use mode = 'kaiser_fast' - """ if mode == 'kaiser_best': @@ -106,7 +103,6 @@ def to_mono(y: array, merge_type: str='average') -> array: def _safe_cast(y: array, dtype: Union[type, str]) -> array: """ data type casting in a safe way, i.e., prevent overflow or underflow - This function is used internally. """ return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype) @@ -115,10 +111,8 @@ def _safe_cast(y: array, dtype: Union[type, str]) -> array: def depth_convert(y: array, dtype: Union[type, str], dithering: bool=True) -> array: """Convert audio array to target dtype safely - This function convert audio waveform to a target dtype, with addition steps of preventing overflow/underflow and preserving audio range. - """ SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64'] @@ -168,12 +162,9 @@ def sound_file_load(file: str, dtype: str='int16', duration: Optional[int]=None) -> Tuple[array, int]: """Load audio using soundfile library - This function load audio file using libsndfile. - Reference: http://www.mega-nerd.com/libsndfile/#Features - """ with sf.SoundFile(file) as sf_desc: sr_native = sf_desc.samplerate @@ -188,33 +179,9 @@ def sound_file_load(file: str, return y, sf_desc.samplerate -def audio_file_load(): - """Load audio using audiofile library - - This function load audio file using audiofile. - - Reference: - https://audiofile.68k.org/ - - """ - raise NotImplementedError() - - -def sox_file_load(): - """Load audio using sox library - - This function load audio file using sox. - - Reference: - http://sox.sourceforge.net/ - """ - raise NotImplementedError() - - def normalize(y: array, norm_type: str='linear', mul_factor: float=1.0) -> array: """ normalize an input audio with additional multiplier. - """ if norm_type == 'linear': @@ -232,14 +199,12 @@ def normalize(y: array, norm_type: str='linear', return y -def save_wav(y: array, sr: int, file: str) -> None: +def save(y: array, sr: int, file: str) -> None: """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16 unless it already is int16 - Notes: It only support raw wav format. - """ if not file.endswith('.wav'): raise ParameterError( @@ -274,11 +239,8 @@ def load( resample_mode: str='kaiser_fast') -> Tuple[array, int]: """Load audio file from disk. This function loads audio from disk using using audio beackend. - Parameters: - Notes: - """ y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration) diff --git a/paddleaudio/paddleaudio/backends/sox_backend.py b/paddleaudio/paddleaudio/backends/sox_backend.py new file mode 100644 index 00000000..97043fd7 --- /dev/null +++ b/paddleaudio/paddleaudio/backends/sox_backend.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddleaudio/utils/__init__.py b/paddleaudio/paddleaudio/compliance/__init__.py similarity index 67% rename from paddleaudio/utils/__init__.py rename to paddleaudio/paddleaudio/compliance/__init__.py index 1c1b4a90..97043fd7 100644 --- a/paddleaudio/utils/__init__.py +++ b/paddleaudio/paddleaudio/compliance/__init__.py @@ -1,6 +1,6 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # -# Licensed under the Apache License, Version 2.0 (the "License" +# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # @@ -11,8 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .download import * -from .env import * -from .error import * -from .log import * -from .time import * diff --git a/paddleaudio/paddleaudio/compliance/kaldi.py b/paddleaudio/paddleaudio/compliance/kaldi.py new file mode 100644 index 00000000..8cb9b666 --- /dev/null +++ b/paddleaudio/paddleaudio/compliance/kaldi.py @@ -0,0 +1,638 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from torchaudio(https://github.com/pytorch/audio) +import math +from typing import Tuple + +import paddle +from paddle import Tensor + +from ..functional import create_dct +from ..functional.window import get_window + +__all__ = [ + 'spectrogram', + 'fbank', + 'mfcc', +] + +# window types +HANNING = 'hann' +HAMMING = 'hamming' +POVEY = 'povey' +RECTANGULAR = 'rect' +BLACKMAN = 'blackman' + + +def _get_epsilon(dtype): + return paddle.to_tensor(1e-07, dtype=dtype) + + +def _next_power_of_2(x: int) -> int: + return 1 if x == 0 else 2**(x - 1).bit_length() + + +def _get_strided(waveform: Tensor, + window_size: int, + window_shift: int, + snip_edges: bool) -> Tensor: + assert waveform.dim() == 1 + num_samples = waveform.shape[0] + + if snip_edges: + if num_samples < window_size: + return paddle.empty((0, 0), dtype=waveform.dtype) + else: + m = 1 + (num_samples - window_size) // window_shift + else: + reversed_waveform = paddle.flip(waveform, [0]) + m = (num_samples + (window_shift // 2)) // window_shift + pad = window_size // 2 - window_shift // 2 + pad_right = reversed_waveform + if pad > 0: + pad_left = reversed_waveform[-pad:] + waveform = paddle.concat((pad_left, waveform, pad_right), axis=0) + else: + waveform = paddle.concat((waveform[-pad:], pad_right), axis=0) + + return paddle.signal.frame(waveform, window_size, window_shift)[:, :m].T + + +def _feature_window_function( + window_type: str, + window_size: int, + blackman_coeff: float, + dtype: int, ) -> Tensor: + if window_type == HANNING: + return get_window('hann', window_size, fftbins=False, dtype=dtype) + elif window_type == HAMMING: + return get_window('hamming', window_size, fftbins=False, dtype=dtype) + elif window_type == POVEY: + return get_window( + 'hann', window_size, fftbins=False, dtype=dtype).pow(0.85) + elif window_type == RECTANGULAR: + return paddle.ones([window_size], dtype=dtype) + elif window_type == BLACKMAN: + a = 2 * math.pi / (window_size - 1) + window_function = paddle.arange(window_size, dtype=dtype) + return (blackman_coeff - 0.5 * paddle.cos(a * window_function) + + (0.5 - blackman_coeff) * paddle.cos(2 * a * window_function) + ).astype(dtype) + else: + raise Exception('Invalid window type ' + window_type) + + +def _get_log_energy(strided_input: Tensor, epsilon: Tensor, + energy_floor: float) -> Tensor: + log_energy = paddle.maximum(strided_input.pow(2).sum(1), epsilon).log() + if energy_floor == 0.0: + return log_energy + return paddle.maximum( + log_energy, + paddle.to_tensor(math.log(energy_floor), dtype=strided_input.dtype)) + + +def _get_waveform_and_window_properties( + waveform: Tensor, + channel: int, + sr: int, + frame_shift: float, + frame_length: float, + round_to_power_of_two: bool, + preemphasis_coefficient: float) -> Tuple[Tensor, int, int, int]: + channel = max(channel, 0) + assert channel < waveform.shape[0], ( + 'Invalid channel {} for size {}'.format(channel, waveform.shape[0])) + waveform = waveform[channel, :] # size (n) + window_shift = int( + sr * frame_shift * + 0.001) # pass frame_shift and frame_length in milliseconds + window_size = int(sr * frame_length * 0.001) + padded_window_size = _next_power_of_2( + window_size) if round_to_power_of_two else window_size + + assert 2 <= window_size <= len(waveform), ( + 'choose a window size {} that is [2, {}]'.format(window_size, + len(waveform))) + assert 0 < window_shift, '`window_shift` must be greater than 0' + assert padded_window_size % 2 == 0, 'the padded `window_size` must be divisible by two.' \ + ' use `round_to_power_of_two` or change `frame_length`' + assert 0. <= preemphasis_coefficient <= 1.0, '`preemphasis_coefficient` must be between [0,1]' + assert sr > 0, '`sr` must be greater than zero' + return waveform, window_shift, window_size, padded_window_size + + +def _get_window(waveform: Tensor, + padded_window_size: int, + window_size: int, + window_shift: int, + window_type: str, + blackman_coeff: float, + snip_edges: bool, + raw_energy: bool, + energy_floor: float, + dither: float, + remove_dc_offset: bool, + preemphasis_coefficient: float) -> Tuple[Tensor, Tensor]: + dtype = waveform.dtype + epsilon = _get_epsilon(dtype) + + # (m, window_size) + strided_input = _get_strided(waveform, window_size, window_shift, + snip_edges) + + if dither != 0.0: + x = paddle.maximum(epsilon, + paddle.rand(strided_input.shape, dtype=dtype)) + rand_gauss = paddle.sqrt(-2 * x.log()) * paddle.cos(2 * math.pi * x) + strided_input = strided_input + rand_gauss * dither + + if remove_dc_offset: + row_means = paddle.mean(strided_input, axis=1).unsqueeze(1) # (m, 1) + strided_input = strided_input - row_means + + if raw_energy: + signal_log_energy = _get_log_energy(strided_input, epsilon, + energy_floor) # (m) + + if preemphasis_coefficient != 0.0: + offset_strided_input = paddle.nn.functional.pad( + strided_input.unsqueeze(0), (1, 0), + data_format='NCL', + mode='replicate').squeeze(0) # (m, window_size + 1) + strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, : + -1] + + window_function = _feature_window_function( + window_type, window_size, blackman_coeff, + dtype).unsqueeze(0) # (1, window_size) + strided_input = strided_input * window_function # (m, window_size) + + # (m, padded_window_size) + if padded_window_size != window_size: + padding_right = padded_window_size - window_size + strided_input = paddle.nn.functional.pad( + strided_input.unsqueeze(0), (0, padding_right), + data_format='NCL', + mode='constant', + value=0).squeeze(0) + + if not raw_energy: + signal_log_energy = _get_log_energy(strided_input, epsilon, + energy_floor) # size (m) + + return strided_input, signal_log_energy + + +def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor: + if subtract_mean: + col_means = paddle.mean(tensor, axis=0).unsqueeze(0) + tensor = tensor - col_means + return tensor + + +def spectrogram(waveform: Tensor, + blackman_coeff: float=0.42, + channel: int=-1, + dither: float=0.0, + energy_floor: float=1.0, + frame_length: float=25.0, + frame_shift: float=10.0, + preemphasis_coefficient: float=0.97, + raw_energy: bool=True, + remove_dc_offset: bool=True, + round_to_power_of_two: bool=True, + sr: int=16000, + snip_edges: bool=True, + subtract_mean: bool=False, + window_type: str=POVEY) -> Tensor: + """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's. + + Args: + waveform (Tensor): A waveform tensor with shape [C, T]. + blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. + channel (int, optional): Select the channel of waveform. Defaults to -1. + dither (float, optional): Dithering constant . Defaults to 0.0. + energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. + frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. + frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. + preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. + raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. + remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. Defaults to True. + sr (int, optional): Sample rate of input waveform. Defaults to 16000. + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it + is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. + subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. + window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + + Returns: + Tensor: A spectrogram tensor with shape (m, padded_window_size // 2 + 1) where m is the number of frames + depends on frame_length and frame_shift. + """ + dtype = waveform.dtype + epsilon = _get_epsilon(dtype) + + waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( + waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two, + preemphasis_coefficient) + + strided_input, signal_log_energy = _get_window( + waveform, padded_window_size, window_size, window_shift, window_type, + blackman_coeff, snip_edges, raw_energy, energy_floor, dither, + remove_dc_offset, preemphasis_coefficient) + + # (m, padded_window_size // 2 + 1, 2) + fft = paddle.fft.rfft(strided_input) + + power_spectrum = paddle.maximum( + fft.abs().pow(2.), epsilon).log() # (m, padded_window_size // 2 + 1) + power_spectrum[:, 0] = signal_log_energy + + power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean) + return power_spectrum + + +def _inverse_mel_scale_scalar(mel_freq: float) -> float: + return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0) + + +def _inverse_mel_scale(mel_freq: Tensor) -> Tensor: + return 700.0 * ((mel_freq / 1127.0).exp() - 1.0) + + +def _mel_scale_scalar(freq: float) -> float: + return 1127.0 * math.log(1.0 + freq / 700.0) + + +def _mel_scale(freq: Tensor) -> Tensor: + return 1127.0 * (1.0 + freq / 700.0).log() + + +def _vtln_warp_freq(vtln_low_cutoff: float, + vtln_high_cutoff: float, + low_freq: float, + high_freq: float, + vtln_warp_factor: float, + freq: Tensor) -> Tensor: + assert vtln_low_cutoff > low_freq, 'be sure to set the vtln_low option higher than low_freq' + assert vtln_high_cutoff < high_freq, 'be sure to set the vtln_high option lower than high_freq [or negative]' + l = vtln_low_cutoff * max(1.0, vtln_warp_factor) + h = vtln_high_cutoff * min(1.0, vtln_warp_factor) + scale = 1.0 / vtln_warp_factor + Fl = scale * l + Fh = scale * h + assert l > low_freq and h < high_freq + scale_left = (Fl - low_freq) / (l - low_freq) + scale_right = (high_freq - Fh) / (high_freq - h) + res = paddle.empty_like(freq) + + outside_low_high_freq = paddle.less_than(freq, paddle.to_tensor(low_freq)) \ + | paddle.greater_than(freq, paddle.to_tensor(high_freq)) + before_l = paddle.less_than(freq, paddle.to_tensor(l)) + before_h = paddle.less_than(freq, paddle.to_tensor(h)) + after_h = paddle.greater_equal(freq, paddle.to_tensor(h)) + + res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq) + res[before_h] = scale * freq[before_h] + res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq) + res[outside_low_high_freq] = freq[outside_low_high_freq] + + return res + + +def _vtln_warp_mel_freq(vtln_low_cutoff: float, + vtln_high_cutoff: float, + low_freq, + high_freq: float, + vtln_warp_factor: float, + mel_freq: Tensor) -> Tensor: + return _mel_scale( + _vtln_warp_freq(vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, + vtln_warp_factor, _inverse_mel_scale(mel_freq))) + + +def _get_mel_banks(num_bins: int, + window_length_padded: int, + sample_freq: float, + low_freq: float, + high_freq: float, + vtln_low: float, + vtln_high: float, + vtln_warp_factor: float) -> Tuple[Tensor, Tensor]: + assert num_bins > 3, 'Must have at least 3 mel bins' + assert window_length_padded % 2 == 0 + num_fft_bins = window_length_padded / 2 + nyquist = 0.5 * sample_freq + + if high_freq <= 0.0: + high_freq += nyquist + + assert (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq), \ + ('Bad values in options: low-freq {} and high-freq {} vs. nyquist {}'.format(low_freq, high_freq, nyquist)) + + fft_bin_width = sample_freq / window_length_padded + mel_low_freq = _mel_scale_scalar(low_freq) + mel_high_freq = _mel_scale_scalar(high_freq) + + mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1) + + if vtln_high < 0.0: + vtln_high += nyquist + + assert vtln_warp_factor == 1.0 or ((low_freq < vtln_low < high_freq) and + (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)), \ + ('Bad values in options: vtln-low {} and vtln-high {}, versus ' + 'low-freq {} and high-freq {}'.format(vtln_low, vtln_high, low_freq, high_freq)) + + bin = paddle.arange(num_bins).unsqueeze(1) + left_mel = mel_low_freq + bin * mel_freq_delta # (num_bins, 1) + center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta # (num_bins, 1) + right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta # (num_bins, 1) + + if vtln_warp_factor != 1.0: + left_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, + vtln_warp_factor, left_mel) + center_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, + high_freq, vtln_warp_factor, + center_mel) + right_mel = _vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, + high_freq, vtln_warp_factor, right_mel) + + center_freqs = _inverse_mel_scale(center_mel) # (num_bins) + # (1, num_fft_bins) + mel = _mel_scale(fft_bin_width * paddle.arange(num_fft_bins)).unsqueeze(0) + + # (num_bins, num_fft_bins) + up_slope = (mel - left_mel) / (center_mel - left_mel) + down_slope = (right_mel - mel) / (right_mel - center_mel) + + if vtln_warp_factor == 1.0: + bins = paddle.maximum( + paddle.zeros([1]), paddle.minimum(up_slope, down_slope)) + else: + bins = paddle.zeros_like(up_slope) + up_idx = paddle.greater_than(mel, left_mel) & paddle.less_than( + mel, center_mel) + down_idx = paddle.greater_than(mel, center_mel) & paddle.less_than( + mel, right_mel) + bins[up_idx] = up_slope[up_idx] + bins[down_idx] = down_slope[down_idx] + + return bins, center_freqs + + +def fbank(waveform: Tensor, + blackman_coeff: float=0.42, + channel: int=-1, + dither: float=0.0, + energy_floor: float=1.0, + frame_length: float=25.0, + frame_shift: float=10.0, + high_freq: float=0.0, + htk_compat: bool=False, + low_freq: float=20.0, + n_mels: int=23, + preemphasis_coefficient: float=0.97, + raw_energy: bool=True, + remove_dc_offset: bool=True, + round_to_power_of_two: bool=True, + sr: int=16000, + snip_edges: bool=True, + subtract_mean: bool=False, + use_energy: bool=False, + use_log_fbank: bool=True, + use_power: bool=True, + vtln_high: float=-500.0, + vtln_low: float=100.0, + vtln_warp: float=1.0, + window_type: str=POVEY) -> Tensor: + """Compute and return filter banks from a waveform. The output is identical to Kaldi's. + + Args: + waveform (Tensor): A waveform tensor with shape [C, T]. + blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. + channel (int, optional): Select the channel of waveform. Defaults to -1. + dither (float, optional): Dithering constant . Defaults to 0.0. + energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. + frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. + frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. + high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0. + htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False. + low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0. + n_mels (int, optional): Number of output mel bins. Defaults to 23. + preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. + raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. + remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. Defaults to True. + sr (int, optional): Sample rate of input waveform. Defaults to 16000. + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it + is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. + subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. + use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. + use_log_fbank (bool, optional): Return log fbank when it is set True. Defaults to True. + use_power (bool, optional): Whether to use power instead of magnitude. Defaults to True. + vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. + vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. + vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. + window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + + Returns: + Tensor: A filter banks tensor with shape (m, n_mels). + """ + dtype = waveform.dtype + + waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties( + waveform, channel, sr, frame_shift, frame_length, round_to_power_of_two, + preemphasis_coefficient) + + strided_input, signal_log_energy = _get_window( + waveform, padded_window_size, window_size, window_shift, window_type, + blackman_coeff, snip_edges, raw_energy, energy_floor, dither, + remove_dc_offset, preemphasis_coefficient) + + # (m, padded_window_size // 2 + 1) + spectrum = paddle.fft.rfft(strided_input).abs() + if use_power: + spectrum = spectrum.pow(2.) + + # (n_mels, padded_window_size // 2) + mel_energies, _ = _get_mel_banks(n_mels, padded_window_size, sr, low_freq, + high_freq, vtln_low, vtln_high, vtln_warp) + mel_energies = mel_energies.astype(dtype) + + # (n_mels, padded_window_size // 2 + 1) + mel_energies = paddle.nn.functional.pad( + mel_energies.unsqueeze(0), (0, 1), + data_format='NCL', + mode='constant', + value=0).squeeze(0) + + # (m, n_mels) + mel_energies = paddle.mm(spectrum, mel_energies.T) + if use_log_fbank: + mel_energies = paddle.maximum(mel_energies, _get_epsilon(dtype)).log() + + if use_energy: + signal_log_energy = signal_log_energy.unsqueeze(1) + if htk_compat: + mel_energies = paddle.concat( + (mel_energies, signal_log_energy), axis=1) + else: + mel_energies = paddle.concat( + (signal_log_energy, mel_energies), axis=1) + + # (m, n_mels + 1) + mel_energies = _subtract_column_mean(mel_energies, subtract_mean) + return mel_energies + + +def _get_dct_matrix(n_mfcc: int, n_mels: int) -> Tensor: + dct_matrix = create_dct(n_mels, n_mels, 'ortho') + dct_matrix[:, 0] = math.sqrt(1 / float(n_mels)) + dct_matrix = dct_matrix[:, :n_mfcc] # (n_mels, n_mfcc) + return dct_matrix + + +def _get_lifter_coeffs(n_mfcc: int, cepstral_lifter: float) -> Tensor: + i = paddle.arange(n_mfcc) + return 1.0 + 0.5 * cepstral_lifter * paddle.sin(math.pi * i / + cepstral_lifter) + + +def mfcc(waveform: Tensor, + blackman_coeff: float=0.42, + cepstral_lifter: float=22.0, + channel: int=-1, + dither: float=0.0, + energy_floor: float=1.0, + frame_length: float=25.0, + frame_shift: float=10.0, + high_freq: float=0.0, + htk_compat: bool=False, + low_freq: float=20.0, + n_mfcc: int=13, + n_mels: int=23, + preemphasis_coefficient: float=0.97, + raw_energy: bool=True, + remove_dc_offset: bool=True, + round_to_power_of_two: bool=True, + sr: int=16000, + snip_edges: bool=True, + subtract_mean: bool=False, + use_energy: bool=False, + vtln_high: float=-500.0, + vtln_low: float=100.0, + vtln_warp: float=1.0, + window_type: str=POVEY) -> Tensor: + """Compute and return mel frequency cepstral coefficients from a waveform. The output is + identical to Kaldi's. + + Args: + waveform (Tensor): A waveform tensor with shape [C, T]. + blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42. + cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0. + channel (int, optional): Select the channel of waveform. Defaults to -1. + dither (float, optional): Dithering constant . Defaults to 0.0. + energy_floor (float, optional): Floor on energy of the output Spectrogram. Defaults to 1.0. + frame_length (float, optional): Frame length in milliseconds. Defaults to 25.0. + frame_shift (float, optional): Shift between adjacent frames in milliseconds. Defaults to 10.0. + high_freq (float, optional): The upper cut-off frequency. Defaults to 0.0. + htk_compat (bool, optional): Put energy to the last when it is set True. Defaults to False. + low_freq (float, optional): The lower cut-off frequency. Defaults to 20.0. + n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 13. + n_mels (int, optional): Number of output mel bins. Defaults to 23. + preemphasis_coefficient (float, optional): Preemphasis coefficient for input waveform. Defaults to 0.97. + raw_energy (bool, optional): Whether to compute before preemphasis and windowing. Defaults to True. + remove_dc_offset (bool, optional): Whether to subtract mean from waveform on frames. Defaults to True. + round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input + to FFT. Defaults to True. + sr (int, optional): Sample rate of input waveform. Defaults to 16000. + snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a singal frame when it + is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. + subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. + use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. + vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function. Defaults to -500.0. + vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function. Defaults to 100.0. + vtln_warp (float, optional): Vtln warp factor. Defaults to 1.0. + window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY. + + Returns: + Tensor: A mel frequency cepstral coefficients tensor with shape (m, n_mfcc). + """ + assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( + n_mfcc, n_mels) + + dtype = waveform.dtype + + # (m, n_mels + use_energy) + feature = fbank( + waveform=waveform, + blackman_coeff=blackman_coeff, + channel=channel, + dither=dither, + energy_floor=energy_floor, + frame_length=frame_length, + frame_shift=frame_shift, + high_freq=high_freq, + htk_compat=htk_compat, + low_freq=low_freq, + n_mels=n_mels, + preemphasis_coefficient=preemphasis_coefficient, + raw_energy=raw_energy, + remove_dc_offset=remove_dc_offset, + round_to_power_of_two=round_to_power_of_two, + sr=sr, + snip_edges=snip_edges, + subtract_mean=False, + use_energy=use_energy, + use_log_fbank=True, + use_power=True, + vtln_high=vtln_high, + vtln_low=vtln_low, + vtln_warp=vtln_warp, + window_type=window_type) + + if use_energy: + # (m) + signal_log_energy = feature[:, n_mels if htk_compat else 0] + mel_offset = int(not htk_compat) + feature = feature[:, mel_offset:(n_mels + mel_offset)] + + # (n_mels, n_mfcc) + dct_matrix = _get_dct_matrix(n_mfcc, n_mels).astype(dtype=dtype) + + # (m, n_mfcc) + feature = feature.matmul(dct_matrix) + + if cepstral_lifter != 0.0: + # (1, n_mfcc) + lifter_coeffs = _get_lifter_coeffs(n_mfcc, cepstral_lifter).unsqueeze(0) + feature *= lifter_coeffs.astype(dtype=dtype) + + if use_energy: + feature[:, 0] = signal_log_energy + + if htk_compat: + energy = feature[:, 0].unsqueeze(1) # (m, 1) + feature = feature[:, 1:] # (m, n_mfcc - 1) + if not use_energy: + energy *= math.sqrt(2) + + feature = paddle.concat((feature, energy), axis=1) + + feature = _subtract_column_mean(feature, subtract_mean) + return feature diff --git a/paddleaudio/features/core.py b/paddleaudio/paddleaudio/compliance/librosa.py similarity index 79% rename from paddleaudio/features/core.py rename to paddleaudio/paddleaudio/compliance/librosa.py index 01925ec6..167795c3 100644 --- a/paddleaudio/features/core.py +++ b/paddleaudio/paddleaudio/compliance/librosa.py @@ -21,11 +21,13 @@ import numpy as np import scipy from numpy import ndarray as array from numpy.lib.stride_tricks import as_strided -from scipy.signal import get_window +from scipy import signal +from ..backends import depth_convert from ..utils import ParameterError __all__ = [ + # dsp 'stft', 'mfcc', 'hz_to_mel', @@ -38,6 +40,12 @@ __all__ = [ 'spectrogram', 'mu_encode', 'mu_decode', + # augmentation + 'depth_augment', + 'spect_augment', + 'random_crop1d', + 'random_crop2d', + 'adaptive_spect_augment', ] @@ -303,7 +311,7 @@ def stft(x: array, if hop_length is None: hop_length = int(win_length // 4) - fft_window = get_window(window, win_length, fftbins=True) + fft_window = signal.get_window(window, win_length, fftbins=True) # Pad the window out to n_fft size fft_window = pad_center(fft_window, n_fft) @@ -576,3 +584,145 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array: y = y * 2 / mu - 1 x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1) return x + + +def randint(high: int) -> int: + """Generate one random integer in range [0 high) + + This is a helper function for random data augmentaiton + """ + return int(np.random.randint(0, high=high)) + + +def rand() -> float: + """Generate one floating-point number in range [0 1) + + This is a helper function for random data augmentaiton + """ + return float(np.random.rand(1)) + + +def depth_augment(y: array, + choices: List=['int8', 'int16'], + probs: List[float]=[0.5, 0.5]) -> array: + """ Audio depth augmentation + + Do audio depth augmentation to simulate the distortion brought by quantization. + """ + assert len(probs) == len( + choices + ), 'number of choices {} must be equal to size of probs {}'.format( + len(choices), len(probs)) + depth = np.random.choice(choices, p=probs) + src_depth = y.dtype + y1 = depth_convert(y, depth) + y2 = depth_convert(y1, src_depth) + + return y2 + + +def adaptive_spect_augment(spect: array, tempo_axis: int=0, + level: float=0.1) -> array: + """Do adpative spectrogram augmentation + + The level of the augmentation is gowern by the paramter level, + ranging from 0 to 1, with 0 represents no augmentation。 + + """ + assert spect.ndim == 2., 'only supports 2d tensor or numpy array' + if tempo_axis == 0: + nt, nf = spect.shape + else: + nf, nt = spect.shape + + time_mask_width = int(nt * level * 0.5) + freq_mask_width = int(nf * level * 0.5) + + num_time_mask = int(10 * level) + num_freq_mask = int(10 * level) + + if tempo_axis == 0: + for _ in range(num_time_mask): + start = randint(nt - time_mask_width) + spect[start:start + time_mask_width, :] = 0 + for _ in range(num_freq_mask): + start = randint(nf - freq_mask_width) + spect[:, start:start + freq_mask_width] = 0 + else: + for _ in range(num_time_mask): + start = randint(nt - time_mask_width) + spect[:, start:start + time_mask_width] = 0 + for _ in range(num_freq_mask): + start = randint(nf - freq_mask_width) + spect[start:start + freq_mask_width, :] = 0 + + return spect + + +def spect_augment(spect: array, + tempo_axis: int=0, + max_time_mask: int=3, + max_freq_mask: int=3, + max_time_mask_width: int=30, + max_freq_mask_width: int=20) -> array: + """Do spectrogram augmentation in both time and freq axis + + Reference: + + """ + assert spect.ndim == 2., 'only supports 2d tensor or numpy array' + if tempo_axis == 0: + nt, nf = spect.shape + else: + nf, nt = spect.shape + + num_time_mask = randint(max_time_mask) + num_freq_mask = randint(max_freq_mask) + + time_mask_width = randint(max_time_mask_width) + freq_mask_width = randint(max_freq_mask_width) + + if tempo_axis == 0: + for _ in range(num_time_mask): + start = randint(nt - time_mask_width) + spect[start:start + time_mask_width, :] = 0 + for _ in range(num_freq_mask): + start = randint(nf - freq_mask_width) + spect[:, start:start + freq_mask_width] = 0 + else: + for _ in range(num_time_mask): + start = randint(nt - time_mask_width) + spect[:, start:start + time_mask_width] = 0 + for _ in range(num_freq_mask): + start = randint(nf - freq_mask_width) + spect[start:start + freq_mask_width, :] = 0 + + return spect + + +def random_crop1d(y: array, crop_len: int) -> array: + """ Do random cropping on 1d input signal + + The input is a 1d signal, typically a sound waveform + """ + if y.ndim != 1: + 'only accept 1d tensor or numpy array' + n = len(y) + idx = randint(n - crop_len) + return y[idx:idx + crop_len] + + +def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array: + """ Do random cropping for 2D array, typically a spectrogram. + + The cropping is done in temporal direction on the time-freq input signal. + """ + if tempo_axis >= s.ndim: + raise ParameterError('axis out of range') + + n = s.shape[tempo_axis] + idx = randint(high=n - crop_len) + sli = [slice(None) for i in range(s.ndim)] + sli[tempo_axis] = slice(idx, idx + crop_len) + out = s[tuple(sli)] + return out diff --git a/paddleaudio/datasets/__init__.py b/paddleaudio/paddleaudio/datasets/__init__.py similarity index 90% rename from paddleaudio/datasets/__init__.py rename to paddleaudio/paddleaudio/datasets/__init__.py index 8d2fdab4..5c5f0369 100644 --- a/paddleaudio/datasets/__init__.py +++ b/paddleaudio/paddleaudio/datasets/__init__.py @@ -15,10 +15,3 @@ from .esc50 import ESC50 from .gtzan import GTZAN from .tess import TESS from .urban_sound import UrbanSound8K - -__all__ = [ - 'ESC50', - 'UrbanSound8K', - 'GTZAN', - 'TESS', -] diff --git a/paddleaudio/datasets/dataset.py b/paddleaudio/paddleaudio/datasets/dataset.py similarity index 96% rename from paddleaudio/datasets/dataset.py rename to paddleaudio/paddleaudio/datasets/dataset.py index 7a57fd6c..06e2df6d 100644 --- a/paddleaudio/datasets/dataset.py +++ b/paddleaudio/paddleaudio/datasets/dataset.py @@ -17,8 +17,8 @@ import numpy as np import paddle from ..backends import load as load_audio -from ..features import melspectrogram -from ..features import mfcc +from ..compliance.librosa import melspectrogram +from ..compliance.librosa import mfcc feat_funcs = { 'raw': None, diff --git a/paddleaudio/datasets/esc50.py b/paddleaudio/paddleaudio/datasets/esc50.py similarity index 100% rename from paddleaudio/datasets/esc50.py rename to paddleaudio/paddleaudio/datasets/esc50.py diff --git a/paddleaudio/datasets/gtzan.py b/paddleaudio/paddleaudio/datasets/gtzan.py similarity index 100% rename from paddleaudio/datasets/gtzan.py rename to paddleaudio/paddleaudio/datasets/gtzan.py diff --git a/paddleaudio/datasets/tess.py b/paddleaudio/paddleaudio/datasets/tess.py similarity index 100% rename from paddleaudio/datasets/tess.py rename to paddleaudio/paddleaudio/datasets/tess.py diff --git a/paddleaudio/datasets/urban_sound.py b/paddleaudio/paddleaudio/datasets/urban_sound.py similarity index 100% rename from paddleaudio/datasets/urban_sound.py rename to paddleaudio/paddleaudio/datasets/urban_sound.py diff --git a/paddleaudio/features/__init__.py b/paddleaudio/paddleaudio/features/__init__.py similarity index 82% rename from paddleaudio/features/__init__.py rename to paddleaudio/paddleaudio/features/__init__.py index d8ac7c4b..00781397 100644 --- a/paddleaudio/features/__init__.py +++ b/paddleaudio/paddleaudio/features/__init__.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .augment import * -from .core import * -from .spectrum import * +from .layers import LogMelSpectrogram +from .layers import MelSpectrogram +from .layers import MFCC +from .layers import Spectrogram diff --git a/paddleaudio/paddleaudio/features/layers.py b/paddleaudio/paddleaudio/features/layers.py new file mode 100644 index 00000000..4a2c1673 --- /dev/null +++ b/paddleaudio/paddleaudio/features/layers.py @@ -0,0 +1,344 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from functools import partial +from typing import Optional +from typing import Union + +import paddle +import paddle.nn as nn + +from ..functional import compute_fbank_matrix +from ..functional import create_dct +from ..functional import power_to_db +from ..functional.window import get_window + +__all__ = [ + 'Spectrogram', + 'MelSpectrogram', + 'LogMelSpectrogram', + 'MFCC', +] + + +class Spectrogram(nn.Layer): + def __init__(self, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + center: bool=True, + pad_mode: str='reflect', + dtype: str=paddle.float32): + """Compute spectrogram of a given signal, typically an audio waveform. + The spectorgram is defined as the complex norm of the short-time + Fourier transformation. + Parameters: + n_fft (int): the number of frequency components of the discrete Fourier transform. + The default value is 2048, + hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4. + The default value is None. + win_length: the window length of the short time FFt. If None, it is set to same as n_fft. + The default value is None. + window (str): the name of the window function applied to the single before the Fourier transform. + The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', + 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. + The default value is 'hann' + center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. + If False, frame t begins at x[t * hop_length] + The default value is True + pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect' + and 'constant'. The default value is 'reflect'. + dtype (str): the data type of input and window. + Notes: + The Spectrogram transform relies on STFT transform to compute the spectrogram. + By default, the weights are not learnable. To fine-tune the Fourier coefficients, + set stop_gradient=False before training. + For more information, see STFT(). + """ + super(Spectrogram, self).__init__() + + if win_length is None: + win_length = n_fft + + self.fft_window = get_window( + window, win_length, fftbins=True, dtype=dtype) + self._stft = partial( + paddle.signal.stft, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=self.fft_window, + center=center, + pad_mode=pad_mode) + self.register_buffer('fft_window', self.fft_window) + + def forward(self, x): + stft = self._stft(x) + spectrogram = paddle.square(paddle.abs(stft)) + return spectrogram + + +class MelSpectrogram(nn.Layer): + def __init__(self, + sr: int=22050, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + center: bool=True, + pad_mode: str='reflect', + n_mels: int=64, + f_min: float=50.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + dtype: str=paddle.float32): + """Compute the melspectrogram of a given signal, typically an audio waveform. + The melspectrogram is also known as filterbank or fbank feature in audio community. + It is computed by multiplying spectrogram with Mel filter bank matrix. + Parameters: + sr(int): the audio sample rate. + The default value is 22050. + n_fft(int): the number of frequency components of the discrete Fourier transform. + The default value is 2048, + hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4. + The default value is None. + win_length: the window length of the short time FFt. If None, it is set to same as n_fft. + The default value is None. + window(str): the name of the window function applied to the single before the Fourier transform. + The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', + 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. + The default value is 'hann' + center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. + If False, frame t begins at x[t * hop_length] + The default value is True + pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect' + and 'constant'. + The default value is 'reflect'. + n_mels(int): the mel bins. + f_min(float): the lower cut-off frequency, below which the filter response is zero. + f_max(float): the upper cut-off frequency, above which the filter response is zeros. + htk(bool): whether to use HTK formula in computing fbank matrix. + norm(str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. + You can specify norm=1.0/2.0 to use customized p-norm normalization. + dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical + accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. + """ + super(MelSpectrogram, self).__init__() + + self._spectrogram = Spectrogram( + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, + dtype=dtype) + self.n_mels = n_mels + self.f_min = f_min + self.f_max = f_max + self.htk = htk + self.norm = norm + if f_max is None: + f_max = sr // 2 + self.fbank_matrix = compute_fbank_matrix( + sr=sr, + n_fft=n_fft, + n_mels=n_mels, + f_min=f_min, + f_max=f_max, + htk=htk, + norm=norm, + dtype=dtype) # float64 for better numerical results + self.register_buffer('fbank_matrix', self.fbank_matrix) + + def forward(self, x): + spect_feature = self._spectrogram(x) + mel_feature = paddle.matmul(self.fbank_matrix, spect_feature) + return mel_feature + + +class LogMelSpectrogram(nn.Layer): + def __init__(self, + sr: int=22050, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + center: bool=True, + pad_mode: str='reflect', + n_mels: int=64, + f_min: float=50.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + ref_value: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=None, + dtype: str=paddle.float32): + """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal, + typically an audio waveform. + Parameters: + sr (int): the audio sample rate. + The default value is 22050. + n_fft (int): the number of frequency components of the discrete Fourier transform. + The default value is 2048, + hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4. + The default value is None. + win_length: the window length of the short time FFt. If None, it is set to same as n_fft. + The default value is None. + window (str): the name of the window function applied to the single before the Fourier transform. + The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', + 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. + The default value is 'hann' + center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. + If False, frame t begins at x[t * hop_length] + The default value is True + pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect' + and 'constant'. + The default value is 'reflect'. + n_mels (int): the mel bins. + f_min (float): the lower cut-off frequency, below which the filter response is zero. + f_max (float): the upper cut-off frequency, above which the filter response is zeros. + htk (bool): whether to use HTK formula in computing fbank matrix. + norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. + You can specify norm=1.0/2.0 to use customized p-norm normalization. + ref_value (float): the reference value. If smaller than 1.0, the db level + amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly. + Otherwise, the db level is pushed down. + magnitude is clipped(to amin). For numerical stability, set amin to a larger value, + e.g., 1e-3. + top_db (float): the maximum db value of resulting spectrum, above which the + spectrum is clipped(to top_db). + dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical + accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. + """ + super(LogMelSpectrogram, self).__init__() + + self._melspectrogram = MelSpectrogram( + sr=sr, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, + n_mels=n_mels, + f_min=f_min, + f_max=f_max, + htk=htk, + norm=norm, + dtype=dtype) + + self.ref_value = ref_value + self.amin = amin + self.top_db = top_db + + def forward(self, x): + # import ipdb; ipdb.set_trace() + mel_feature = self._melspectrogram(x) + log_mel_feature = power_to_db( + mel_feature, + ref_value=self.ref_value, + amin=self.amin, + top_db=self.top_db) + return log_mel_feature + + +class MFCC(nn.Layer): + def __init__(self, + sr: int=22050, + n_mfcc: int=40, + n_fft: int=512, + hop_length: Optional[int]=None, + win_length: Optional[int]=None, + window: str='hann', + center: bool=True, + pad_mode: str='reflect', + n_mels: int=64, + f_min: float=50.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + ref_value: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=None, + dtype: str=paddle.float32): + """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms. + + Parameters: + sr(int): the audio sample rate. + The default value is 22050. + n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40. + n_fft (int): the number of frequency components of the discrete Fourier transform. + The default value is 2048, + hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4. + The default value is None. + win_length: the window length of the short time FFt. If None, it is set to same as n_fft. + The default value is None. + window (str): the name of the window function applied to the single before the Fourier transform. + The folllowing window names are supported: 'hamming','hann','kaiser','gaussian', + 'exponential','triang','bohman','blackman','cosine','tukey','taylor'. + The default value is 'hann' + center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length]. + If False, frame t begins at x[t * hop_length] + The default value is True + pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect' + and 'constant'. + The default value is 'reflect'. + n_mels (int): the mel bins. + f_min (float): the lower cut-off frequency, below which the filter response is zero. + f_max (float): the upper cut-off frequency, above which the filter response is zeros. + htk (bool): whether to use HTK formula in computing fbank matrix. + norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default. + You can specify norm=1.0/2.0 to use customized p-norm normalization. + ref_value (float): the reference value. If smaller than 1.0, the db level + amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly. + Otherwise, the db level is pushed down. + magnitude is clipped(to amin). For numerical stability, set amin to a larger value, + e.g., 1e-3. + top_db (float): the maximum db value of resulting spectrum, above which the + spectrum is clipped(to top_db). + dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical + accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix. + """ + super(MFCC, self).__init__() + assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % ( + n_mfcc, n_mels) + self._log_melspectrogram = LogMelSpectrogram( + sr=sr, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + window=window, + center=center, + pad_mode=pad_mode, + n_mels=n_mels, + f_min=f_min, + f_max=f_max, + htk=htk, + norm=norm, + ref_value=ref_value, + amin=amin, + top_db=top_db, + dtype=dtype) + self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype) + self.register_buffer('dct_matrix', self.dct_matrix) + + def forward(self, x): + log_mel_feature = self._log_melspectrogram(x) + mfcc = paddle.matmul( + log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose( + (0, 2, 1)) # (B, n_mels, L) + return mfcc diff --git a/paddleaudio/paddleaudio/functional/__init__.py b/paddleaudio/paddleaudio/functional/__init__.py new file mode 100644 index 00000000..c85232df --- /dev/null +++ b/paddleaudio/paddleaudio/functional/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .functional import compute_fbank_matrix +from .functional import create_dct +from .functional import fft_frequencies +from .functional import hz_to_mel +from .functional import mel_frequencies +from .functional import mel_to_hz +from .functional import power_to_db diff --git a/paddleaudio/paddleaudio/functional/functional.py b/paddleaudio/paddleaudio/functional/functional.py new file mode 100644 index 00000000..c5ab3045 --- /dev/null +++ b/paddleaudio/paddleaudio/functional/functional.py @@ -0,0 +1,265 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Modified from librosa(https://github.com/librosa/librosa) +import math +from typing import Optional +from typing import Union + +import paddle + +__all__ = [ + 'hz_to_mel', + 'mel_to_hz', + 'mel_frequencies', + 'fft_frequencies', + 'compute_fbank_matrix', + 'power_to_db', + 'create_dct', +] + + +def hz_to_mel(freq: Union[paddle.Tensor, float], + htk: bool=False) -> Union[paddle.Tensor, float]: + """Convert Hz to Mels. + Parameters: + freq: the input tensor of arbitrary shape, or a single floating point number. + htk: use HTK formula to do the conversion. + The default value is False. + Returns: + The frequencies represented in Mel-scale. + """ + + if htk: + if isinstance(freq, paddle.Tensor): + return 2595.0 * paddle.log10(1.0 + freq / 700.0) + else: + return 2595.0 * math.log10(1.0 + freq / 700.0) + + # Fill in the linear part + f_min = 0.0 + f_sp = 200.0 / 3 + + mels = (freq - f_min) / f_sp + + # Fill in the log-scale part + + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = math.log(6.4) / 27.0 # step size for log region + + if isinstance(freq, paddle.Tensor): + target = min_log_mel + paddle.log( + freq / min_log_hz + 1e-10) / logstep # prevent nan with 1e-10 + mask = (freq > min_log_hz).astype(freq.dtype) + mels = target * mask + mels * ( + 1 - mask) # will replace by masked_fill OP in future + else: + if freq >= min_log_hz: + mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep + + return mels + + +def mel_to_hz(mel: Union[float, paddle.Tensor], + htk: bool=False) -> Union[float, paddle.Tensor]: + """Convert mel bin numbers to frequencies. + Parameters: + mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number. + htk: use HTK formula to do the conversion. + Returns: + The frequencies represented in hz. + """ + if htk: + return 700.0 * (10.0**(mel / 2595.0) - 1.0) + + f_min = 0.0 + f_sp = 200.0 / 3 + freqs = f_min + f_sp * mel + # And now the nonlinear scale + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = math.log(6.4) / 27.0 # step size for log region + if isinstance(mel, paddle.Tensor): + target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel)) + mask = (mel > min_log_mel).astype(mel.dtype) + freqs = target * mask + freqs * ( + 1 - mask) # will replace by masked_fill OP in future + else: + if mel >= min_log_mel: + freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel)) + + return freqs + + +def mel_frequencies(n_mels: int=64, + f_min: float=0.0, + f_max: float=11025.0, + htk: bool=False, + dtype: str=paddle.float32): + """Compute mel frequencies. + Parameters: + n_mels(int): number of Mel bins. + f_min(float): the lower cut-off frequency, below which the filter response is zero. + f_max(float): the upper cut-off frequency, above which the filter response is zero. + htk(bool): whether to use htk formula. + dtype(str): the datatype of the return frequencies. + Returns: + The frequencies represented in Mel-scale + """ + # 'Center freqs' of mel bands - uniformly spaced between limits + min_mel = hz_to_mel(f_min, htk=htk) + max_mel = hz_to_mel(f_max, htk=htk) + mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype) + freqs = mel_to_hz(mels, htk=htk) + return freqs + + +def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32): + """Compute fourier frequencies. + Parameters: + sr(int): the audio sample rate. + n_fft(float): the number of fft bins. + dtype(str): the datatype of the return frequencies. + Returns: + The frequencies represented in hz. + """ + return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype) + + +def compute_fbank_matrix(sr: int, + n_fft: int, + n_mels: int=64, + f_min: float=0.0, + f_max: Optional[float]=None, + htk: bool=False, + norm: Union[str, float]='slaney', + dtype: str=paddle.float32): + """Compute fbank matrix. + Parameters: + sr(int): the audio sample rate. + n_fft(int): the number of fft bins. + n_mels(int): the number of Mel bins. + f_min(float): the lower cut-off frequency, below which the filter response is zero. + f_max(float): the upper cut-off frequency, above which the filter response is zero. + htk: whether to use htk formula. + return_complex(bool): whether to return complex matrix. If True, the matrix will + be complex type. Otherwise, the real and image part will be stored in the last + axis of returned tensor. + dtype(str): the datatype of the returned fbank matrix. + Returns: + The fbank matrix of shape (n_mels, int(1+n_fft//2)). + Shape: + output: (n_mels, int(1+n_fft//2)) + """ + + if f_max is None: + f_max = float(sr) / 2 + + # Initialize the weights + weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) + + # Center freqs of each FFT bin + fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype) + + # 'Center freqs' of mel bands - uniformly spaced between limits + mel_f = mel_frequencies( + n_mels + 2, f_min=f_min, f_max=f_max, htk=htk, dtype=dtype) + + fdiff = mel_f[1:] - mel_f[:-1] #np.diff(mel_f) + ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0) + #ramps = np.subtract.outer(mel_f, fftfreqs) + + for i in range(n_mels): + # lower and upper slopes for all bins + lower = -ramps[i] / fdiff[i] + upper = ramps[i + 2] / fdiff[i + 1] + + # .. then intersect them with each other and zero + weights[i] = paddle.maximum( + paddle.zeros_like(lower), paddle.minimum(lower, upper)) + + # Slaney-style mel is scaled to be approx constant energy per channel + if norm == 'slaney': + enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels]) + weights *= enorm.unsqueeze(1) + elif isinstance(norm, int) or isinstance(norm, float): + weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1) + + return weights + + +def power_to_db(magnitude: paddle.Tensor, + ref_value: float=1.0, + amin: float=1e-10, + top_db: Optional[float]=None) -> paddle.Tensor: + """Convert a power spectrogram (amplitude squared) to decibel (dB) units. + The function computes the scaling ``10 * log10(x / ref)`` in a numerically + stable way. + Parameters: + magnitude(Tensor): the input magnitude tensor of any shape. + ref_value(float): the reference value. If smaller than 1.0, the db level + of the signal will be pulled up accordingly. Otherwise, the db level + is pushed down. + amin(float): the minimum value of input magnitude, below which the input + magnitude is clipped(to amin). + top_db(float): the maximum db value of resulting spectrum, above which the + spectrum is clipped(to top_db). + Returns: + The spectrogram in log-scale. + shape: + input: any shape + output: same as input + """ + if amin <= 0: + raise Exception("amin must be strictly positive") + + if ref_value <= 0: + raise Exception("ref_value must be strictly positive") + + ones = paddle.ones_like(magnitude) + log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude)) + log_spec -= 10.0 * math.log10(max(ref_value, amin)) + + if top_db is not None: + if top_db < 0: + raise Exception("top_db must be non-negative") + log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db)) + + return log_spec + + +def create_dct(n_mfcc: int, + n_mels: int, + norm: Optional[str]='ortho', + dtype: Optional[str]=paddle.float32) -> paddle.Tensor: + """Create a discrete cosine transform(DCT) matrix. + + Parameters: + n_mfcc (int): Number of mel frequency cepstral coefficients. + n_mels (int): Number of mel filterbanks. + norm (str, optional): Normalizaiton type. Defaults to 'ortho'. + Returns: + Tensor: The DCT matrix with shape (n_mels, n_mfcc). + """ + n = paddle.arange(n_mels, dtype=dtype) + k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1) + dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) * + k) # size (n_mfcc, n_mels) + if norm is None: + dct *= 2.0 + else: + assert norm == "ortho" + dct[0] *= 1.0 / math.sqrt(2.0) + dct *= math.sqrt(2.0 / float(n_mels)) + return dct.T diff --git a/paddleaudio/features/window.py b/paddleaudio/paddleaudio/functional/window.py similarity index 98% rename from paddleaudio/features/window.py rename to paddleaudio/paddleaudio/functional/window.py index 629989fc..f321b38e 100644 --- a/paddleaudio/features/window.py +++ b/paddleaudio/paddleaudio/functional/window.py @@ -20,6 +20,19 @@ from paddle import Tensor __all__ = [ 'get_window', + + # windows + 'taylor', + 'hamming', + 'hann', + 'tukey', + 'kaiser', + 'gaussian', + 'exponential', + 'triang', + 'bohman', + 'blackman', + 'cosine', ] @@ -73,6 +86,21 @@ def general_gaussian(M: int, p, sig, sym: bool=True, return _truncate(w, needs_trunc) +def general_cosine(M: int, a: float, sym: bool=True, + dtype: str='float64') -> Tensor: + """Compute a generic weighted sum of cosine terms window. + This function is consistent with scipy.signal.windows.general_cosine(). + """ + if _len_guards(M): + return paddle.ones((M, ), dtype=dtype) + M, needs_trunc = _extend(M, sym) + fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype) + w = paddle.zeros((M, ), dtype=dtype) + for k in range(len(a)): + w += a[k] * paddle.cos(k * fac) + return _truncate(w, needs_trunc) + + def general_hamming(M: int, alpha: float, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a generalized Hamming window. @@ -143,21 +171,6 @@ def taylor(M: int, return _truncate(w, needs_trunc) -def general_cosine(M: int, a: float, sym: bool=True, - dtype: str='float64') -> Tensor: - """Compute a generic weighted sum of cosine terms window. - This function is consistent with scipy.signal.windows.general_cosine(). - """ - if _len_guards(M): - return paddle.ones((M, ), dtype=dtype) - M, needs_trunc = _extend(M, sym) - fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype) - w = paddle.zeros((M, ), dtype=dtype) - for k in range(len(a)): - w += a[k] * paddle.cos(k * fac) - return _truncate(w, needs_trunc) - - def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor: """Compute a Hamming window. The Hamming window is a taper formed by using a raised cosine with @@ -375,6 +388,7 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor: return _truncate(w, needs_trunc) +## factory function def get_window(window: Union[str, Tuple[str, float]], win_length: int, fftbins: bool=True, diff --git a/paddleaudio/backends/__init__.py b/paddleaudio/paddleaudio/io/__init__.py similarity index 96% rename from paddleaudio/backends/__init__.py rename to paddleaudio/paddleaudio/io/__init__.py index f2f77ffe..185a92b8 100644 --- a/paddleaudio/backends/__init__.py +++ b/paddleaudio/paddleaudio/io/__init__.py @@ -11,4 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .audio import * diff --git a/paddleaudio/paddleaudio/metric/__init__.py b/paddleaudio/paddleaudio/metric/__init__.py new file mode 100644 index 00000000..a96530ff --- /dev/null +++ b/paddleaudio/paddleaudio/metric/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .dtw import dtw_distance +from .mcd import mcd_distance diff --git a/paddleaudio/paddleaudio/metric/dtw.py b/paddleaudio/paddleaudio/metric/dtw.py new file mode 100644 index 00000000..d27f56e2 --- /dev/null +++ b/paddleaudio/paddleaudio/metric/dtw.py @@ -0,0 +1,42 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +from dtaidistance import dtw_ndim + +__all__ = [ + 'dtw_distance', +] + + +def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float: + """dtw distance + + Dynamic Time Warping. + This function keeps a compact matrix, not the full warping paths matrix. + Uses dynamic programming to compute: + + wps[i, j] = (s1[i]-s2[j])**2 + min( + wps[i-1, j ] + penalty, // vertical / insertion / expansion + wps[i , j-1] + penalty, // horizontal / deletion / compression + wps[i-1, j-1]) // diagonal / match + dtw = sqrt(wps[-1, -1]) + + Args: + xs (np.ndarray): ref sequence, [T,D] + ys (np.ndarray): hyp sequence, [T,D] + + Returns: + float: dtw distance + """ + return dtw_ndim.distance(xs, ys) diff --git a/paddleaudio/paddleaudio/metric/mcd.py b/paddleaudio/paddleaudio/metric/mcd.py new file mode 100644 index 00000000..465cd5a4 --- /dev/null +++ b/paddleaudio/paddleaudio/metric/mcd.py @@ -0,0 +1,48 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import mcd.metrics_fast as mt +import numpy as np +from mcd import dtw + +__all__ = [ + 'mcd_distance', +] + + +def mcd_distance(xs: np.ndarray, ys: np.ndarray, cost_fn=mt.logSpecDbDist): + """Mel cepstral distortion (MCD), dtw distance. + + Dynamic Time Warping. + Uses dynamic programming to compute: + wps[i, j] = cost_fn(xs[i], ys[j]) + min( + wps[i-1, j ], // vertical / insertion / expansion + wps[i , j-1], // horizontal / deletion / compression + wps[i-1, j-1]) // diagonal / match + dtw = sqrt(wps[-1, -1]) + + Cost Function: + logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0) + def logSpecDbDist(x, y): + diff = x - y + return logSpecDbConst * math.sqrt(np.inner(diff, diff)) + + Args: + xs (np.ndarray): ref sequence, [T,D] + ys (np.ndarray): hyp sequence, [T,D] + + Returns: + float: dtw distance + """ + min_cost, path = dtw.dtw(xs, ys, cost_fn) + return min_cost diff --git a/paddleaudio/paddleaudio/sox_effects/__init__.py b/paddleaudio/paddleaudio/sox_effects/__init__.py new file mode 100644 index 00000000..97043fd7 --- /dev/null +++ b/paddleaudio/paddleaudio/sox_effects/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddleaudio/paddleaudio/utils/__init__.py b/paddleaudio/paddleaudio/utils/__init__.py new file mode 100644 index 00000000..afb9cedd --- /dev/null +++ b/paddleaudio/paddleaudio/utils/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .download import decompress +from .download import download_and_decompress +from .download import load_state_dict_from_url +from .env import DATA_HOME +from .env import MODEL_HOME +from .env import PPAUDIO_HOME +from .env import USER_HOME +from .error import ParameterError +from .log import Logger +from .log import logger +from .time import seconds_to_hms +from .time import Timer diff --git a/paddleaudio/utils/download.py b/paddleaudio/paddleaudio/utils/download.py similarity index 94% rename from paddleaudio/utils/download.py rename to paddleaudio/paddleaudio/utils/download.py index 45a8e57b..4658352f 100644 --- a/paddleaudio/utils/download.py +++ b/paddleaudio/paddleaudio/utils/download.py @@ -22,6 +22,12 @@ from .log import logger download.logger = logger +__all__ = [ + 'decompress', + 'download_and_decompress', + 'load_state_dict_from_url', +] + def decompress(file: str): """ diff --git a/paddleaudio/utils/env.py b/paddleaudio/paddleaudio/utils/env.py similarity index 95% rename from paddleaudio/utils/env.py rename to paddleaudio/paddleaudio/utils/env.py index 59c6b621..a2d14b89 100644 --- a/paddleaudio/utils/env.py +++ b/paddleaudio/paddleaudio/utils/env.py @@ -20,6 +20,13 @@ PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. D ''' import os +__all__ = [ + 'USER_HOME', + 'PPAUDIO_HOME', + 'MODEL_HOME', + 'DATA_HOME', +] + def _get_user_home(): return os.path.expanduser('~') diff --git a/paddleaudio/utils/error.py b/paddleaudio/paddleaudio/utils/error.py similarity index 100% rename from paddleaudio/utils/error.py rename to paddleaudio/paddleaudio/utils/error.py diff --git a/paddleaudio/utils/log.py b/paddleaudio/paddleaudio/utils/log.py similarity index 98% rename from paddleaudio/utils/log.py rename to paddleaudio/paddleaudio/utils/log.py index 5e7db68a..5656b286 100644 --- a/paddleaudio/utils/log.py +++ b/paddleaudio/paddleaudio/utils/log.py @@ -19,7 +19,10 @@ import time import colorlog -loggers = {} +__all__ = [ + 'Logger', + 'logger', +] log_config = { 'DEBUG': { diff --git a/paddleaudio/utils/time.py b/paddleaudio/paddleaudio/utils/time.py similarity index 97% rename from paddleaudio/utils/time.py rename to paddleaudio/paddleaudio/utils/time.py index 6f0c7585..105208f9 100644 --- a/paddleaudio/utils/time.py +++ b/paddleaudio/paddleaudio/utils/time.py @@ -14,6 +14,11 @@ import math import time +__all__ = [ + 'Timer', + 'seconds_to_hms', +] + class Timer(object): '''Calculate runing speed and estimated time of arrival(ETA)''' diff --git a/setup_audio.py b/paddleaudio/setup.py similarity index 96% rename from setup_audio.py rename to paddleaudio/setup.py index 21204998..7623443a 100644 --- a/setup_audio.py +++ b/paddleaudio/setup.py @@ -14,7 +14,7 @@ import setuptools # set the version here -VERSION = '0.1.0' +VERSION = '0.2.0' def write_version_py(filename='paddleaudio/__init__.py'): @@ -59,6 +59,8 @@ setuptools.setup( 'resampy >= 0.2.2', 'soundfile >= 0.9.0', 'colorlog', + 'dtaidistance >= 2.3.6', + 'mcd >= 0.4', ], ) remove_version_py() diff --git a/paddleaudio/tests/.gitkeep b/paddleaudio/tests/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py index 185a92b8..b781c4a8 100644 --- a/paddlespeech/__init__.py +++ b/paddlespeech/__init__.py @@ -11,3 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import _locale + +_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8']) diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py index cecf76fe..b526a384 100644 --- a/paddlespeech/cli/__init__.py +++ b/paddlespeech/cli/__init__.py @@ -18,6 +18,7 @@ from .base_commands import BaseCommand from .base_commands import HelpCommand from .cls import CLSExecutor from .st import STExecutor +from .stats import StatsExecutor from .text import TextExecutor from .tts import TTSExecutor diff --git a/paddleaudio/__init__.py b/paddlespeech/cli/stats/__init__.py similarity index 92% rename from paddleaudio/__init__.py rename to paddlespeech/cli/stats/__init__.py index 2685cf57..9fe6c4ab 100644 --- a/paddleaudio/__init__.py +++ b/paddlespeech/cli/stats/__init__.py @@ -11,5 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from .backends import * -from .features import * +from .infer import StatsExecutor diff --git a/paddlespeech/cli/stats/infer.py b/paddlespeech/cli/stats/infer.py new file mode 100644 index 00000000..4ef50449 --- /dev/null +++ b/paddlespeech/cli/stats/infer.py @@ -0,0 +1,193 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from typing import List + +from prettytable import PrettyTable + +from ..log import logger +from ..utils import cli_register +from ..utils import stats_wrapper + +__all__ = ['StatsExecutor'] + +model_name_format = { + 'asr': 'Model-Language-Sample Rate', + 'cls': 'Model-Sample Rate', + 'st': 'Model-Source language-Target language', + 'text': 'Model-Task-Language', + 'tts': 'Model-Language' +} + + +@cli_register( + name='paddlespeech.stats', + description='Get speech tasks support models list.') +class StatsExecutor(): + def __init__(self): + super(StatsExecutor, self).__init__() + + self.parser = argparse.ArgumentParser( + prog='paddlespeech.stats', add_help=True) + self.parser.add_argument( + '--task', + type=str, + default='asr', + choices=['asr', 'cls', 'st', 'text', 'tts'], + help='Choose speech task.', + required=True) + self.task_choices = ['asr', 'cls', 'st', 'text', 'tts'] + + def show_support_models(self, pretrained_models: dict): + fields = model_name_format[self.task].split("-") + table = PrettyTable(fields) + for key in pretrained_models: + table.add_row(key.split("-")) + print(table) + + def execute(self, argv: List[str]) -> bool: + """ + Command line entry. + """ + parser_args = self.parser.parse_args(argv) + self.task = parser_args.task + if self.task not in self.task_choices: + logger.error( + "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']" + ) + return False + + elif self.task == 'asr': + try: + from ..asr.infer import pretrained_models + logger.info( + "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error("Failed to get the list of ASR pretrained models.") + return False + + elif self.task == 'cls': + try: + from ..cls.infer import pretrained_models + logger.info( + "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error("Failed to get the list of CLS pretrained models.") + return False + + elif self.task == 'st': + try: + from ..st.infer import pretrained_models + logger.info( + "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error("Failed to get the list of ST pretrained models.") + return False + + elif self.task == 'text': + try: + from ..text.infer import pretrained_models + logger.info( + "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error( + "Failed to get the list of TEXT pretrained models.") + return False + + elif self.task == 'tts': + try: + from ..tts.infer import pretrained_models + logger.info( + "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + return True + except BaseException: + logger.error("Failed to get the list of TTS pretrained models.") + return False + + @stats_wrapper + def __call__( + self, + task: str=None, ): + """ + Python API to call an executor. + """ + self.task = task + if self.task not in self.task_choices: + print( + "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']" + ) + + elif self.task == 'asr': + try: + from ..asr.infer import pretrained_models + print( + "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of ASR pretrained models.") + + elif self.task == 'cls': + try: + from ..cls.infer import pretrained_models + print( + "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of CLS pretrained models.") + + elif self.task == 'st': + try: + from ..st.infer import pretrained_models + print( + "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of ST pretrained models.") + + elif self.task == 'text': + try: + from ..text.infer import pretrained_models + print( + "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of TEXT pretrained models.") + + elif self.task == 'tts': + try: + from ..tts.infer import pretrained_models + print( + "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print("Failed to get the list of TTS pretrained models.") diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index ba15d652..8423dfa8 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -13,6 +13,7 @@ # limitations under the License. import argparse import os +import time from collections import OrderedDict from typing import Any from typing import List @@ -621,6 +622,7 @@ class TTSExecutor(BaseExecutor): am_dataset = am[am.rindex('_') + 1:] get_tone_ids = False merge_sentences = False + frontend_st = time.time() if am_name == 'speedyspeech': get_tone_ids = True if lang == 'zh': @@ -637,9 +639,13 @@ class TTSExecutor(BaseExecutor): phone_ids = input_ids["phone_ids"] else: print("lang should in {'zh', 'en'}!") + self.frontend_time = time.time() - frontend_st + self.am_time = 0 + self.voc_time = 0 flags = 0 for i in range(len(phone_ids)): + am_st = time.time() part_phone_ids = phone_ids[i] # am if am_name == 'speedyspeech': @@ -653,13 +659,16 @@ class TTSExecutor(BaseExecutor): part_phone_ids, spk_id=paddle.to_tensor(spk_id)) else: mel = self.am_inference(part_phone_ids) + self.am_time += (time.time() - am_st) # voc + voc_st = time.time() wav = self.voc_inference(mel) if flags == 0: wav_all = wav flags = 1 else: wav_all = paddle.concat([wav_all, wav]) + self.voc_time += (time.time() - voc_st) self._outputs['wav'] = wav_all def postprocess(self, output: str='output.wav') -> Union[str, os.PathLike]: diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py index 89752bb9..ac55af12 100644 --- a/paddlespeech/s2t/io/sampler.py +++ b/paddlespeech/s2t/io/sampler.py @@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False): """ rng = np.random.RandomState(epoch) shift_len = rng.randint(0, batch_size - 1) - batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size)) + batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) rng.shuffle(batch_indices) batch_indices = [item for batch in batch_indices for item in batch] assert clipped is False diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index f7b05714..999723e5 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -33,8 +33,6 @@ from paddlespeech.s2t.modules.decoder import TransformerDecoder from paddlespeech.s2t.modules.encoder import ConformerEncoder from paddlespeech.s2t.modules.encoder import TransformerEncoder from paddlespeech.s2t.modules.loss import LabelSmoothingLoss -from paddlespeech.s2t.modules.mask import mask_finished_preds -from paddlespeech.s2t.modules.mask import mask_finished_scores from paddlespeech.s2t.modules.mask import subsequent_mask from paddlespeech.s2t.utils import checkpoint from paddlespeech.s2t.utils import layer_tools @@ -291,7 +289,7 @@ class U2STBaseModel(nn.Layer): device = speech.place # Let's assume B = batch_size and N = beam_size - # 1. Encoder and init hypothesis + # 1. Encoder and init hypothesis encoder_out, encoder_mask = self._forward_encoder( speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, diff --git a/paddlespeech/server/bin/__init__.py b/paddlespeech/server/bin/__init__.py index bd75747f..025aab09 100644 --- a/paddlespeech/server/bin/__init__.py +++ b/paddlespeech/server/bin/__init__.py @@ -14,3 +14,4 @@ from .paddlespeech_client import ASRClientExecutor from .paddlespeech_client import TTSClientExecutor from .paddlespeech_server import ServerExecutor +from .paddlespeech_server import ServerStatsExecutor diff --git a/paddlespeech/server/bin/main.py b/paddlespeech/server/bin/main.py index 360d295e..de528299 100644 --- a/paddlespeech/server/bin/main.py +++ b/paddlespeech/server/bin/main.py @@ -34,7 +34,7 @@ def init(config): bool: """ # init api - api_list = list(config.engine_backend) + api_list = list(engine.split("_")[0] for engine in config.engine_list) api_router = setup_router(api_list) app.include_router(api_router) diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 853d272f..ee6ab7ad 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -121,7 +121,6 @@ class TTSClientExecutor(BaseExecutor): (args.output)) logger.info("Audio duration: %f s." % (duration)) logger.info("Response time: %f s." % (time_consume)) - logger.info("RTF: %f " % (time_consume / duration)) return True except BaseException: diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py index aff77d54..3d71f091 100644 --- a/paddlespeech/server/bin/paddlespeech_server.py +++ b/paddlespeech/server/bin/paddlespeech_server.py @@ -16,15 +16,17 @@ from typing import List import uvicorn from fastapi import FastAPI +from prettytable import PrettyTable from ..executor import BaseExecutor from ..util import cli_server_register from ..util import stats_wrapper +from paddlespeech.cli.log import logger from paddlespeech.server.engine.engine_pool import init_engine_pool from paddlespeech.server.restful.api import setup_router from paddlespeech.server.utils.config import get_config -__all__ = ['ServerExecutor'] +__all__ = ['ServerExecutor', 'ServerStatsExecutor'] app = FastAPI( title="PaddleSpeech Serving API", description="Api", version="0.0.1") @@ -60,7 +62,7 @@ class ServerExecutor(BaseExecutor): bool: """ # init api - api_list = list(config.engine_backend) + api_list = list(engine.split("_")[0] for engine in config.engine_list) api_router = setup_router(api_list) app.include_router(api_router) @@ -86,3 +88,139 @@ class ServerExecutor(BaseExecutor): config = get_config(config_file) if self.init(config): uvicorn.run(app, host=config.host, port=config.port, debug=True) + + +@cli_server_register( + name='paddlespeech_server.stats', + description='Get the models supported by each speech task in the service.') +class ServerStatsExecutor(): + def __init__(self): + super(ServerStatsExecutor, self).__init__() + + self.parser = argparse.ArgumentParser( + prog='paddlespeech_server.stats', add_help=True) + self.parser.add_argument( + '--task', + type=str, + default=None, + choices=['asr', 'tts'], + help='Choose speech task.', + required=True) + self.task_choices = ['asr', 'tts'] + self.model_name_format = { + 'asr': 'Model-Language-Sample Rate', + 'tts': 'Model-Language' + } + + def show_support_models(self, pretrained_models: dict): + fields = self.model_name_format[self.task].split("-") + table = PrettyTable(fields) + for key in pretrained_models: + table.add_row(key.split("-")) + print(table) + + def execute(self, argv: List[str]) -> bool: + """ + Command line entry. + """ + parser_args = self.parser.parse_args(argv) + self.task = parser_args.task + if self.task not in self.task_choices: + logger.error( + "Please input correct speech task, choices = ['asr', 'tts']") + return False + + elif self.task == 'asr': + try: + from paddlespeech.cli.asr.infer import pretrained_models + logger.info( + "Here is the table of ASR pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + # show ASR static pretrained model + from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models + logger.info( + "Here is the table of ASR static pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + return True + except BaseException: + logger.error( + "Failed to get the table of ASR pretrained models supported in the service." + ) + return False + + elif self.task == 'tts': + try: + from paddlespeech.cli.tts.infer import pretrained_models + logger.info( + "Here is the table of TTS pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + # show TTS static pretrained model + from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models + logger.info( + "Here is the table of TTS static pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + return True + except BaseException: + logger.error( + "Failed to get the table of TTS pretrained models supported in the service." + ) + return False + + @stats_wrapper + def __call__( + self, + task: str=None, ): + """ + Python API to call an executor. + """ + self.task = task + if self.task not in self.task_choices: + print("Please input correct speech task, choices = ['asr', 'tts']") + + elif self.task == 'asr': + try: + from paddlespeech.cli.asr.infer import pretrained_models + print( + "Here is the table of ASR pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + # show ASR static pretrained model + from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models + print( + "Here is the table of ASR static pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + except BaseException: + print( + "Failed to get the table of ASR pretrained models supported in the service." + ) + + elif self.task == 'tts': + try: + from paddlespeech.cli.tts.infer import pretrained_models + print( + "Here is the table of TTS pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + # show TTS static pretrained model + from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models + print( + "Here is the table of TTS static pretrained models supported in the service." + ) + self.show_support_models(pretrained_models) + + except BaseException: + print( + "Failed to get the table of TTS pretrained models supported in the service." + ) diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml index cc08665e..6048450b 100644 --- a/paddlespeech/server/conf/application.yaml +++ b/paddlespeech/server/conf/application.yaml @@ -1,25 +1,107 @@ # This is the parameter configuration file for PaddleSpeech Serving. -################################################################## -# SERVER SETTING # -################################################################## -host: '0.0.0.0' +################################################################################# +# SERVER SETTING # +################################################################################# +host: 127.0.0.1 port: 8090 -################################################################## -# CONFIG FILE # -################################################################## -# The engine_type of speech task needs to keep the same type as the config file of speech task. -# E.g: The engine_type of asr is 'python', the engine_backend of asr is 'XX/asr.yaml' -# E.g: The engine_type of asr is 'inference', the engine_backend of asr is 'XX/asr_pd.yaml' -# -# add engine type (Options: python, inference) -engine_type: - asr: 'python' - tts: 'python' - -# add engine backend type (Options: asr, tts) and config file here. -# Adding a speech task to engine_backend means starting the service. -engine_backend: - asr: 'conf/asr/asr.yaml' - tts: 'conf/tts/tts.yaml' +# The task format in the engin_list is: _ +# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] + +engine_list: ['asr_python', 'tts_python'] + + +################################################################################# +# ENGINE CONFIG # +################################################################################# +################### speech task: asr; engine_type: python ####################### +asr_python: + model: 'conformer_wenetspeech' + lang: 'zh' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + decode_method: 'attention_rescoring' + force_yes: True + device: # set 'gpu:id' or 'cpu' + + +################### speech task: asr; engine_type: inference ####################### +asr_inference: + # model_type choices=['deepspeech2offline_aishell'] + model_type: 'deepspeech2offline_aishell' + am_model: # the pdmodel file of am static model [optional] + am_params: # the pdiparams file of am static model [optional] + lang: 'zh' + sample_rate: 16000 + cfg_path: + decode_method: + force_yes: True + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + +################### speech task: tts; engine_type: python ####################### +tts_python: + # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', + # 'fastspeech2_ljspeech', 'fastspeech2_aishell3', + # 'fastspeech2_vctk'] + am: 'fastspeech2_csmsc' + am_config: + am_ckpt: + am_stat: + phones_dict: + tones_dict: + speaker_dict: + spk_id: 0 + + # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', + # 'pwgan_vctk', 'mb_melgan_csmsc'] + voc: 'pwgan_csmsc' + voc_config: + voc_ckpt: + voc_stat: + + # others + lang: 'zh' + device: # set 'gpu:id' or 'cpu' + + +################### speech task: tts; engine_type: inference ####################### +tts_inference: + # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] + am: 'fastspeech2_csmsc' + am_model: # the pdmodel file of your am static model (XX.pdmodel) + am_params: # the pdiparams file of your am static model (XX.pdipparams) + am_sample_rate: 24000 + phones_dict: + tones_dict: + speaker_dict: + spk_id: 0 + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc'] + voc: 'pwgan_csmsc' + voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel) + voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams) + voc_sample_rate: 24000 + + voc_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + # others + lang: 'zh' + diff --git a/paddlespeech/server/conf/asr/asr.yaml b/paddlespeech/server/conf/asr/asr.yaml deleted file mode 100644 index 1a805142..00000000 --- a/paddlespeech/server/conf/asr/asr.yaml +++ /dev/null @@ -1,8 +0,0 @@ -model: 'conformer_wenetspeech' -lang: 'zh' -sample_rate: 16000 -cfg_path: # [optional] -ckpt_path: # [optional] -decode_method: 'attention_rescoring' -force_yes: True -device: 'cpu' # set 'gpu:id' or 'cpu' diff --git a/paddlespeech/server/conf/asr/asr_pd.yaml b/paddlespeech/server/conf/asr/asr_pd.yaml deleted file mode 100644 index 6cddb450..00000000 --- a/paddlespeech/server/conf/asr/asr_pd.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# This is the parameter configuration file for ASR server. -# These are the static models that support paddle inference. - -################################################################## -# ACOUSTIC MODEL SETTING # -# am choices=['deepspeech2offline_aishell'] TODO -################################################################## -model_type: 'deepspeech2offline_aishell' -am_model: # the pdmodel file of am static model [optional] -am_params: # the pdiparams file of am static model [optional] -lang: 'zh' -sample_rate: 16000 -cfg_path: -decode_method: -force_yes: True - -am_predictor_conf: - device: 'cpu' # set 'gpu:id' or 'cpu' - enable_mkldnn: True - switch_ir_optim: True - - -################################################################## -# OTHERS # -################################################################## diff --git a/paddlespeech/server/conf/tts/tts.yaml b/paddlespeech/server/conf/tts/tts.yaml deleted file mode 100644 index 19e8874e..00000000 --- a/paddlespeech/server/conf/tts/tts.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# This is the parameter configuration file for TTS server. - -################################################################## -# ACOUSTIC MODEL SETTING # -# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', -# 'fastspeech2_ljspeech', 'fastspeech2_aishell3', -# 'fastspeech2_vctk'] -################################################################## -am: 'fastspeech2_csmsc' -am_config: -am_ckpt: -am_stat: -phones_dict: -tones_dict: -speaker_dict: -spk_id: 0 - -################################################################## -# VOCODER SETTING # -# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', -# 'pwgan_vctk', 'mb_melgan_csmsc'] -################################################################## -voc: 'pwgan_csmsc' -voc_config: -voc_ckpt: -voc_stat: - -################################################################## -# OTHERS # -################################################################## -lang: 'zh' -device: 'cpu' # set 'gpu:id' or 'cpu' diff --git a/paddlespeech/server/conf/tts/tts_pd.yaml b/paddlespeech/server/conf/tts/tts_pd.yaml deleted file mode 100644 index 019c7ed6..00000000 --- a/paddlespeech/server/conf/tts/tts_pd.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# This is the parameter configuration file for TTS server. -# These are the static models that support paddle inference. - -################################################################## -# ACOUSTIC MODEL SETTING # -# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] -################################################################## -am: 'fastspeech2_csmsc' -am_model: # the pdmodel file of your am static model (XX.pdmodel) -am_params: # the pdiparams file of your am static model (XX.pdipparams) -am_sample_rate: 24000 # must match the model -phones_dict: -tones_dict: -speaker_dict: -spk_id: 0 - -am_predictor_conf: - device: 'cpu' # set 'gpu:id' or 'cpu' - enable_mkldnn: False - switch_ir_optim: False - - -################################################################## -# VOCODER SETTING # -# voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc'] -################################################################## -voc: 'pwgan_csmsc' -voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel) -voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams) -voc_sample_rate: 24000 #must match the model - -voc_predictor_conf: - device: 'cpu' # set 'gpu:id' or 'cpu' - enable_mkldnn: False - switch_ir_optim: False - -################################################################## -# OTHERS # -################################################################## -lang: 'zh' diff --git a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py index 5d4c4fa6..1925bf1d 100644 --- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py @@ -13,6 +13,7 @@ # limitations under the License. import io import os +import time from typing import Optional import paddle @@ -25,7 +26,6 @@ from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.modules.ctc import CTCDecoder from paddlespeech.s2t.utils.utility import UpdateConfig from paddlespeech.server.engine.base_engine import BaseEngine -from paddlespeech.server.utils.config import get_config from paddlespeech.server.utils.paddle_predictor import init_predictor from paddlespeech.server.utils.paddle_predictor import run_model @@ -183,7 +183,7 @@ class ASREngine(BaseEngine): def __init__(self): super(ASREngine, self).__init__() - def init(self, config_file: str) -> bool: + def init(self, config: dict) -> bool: """init engine resource Args: @@ -195,9 +195,8 @@ class ASREngine(BaseEngine): self.input = None self.output = None self.executor = ASRServerExecutor() - self.config = get_config(config_file) + self.config = config - paddle.set_device(paddle.get_device()) self.executor._init_from_path( model_type=self.config.model_type, am_model=self.config.am_model, @@ -223,13 +222,18 @@ class ASREngine(BaseEngine): logger.info("start running asr engine") self.executor.preprocess(self.config.model_type, io.BytesIO(audio_data)) + st = time.time() self.executor.infer(self.config.model_type) + infer_time = time.time() - st self.output = self.executor.postprocess() # Retrieve result of asr. logger.info("end inferring asr engine") else: logger.info("file check failed!") self.output = None + logger.info("inference time: {}".format(infer_time)) + logger.info("asr engine type: paddle inference") + def postprocess(self): """postprocess """ diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py index 9fac487d..e76c49a7 100644 --- a/paddlespeech/server/engine/asr/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/python/asr_engine.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. import io +import time import paddle from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger from paddlespeech.server.engine.base_engine import BaseEngine -from paddlespeech.server.utils.config import get_config __all__ = ['ASREngine'] @@ -39,7 +39,7 @@ class ASREngine(BaseEngine): def __init__(self): super(ASREngine, self).__init__() - def init(self, config_file: str) -> bool: + def init(self, config: dict) -> bool: """init engine resource Args: @@ -51,18 +51,25 @@ class ASREngine(BaseEngine): self.input = None self.output = None self.executor = ASRServerExecutor() + self.config = config + try: + if self.config.device: + self.device = self.config.device + else: + self.device = paddle.get_device() + paddle.set_device(self.device) + except BaseException: + logger.error( + "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" + ) - self.config = get_config(config_file) - if self.config.device is None: - paddle.set_device(paddle.get_device()) - else: - paddle.set_device(self.config.device) self.executor._init_from_path( self.config.model, self.config.lang, self.config.sample_rate, self.config.cfg_path, self.config.decode_method, self.config.ckpt_path) - logger.info("Initialize ASR server engine successfully.") + logger.info("Initialize ASR server engine successfully on device: %s." % + (self.device)) return True def run(self, audio_data): @@ -76,12 +83,17 @@ class ASREngine(BaseEngine): self.config.force_yes): logger.info("start run asr engine") self.executor.preprocess(self.config.model, io.BytesIO(audio_data)) + st = time.time() self.executor.infer(self.config.model) + infer_time = time.time() - st self.output = self.executor.postprocess() # Retrieve result of asr. else: logger.info("file check failed!") self.output = None + logger.info("inference time: {}".format(infer_time)) + logger.info("asr engine type: python") + def postprocess(self): """postprocess """ diff --git a/paddlespeech/server/engine/engine_pool.py b/paddlespeech/server/engine/engine_pool.py index f6a4d2aa..9de73567 100644 --- a/paddlespeech/server/engine/engine_pool.py +++ b/paddlespeech/server/engine/engine_pool.py @@ -28,11 +28,13 @@ def init_engine_pool(config) -> bool: """ Init engine pool """ global ENGINE_POOL - for engine in config.engine_backend: + + for engine_and_type in config.engine_list: + engine = engine_and_type.split("_")[0] + engine_type = engine_and_type.split("_")[1] ENGINE_POOL[engine] = EngineFactory.get_engine( - engine_name=engine, engine_type=config.engine_type[engine]) - if not ENGINE_POOL[engine].init( - config_file=config.engine_backend[engine]): + engine_name=engine, engine_type=engine_type) + if not ENGINE_POOL[engine].init(config=config[engine_and_type]): return False return True diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py index a9dc5f4e..1bbbe0ea 100644 --- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py +++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py @@ -14,6 +14,7 @@ import base64 import io import os +import time from typing import Optional import librosa @@ -28,7 +29,6 @@ from paddlespeech.cli.utils import download_and_decompress from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.audio_process import change_speed -from paddlespeech.server.utils.config import get_config from paddlespeech.server.utils.errors import ErrorCode from paddlespeech.server.utils.exception import ServerBaseException from paddlespeech.server.utils.paddle_predictor import init_predictor @@ -179,7 +179,7 @@ class TTSServerExecutor(TTSExecutor): self.phones_dict = os.path.abspath(phones_dict) self.am_sample_rate = am_sample_rate self.am_res_path = os.path.dirname(os.path.abspath(self.am_model)) - print("self.phones_dict:", self.phones_dict) + logger.info("self.phones_dict: {}".format(self.phones_dict)) # for speedyspeech self.tones_dict = None @@ -224,21 +224,21 @@ class TTSServerExecutor(TTSExecutor): with open(self.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) - print("vocab_size:", vocab_size) + logger.info("vocab_size: {}".format(vocab_size)) tone_size = None if self.tones_dict: with open(self.tones_dict, "r") as f: tone_id = [line.strip().split() for line in f.readlines()] tone_size = len(tone_id) - print("tone_size:", tone_size) + logger.info("tone_size: {}".format(tone_size)) spk_num = None if self.speaker_dict: with open(self.speaker_dict, 'rt') as f: spk_id = [line.strip().split() for line in f.readlines()] spk_num = len(spk_id) - print("spk_num:", spk_num) + logger.info("spk_num: {}".format(spk_num)) # frontend if lang == 'zh': @@ -248,21 +248,29 @@ class TTSServerExecutor(TTSExecutor): elif lang == 'en': self.frontend = English(phone_vocab_path=self.phones_dict) - print("frontend done!") - - # am predictor - self.am_predictor_conf = am_predictor_conf - self.am_predictor = init_predictor( - model_file=self.am_model, - params_file=self.am_params, - predictor_conf=self.am_predictor_conf) - - # voc predictor - self.voc_predictor_conf = voc_predictor_conf - self.voc_predictor = init_predictor( - model_file=self.voc_model, - params_file=self.voc_params, - predictor_conf=self.voc_predictor_conf) + logger.info("frontend done!") + + try: + # am predictor + self.am_predictor_conf = am_predictor_conf + self.am_predictor = init_predictor( + model_file=self.am_model, + params_file=self.am_params, + predictor_conf=self.am_predictor_conf) + logger.info("Create AM predictor successfully.") + except BaseException: + logger.error("Failed to create AM predictor.") + + try: + # voc predictor + self.voc_predictor_conf = voc_predictor_conf + self.voc_predictor = init_predictor( + model_file=self.voc_model, + params_file=self.voc_params, + predictor_conf=self.voc_predictor_conf) + logger.info("Create Vocoder predictor successfully.") + except BaseException: + logger.error("Failed to create Vocoder predictor.") @paddle.no_grad() def infer(self, @@ -277,6 +285,7 @@ class TTSServerExecutor(TTSExecutor): am_dataset = am[am.rindex('_') + 1:] get_tone_ids = False merge_sentences = False + frontend_st = time.time() if am_name == 'speedyspeech': get_tone_ids = True if lang == 'zh': @@ -292,10 +301,14 @@ class TTSServerExecutor(TTSExecutor): text, merge_sentences=merge_sentences) phone_ids = input_ids["phone_ids"] else: - print("lang should in {'zh', 'en'}!") + logger.error("lang should in {'zh', 'en'}!") + self.frontend_time = time.time() - frontend_st + self.am_time = 0 + self.voc_time = 0 flags = 0 for i in range(len(phone_ids)): + am_st = time.time() part_phone_ids = phone_ids[i] # am if am_name == 'speedyspeech': @@ -314,7 +327,10 @@ class TTSServerExecutor(TTSExecutor): am_result = run_model(self.am_predictor, [part_phone_ids.numpy()]) mel = am_result[0] + self.am_time += (time.time() - am_st) + # voc + voc_st = time.time() voc_result = run_model(self.voc_predictor, [mel]) wav = voc_result[0] wav = paddle.to_tensor(wav) @@ -324,6 +340,7 @@ class TTSServerExecutor(TTSExecutor): flags = 1 else: wav_all = paddle.concat([wav_all, wav]) + self.voc_time += (time.time() - voc_st) self._outputs['wav'] = wav_all @@ -339,11 +356,11 @@ class TTSEngine(BaseEngine): """ super(TTSEngine, self).__init__() - def init(self, config_file: str) -> bool: + def init(self, config: dict) -> bool: self.executor = TTSServerExecutor() try: - self.config = get_config(config_file) + self.config = config self.executor._init_from_path( am=self.config.am, am_model=self.config.am_model, @@ -370,7 +387,7 @@ class TTSEngine(BaseEngine): def postprocess(self, wav, original_fs: int, - target_fs: int=16000, + target_fs: int=0, volume: float=1.0, speed: float=1.0, audio_path: str=None): @@ -395,38 +412,50 @@ class TTSEngine(BaseEngine): if target_fs == 0 or target_fs > original_fs: target_fs = original_fs wav_tar_fs = wav + logger.info( + "The sample rate of synthesized audio is the same as model, which is {}Hz". + format(original_fs)) else: wav_tar_fs = librosa.resample( np.squeeze(wav), original_fs, target_fs) - + logger.info( + "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.". + format(original_fs, target_fs)) # transform volume wav_vol = wav_tar_fs * volume + logger.info("Transform the volume of the audio successfully.") # transform speed try: # windows not support soxbindings wav_speed = change_speed(wav_vol, speed, target_fs) + logger.info("Transform the speed of the audio successfully.") except ServerBaseException: raise ServerBaseException( ErrorCode.SERVER_INTERNAL_ERR, - "Transform speed failed. Can not install soxbindings on your system. \ + "Failed to transform speed. Can not install soxbindings on your system. \ You need to set speed value 1.0.") except BaseException: - logger.error("Transform speed failed.") + logger.error("Failed to transform speed.") # wav to base64 buf = io.BytesIO() wavfile.write(buf, target_fs, wav_speed) base64_bytes = base64.b64encode(buf.read()) wav_base64 = base64_bytes.decode('utf-8') + logger.info("Audio to string successfully.") # save audio - if audio_path is not None and audio_path.endswith(".wav"): - sf.write(audio_path, wav_speed, target_fs) - elif audio_path is not None and audio_path.endswith(".pcm"): - wav_norm = wav_speed * (32767 / max(0.001, - np.max(np.abs(wav_speed)))) - with open(audio_path, "wb") as f: - f.write(wav_norm.astype(np.int16)) + if audio_path is not None: + if audio_path.endswith(".wav"): + sf.write(audio_path, wav_speed, target_fs) + elif audio_path.endswith(".pcm"): + wav_norm = wav_speed * (32767 / max(0.001, + np.max(np.abs(wav_speed)))) + with open(audio_path, "wb") as f: + f.write(wav_norm.astype(np.int16)) + logger.info("Save audio to {} successfully.".format(audio_path)) + else: + logger.info("There is no need to save audio.") return target_fs, wav_base64 @@ -462,8 +491,12 @@ class TTSEngine(BaseEngine): lang = self.config.lang try: + infer_st = time.time() self.executor.infer( text=sentence, lang=lang, am=self.config.am, spk_id=spk_id) + infer_et = time.time() + infer_time = infer_et - infer_st + except ServerBaseException: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts infer failed.") @@ -471,6 +504,7 @@ class TTSEngine(BaseEngine): logger.error("tts infer failed.") try: + postprocess_st = time.time() target_sample_rate, wav_base64 = self.postprocess( wav=self.executor._outputs['wav'].numpy(), original_fs=self.executor.am_sample_rate, @@ -478,10 +512,34 @@ class TTSEngine(BaseEngine): volume=volume, speed=speed, audio_path=save_path) + postprocess_et = time.time() + postprocess_time = postprocess_et - postprocess_st + duration = len(self.executor._outputs['wav'] + .numpy()) / self.executor.am_sample_rate + rtf = infer_time / duration + except ServerBaseException: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts postprocess failed.") except BaseException: logger.error("tts postprocess failed.") + logger.info("AM model: {}".format(self.config.am)) + logger.info("Vocoder model: {}".format(self.config.voc)) + logger.info("Language: {}".format(lang)) + logger.info("tts engine type: paddle inference") + + logger.info("audio duration: {}".format(duration)) + logger.info( + "frontend inference time: {}".format(self.executor.frontend_time)) + logger.info("AM inference time: {}".format(self.executor.am_time)) + logger.info("Vocoder inference time: {}".format(self.executor.voc_time)) + logger.info("total inference time: {}".format(infer_time)) + logger.info( + "postprocess (change speed, volume, target sample rate) time: {}". + format(postprocess_time)) + logger.info("total generate audio time: {}".format(infer_time + + postprocess_time)) + logger.info("RTF: {}".format(rtf)) + return lang, target_sample_rate, wav_base64 diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py index 20b4e0fe..8d6c7fd1 100644 --- a/paddlespeech/server/engine/tts/python/tts_engine.py +++ b/paddlespeech/server/engine/tts/python/tts_engine.py @@ -13,6 +13,7 @@ # limitations under the License. import base64 import io +import time import librosa import numpy as np @@ -24,7 +25,6 @@ from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.audio_process import change_speed -from paddlespeech.server.utils.config import get_config from paddlespeech.server.utils.errors import ErrorCode from paddlespeech.server.utils.exception import ServerBaseException @@ -49,16 +49,25 @@ class TTSEngine(BaseEngine): """ super(TTSEngine, self).__init__() - def init(self, config_file: str) -> bool: + def init(self, config: dict) -> bool: self.executor = TTSServerExecutor() try: - self.config = get_config(config_file) - if self.config.device is None: - paddle.set_device(paddle.get_device()) + self.config = config + if self.config.device: + self.device = self.config.device else: - paddle.set_device(self.config.device) + self.device = paddle.get_device() + paddle.set_device(self.device) + except BaseException: + logger.error( + "Set device failed, please check if device is already used and the parameter 'device' in the yaml file" + ) + logger.error("Initialize TTS server engine Failed on device: %s." % + (self.device)) + return False + try: self.executor._init_from_path( am=self.config.am, am_config=self.config.am_config, @@ -73,16 +82,19 @@ class TTSEngine(BaseEngine): voc_stat=self.config.voc_stat, lang=self.config.lang) except BaseException: - logger.error("Initialize TTS server engine Failed.") + logger.error("Failed to get model related files.") + logger.error("Initialize TTS server engine Failed on device: %s." % + (self.device)) return False - logger.info("Initialize TTS server engine successfully.") + logger.info("Initialize TTS server engine successfully on device: %s." % + (self.device)) return True def postprocess(self, wav, original_fs: int, - target_fs: int=16000, + target_fs: int=0, volume: float=1.0, speed: float=1.0, audio_path: str=None): @@ -107,38 +119,50 @@ class TTSEngine(BaseEngine): if target_fs == 0 or target_fs > original_fs: target_fs = original_fs wav_tar_fs = wav + logger.info( + "The sample rate of synthesized audio is the same as model, which is {}Hz". + format(original_fs)) else: wav_tar_fs = librosa.resample( np.squeeze(wav), original_fs, target_fs) - + logger.info( + "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.". + format(original_fs, target_fs)) # transform volume wav_vol = wav_tar_fs * volume + logger.info("Transform the volume of the audio successfully.") # transform speed try: # windows not support soxbindings wav_speed = change_speed(wav_vol, speed, target_fs) + logger.info("Transform the speed of the audio successfully.") except ServerBaseException: raise ServerBaseException( ErrorCode.SERVER_INTERNAL_ERR, - "Transform speed failed. Can not install soxbindings on your system. \ + "Failed to transform speed. Can not install soxbindings on your system. \ You need to set speed value 1.0.") except BaseException: - logger.error("Transform speed failed.") + logger.error("Failed to transform speed.") # wav to base64 buf = io.BytesIO() wavfile.write(buf, target_fs, wav_speed) base64_bytes = base64.b64encode(buf.read()) wav_base64 = base64_bytes.decode('utf-8') + logger.info("Audio to string successfully.") # save audio - if audio_path is not None and audio_path.endswith(".wav"): - sf.write(audio_path, wav_speed, target_fs) - elif audio_path is not None and audio_path.endswith(".pcm"): - wav_norm = wav_speed * (32767 / max(0.001, - np.max(np.abs(wav_speed)))) - with open(audio_path, "wb") as f: - f.write(wav_norm.astype(np.int16)) + if audio_path is not None: + if audio_path.endswith(".wav"): + sf.write(audio_path, wav_speed, target_fs) + elif audio_path.endswith(".pcm"): + wav_norm = wav_speed * (32767 / max(0.001, + np.max(np.abs(wav_speed)))) + with open(audio_path, "wb") as f: + f.write(wav_norm.astype(np.int16)) + logger.info("Save audio to {} successfully.".format(audio_path)) + else: + logger.info("There is no need to save audio.") return target_fs, wav_base64 @@ -174,8 +198,15 @@ class TTSEngine(BaseEngine): lang = self.config.lang try: + infer_st = time.time() self.executor.infer( text=sentence, lang=lang, am=self.config.am, spk_id=spk_id) + infer_et = time.time() + infer_time = infer_et - infer_st + duration = len(self.executor._outputs['wav'] + .numpy()) / self.executor.am_config.fs + rtf = infer_time / duration + except ServerBaseException: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts infer failed.") @@ -183,6 +214,7 @@ class TTSEngine(BaseEngine): logger.error("tts infer failed.") try: + postprocess_st = time.time() target_sample_rate, wav_base64 = self.postprocess( wav=self.executor._outputs['wav'].numpy(), original_fs=self.executor.am_config.fs, @@ -190,10 +222,32 @@ class TTSEngine(BaseEngine): volume=volume, speed=speed, audio_path=save_path) + postprocess_et = time.time() + postprocess_time = postprocess_et - postprocess_st + except ServerBaseException: raise ServerBaseException(ErrorCode.SERVER_INTERNAL_ERR, "tts postprocess failed.") except BaseException: logger.error("tts postprocess failed.") + logger.info("AM model: {}".format(self.config.am)) + logger.info("Vocoder model: {}".format(self.config.voc)) + logger.info("Language: {}".format(lang)) + logger.info("tts engine type: python") + + logger.info("audio duration: {}".format(duration)) + logger.info( + "frontend inference time: {}".format(self.executor.frontend_time)) + logger.info("AM inference time: {}".format(self.executor.am_time)) + logger.info("Vocoder inference time: {}".format(self.executor.voc_time)) + logger.info("total inference time: {}".format(infer_time)) + logger.info( + "postprocess (change speed, volume, target sample rate) time: {}". + format(postprocess_time)) + logger.info("total generate audio time: {}".format(infer_time + + postprocess_time)) + logger.info("RTF: {}".format(rtf)) + logger.info("device: {}".format(self.device)) + return lang, target_sample_rate, wav_base64 diff --git a/paddlespeech/server/restful/tts_api.py b/paddlespeech/server/restful/tts_api.py index c7e91300..0af0f6d0 100644 --- a/paddlespeech/server/restful/tts_api.py +++ b/paddlespeech/server/restful/tts_api.py @@ -16,6 +16,7 @@ from typing import Union from fastapi import APIRouter +from paddlespeech.cli.log import logger from paddlespeech.server.engine.engine_pool import get_engine_pool from paddlespeech.server.restful.request import TTSRequest from paddlespeech.server.restful.response import ErrorResponse @@ -60,6 +61,9 @@ def tts(request_body: TTSRequest): Returns: json: [description] """ + + logger.info("request: {}".format(request_body)) + # get params text = request_body.text spk_id = request_body.spk_id @@ -92,6 +96,7 @@ def tts(request_body: TTSRequest): # get single engine from engine pool engine_pool = get_engine_pool() tts_engine = engine_pool['tts'] + logger.info("Get tts engine successfully.") lang, target_sample_rate, wav_base64 = tts_engine.run( text, spk_id, speed, volume, sample_rate, save_path) diff --git a/paddlespeech/server/utils/paddle_predictor.py b/paddlespeech/server/utils/paddle_predictor.py index f4216d74..4035d48d 100644 --- a/paddlespeech/server/utils/paddle_predictor.py +++ b/paddlespeech/server/utils/paddle_predictor.py @@ -15,6 +15,7 @@ import os from typing import List from typing import Optional +import paddle from paddle.inference import Config from paddle.inference import create_predictor @@ -40,15 +41,30 @@ def init_predictor(model_dir: Optional[os.PathLike]=None, else: config = Config(model_file, params_file) - config.enable_memory_optim() - if "gpu" in predictor_conf["device"]: - gpu_id = predictor_conf["device"].split(":")[-1] + # set device + if predictor_conf["device"]: + device = predictor_conf["device"] + else: + device = paddle.get_device() + if "gpu" in device: + gpu_id = device.split(":")[-1] config.enable_use_gpu(1000, int(gpu_id)) - if predictor_conf["enable_mkldnn"]: - config.enable_mkldnn() + + # IR optim if predictor_conf["switch_ir_optim"]: config.switch_ir_optim() + # glog + if not predictor_conf["glog_info"]: + config.disable_glog_info() + + # config summary + if predictor_conf["summary"]: + print(config.summary()) + + # memory optim + config.enable_memory_optim() + predictor = create_predictor(config) return predictor diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py index 1c42a87c..81da14f2 100644 --- a/paddlespeech/t2s/exps/synthesize.py +++ b/paddlespeech/t2s/exps/synthesize.py @@ -20,6 +20,7 @@ import numpy as np import paddle import soundfile as sf import yaml +from timer import timer from yacs.config import CfgNode from paddlespeech.s2t.utils.dynamic_import import dynamic_import @@ -50,6 +51,18 @@ model_alias = { "paddlespeech.t2s.models.melgan:MelGANGenerator", "mb_melgan_inference": "paddlespeech.t2s.models.melgan:MelGANInference", + "style_melgan": + "paddlespeech.t2s.models.melgan:StyleMelGANGenerator", + "style_melgan_inference": + "paddlespeech.t2s.models.melgan:StyleMelGANInference", + "hifigan": + "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", + "hifigan_inference": + "paddlespeech.t2s.models.hifigan:HiFiGANInference", + "wavernn": + "paddlespeech.t2s.models.wavernn:WaveRNN", + "wavernn_inference": + "paddlespeech.t2s.models.wavernn:WaveRNNInference", } @@ -146,10 +159,15 @@ def evaluate(args): voc_name = args.voc[:args.voc.rindex('_')] voc_class = dynamic_import(voc_name, model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', model_alias) - voc = voc_class(**voc_config["generator_params"]) - voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) - voc.remove_weight_norm() - voc.eval() + if voc_name != 'wavernn': + voc = voc_class(**voc_config["generator_params"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"]) + voc.remove_weight_norm() + voc.eval() + else: + voc = voc_class(**voc_config["model"]) + voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"]) + voc.eval() voc_mu, voc_std = np.load(args.voc_stat) voc_mu = paddle.to_tensor(voc_mu) voc_std = paddle.to_tensor(voc_std) @@ -162,38 +180,51 @@ def evaluate(args): output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) + N = 0 + T = 0 + for datum in test_dataset: utt_id = datum["utt_id"] - with paddle.no_grad(): - # acoustic model - if am_name == 'fastspeech2': - phone_ids = paddle.to_tensor(datum["text"]) - spk_emb = None - spk_id = None - # multi speaker - if args.voice_cloning and "spk_emb" in datum: - spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) - elif "spk_id" in datum: - spk_id = paddle.to_tensor(datum["spk_id"]) - mel = am_inference(phone_ids, spk_id=spk_id, spk_emb=spk_emb) - elif am_name == 'speedyspeech': - phone_ids = paddle.to_tensor(datum["phones"]) - tone_ids = paddle.to_tensor(datum["tones"]) - mel = am_inference(phone_ids, tone_ids) - elif am_name == 'tacotron2': - phone_ids = paddle.to_tensor(datum["text"]) - spk_emb = None - # multi speaker - if args.voice_cloning and "spk_emb" in datum: - spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) - mel = am_inference(phone_ids, spk_emb=spk_emb) + with timer() as t: + with paddle.no_grad(): + # acoustic model + if am_name == 'fastspeech2': + phone_ids = paddle.to_tensor(datum["text"]) + spk_emb = None + spk_id = None + # multi speaker + if args.voice_cloning and "spk_emb" in datum: + spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) + elif "spk_id" in datum: + spk_id = paddle.to_tensor(datum["spk_id"]) + mel = am_inference( + phone_ids, spk_id=spk_id, spk_emb=spk_emb) + elif am_name == 'speedyspeech': + phone_ids = paddle.to_tensor(datum["phones"]) + tone_ids = paddle.to_tensor(datum["tones"]) + mel = am_inference(phone_ids, tone_ids) + elif am_name == 'tacotron2': + phone_ids = paddle.to_tensor(datum["text"]) + spk_emb = None + # multi speaker + if args.voice_cloning and "spk_emb" in datum: + spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) + mel = am_inference(phone_ids, spk_emb=spk_emb) # vocoder wav = voc_inference(mel) + + wav = wav.numpy() + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + rtf = am_config.fs / speed + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) sf.write( - str(output_dir / (utt_id + ".wav")), - wav.numpy(), - samplerate=am_config.fs) + str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs) print(f"{utt_id} done!") + print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") def main(): @@ -246,7 +277,8 @@ def main(): default='pwgan_csmsc', choices=[ 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc' + 'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc', + 'style_melgan_csmsc' ], help='Choose vocoder type of tts task.') diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py index 75c631b8..94180f85 100644 --- a/paddlespeech/t2s/exps/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/synthesize_e2e.py @@ -21,6 +21,7 @@ import soundfile as sf import yaml from paddle import jit from paddle.static import InputSpec +from timer import timer from yacs.config import CfgNode from paddlespeech.s2t.utils.dynamic_import import dynamic_import @@ -194,10 +195,10 @@ def evaluate(args): am_inference = jit.to_static( am_inference, input_spec=[ - InputSpec([-1], dtype=paddle.int64), # text - InputSpec([-1], dtype=paddle.int64), # tone - None, # duration - InputSpec([-1], dtype=paddle.int64) # spk_id + InputSpec([-1], dtype=paddle.int64), # text + InputSpec([-1], dtype=paddle.int64), # tone + InputSpec([1], dtype=paddle.int64), # spk_id + None # duration ]) else: am_inference = jit.to_static( @@ -233,59 +234,68 @@ def evaluate(args): # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022) if am_name == 'tacotron2': merge_sentences = True - + N = 0 + T = 0 for utt_id, sentence in sentences: - get_tone_ids = False - if am_name == 'speedyspeech': - get_tone_ids = True - if args.lang == 'zh': - input_ids = frontend.get_input_ids( - sentence, - merge_sentences=merge_sentences, - get_tone_ids=get_tone_ids) - phone_ids = input_ids["phone_ids"] - if get_tone_ids: - tone_ids = input_ids["tone_ids"] - elif args.lang == 'en': - input_ids = frontend.get_input_ids( - sentence, merge_sentences=merge_sentences) - phone_ids = input_ids["phone_ids"] - else: - print("lang should in {'zh', 'en'}!") - with paddle.no_grad(): - flags = 0 - for i in range(len(phone_ids)): - part_phone_ids = phone_ids[i] - # acoustic model - if am_name == 'fastspeech2': - # multi speaker - if am_dataset in {"aishell3", "vctk"}: - spk_id = paddle.to_tensor(args.spk_id) - mel = am_inference(part_phone_ids, spk_id) - else: + with timer() as t: + get_tone_ids = False + if am_name == 'speedyspeech': + get_tone_ids = True + if args.lang == 'zh': + input_ids = frontend.get_input_ids( + sentence, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids) + phone_ids = input_ids["phone_ids"] + if get_tone_ids: + tone_ids = input_ids["tone_ids"] + elif args.lang == 'en': + input_ids = frontend.get_input_ids( + sentence, merge_sentences=merge_sentences) + phone_ids = input_ids["phone_ids"] + else: + print("lang should in {'zh', 'en'}!") + with paddle.no_grad(): + flags = 0 + for i in range(len(phone_ids)): + part_phone_ids = phone_ids[i] + # acoustic model + if am_name == 'fastspeech2': + # multi speaker + if am_dataset in {"aishell3", "vctk"}: + spk_id = paddle.to_tensor(args.spk_id) + mel = am_inference(part_phone_ids, spk_id) + else: + mel = am_inference(part_phone_ids) + elif am_name == 'speedyspeech': + part_tone_ids = tone_ids[i] + if am_dataset in {"aishell3", "vctk"}: + spk_id = paddle.to_tensor(args.spk_id) + mel = am_inference(part_phone_ids, part_tone_ids, + spk_id) + else: + mel = am_inference(part_phone_ids, part_tone_ids) + elif am_name == 'tacotron2': mel = am_inference(part_phone_ids) - elif am_name == 'speedyspeech': - part_tone_ids = tone_ids[i] - if am_dataset in {"aishell3", "vctk"}: - spk_id = paddle.to_tensor(args.spk_id) - mel = am_inference(part_phone_ids, part_tone_ids, - spk_id) + # vocoder + wav = voc_inference(mel) + if flags == 0: + wav_all = wav + flags = 1 else: - mel = am_inference(part_phone_ids, part_tone_ids) - elif am_name == 'tacotron2': - mel = am_inference(part_phone_ids) - # vocoder - wav = voc_inference(mel) - if flags == 0: - wav_all = wav - flags = 1 - else: - wav_all = paddle.concat([wav_all, wav]) + wav_all = paddle.concat([wav_all, wav]) + wav = wav_all.numpy() + N += wav.size + T += t.elapse + speed = wav.size / t.elapse + rtf = am_config.fs / speed + print( + f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}." + ) sf.write( - str(output_dir / (utt_id + ".wav")), - wav_all.numpy(), - samplerate=am_config.fs) + str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs) print(f"{utt_id} done!") + print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }") def main(): diff --git a/paddlespeech/t2s/exps/wavernn/synthesize.py b/paddlespeech/t2s/exps/wavernn/synthesize.py index 4357b282..d23e9cb7 100644 --- a/paddlespeech/t2s/exps/wavernn/synthesize.py +++ b/paddlespeech/t2s/exps/wavernn/synthesize.py @@ -91,7 +91,7 @@ def main(): target=config.inference.target, overlap=config.inference.overlap, mu_law=config.mu_law, - gen_display=True) + gen_display=False) wav = wav.numpy() N += wav.size T += t.elapse diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 5264e068..07f7fa2b 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -63,7 +63,7 @@ class ToneSandhi(): '扫把', '惦记' } self.must_not_neural_tone_words = { - "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子" + "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子", "人人", "虎虎" } self.punc = ":,;。?!“”‘’':,;.?!" @@ -77,7 +77,9 @@ class ToneSandhi(): # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺 for j, item in enumerate(word): - if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}: + if j - 1 >= 0 and item == word[j - 1] and pos[0] in { + "n", "v", "a" + } and word not in self.must_not_neural_tone_words: finals[j] = finals[j][:-1] + "5" ge_idx = word.find("个") if len(word) >= 1 and word[-1] in "吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py index a905c412..bb8ed5b4 100644 --- a/paddlespeech/t2s/frontend/zh_frontend.py +++ b/paddlespeech/t2s/frontend/zh_frontend.py @@ -20,7 +20,10 @@ import numpy as np import paddle from g2pM import G2pM from pypinyin import lazy_pinyin +from pypinyin import load_phrases_dict +from pypinyin import load_single_dict from pypinyin import Style +from pypinyin_dict.phrase_pinyin_data import large_pinyin from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi @@ -41,6 +44,8 @@ class Frontend(): self.g2pM_model = G2pM() self.pinyin2phone = generate_lexicon( with_tone=True, with_erhua=False) + else: + self.__init__pypinyin() self.must_erhua = {"小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"} self.not_erhua = { "虐儿", "为儿", "护儿", "瞒儿", "救儿", "替儿", "有儿", "一儿", "我儿", "俺儿", "妻儿", @@ -62,6 +67,23 @@ class Frontend(): for tone, id in tone_id: self.vocab_tones[tone] = int(id) + def __init__pypinyin(self): + large_pinyin.load() + + load_phrases_dict({u'开户行': [[u'ka1i'], [u'hu4'], [u'hang2']]}) + load_phrases_dict({u'发卡行': [[u'fa4'], [u'ka3'], [u'hang2']]}) + load_phrases_dict({u'放款行': [[u'fa4ng'], [u'kua3n'], [u'hang2']]}) + load_phrases_dict({u'茧行': [[u'jia3n'], [u'hang2']]}) + load_phrases_dict({u'行号': [[u'hang2'], [u'ha4o']]}) + load_phrases_dict({u'各地': [[u'ge4'], [u'di4']]}) + load_phrases_dict({u'借还款': [[u'jie4'], [u'hua2n'], [u'kua3n']]}) + load_phrases_dict({u'时间为': [[u'shi2'], [u'jia1n'], [u'we2i']]}) + load_phrases_dict({u'为准': [[u'we2i'], [u'zhu3n']]}) + load_phrases_dict({u'色差': [[u'se4'], [u'cha1']]}) + + # 调整字的拼音顺序 + load_single_dict({ord(u'地'): u'de,di4'}) + def _get_initials_finals(self, word: str) -> List[List[str]]: initials = [] finals = [] diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py index bfa7d2b1..ea518913 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py +++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py @@ -63,7 +63,10 @@ def replace_time(match) -> str: result = f"{num2str(hour)}点" if minute.lstrip('0'): - result += f"{_time_num2str(minute)}分" + if int(minute) == 30: + result += f"半" + else: + result += f"{_time_num2str(minute)}分" if second and second.lstrip('0'): result += f"{_time_num2str(second)}秒" @@ -71,7 +74,10 @@ def replace_time(match) -> str: result += "至" result += f"{num2str(hour_2)}点" if minute_2.lstrip('0'): - result += f"{_time_num2str(minute_2)}分" + if int(minute) == 30: + result += f"半" + else: + result += f"{_time_num2str(minute_2)}分" if second_2 and second_2.lstrip('0'): result += f"{_time_num2str(second_2)}秒" diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py index 27a2f846..a83b42a4 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/num.py +++ b/paddlespeech/t2s/frontend/zh_normalization/num.py @@ -28,7 +28,7 @@ UNITS = OrderedDict({ 8: '亿', }) -COM_QUANTIFIERS = '(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' +COM_QUANTIFIERS = '(所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' # 分数表达式 RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') @@ -110,7 +110,7 @@ def replace_default_num(match): # 纯小数 RE_DECIMAL_NUM = re.compile(r'(-?)((\d+)(\.\d+))' r'|(\.(\d+))') # 正整数 + 量词 -RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几])?" + COM_QUANTIFIERS) +RE_POSITIVE_QUANTIFIERS = re.compile(r"(\d+)([多余几\+])?" + COM_QUANTIFIERS) RE_NUMBER = re.compile(r'(-?)((\d+)(\.\d+)?)' r'|(\.(\d+))') @@ -123,6 +123,8 @@ def replace_positive_quantifier(match) -> str: """ number = match.group(1) match_2 = match.group(2) + if match_2 == "+": + match_2 = "多" match_2: str = match_2 if match_2 else "" quantifiers: str = match.group(3) number: str = num2str(number) @@ -151,6 +153,7 @@ def replace_number(match) -> str: # 范围表达式 # match.group(1) and match.group(8) are copy from RE_NUMBER + RE_RANGE = re.compile( r'((-?)((\d+)(\.\d+)?)|(\.(\d+)))[-~]((-?)((\d+)(\.\d+)?)|(\.(\d+)))') diff --git a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py index f9d1b8cb..bc663c70 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py +++ b/paddlespeech/t2s/frontend/zh_normalization/text_normlization.py @@ -63,11 +63,19 @@ class TextNormalizer(): # Only for pure Chinese here if lang == "zh": text = text.replace(" ", "") + # 过滤掉特殊字符 + text = re.sub(r'[《》【】<=>{}()()#&@“”^_|…\\]', '', text) text = self.SENTENCE_SPLITOR.sub(r'\1\n', text) text = text.strip() sentences = [sentence.strip() for sentence in re.split(r'\n+', text)] return sentences + def _post_replace(self, sentence: str) -> str: + sentence = sentence.replace('/', '每') + sentence = sentence.replace('~', '至') + + return sentence + def normalize_sentence(self, sentence: str) -> str: # basic character conversions sentence = tranditional_to_simplified(sentence) @@ -97,6 +105,7 @@ class TextNormalizer(): sentence) sentence = RE_DEFAULT_NUM.sub(replace_default_num, sentence) sentence = RE_NUMBER.sub(replace_number, sentence) + sentence = self._post_replace(sentence) return sentence diff --git a/paddlespeech/t2s/models/melgan/melgan.py b/paddlespeech/t2s/models/melgan/melgan.py index 6a139659..22d8fd9e 100644 --- a/paddlespeech/t2s/models/melgan/melgan.py +++ b/paddlespeech/t2s/models/melgan/melgan.py @@ -66,7 +66,7 @@ class MelGANGenerator(nn.Layer): nonlinear_activation_params (Dict[str, Any], optional): Parameters passed to the linear activation in the upsample network, by default {} pad (str): Padding function module name before dilated convolution layer. - pad_params (dict): Hyperparameters for padding function. + pad_params (dict): Hyperparameters for padding function. use_final_nonlinear_activation (nn.Layer): Activation function for the final layer. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py index 42e8f743..44ccfc60 100644 --- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py +++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py @@ -247,7 +247,7 @@ class SpeedySpeechInference(nn.Layer): self.normalizer = normalizer self.acoustic_model = speedyspeech_model - def forward(self, phones, tones, durations=None, spk_id=None): + def forward(self, phones, tones, spk_id=None, durations=None): normalized_mel = self.acoustic_model.inference( phones, tones, durations=durations, spk_id=spk_id) logmel = self.normalizer.inverse(normalized_mel) diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py index 1320ffa3..95907043 100644 --- a/paddlespeech/t2s/models/wavernn/wavernn.py +++ b/paddlespeech/t2s/models/wavernn/wavernn.py @@ -509,16 +509,20 @@ class WaveRNN(nn.Layer): total_len = num_folds * (target + overlap) + overlap # Need some silence for the run warmup - slience_len = overlap // 2 + slience_len = 0 + linear_len = slience_len fade_len = overlap - slience_len slience = paddle.zeros([slience_len], dtype=paddle.float32) - linear = paddle.ones([fade_len], dtype=paddle.float32) + linear = paddle.ones([linear_len], dtype=paddle.float32) # Equal power crossfade # fade_in increase from 0 to 1, fade_out reduces from 1 to 0 - t = paddle.linspace(-1, 1, fade_len, dtype=paddle.float32) - fade_in = paddle.sqrt(0.5 * (1 + t)) - fade_out = paddle.sqrt(0.5 * (1 - t)) + sigmoid_scale = 2.3 + t = paddle.linspace( + -sigmoid_scale, sigmoid_scale, fade_len, dtype=paddle.float32) + # sigmoid 曲线应该更好 + fade_in = paddle.nn.functional.sigmoid(t) + fade_out = 1 - paddle.nn.functional.sigmoid(t) # Concat the silence to the fades fade_out = paddle.concat([linear, fade_out]) fade_in = paddle.concat([slience, fade_in]) diff --git a/paddlespeech/t2s/modules/transformer/repeat.py b/paddlespeech/t2s/modules/transformer/repeat.py index 2073a78b..1e946adf 100644 --- a/paddlespeech/t2s/modules/transformer/repeat.py +++ b/paddlespeech/t2s/modules/transformer/repeat.py @@ -36,4 +36,4 @@ def repeat(N, fn): Returns: MultiSequential: Repeated model instance. """ - return MultiSequential(*[fn(n) for n in range(N)]) + return MultiSequential(* [fn(n) for n in range(N)]) diff --git a/setup.py b/setup.py index 3f3632b3..f86758ba 100644 --- a/setup.py +++ b/setup.py @@ -48,6 +48,7 @@ base = [ "paddlespeech_feat", "praatio==5.0.0", "pypinyin", + "pypinyin-dict", "python-dateutil", "pyworld", "resampy==0.2.2", @@ -62,6 +63,7 @@ base = [ "visualdl", "webrtcvad", "yacs~=0.1.8", + "prettytable", ] server = [ diff --git a/tests/test_tipc/configs/conformer/train_benchmark.txt b/tests/test_tipc/configs/conformer/train_infer_python.txt similarity index 91% rename from tests/test_tipc/configs/conformer/train_benchmark.txt rename to tests/test_tipc/configs/conformer/train_infer_python.txt index 3833f144..33b1debd 100644 --- a/tests/test_tipc/configs/conformer/train_benchmark.txt +++ b/tests/test_tipc/configs/conformer/train_infer_python.txt @@ -54,4 +54,4 @@ batch_size:16|30 fp_items:fp32 iteration:50 --profiler-options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile" -flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +flags:null diff --git a/tests/test_tipc/configs/pwgan/train_benchmark.txt b/tests/test_tipc/configs/pwgan/train_infer_python.txt similarity index 91% rename from tests/test_tipc/configs/pwgan/train_benchmark.txt rename to tests/test_tipc/configs/pwgan/train_infer_python.txt index e936da3c..c64984dc 100644 --- a/tests/test_tipc/configs/pwgan/train_benchmark.txt +++ b/tests/test_tipc/configs/pwgan/train_infer_python.txt @@ -54,4 +54,4 @@ batch_size:6|16 fp_items:fp32 iteration:50 --profiler_options:"batch_range=[10,35];state=GPU;tracer_option=Default;profile_path=model.profile" -flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096 +flags:null diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh index 0280e5d4..b46b2032 100644 --- a/tests/test_tipc/prepare.sh +++ b/tests/test_tipc/prepare.sh @@ -26,15 +26,19 @@ if [ ${MODE} = "benchmark_train" ];then curPath=$(readlink -f "$(dirname "$0")") echo "curPath:"${curPath} cd ${curPath}/../.. - pip install . + apt-get install libsndfile1 + pip install pytest-runner kaldiio setuptools_scm -i https://pypi.tuna.tsinghua.edu.cn/simple + pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple cd - if [ ${model_name} == "conformer" ]; then # set the URL for aishell_tiny dataset - URL='None' + URL=${conformer_data_URL:-"None"} echo "URL:"${URL} if [ ${URL} == 'None' ];then echo "please contact author to get the URL.\n" exit + else + wget -P ${curPath}/../../dataset/aishell/ ${URL} fi sed -i "s#^URL_ROOT_TAG#URL_ROOT = '${URL}'#g" ${curPath}/conformer/scripts/aishell_tiny.py cp ${curPath}/conformer/scripts/aishell_tiny.py ${curPath}/../../dataset/aishell/ @@ -42,6 +46,7 @@ if [ ${MODE} = "benchmark_train" ];then source path.sh # download audio data sed -i "s#aishell.py#aishell_tiny.py#g" ./local/data.sh + sed -i "s#python3#python#g" ./local/data.sh bash ./local/data.sh || exit -1 if [ $? -ne 0 ]; then exit 1 @@ -56,7 +61,6 @@ if [ ${MODE} = "benchmark_train" ];then sed -i "s#conf/#test_tipc/conformer/benchmark_train/conf/#g" ${curPath}/conformer/benchmark_train/conf/conformer.yaml sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/tuning/decode.yaml sed -i "s#data/#test_tipc/conformer/benchmark_train/data/#g" ${curPath}/conformer/benchmark_train/conf/preprocess.yaml - fi if [ ${model_name} == "pwgan" ]; then @@ -73,4 +77,4 @@ if [ ${MODE} = "benchmark_train" ];then python ../paddlespeech/t2s/exps/gan_vocoder/normalize.py --metadata=dump/test/raw/metadata.jsonl --dumpdir=dump/test/norm --stats=dump/train/feats_stats.npy fi -fi \ No newline at end of file +fi diff --git a/tests/unit/asr/deepspeech2_online_model_test.py b/tests/unit/asr/deepspeech2_online_model_test.py index f623c5ac..f23c4926 100644 --- a/tests/unit/asr/deepspeech2_online_model_test.py +++ b/tests/unit/asr/deepspeech2_online_model_test.py @@ -11,11 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os +import pickle import unittest import numpy as np import paddle +from paddle import inference +from paddlespeech.s2t.models.ds2_online import DeepSpeech2InferModelOnline from paddlespeech.s2t.models.ds2_online import DeepSpeech2ModelOnline @@ -182,5 +186,77 @@ class TestDeepSpeech2ModelOnline(unittest.TestCase): paddle.allclose(final_state_c_box, final_state_c_box_chk), True) +class TestDeepSpeech2StaticModelOnline(unittest.TestCase): + def setUp(self): + export_prefix = "exp/deepspeech2_online/checkpoints/test_export" + if not os.path.exists(os.path.dirname(export_prefix)): + os.makedirs(os.path.dirname(export_prefix), mode=0o755) + infer_model = DeepSpeech2InferModelOnline( + feat_size=161, + dict_size=4233, + num_conv_layers=2, + num_rnn_layers=5, + rnn_size=1024, + num_fc_layers=0, + fc_layers_size_list=[-1], + use_gru=False) + static_model = infer_model.export() + paddle.jit.save(static_model, export_prefix) + + with open("test_data/static_ds2online_inputs.pickle", "rb") as f: + self.data_dict = pickle.load(f) + + self.setup_model(export_prefix) + + def setup_model(self, export_prefix): + deepspeech_config = inference.Config(export_prefix + ".pdmodel", + export_prefix + ".pdiparams") + if ('CUDA_VISIBLE_DEVICES' in os.environ.keys() and + os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''): + deepspeech_config.enable_use_gpu(100, 0) + deepspeech_config.enable_memory_optim() + deepspeech_predictor = inference.create_predictor(deepspeech_config) + self.predictor = deepspeech_predictor + + def test_unit(self): + input_names = self.predictor.get_input_names() + audio_handle = self.predictor.get_input_handle(input_names[0]) + audio_len_handle = self.predictor.get_input_handle(input_names[1]) + h_box_handle = self.predictor.get_input_handle(input_names[2]) + c_box_handle = self.predictor.get_input_handle(input_names[3]) + + x_chunk = self.data_dict["audio_chunk"] + x_chunk_lens = self.data_dict["audio_chunk_lens"] + chunk_state_h_box = self.data_dict["chunk_state_h_box"] + chunk_state_c_box = self.data_dict["chunk_state_c_bos"] + + audio_handle.reshape(x_chunk.shape) + audio_handle.copy_from_cpu(x_chunk) + + audio_len_handle.reshape(x_chunk_lens.shape) + audio_len_handle.copy_from_cpu(x_chunk_lens) + + h_box_handle.reshape(chunk_state_h_box.shape) + h_box_handle.copy_from_cpu(chunk_state_h_box) + + c_box_handle.reshape(chunk_state_c_box.shape) + c_box_handle.copy_from_cpu(chunk_state_c_box) + + output_names = self.predictor.get_output_names() + output_handle = self.predictor.get_output_handle(output_names[0]) + output_lens_handle = self.predictor.get_output_handle(output_names[1]) + output_state_h_handle = self.predictor.get_output_handle( + output_names[2]) + output_state_c_handle = self.predictor.get_output_handle( + output_names[3]) + self.predictor.run() + + output_chunk_probs = output_handle.copy_to_cpu() + output_chunk_lens = output_lens_handle.copy_to_cpu() + chunk_state_h_box = output_state_h_handle.copy_to_cpu() + chunk_state_c_box = output_state_c_handle.copy_to_cpu() + return True + + if __name__ == '__main__': unittest.main() diff --git a/tests/unit/asr/deepspeech2_online_model_test.sh b/tests/unit/asr/deepspeech2_online_model_test.sh new file mode 100644 index 00000000..629238fd --- /dev/null +++ b/tests/unit/asr/deepspeech2_online_model_test.sh @@ -0,0 +1,3 @@ +mkdir -p ./test_data +wget -P ./test_data https://paddlespeech.bj.bcebos.com/datasets/unit_test/asr/static_ds2online_inputs.pickle +python deepspeech2_online_model_test.py diff --git a/tests/unit/server/change_yaml.py b/tests/unit/server/change_yaml.py new file mode 100644 index 00000000..1f063d8f --- /dev/null +++ b/tests/unit/server/change_yaml.py @@ -0,0 +1,105 @@ +#!/usr/bin/python +import argparse +import os + +import yaml + + +def change_device(yamlfile: str, engine: str, device: str): + """Change the settings of the device under the voice task configuration file + + Args: + yaml_name (str): asr or asr_pd or tts or tts_pd + cpu (bool): True means set device to "cpu" + model_type (dict): change model type + """ + tmp_yamlfile = yamlfile.split(".yaml")[0] + "_tmp.yaml" + os.system("cp %s %s" % (yamlfile, tmp_yamlfile)) + + if device == 'cpu': + set_device = 'cpu' + elif device == 'gpu': + set_device = 'gpu:0' + else: + print("Please set correct device: cpu or gpu.") + + with open(tmp_yamlfile) as f, open(yamlfile, "w+", encoding="utf-8") as fw: + y = yaml.safe_load(f) + if engine == 'asr_python' or engine == 'tts_python': + y[engine]['device'] = set_device + elif engine == 'asr_inference': + y[engine]['am_predictor_conf']['device'] = set_device + elif engine == 'tts_inference': + y[engine]['am_predictor_conf']['device'] = set_device + y[engine]['voc_predictor_conf']['device'] = set_device + else: + print( + "Please set correct engine: asr_python, tts_python, asr_inference, tts_inference." + ) + + print(yaml.dump(y, default_flow_style=False, sort_keys=False)) + yaml.dump(y, fw, allow_unicode=True) + os.system("rm %s" % (tmp_yamlfile)) + print("Change %s successfully." % (yamlfile)) + + +def change_engine_type(yamlfile: str, engine_type): + """Change the engine type and corresponding configuration file of the speech task in application.yaml + + Args: + task (str): asr or tts + """ + tmp_yamlfile = yamlfile.split(".yaml")[0] + "_tmp.yaml" + os.system("cp %s %s" % (yamlfile, tmp_yamlfile)) + speech_task = engine_type.split("_")[0] + + with open(tmp_yamlfile) as f, open(yamlfile, "w+", encoding="utf-8") as fw: + y = yaml.safe_load(f) + engine_list = y['engine_list'] + for engine in engine_list: + if speech_task in engine: + engine_list.remove(engine) + engine_list.append(engine_type) + y['engine_list'] = engine_list + print(yaml.dump(y, default_flow_style=False, sort_keys=False)) + yaml.dump(y, fw, allow_unicode=True) + os.system("rm %s" % (tmp_yamlfile)) + print("Change %s successfully." % (yamlfile)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + '--config_file', + type=str, + default='./conf/application.yaml', + help='server yaml file.') + parser.add_argument( + '--change_task', + type=str, + default=None, + help='Change task', + choices=[ + 'enginetype-asr_python', + 'enginetype-asr_inference', + 'enginetype-tts_python', + 'enginetype-tts_inference', + 'device-asr_python-cpu', + 'device-asr_python-gpu', + 'device-asr_inference-cpu', + 'device-asr_inference-gpu', + 'device-tts_python-cpu', + 'device-tts_python-gpu', + 'device-tts_inference-cpu', + 'device-tts_inference-gpu', + ], + required=True) + args = parser.parse_args() + + types = args.change_task.split("-") + if types[0] == "enginetype": + change_engine_type(args.config_file, types[1]) + elif types[0] == "device": + change_device(args.config_file, types[1], types[2]) + else: + print("Error change task, please check change_task.") diff --git a/tests/unit/server/conf/application.yaml b/tests/unit/server/conf/application.yaml new file mode 100644 index 00000000..6048450b --- /dev/null +++ b/tests/unit/server/conf/application.yaml @@ -0,0 +1,107 @@ +# This is the parameter configuration file for PaddleSpeech Serving. + +################################################################################# +# SERVER SETTING # +################################################################################# +host: 127.0.0.1 +port: 8090 + +# The task format in the engin_list is: _ +# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] + +engine_list: ['asr_python', 'tts_python'] + + +################################################################################# +# ENGINE CONFIG # +################################################################################# +################### speech task: asr; engine_type: python ####################### +asr_python: + model: 'conformer_wenetspeech' + lang: 'zh' + sample_rate: 16000 + cfg_path: # [optional] + ckpt_path: # [optional] + decode_method: 'attention_rescoring' + force_yes: True + device: # set 'gpu:id' or 'cpu' + + +################### speech task: asr; engine_type: inference ####################### +asr_inference: + # model_type choices=['deepspeech2offline_aishell'] + model_type: 'deepspeech2offline_aishell' + am_model: # the pdmodel file of am static model [optional] + am_params: # the pdiparams file of am static model [optional] + lang: 'zh' + sample_rate: 16000 + cfg_path: + decode_method: + force_yes: True + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + +################### speech task: tts; engine_type: python ####################### +tts_python: + # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', + # 'fastspeech2_ljspeech', 'fastspeech2_aishell3', + # 'fastspeech2_vctk'] + am: 'fastspeech2_csmsc' + am_config: + am_ckpt: + am_stat: + phones_dict: + tones_dict: + speaker_dict: + spk_id: 0 + + # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', + # 'pwgan_vctk', 'mb_melgan_csmsc'] + voc: 'pwgan_csmsc' + voc_config: + voc_ckpt: + voc_stat: + + # others + lang: 'zh' + device: # set 'gpu:id' or 'cpu' + + +################### speech task: tts; engine_type: inference ####################### +tts_inference: + # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'] + am: 'fastspeech2_csmsc' + am_model: # the pdmodel file of your am static model (XX.pdmodel) + am_params: # the pdiparams file of your am static model (XX.pdipparams) + am_sample_rate: 24000 + phones_dict: + tones_dict: + speaker_dict: + spk_id: 0 + + am_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc'] + voc: 'pwgan_csmsc' + voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel) + voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams) + voc_sample_rate: 24000 + + voc_predictor_conf: + device: # set 'gpu:id' or 'cpu' + switch_ir_optim: True + glog_info: False # True -> print glog + summary: True # False -> do not show predictor config + + # others + lang: 'zh' + diff --git a/tests/unit/server/test_server_client.sh b/tests/unit/server/test_server_client.sh new file mode 100644 index 00000000..b48e7111 --- /dev/null +++ b/tests/unit/server/test_server_client.sh @@ -0,0 +1,186 @@ +#!/bin/bash +# bash test_server_client.sh + +StartService(){ + # Start service + paddlespeech_server start --config_file $config_file 1>>log/server.log 2>>log/server.log.wf & + echo $! > pid + + start_num=$(cat log/server.log.wf | grep "INFO: Uvicorn running on http://" -c) + flag="normal" + while [[ $start_num -lt $target_start_num && $flag == "normal" ]] + do + start_num=$(cat log/server.log.wf | grep "INFO: Uvicorn running on http://" -c) + # start service failed + if [ $(cat log/server.log.wf | grep -i "error" -c) -gt $error_time ];then + echo "Service started failed." | tee -a ./log/test_result.log + error_time=$(cat log/server.log.wf | grep -i "error" -c) + flag="unnormal" + fi + done +} + +ClientTest(){ + # Client test + # test asr client + paddlespeech_client asr --server_ip $server_ip --port $port --input ./zh.wav + ((test_times+=1)) + paddlespeech_client asr --server_ip $server_ip --port $port --input ./zh.wav + ((test_times+=1)) + + # test tts client + paddlespeech_client tts --server_ip $server_ip --port $port --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav + ((test_times+=1)) + paddlespeech_client tts --server_ip $server_ip --port $port --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav + ((test_times+=1)) +} + +GetTestResult() { + # Determine if the test was successful + response_success_time=$(cat log/server.log | grep "200 OK" -c) + if (( $response_success_time == $test_times )) ; then + echo "Testing successfully. The service configuration is: asr engine type: $1; tts engine type: $1; device: $2." | tee -a ./log/test_result.log + else + echo "Testing failed. The service configuration is: asr engine type: $1; tts engine type: $1; device: $2." | tee -a ./log/test_result.log + fi + test_times=$response_success_time +} + + +mkdir -p log +rm -rf log/server.log.wf +rm -rf log/server.log +rm -rf log/test_result.log + +config_file=./conf/application.yaml +server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}') +port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}') + +echo "Sevice ip: $server_ip" | tee ./log/test_result.log +echo "Sevice port: $port" | tee -a ./log/test_result.log + +# whether a process is listening on $port +pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'` +if [ "$pid" != "" ]; then + echo "The port: $port is occupied, please change another port" + exit +fi + +# download test audios for ASR client +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav + + +target_start_num=0 # the number of start service +test_times=0 # The number of client test +error_time=0 # The number of error occurrences in the startup failure server.log.wf file + +# start server: asr engine type: python; tts engine type: python; device: gpu +echo "Start the service: asr engine type: python; tts engine type: python; device: gpu" | tee -a ./log/test_result.log +((target_start_num+=1)) +StartService + +if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then + echo "Service started successfully." | tee -a ./log/test_result.log + ClientTest + echo "This round of testing is over." | tee -a ./log/test_result.log + + GetTestResult python gpu +else + echo "Service failed to start, no client test." + target_start_num=$start_num + +fi + +kill -9 `cat pid` +rm -rf pid +sleep 2s +echo "**************************************************************************************" | tee -a ./log/test_result.log + + + +# start server: asr engine type: python; tts engine type: python; device: cpu +python change_yaml.py --change_task device-asr_python-cpu # change asr.yaml device: cpu +python change_yaml.py --change_task device-tts_python-cpu # change tts.yaml device: cpu + +echo "Start the service: asr engine type: python; tts engine type: python; device: cpu" | tee -a ./log/test_result.log +((target_start_num+=1)) +StartService + +if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then + echo "Service started successfully." | tee -a ./log/test_result.log + ClientTest + echo "This round of testing is over." | tee -a ./log/test_result.log + + GetTestResult python cpu +else + echo "Service failed to start, no client test." + target_start_num=$start_num + +fi + +kill -9 `cat pid` +rm -rf pid +sleep 2s +echo "**************************************************************************************" | tee -a ./log/test_result.log + + +# start server: asr engine type: inference; tts engine type: inference; device: gpu +python change_yaml.py --change_task enginetype-asr_inference # change application.yaml, asr engine_type: inference; asr engine_backend: asr_pd.yaml +python change_yaml.py --change_task enginetype-tts_inference # change application.yaml, tts engine_type: inference; tts engine_backend: tts_pd.yaml + +echo "Start the service: asr engine type: inference; tts engine type: inference; device: gpu" | tee -a ./log/test_result.log +((target_start_num+=1)) +StartService + +if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then + echo "Service started successfully." | tee -a ./log/test_result.log + ClientTest + echo "This round of testing is over." | tee -a ./log/test_result.log + + GetTestResult inference gpu +else + echo "Service failed to start, no client test." + target_start_num=$start_num + +fi + +kill -9 `cat pid` +rm -rf pid +sleep 2s +echo "**************************************************************************************" | tee -a ./log/test_result.log + + +# start server: asr engine type: inference; tts engine type: inference; device: cpu +python change_yaml.py --change_task device-asr_inference-cpu # change asr_pd.yaml device: cpu +python change_yaml.py --change_task device-tts_inference-cpu # change tts_pd.yaml device: cpu + +echo "start the service: asr engine type: inference; tts engine type: inference; device: cpu" | tee -a ./log/test_result.log +((target_start_num+=1)) +StartService + +if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then + echo "Service started successfully." | tee -a ./log/test_result.log + ClientTest + echo "This round of testing is over." | tee -a ./log/test_result.log + + GetTestResult inference cpu +else + echo "Service failed to start, no client test." + target_start_num=$start_num + +fi + +kill -9 `cat pid` +rm -rf pid +sleep 2s +echo "**************************************************************************************" | tee -a ./log/test_result.log + +echo "All tests completed." | tee -a ./log/test_result.log + +# sohw all the test results +echo "***************** Here are all the test results ********************" +cat ./log/test_result.log + +# Restoring conf is the same as demos/speech_server +rm -rf ./conf +cp ../../../demos/speech_server/conf/ ./ -rf \ No newline at end of file