diff --git a/README.md b/README.md
index 9a2fe2aa..46730797 100644
--- a/README.md
+++ b/README.md
@@ -344,9 +344,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
FastSpeech2 |
- AISHELL-3 / VCTK / LJSpeech / CSMSC |
+ LJSpeech / VCTK / CSMSC / AISHELL-3 |
- fastspeech2-aishell3 / fastspeech2-vctk / fastspeech2-ljspeech / fastspeech2-csmsc
+ fastspeech2-ljspeech / fastspeech2-vctk / fastspeech2-csmsc / fastspeech2-aishell3
|
@@ -359,9 +359,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
Parallel WaveGAN |
- LJSpeech / VCTK / CSMSC |
+ LJSpeech / VCTK / CSMSC / AISHELL-3 |
- PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc
+ PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3
|
diff --git a/README_cn.md b/README_cn.md
index 409b7a25..9782240a 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -302,7 +302,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
语音合成模块类型 |
- 模型种类 |
+ 模型种类 |
数据集 |
链接 |
@@ -339,9 +339,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
FastSpeech2 |
- AISHELL-3 / VCTK / LJSpeech / CSMSC |
+ LJSpeech / VCTK / CSMSC / AISHELL-3 |
- fastspeech2-aishell3 / fastspeech2-vctk / fastspeech2-ljspeech / fastspeech2-csmsc
+ fastspeech2-ljspeech / fastspeech2-vctk / fastspeech2-csmsc / fastspeech2-aishell3
|
@@ -354,9 +354,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
Parallel WaveGAN |
- LJSpeech / VCTK / CSMSC |
+ LJSpeech / VCTK / CSMSC / AISHELL-3 |
- PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc
+ PWGAN-ljspeech / PWGAN-vctk / PWGAN-csmsc / PWGAN-aishell3
|
diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md
new file mode 100644
index 00000000..39007f6c
--- /dev/null
+++ b/demos/speech_server/README.md
@@ -0,0 +1,224 @@
+([简体中文](./README_cn.md)|English)
+
+# Speech Server
+
+## Introduction
+This demo is an implementation of starting the voice service and accessing the service. It can be achieved with a single command using `paddlespeech_server` and `paddlespeech_client` or a few lines of code in python.
+
+
+## Usage
+### 1. Installation
+see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
+
+You can choose one way from easy, meduim and hard to install paddlespeech.
+
+### 2. Prepare config File
+The configuration file contains the service-related configuration files and the model configuration related to the voice tasks contained in the service. They are all under the `conf` folder.
+
+The input of ASR client demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
+
+Here are sample files for thisASR client demo that can be downloaded:
+```bash
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+```
+
+### 3. Server Usage
+- Command Line (Recommended)
+
+ ```bash
+ # start the service
+ paddlespeech_server start --config_file ./conf/application.yaml
+ ```
+
+ Usage:
+
+ ```bash
+ paddlespeech_server start --help
+ ```
+ Arguments:
+ - `config_file`: yaml file of the app, defalut: ./conf/application.yaml
+ - `log_file`: log file. Default: ./log/paddlespeech.log
+
+ Output:
+ ```bash
+ [2022-02-23 11:17:32] [INFO] [server.py:64] Started server process [6384]
+ INFO: Waiting for application startup.
+ [2022-02-23 11:17:32] [INFO] [on.py:26] Waiting for application startup.
+ INFO: Application startup complete.
+ [2022-02-23 11:17:32] [INFO] [on.py:38] Application startup complete.
+ INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
+ [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
+
+ ```
+
+- Python API
+ ```python
+ from paddlespeech.server.bin.paddlespeech_server import ServerExecutor
+
+ server_executor = ServerExecutor()
+ server_executor(
+ config_file="./conf/application.yaml",
+ log_file="./log/paddlespeech.log")
+ ```
+
+ Output:
+ ```bash
+ INFO: Started server process [529]
+ [2022-02-23 14:57:56] [INFO] [server.py:64] Started server process [529]
+ INFO: Waiting for application startup.
+ [2022-02-23 14:57:56] [INFO] [on.py:26] Waiting for application startup.
+ INFO: Application startup complete.
+ [2022-02-23 14:57:56] [INFO] [on.py:38] Application startup complete.
+ INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
+ [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
+
+ ```
+
+
+### 4. ASR Client Usage
+- Command Line (Recommended)
+ ```
+ paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+ ```
+
+ Usage:
+
+ ```bash
+ paddlespeech_client asr --help
+ ```
+ Arguments:
+ - `server_ip`: server ip. Default: 127.0.0.1
+ - `port`: server port. Default: 8090
+ - `input`(required): Audio file to be recognized.
+ - `sample_rate`: Audio ampling rate, default: 16000.
+ - `lang`: Language. Default: "zh_cn".
+ - `audio_format`: Audio format. Default: "wav".
+
+ Output:
+ ```bash
+ [2022-02-23 18:11:22,819] [ INFO] - {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'transcription': '我认为跑步最重要的就是给我带来了身体健康'}}
+ [2022-02-23 18:11:22,820] [ INFO] - time cost 0.689145 s.
+
+ ```
+
+- Python API
+ ```python
+ from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor
+
+ asrclient_executor = ASRClientExecutor()
+ asrclient_executor(
+ input="./zh.wav",
+ server_ip="127.0.0.1",
+ port=8090,
+ sample_rate=16000,
+ lang="zh_cn",
+ audio_format="wav")
+ ```
+
+ Output:
+ ```bash
+ {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'transcription': '我认为跑步最重要的就是给我带来了身体健康'}}
+ time cost 0.604353 s.
+ ```
+
+### 5. TTS Client Usage
+- Command Line (Recommended)
+ ```bash
+ paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav
+ ```
+ Usage:
+
+ ```bash
+ paddlespeech_client tts --help
+ ```
+ Arguments:
+ - `server_ip`: server ip. Default: 127.0.0.1
+ - `port`: server port. Default: 8090
+ - `input`(required): Input text to generate.
+ - `spk_id`: Speaker id for multi-speaker text to speech. Default: 0
+ - `speed`: Audio speed, the value should be set between 0 and 3. Default: 1.0
+ - `volume`: Audio volume, the value should be set between 0 and 3. Default: 1.0
+ - `sample_rate`: Sampling rate, choice: [0, 8000, 16000], the default is the same as the model. Default: 0
+ - `output`: Output wave filepath. Default: `output.wav`.
+
+ Output:
+ ```bash
+ [2022-02-23 15:20:37,875] [ INFO] - {'description': 'success.'}
+ [2022-02-23 15:20:37,875] [ INFO] - Save synthesized audio successfully on output.wav.
+ [2022-02-23 15:20:37,875] [ INFO] - Audio duration: 3.612500 s.
+ [2022-02-23 15:20:37,875] [ INFO] - Response time: 0.348050 s.
+ [2022-02-23 15:20:37,875] [ INFO] - RTF: 0.096346
+
+
+ ```
+
+- Python API
+ ```python
+ from paddlespeech.server.bin.paddlespeech_client import TTSClientExecutor
+
+ ttsclient_executor = TTSClientExecutor()
+ ttsclient_executor(
+ input="您好,欢迎使用百度飞桨语音合成服务。",
+ server_ip="127.0.0.1",
+ port=8090,
+ spk_id=0,
+ speed=1.0,
+ volume=1.0,
+ sample_rate=0,
+ output="./output.wav")
+ ```
+
+ Output:
+ ```bash
+ {'description': 'success.'}
+ Save synthesized audio successfully on ./output.wav.
+ Audio duration: 3.612500 s.
+ Response time: 0.388317 s.
+ RTF: 0.107493
+
+ ```
+
+
+## Pretrained Models
+### ASR model
+Here is a list of [ASR pretrained models](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_recognition/README.md#4pretrained-models) released by PaddleSpeech, both command line and python interfaces are available:
+
+| Model | Language | Sample Rate
+| :--- | :---: | :---: |
+| conformer_wenetspeech| zh| 16000
+| transformer_librispeech| en| 16000
+
+### TTS model
+Here is a list of [TTS pretrained models](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/text_to_speech/README.md#4-pretrained-models) released by PaddleSpeech, both command line and python interfaces are available:
+
+- Acoustic model
+ | Model | Language
+ | :--- | :---: |
+ | speedyspeech_csmsc| zh
+ | fastspeech2_csmsc| zh
+ | fastspeech2_aishell3| zh
+ | fastspeech2_ljspeech| en
+ | fastspeech2_vctk| en
+
+- Vocoder
+ | Model | Language
+ | :--- | :---: |
+ | pwgan_csmsc| zh
+ | pwgan_aishell3| zh
+ | pwgan_ljspeech| en
+ | pwgan_vctk| en
+ | mb_melgan_csmsc| zh
+
+Here is a list of **TTS pretrained static models** released by PaddleSpeech, both command line and python interfaces are available:
+- Acoustic model
+ | Model | Language
+ | :--- | :---: |
+ | speedyspeech_csmsc| zh
+ | fastspeech2_csmsc| zh
+
+- Vocoder
+ | Model | Language
+ | :--- | :---: |
+ | pwgan_csmsc| zh
+ | mb_melgan_csmsc| zh
+ | hifigan_csmsc| zh
diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md
new file mode 100644
index 00000000..f5666070
--- /dev/null
+++ b/demos/speech_server/README_cn.md
@@ -0,0 +1,222 @@
+([简体中文](./README_cn.md)|English)
+
+# 语音服务
+
+## 介绍
+这个demo是一个启动语音服务和访问服务的实现。 它可以通过使用`paddlespeech_server` 和 `paddlespeech_client`的单个命令或 python 的几行代码来实现。
+
+
+## 使用方法
+### 1. 安装
+请看 [安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
+
+你可以从 easy,medium,hard 三中方式中选择一种方式安装 PaddleSpeech。
+
+### 2. 准备配置文件
+配置文件包含服务相关的配置文件和服务中包含的语音任务相关的模型配置。 它们都在 `conf` 文件夹下。
+
+这个 ASR client 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。
+
+可以下载此 ASR client的示例音频:
+```bash
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+```
+
+### 3. 服务端使用方法
+- 命令行 (推荐使用)
+
+ ```bash
+ # 启动服务
+ paddlespeech_server start --config_file ./conf/application.yaml
+ ```
+
+ 使用方法:
+
+ ```bash
+ paddlespeech_server start --help
+ ```
+ 参数:
+ - `config_file`: 服务的配置文件,默认: ./conf/application.yaml
+ - `log_file`: log 文件. 默认:./log/paddlespeech.log
+
+ 输出:
+ ```bash
+ [2022-02-23 11:17:32] [INFO] [server.py:64] Started server process [6384]
+ INFO: Waiting for application startup.
+ [2022-02-23 11:17:32] [INFO] [on.py:26] Waiting for application startup.
+ INFO: Application startup complete.
+ [2022-02-23 11:17:32] [INFO] [on.py:38] Application startup complete.
+ INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
+ [2022-02-23 11:17:32] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
+
+ ```
+
+- Python API
+ ```python
+ from paddlespeech.server.bin.paddlespeech_server import ServerExecutor
+
+ server_executor = ServerExecutor()
+ server_executor(
+ config_file="./conf/application.yaml",
+ log_file="./log/paddlespeech.log")
+ ```
+
+ 输出:
+ ```bash
+ INFO: Started server process [529]
+ [2022-02-23 14:57:56] [INFO] [server.py:64] Started server process [529]
+ INFO: Waiting for application startup.
+ [2022-02-23 14:57:56] [INFO] [on.py:26] Waiting for application startup.
+ INFO: Application startup complete.
+ [2022-02-23 14:57:56] [INFO] [on.py:38] Application startup complete.
+ INFO: Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
+ [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
+
+ ```
+
+### 4. ASR客户端使用方法
+- 命令行 (推荐使用)
+ ```
+ paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+ ```
+
+ 使用帮助:
+
+ ```bash
+ paddlespeech_client asr --help
+ ```
+
+ 参数:
+ - `server_ip`: 服务端ip地址,默认: 127.0.0.1。
+ - `port`: 服务端口,默认: 8090。
+ - `input`(必须输入): 用于识别的音频文件。
+ - `sample_rate`: 音频采样率,默认值:16000。
+ - `lang`: 模型语言,默认值:zh_cn。
+ - `audio_format`: 音频格式,默认值:wav。
+
+ 输出:
+
+ ```bash
+ [2022-02-23 18:11:22,819] [ INFO] - {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'transcription': '我认为跑步最重要的就是给我带来了身体健康'}}
+ [2022-02-23 18:11:22,820] [ INFO] - time cost 0.689145 s.
+ ```
+
+- Python API
+ ```python
+ from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor
+
+ asrclient_executor = ASRClientExecutor()
+ asrclient_executor(
+ input="./zh.wav",
+ server_ip="127.0.0.1",
+ port=8090,
+ sample_rate=16000,
+ lang="zh_cn",
+ audio_format="wav")
+ ```
+
+ 输出:
+ ```bash
+ {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'transcription': '我认为跑步最重要的就是给我带来了身体健康'}}
+ time cost 0.604353 s.
+
+ ```
+
+### 5. TTS客户端使用方法
+ ```bash
+ paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav
+ ```
+ 使用帮助:
+
+ ```bash
+ paddlespeech_client tts --help
+ ```
+
+ 参数:
+ - `server_ip`: 服务端ip地址,默认: 127.0.0.1。
+ - `port`: 服务端口,默认: 8090。
+ - `input`(必须输入): 待合成的文本。
+ - `spk_id`: 说话人 id,用于多说话人语音合成,默认值: 0。
+ - `speed`: 音频速度,该值应设置在 0 到 3 之间。 默认值:1.0
+ - `volume`: 音频音量,该值应设置在 0 到 3 之间。 默认值: 1.0
+ - `sample_rate`: 采样率,可选 [0, 8000, 16000],默认与模型相同。 默认值:0
+ - `output`: 输出音频的路径, 默认值:output.wav。
+
+ 输出:
+ ```bash
+ [2022-02-23 15:20:37,875] [ INFO] - {'description': 'success.'}
+ [2022-02-23 15:20:37,875] [ INFO] - Save synthesized audio successfully on output.wav.
+ [2022-02-23 15:20:37,875] [ INFO] - Audio duration: 3.612500 s.
+ [2022-02-23 15:20:37,875] [ INFO] - Response time: 0.348050 s.
+ [2022-02-23 15:20:37,875] [ INFO] - RTF: 0.096346
+ ```
+
+- Python API
+ ```python
+ from paddlespeech.server.bin.paddlespeech_client import TTSClientExecutor
+
+ ttsclient_executor = TTSClientExecutor()
+ ttsclient_executor(
+ input="您好,欢迎使用百度飞桨语音合成服务。",
+ server_ip="127.0.0.1",
+ port=8090,
+ spk_id=0,
+ speed=1.0,
+ volume=1.0,
+ sample_rate=0,
+ output="./output.wav")
+ ```
+
+ 输出:
+ ```bash
+ {'description': 'success.'}
+ Save synthesized audio successfully on ./output.wav.
+ Audio duration: 3.612500 s.
+ Response time: 0.388317 s.
+ RTF: 0.107493
+
+ ```
+
+## Pretrained Models
+### ASR model
+下面是PaddleSpeech发布的[ASR预训练模型](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_recognition/README.md#4pretrained-models)列表,命令行和python接口均可用:
+
+| Model | Language | Sample Rate
+| :--- | :---: | :---: |
+| conformer_wenetspeech| zh| 16000
+| transformer_librispeech| en| 16000
+
+### TTS model
+下面是PaddleSpeech发布的 [TTS预训练模型](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/text_to_speech/README.md#4-pretrained-models) 列表,命令行和python接口均可用:
+
+- Acoustic model
+ | Model | Language
+ | :--- | :---: |
+ | speedyspeech_csmsc| zh
+ | fastspeech2_csmsc| zh
+ | fastspeech2_aishell3| zh
+ | fastspeech2_ljspeech| en
+ | fastspeech2_vctk| en
+
+- Vocoder
+ | Model | Language
+ | :--- | :---: |
+ | pwgan_csmsc| zh
+ | pwgan_aishell3| zh
+ | pwgan_ljspeech| en
+ | pwgan_vctk| en
+ | mb_melgan_csmsc| zh
+
+下面是PaddleSpeech发布的 **TTS预训练静态模型** 列表,命令行和python接口均可用:
+- Acoustic model
+ | Model | Language
+ | :--- | :---: |
+ | speedyspeech_csmsc| zh
+ | fastspeech2_csmsc| zh
+
+- Vocoder
+ | Model | Language
+ | :--- | :---: |
+ | pwgan_csmsc| zh
+ | mb_melgan_csmsc| zh
+ | hifigan_csmsc| zh
diff --git a/demos/speech_server/asr_client.sh b/demos/speech_server/asr_client.sh
new file mode 100644
index 00000000..afe2f821
--- /dev/null
+++ b/demos/speech_server/asr_client.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml
new file mode 100644
index 00000000..c8d71f2f
--- /dev/null
+++ b/demos/speech_server/conf/application.yaml
@@ -0,0 +1,17 @@
+# This is the parameter configuration file for PaddleSpeech Serving.
+
+##################################################################
+# SERVER SETTING #
+##################################################################
+host: '0.0.0.0'
+port: 8090
+
+##################################################################
+# CONFIG FILE #
+##################################################################
+# add engine type (Options: asr, tts) and config file here.
+
+engine_backend:
+ asr: 'conf/asr/asr.yaml'
+ tts: 'conf/tts/tts.yaml'
+
diff --git a/demos/speech_server/conf/asr/asr.yaml b/demos/speech_server/conf/asr/asr.yaml
new file mode 100644
index 00000000..4c3b0a67
--- /dev/null
+++ b/demos/speech_server/conf/asr/asr.yaml
@@ -0,0 +1,7 @@
+model: 'conformer_wenetspeech'
+lang: 'zh'
+sample_rate: 16000
+cfg_path:
+ckpt_path:
+decode_method: 'attention_rescoring'
+force_yes: False
diff --git a/demos/speech_server/conf/tts/tts.yaml b/demos/speech_server/conf/tts/tts.yaml
new file mode 100644
index 00000000..cb4829c8
--- /dev/null
+++ b/demos/speech_server/conf/tts/tts.yaml
@@ -0,0 +1,32 @@
+# This is the parameter configuration file for TTS server.
+
+##################################################################
+# ACOUSTIC MODEL SETTING #
+# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
+# 'fastspeech2_ljspeech', 'fastspeech2_aishell3',
+# 'fastspeech2_vctk']
+##################################################################
+am: 'fastspeech2_csmsc'
+am_config:
+am_ckpt:
+am_stat:
+phones_dict:
+tones_dict:
+speaker_dict:
+spk_id: 0
+
+##################################################################
+# VOCODER SETTING #
+# voc choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
+# 'pwgan_vctk', 'mb_melgan_csmsc']
+##################################################################
+voc: 'pwgan_csmsc'
+voc_config:
+voc_ckpt:
+voc_stat:
+
+##################################################################
+# OTHERS #
+##################################################################
+lang: 'zh'
+device: 'gpu:2'
diff --git a/demos/speech_server/conf/tts/tts_pd.yaml b/demos/speech_server/conf/tts/tts_pd.yaml
new file mode 100644
index 00000000..c268c6a3
--- /dev/null
+++ b/demos/speech_server/conf/tts/tts_pd.yaml
@@ -0,0 +1,41 @@
+# This is the parameter configuration file for TTS server.
+# These are the static models that support paddle inference.
+
+##################################################################
+# ACOUSTIC MODEL SETTING #
+# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
+##################################################################
+am: 'fastspeech2_csmsc'
+am_model: # the pdmodel file of am static model
+am_params: # the pdiparams file of am static model
+am_sample_rate: 24000
+phones_dict:
+tones_dict:
+speaker_dict:
+spk_id: 0
+
+am_predictor_conf:
+ use_gpu: True
+ enable_mkldnn: True
+ switch_ir_optim: True
+
+
+##################################################################
+# VOCODER SETTING #
+# voc choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
+##################################################################
+voc: 'pwgan_csmsc'
+voc_model: # the pdmodel file of vocoder static model
+voc_params: # the pdiparams file of vocoder static model
+voc_sample_rate: 24000
+
+voc_predictor_conf:
+ use_gpu: True
+ enable_mkldnn: True
+ switch_ir_optim: True
+
+##################################################################
+# OTHERS #
+##################################################################
+lang: 'zh'
+device: paddle.get_device()
diff --git a/demos/speech_server/server.sh b/demos/speech_server/server.sh
new file mode 100644
index 00000000..d9367ec0
--- /dev/null
+++ b/demos/speech_server/server.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+paddlespeech_server start --config_file ./conf/application.yaml
\ No newline at end of file
diff --git a/demos/speech_server/tts_client.sh b/demos/speech_server/tts_client.sh
new file mode 100644
index 00000000..a756dfd3
--- /dev/null
+++ b/demos/speech_server/tts_client.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav
diff --git a/docs/source/tts/tts_papers.md b/docs/source/tts/tts_papers.md
index 2b35b885..681b2106 100644
--- a/docs/source/tts/tts_papers.md
+++ b/docs/source/tts/tts_papers.md
@@ -3,6 +3,8 @@
### Polyphone
- [【g2pM】g2pM: A Neural Grapheme-to-Phoneme Conversion Package for Mandarin Chinese Based on a New Open Benchmark Dataset](https://arxiv.org/abs/2004.03136)
- [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf)
+- [Polyphone Disambiguation in Mandarin Chinese with Semi-Supervised Learning](https://www.isca-speech.org/archive/pdfs/interspeech_2021/shi21d_interspeech.pdf)
+ * github: https://github.com/PaperMechanica/SemiPPL
### Text Normalization
#### English
- [applenob/text_normalization](https://github.com/applenob/text_normalization)
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index c83df432..7f648b4c 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
-import ast
import os
import sys
from collections import OrderedDict
@@ -183,10 +182,15 @@ class ASRExecutor(BaseExecutor):
default=paddle.get_device(),
help='Choose device to execute model inference.')
self.parser.add_argument(
+ '-d',
'--job_dump_result',
- type=ast.literal_eval,
- default=False,
+ action='store_true',
help='Save job result into file.')
+ self.parser.add_argument(
+ '-v',
+ '--verbose',
+ action='store_true',
+ help='Increase logger verbosity of current task.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
@@ -482,7 +486,9 @@ class ASRExecutor(BaseExecutor):
decode_method = parser_args.decode_method
force_yes = parser_args.yes
device = parser_args.device
- job_dump_result = parser_args.job_dump_result
+
+ if not parser_args.verbose:
+ self.disable_task_loggers()
task_source = self.get_task_source(parser_args.input)
task_results = OrderedDict()
@@ -498,7 +504,7 @@ class ASRExecutor(BaseExecutor):
task_results[id_] = f'{e.__class__.__name__}: {e}'
self.process_task_results(parser_args.input, task_results,
- job_dump_result)
+ parser_args.job_dump_result)
if has_exceptions:
return False
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
index e5d4b546..ab5eee6e 100644
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
-import ast
import os
from collections import OrderedDict
from typing import List
@@ -112,10 +111,15 @@ class CLSExecutor(BaseExecutor):
default=paddle.get_device(),
help='Choose device to execute model inference.')
self.parser.add_argument(
+ '-d',
'--job_dump_result',
- type=ast.literal_eval,
- default=False,
+ action='store_true',
help='Save job result into file.')
+ self.parser.add_argument(
+ '-v',
+ '--verbose',
+ action='store_true',
+ help='Increase logger verbosity of current task.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
@@ -243,7 +247,9 @@ class CLSExecutor(BaseExecutor):
ckpt_path = parser_args.ckpt_path
topk = parser_args.topk
device = parser_args.device
- job_dump_result = parser_args.job_dump_result
+
+ if not parser_args.verbose:
+ self.disable_task_loggers()
task_source = self.get_task_source(parser_args.input)
task_results = OrderedDict()
@@ -259,7 +265,7 @@ class CLSExecutor(BaseExecutor):
task_results[id_] = f'{e.__class__.__name__}: {e}'
self.process_task_results(parser_args.input, task_results,
- job_dump_result)
+ parser_args.job_dump_result)
if has_exceptions:
return False
diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
index d81f8f9f..d77d27b0 100644
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import logging
import os
import sys
from abc import ABC
@@ -149,10 +150,16 @@ class BaseExecutor(ABC):
job_dump_result (bool, optional): if True, dumps job results into file. Defaults to False.
"""
- raw_text = self._format_task_results(results)
- print(raw_text, end='')
+ if not self._is_job_input(input_) and len(
+ results) == 1: # Only one input sample
+ raw_text = list(results.values())[0]
+ else:
+ raw_text = self._format_task_results(results)
+
+ print(raw_text, end='') # Stdout
- if self._is_job_input(input_) and job_dump_result:
+ if self._is_job_input(
+ input_) and job_dump_result: # Dump to *.job.done
try:
job_output_file = os.path.abspath(input_) + '.done'
sys.stdout = open(job_output_file, 'w')
@@ -209,3 +216,13 @@ class BaseExecutor(ABC):
for k, v in results.items():
ret += f'{k} {v}\n'
return ret
+
+ def disable_task_loggers(self):
+ """
+ Disable all loggers in current task.
+ """
+ loggers = [
+ logging.getLogger(name) for name in logging.root.manager.loggerDict
+ ]
+ for l in loggers:
+ l.disabled = True
diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py
index a11509ea..e64fc57d 100644
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
-import ast
import os
import subprocess
from collections import OrderedDict
@@ -110,10 +109,15 @@ class STExecutor(BaseExecutor):
default=paddle.get_device(),
help="Choose device to execute model inference.")
self.parser.add_argument(
+ '-d',
'--job_dump_result',
- type=ast.literal_eval,
- default=False,
+ action='store_true',
help='Save job result into file.')
+ self.parser.add_argument(
+ '-v',
+ '--verbose',
+ action='store_true',
+ help='Increase logger verbosity of current task.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
@@ -327,7 +331,9 @@ class STExecutor(BaseExecutor):
config = parser_args.config
ckpt_path = parser_args.ckpt_path
device = parser_args.device
- job_dump_result = parser_args.job_dump_result
+
+ if not parser_args.verbose:
+ self.disable_task_loggers()
task_source = self.get_task_source(parser_args.input)
task_results = OrderedDict()
@@ -343,7 +349,7 @@ class STExecutor(BaseExecutor):
task_results[id_] = f'{e.__class__.__name__}: {e}'
self.process_task_results(parser_args.input, task_results,
- job_dump_result)
+ parser_args.job_dump_result)
if has_exceptions:
return False
diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py
index cc902be2..dcf306c6 100644
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
-import ast
import os
import re
from collections import OrderedDict
@@ -122,10 +121,15 @@ class TextExecutor(BaseExecutor):
default=paddle.get_device(),
help='Choose device to execute model inference.')
self.parser.add_argument(
+ '-d',
'--job_dump_result',
- type=ast.literal_eval,
- default=False,
+ action='store_true',
help='Save job result into file.')
+ self.parser.add_argument(
+ '-v',
+ '--verbose',
+ action='store_true',
+ help='Increase logger verbosity of current task.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
@@ -270,7 +274,9 @@ class TextExecutor(BaseExecutor):
ckpt_path = parser_args.ckpt_path
punc_vocab = parser_args.punc_vocab
device = parser_args.device
- job_dump_result = parser_args.job_dump_result
+
+ if not parser_args.verbose:
+ self.disable_task_loggers()
task_source = self.get_task_source(parser_args.input)
task_results = OrderedDict()
@@ -286,7 +292,7 @@ class TextExecutor(BaseExecutor):
task_results[id_] = f'{e.__class__.__name__}: {e}'
self.process_task_results(parser_args.input, task_results,
- job_dump_result)
+ parser_args.job_dump_result)
if has_exceptions:
return False
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 3f650c40..ba15d652 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
-import ast
import os
from collections import OrderedDict
from typing import Any
@@ -400,10 +399,15 @@ class TTSExecutor(BaseExecutor):
self.parser.add_argument(
'--output', type=str, default='output.wav', help='output file name')
self.parser.add_argument(
+ '-d',
'--job_dump_result',
- type=ast.literal_eval,
- default=False,
+ action='store_true',
help='Save job result into file.')
+ self.parser.add_argument(
+ '-v',
+ '--verbose',
+ action='store_true',
+ help='Increase logger verbosity of current task.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
@@ -693,7 +697,9 @@ class TTSExecutor(BaseExecutor):
lang = args.lang
device = args.device
spk_id = args.spk_id
- job_dump_result = args.job_dump_result
+
+ if not args.verbose:
+ self.disable_task_loggers()
task_source = self.get_task_source(args.input)
task_results = OrderedDict()
@@ -733,7 +739,8 @@ class TTSExecutor(BaseExecutor):
has_exceptions = True
task_results[id_] = f'{e.__class__.__name__}: {e}'
- self.process_task_results(args.input, task_results, job_dump_result)
+ self.process_task_results(args.input, task_results,
+ args.job_dump_result)
if has_exceptions:
return False
diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index fb809837..3730d607 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -26,6 +26,8 @@ import soundfile
from ..executor import BaseExecutor
from ..util import cli_client_register
+from ..util import stats_wrapper
+from paddlespeech.cli.log import logger
from paddlespeech.server.utils.audio_process import wav2pcm
from paddlespeech.server.utils.util import wav2base64
@@ -36,8 +38,9 @@ __all__ = ['TTSClientExecutor', 'ASRClientExecutor']
name='paddlespeech_client.tts', description='visit tts service')
class TTSClientExecutor(BaseExecutor):
def __init__(self):
- super().__init__()
- self.parser = argparse.ArgumentParser()
+ super(TTSClientExecutor, self).__init__()
+ self.parser = argparse.ArgumentParser(
+ prog='paddlespeech_client.tts', add_help=True)
self.parser.add_argument(
'--server_ip', type=str, default='127.0.0.1', help='server ip')
self.parser.add_argument(
@@ -46,17 +49,24 @@ class TTSClientExecutor(BaseExecutor):
'--input',
type=str,
default="你好,欢迎使用语音合成服务",
- help='A sentence to be synthesized')
+ help='A sentence to be synthesized.')
self.parser.add_argument(
'--spk_id', type=int, default=0, help='Speaker id')
self.parser.add_argument(
- '--speed', type=float, default=1.0, help='Audio speed')
+ '--speed',
+ type=float,
+ default=1.0,
+ help='Audio speed, the value should be set between 0 and 3')
self.parser.add_argument(
- '--volume', type=float, default=1.0, help='Audio volume')
+ '--volume',
+ type=float,
+ default=1.0,
+ help='Audio volume, the value should be set between 0 and 3')
self.parser.add_argument(
'--sample_rate',
type=int,
default=0,
+ choices=[0, 8000, 16000],
help='Sampling rate, the default is the same as the model')
self.parser.add_argument(
'--output',
@@ -64,36 +74,14 @@ class TTSClientExecutor(BaseExecutor):
default="./output.wav",
help='Synthesized audio file')
- # Request and response
- def tts_client(self, args):
- """ Request and response
- Args:
- input: A sentence to be synthesized
- outfile: Synthetic audio file
- """
- url = 'http://' + args.server_ip + ":" + str(
- args.port) + '/paddlespeech/tts'
- request = {
- "text": args.input,
- "spk_id": args.spk_id,
- "speed": args.speed,
- "volume": args.volume,
- "sample_rate": args.sample_rate,
- "save_path": args.output
- }
-
- response = requests.post(url, json.dumps(request))
- response_dict = response.json()
- print(response_dict["message"])
+ def postprocess(self, response_dict: dict, outfile: str) -> float:
wav_base64 = response_dict["result"]["audio"]
-
audio_data_byte = base64.b64decode(wav_base64)
# from byte
samples, sample_rate = soundfile.read(
io.BytesIO(audio_data_byte), dtype='float32')
# transform audio
- outfile = args.output
if outfile.endswith(".wav"):
soundfile.write(outfile, samples, sample_rate)
elif outfile.endswith(".pcm"):
@@ -102,18 +90,79 @@ class TTSClientExecutor(BaseExecutor):
wav2pcm(temp_wav, outfile, data_type=np.int16)
os.system("rm %s" % (temp_wav))
else:
- print("The format for saving audio only supports wav or pcm")
+ logger.error("The format for saving audio only supports wav or pcm")
- return len(samples), sample_rate
+ duration = len(samples) / sample_rate
+ return duration
def execute(self, argv: List[str]) -> bool:
args = self.parser.parse_args(argv)
- st = time.time()
try:
- samples_length, sample_rate = self.tts_client(args)
+ url = 'http://' + args.server_ip + ":" + str(
+ args.port) + '/paddlespeech/tts'
+ request = {
+ "text": args.input,
+ "spk_id": args.spk_id,
+ "speed": args.speed,
+ "volume": args.volume,
+ "sample_rate": args.sample_rate,
+ "save_path": args.output
+ }
+ st = time.time()
+ response = requests.post(url, json.dumps(request))
+ time_consume = time.time() - st
+
+ response_dict = response.json()
+ duration = self.postprocess(response_dict, args.output)
+
+ logger.info(response_dict["message"])
+ logger.info("Save synthesized audio successfully on %s." %
+ (args.output))
+ logger.info("Audio duration: %f s." % (duration))
+ logger.info("Response time: %f s." % (time_consume))
+ logger.info("RTF: %f " % (time_consume / duration))
+
+ return True
+ except:
+ logger.error("Failed to synthesized audio.")
+ return False
+
+ @stats_wrapper
+ def __call__(self,
+ input: str,
+ server_ip: str="127.0.0.1",
+ port: int=8090,
+ spk_id: int=0,
+ speed: float=1.0,
+ volume: float=1.0,
+ sample_rate: int=0,
+ output: str="./output.wav"):
+ """
+ Python API to call an executor.
+ """
+
+ url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/tts'
+ request = {
+ "text": input,
+ "spk_id": spk_id,
+ "speed": speed,
+ "volume": volume,
+ "sample_rate": sample_rate,
+ "save_path": output
+ }
+
+ try:
+ st = time.time()
+ response = requests.post(url, json.dumps(request))
time_consume = time.time() - st
- print("Save synthesized audio successfully on %s." % (args.output))
- print("Inference time: %f s." % (time_consume))
+ response_dict = response.json()
+ duration = self.postprocess(response_dict, output)
+
+ print(response_dict["message"])
+ print("Save synthesized audio successfully on %s." % (output))
+ print("Audio duration: %f s." % (duration))
+ print("Response time: %f s." % (time_consume))
+ print("RTF: %f " % (time_consume / duration))
except:
print("Failed to synthesized audio.")
@@ -122,8 +171,9 @@ class TTSClientExecutor(BaseExecutor):
name='paddlespeech_client.asr', description='visit asr service')
class ASRClientExecutor(BaseExecutor):
def __init__(self):
- super().__init__()
- self.parser = argparse.ArgumentParser()
+ super(ASRClientExecutor, self).__init__()
+ self.parser = argparse.ArgumentParser(
+ prog='paddlespeech_client.asr', add_help=True)
self.parser.add_argument(
'--server_ip', type=str, default='127.0.0.1', help='server ip')
self.parser.add_argument(
@@ -152,11 +202,43 @@ class ASRClientExecutor(BaseExecutor):
"lang": args.lang,
}
time_start = time.time()
+ try:
+ r = requests.post(url=url, data=json.dumps(data))
+ # ending Timestamp
+ time_end = time.time()
+ logger.info(r.json())
+ logger.info("time cost %f s." % (time_end - time_start))
+ return True
+ except:
+ logger.error("Failed to speech recognition.")
+ return False
+
+ @stats_wrapper
+ def __call__(self,
+ input: str,
+ server_ip: str="127.0.0.1",
+ port: int=8090,
+ sample_rate: int=16000,
+ lang: str="zh_cn",
+ audio_format: str="wav"):
+ """
+ Python API to call an executor.
+ """
+
+ url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/asr'
+ audio = wav2base64(input)
+ data = {
+ "audio": audio,
+ "audio_format": audio_format,
+ "sample_rate": sample_rate,
+ "lang": lang,
+ }
+ time_start = time.time()
try:
r = requests.post(url=url, data=json.dumps(data))
# ending Timestamp
time_end = time.time()
print(r.json())
- print('time cost', time_end - time_start, 's')
+ print("time cost %f s." % (time_end - time_start))
except:
print("Failed to speech recognition.")
\ No newline at end of file
diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py
index 5b70e227..7c88d8a0 100644
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@@ -19,6 +19,7 @@ from fastapi import FastAPI
from ..executor import BaseExecutor
from ..util import cli_server_register
+from ..util import stats_wrapper
from paddlespeech.server.engine.engine_factory import EngineFactory
from paddlespeech.server.restful.api import setup_router
from paddlespeech.server.utils.config import get_config
@@ -33,8 +34,9 @@ app = FastAPI(
name='paddlespeech_server.start', description='Start the service')
class ServerExecutor(BaseExecutor):
def __init__(self):
- super().__init__()
- self.parser = argparse.ArgumentParser()
+ super(ServerExecutor, self).__init__()
+ self.parser = argparse.ArgumentParser(
+ prog='paddlespeech_server.start', add_help=True)
self.parser.add_argument(
"--config_file",
action="store",
@@ -74,4 +76,15 @@ class ServerExecutor(BaseExecutor):
config = get_config(args.config_file)
if self.init(config):
- uvicorn.run(app, host=config.host, port=config.port, debug=True)
\ No newline at end of file
+ uvicorn.run(app, host=config.host, port=config.port, debug=True)
+
+ @stats_wrapper
+ def __call__(self,
+ config_file: str="./conf/application.yaml",
+ log_file: str="./log/paddlespeech.log"):
+ """
+ Python API to call an executor.
+ """
+ config = get_config(config_file)
+ if self.init(config):
+ uvicorn.run(app, host=config.host, port=config.port, debug=True)
diff --git a/paddlespeech/server/executor.py b/paddlespeech/server/executor.py
index 192e1f17..fa2d01a9 100644
--- a/paddlespeech/server/executor.py
+++ b/paddlespeech/server/executor.py
@@ -16,6 +16,7 @@ from abc import ABC
from abc import abstractmethod
from typing import List
+
class BaseExecutor(ABC):
"""
An abstract executor of paddlespeech server tasks.
@@ -36,3 +37,10 @@ class BaseExecutor(ABC):
int: Result of the command execution. `True` for a success and `False` for a failure.
"""
pass
+
+ @abstractmethod
+ def __call__(self, *arg, **kwargs):
+ """
+ Python API to call an executor.
+ """
+ pass
diff --git a/paddlespeech/server/util.py b/paddlespeech/server/util.py
index 58e86b27..48c4b8cb 100644
--- a/paddlespeech/server/util.py
+++ b/paddlespeech/server/util.py
@@ -30,9 +30,12 @@ from paddle.framework import load
import paddleaudio
from . import download
-from .. import __version__
from .entry import client_commands
from .entry import server_commands
+try:
+ from .. import __version__
+except ImportError:
+ __version__ = "0.0.0" # for develop branch
requests.adapters.DEFAULT_RETRIES = 3