Merge branch 'PaddlePaddle:develop' into develop

pull/3242/head
jiamingkong 2 years ago committed by GitHub
commit 3ef28dee45
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -3,11 +3,7 @@ repos:
rev: v0.16.0
hooks:
- id: yapf
name: yapf
language: python
entry: yapf
args: [-i, -vv]
types: [python]
files: \.py$
exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
- repo: https://github.com/pre-commit/pre-commit-hooks

@ -34,6 +34,8 @@ Currently the engine type supports two forms: python and inference (Paddle Infer
paddlespeech_server start --config_file ./conf/application.yaml
```
> **Note:** For mixed Chinese and English speech recognition, please use the `./conf/conformer_talcs_application.yaml` configuration file
Usage:
```bash
@ -85,6 +87,7 @@ Here are sample files for this ASR client demo that can be downloaded:
```bash
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
```
**Note:** The response time will be slightly longer when using the client for the first time
@ -92,8 +95,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
If `127.0.0.1` is not accessible, you need to use the actual service IP address.
```
```bash
paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
# Chinese and English mixed speech recognition, using `./conf/conformer_talcs_application.yaml` config file
paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav
```
Usage:

@ -37,6 +37,8 @@
paddlespeech_server start --config_file ./conf/application.yaml
```
> **注意:** 中英文混合语音识别请使用 `./conf/conformer_talcs_application.yaml` 配置文件
使用方法:
```bash
@ -79,6 +81,8 @@
[2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
```
### 4. ASR 客户端使用方法
ASR 客户端的输入是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。
@ -87,6 +91,7 @@ ASR 客户端的输入是一个 WAV 文件(`.wav`),并且采样率必须
```bash
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
```
**注意:** 初次使用客户端时响应时间会略长
@ -94,8 +99,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
`127.0.0.1` 不能访问,则需要使用实际服务 IP 地址
```
```bash
paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
# 中英文混合语音识别 , 请使用 `./conf/conformer_talcs_application.yaml` 配置文件
paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav
```
使用帮助:

@ -0,0 +1,163 @@
# This is the parameter configuration file for PaddleSpeech Offline Serving.
#################################################################################
# SERVER SETTING #
#################################################################################
host: 0.0.0.0
port: 8090
# The task format in the engin_list is: <speech task>_<engine type>
# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference', 'text_python', 'vector_python']
protocol: 'http'
engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']
#################################################################################
# ENGINE CONFIG #
#################################################################################
################################### ASR #########################################
################### speech task: asr; engine_type: python #######################
asr_python:
model: 'conformer_talcs'
lang: 'zh_en'
sample_rate: 16000
cfg_path: # [optional]
ckpt_path: # [optional]
decode_method: 'attention_rescoring'
force_yes: True
codeswitch: True
device: # set 'gpu:id' or 'cpu'
################### speech task: asr; engine_type: inference #######################
asr_inference:
# model_type choices=['deepspeech2offline_aishell']
model_type: 'deepspeech2offline_aishell'
am_model: # the pdmodel file of am static model [optional]
am_params: # the pdiparams file of am static model [optional]
lang: 'zh'
sample_rate: 16000
cfg_path:
decode_method:
force_yes: True
am_predictor_conf:
device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True
glog_info: False # True -> print glog
summary: True # False -> do not show predictor config
################################### TTS #########################################
################### speech task: tts; engine_type: python #######################
tts_python:
# am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
# 'fastspeech2_ljspeech', 'fastspeech2_aishell3',
# 'fastspeech2_vctk', 'fastspeech2_mix',
# 'tacotron2_csmsc', 'tacotron2_ljspeech']
am: 'fastspeech2_csmsc'
am_config:
am_ckpt:
am_stat:
phones_dict:
tones_dict:
speaker_dict:
# voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
# 'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc',
# 'hifigan_csmsc', 'hifigan_ljspeech', 'hifigan_aishell3',
# 'hifigan_vctk', 'wavernn_csmsc']
voc: 'mb_melgan_csmsc'
voc_config:
voc_ckpt:
voc_stat:
# others
lang: 'zh'
device: # set 'gpu:id' or 'cpu'
################### speech task: tts; engine_type: inference #######################
tts_inference:
# am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
am: 'fastspeech2_csmsc'
am_model: # the pdmodel file of your am static model (XX.pdmodel)
am_params: # the pdiparams file of your am static model (XX.pdipparams)
am_sample_rate: 24000
phones_dict:
tones_dict:
speaker_dict:
am_predictor_conf:
device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True
glog_info: False # True -> print glog
summary: True # False -> do not show predictor config
# voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
voc: 'mb_melgan_csmsc'
voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
voc_sample_rate: 24000
voc_predictor_conf:
device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True
glog_info: False # True -> print glog
summary: True # False -> do not show predictor config
# others
lang: 'zh'
################################### CLS #########################################
################### speech task: cls; engine_type: python #######################
cls_python:
# model choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
model: 'panns_cnn14'
cfg_path: # [optional] Config of cls task.
ckpt_path: # [optional] Checkpoint file of model.
label_file: # [optional] Label file of cls task.
device: # set 'gpu:id' or 'cpu'
################### speech task: cls; engine_type: inference #######################
cls_inference:
# model_type choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
model_type: 'panns_cnn14'
cfg_path:
model_path: # the pdmodel file of am static model [optional]
params_path: # the pdiparams file of am static model [optional]
label_file: # [optional] Label file of cls task.
predictor_conf:
device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True
glog_info: False # True -> print glog
summary: True # False -> do not show predictor config
################################### Text #########################################
################### text task: punc; engine_type: python #######################
text_python:
task: punc
model_type: 'ernie_linear_p3_wudao'
lang: 'zh'
sample_rate: 16000
cfg_path: # [optional]
ckpt_path: # [optional]
vocab_file: # [optional]
device: # set 'gpu:id' or 'cpu'
################################### Vector ######################################
################### Vector task: spk; engine_type: python #######################
vector_python:
task: spk
model_type: 'ecapatdnn_voxceleb12'
sample_rate: 16000
cfg_path: # [optional]
ckpt_path: # [optional]
device: # set 'gpu:id' or 'cpu'

@ -1,5 +1,7 @@
# Released Models
> !!! Since PaddlePaddle support 0-D tensor from 2.5.0, PaddleSpeech Static model will not work for it, please re-export static model.
## Speech-to-Text Models
### Speech Recognition Model

@ -252,7 +252,7 @@ class STExecutor(BaseExecutor):
norm_feat = dict(kaldiio.load_ark(process.stdout))[utt_name]
self._inputs["audio"] = paddle.to_tensor(norm_feat).unsqueeze(0)
self._inputs["audio_len"] = paddle.to_tensor(
self._inputs["audio"].shape[1], dtype="int64")
self._inputs["audio"].shape[1:2], dtype="int64")
else:
raise ValueError("Wrong model type.")

@ -491,7 +491,7 @@ class TTSExecutor(BaseExecutor):
# multi speaker
if am_dataset in {'aishell3', 'vctk', 'mix', 'canton'}:
mel = self.am_inference(
part_phone_ids, spk_id=paddle.to_tensor(spk_id))
part_phone_ids, spk_id=paddle.to_tensor([spk_id]))
else:
mel = self.am_inference(part_phone_ids)
self.am_time += (time.time() - am_st)

@ -274,7 +274,7 @@ asr_dynamic_pretrained_models = {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.5.0.model.tar.gz',
'md5':
'a0adb2b204902982718bc1d8917f7038',
'38924b8adc28ef458847c3571e87e3cb',
'cfg_path':
'model.yaml',
'ckpt_path':

@ -67,13 +67,19 @@ class ASREngine(BaseEngine):
logger.error(e)
return False
cs = False
if self.config.lang == "zh_en" :
cs=True
self.executor._init_from_path(
model_type=self.config.model,
lang=self.config.lang,
sample_rate=self.config.sample_rate,
cfg_path=self.config.cfg_path,
decode_method=self.config.decode_method,
ckpt_path=self.config.ckpt_path)
ckpt_path=self.config.ckpt_path,
codeswitch=cs )
logger.info("Initialize ASR server engine successfully on device: %s." %
(self.device))

@ -783,7 +783,7 @@ class FastSpeech2(nn.Layer):
x = paddle.cast(text, 'int64')
d, p, e = durations, pitch, energy
# setup batch axis
ilens = paddle.shape(x)[0]
ilens = paddle.shape(x)[0:1]
xs = x.unsqueeze(0)

@ -181,7 +181,7 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
if length_dim == 0:
raise ValueError("length_dim cannot be 0: {}".format(length_dim))
bs = paddle.shape(lengths)[0]
bs = paddle.shape(lengths)
if xs is None:
maxlen = paddle.cast(lengths.max(), dtype=bs.dtype)
else:

Loading…
Cancel
Save