Merge pull request #1859 from lym0302/update_readme

[server] improve server code
pull/1872/head
liangym 2 years ago committed by GitHub
commit e7a35485e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -43,12 +43,12 @@ tts_online:
device: 'cpu' # set 'gpu:id' or 'cpu' device: 'cpu' # set 'gpu:id' or 'cpu'
# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
am_block: 42 am_block: 72
am_pad: 12 am_pad: 12
# voc_pad and voc_block voc model to streaming voc infer, # voc_pad and voc_block voc model to streaming voc infer,
# when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
# when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
voc_block: 14 voc_block: 36
voc_pad: 14 voc_pad: 14
@ -91,12 +91,12 @@ tts_online-onnx:
lang: 'zh' lang: 'zh'
# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
am_block: 42 am_block: 72
am_pad: 12 am_pad: 12
# voc_pad and voc_block voc model to streaming voc infer, # voc_pad and voc_block voc model to streaming voc infer,
# when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
# when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
voc_block: 14 voc_block: 36
voc_pad: 14 voc_pad: 14
# voc_upsample should be same as n_shift on voc config. # voc_upsample should be same as n_shift on voc config.
voc_upsample: 300 voc_upsample: 300

@ -31,6 +31,7 @@ from ..util import stats_wrapper
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler
from paddlespeech.server.utils.audio_process import wav2pcm from paddlespeech.server.utils.audio_process import wav2pcm
from paddlespeech.server.utils.util import compute_delay
from paddlespeech.server.utils.util import wav2base64 from paddlespeech.server.utils.util import wav2base64
__all__ = [ __all__ = [
@ -221,7 +222,7 @@ class TTSOnlineClientExecutor(BaseExecutor):
play = args.play play = args.play
try: try:
res = self( self(
input=input_, input=input_,
server_ip=server_ip, server_ip=server_ip,
port=port, port=port,
@ -257,17 +258,42 @@ class TTSOnlineClientExecutor(BaseExecutor):
logger.info("tts http client start") logger.info("tts http client start")
from paddlespeech.server.utils.audio_handler import TTSHttpHandler from paddlespeech.server.utils.audio_handler import TTSHttpHandler
handler = TTSHttpHandler(server_ip, port, play) handler = TTSHttpHandler(server_ip, port, play)
handler.run(input, spk_id, speed, volume, sample_rate, output) first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
input, spk_id, speed, volume, sample_rate, output)
delay_time_list = compute_delay(receive_time_list,
chunk_duration_list)
elif protocol == "websocket": elif protocol == "websocket":
from paddlespeech.server.utils.audio_handler import TTSWsHandler from paddlespeech.server.utils.audio_handler import TTSWsHandler
logger.info("tts websocket client start") logger.info("tts websocket client start")
handler = TTSWsHandler(server_ip, port, play) handler = TTSWsHandler(server_ip, port, play)
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
loop.run_until_complete(handler.run(input, output)) first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
handler.run(input, output))
delay_time_list = compute_delay(receive_time_list,
chunk_duration_list)
else: else:
logger.error("Please set correct protocol, http or websocket") logger.error("Please set correct protocol, http or websocket")
return False
logger.info(f"sentence: {input}")
logger.info(f"duration: {duration} s")
logger.info(f"first response: {first_response} s")
logger.info(f"final response: {final_response} s")
logger.info(f"RTF: {final_response/duration}")
if output is not None:
if save_audio_success:
logger.info(f"Audio successfully saved in {output}")
else:
logger.error("Audio save failed.")
if delay_time_list != []:
logger.info(
f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
)
else:
logger.info("The sentence has no delay in streaming synthesis.")
@cli_client_register( @cli_client_register(

@ -1,4 +1,4 @@
# This is the parameter configuration file for PaddleSpeech Serving. # This is the parameter configuration file for PaddleSpeech Offline Serving..
################################################################################# #################################################################################
# SERVER SETTING # # SERVER SETTING #
@ -7,9 +7,7 @@ host: 127.0.0.1
port: 8090 port: 8090
# The task format in the engin_list is: <speech task>_<engine type> # The task format in the engin_list is: <speech task>_<engine type>
# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] # task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference']
# protocol = ['websocket', 'http'] (only one can be selected).
# http only support offline engine type.
protocol: 'http' protocol: 'http'
engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python'] engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']
@ -50,24 +48,6 @@ asr_inference:
summary: True # False -> do not show predictor config summary: True # False -> do not show predictor config
################### speech task: asr; engine_type: online #######################
asr_online:
model_type: 'deepspeech2online_aishell'
am_model: # the pdmodel file of am static model [optional]
am_params: # the pdiparams file of am static model [optional]
lang: 'zh'
sample_rate: 16000
cfg_path:
decode_method:
force_yes: True
am_predictor_conf:
device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True
glog_info: False # True -> print glog
summary: True # False -> do not show predictor config
################################### TTS ######################################### ################################### TTS #########################################
################### speech task: tts; engine_type: python ####################### ################### speech task: tts; engine_type: python #######################
tts_python: tts_python:

@ -43,12 +43,12 @@ tts_online:
device: 'cpu' # set 'gpu:id' or 'cpu' device: 'cpu' # set 'gpu:id' or 'cpu'
# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
am_block: 42 am_block: 72
am_pad: 12 am_pad: 12
# voc_pad and voc_block voc model to streaming voc infer, # voc_pad and voc_block voc model to streaming voc infer,
# when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
# when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
voc_block: 14 voc_block: 36
voc_pad: 14 voc_pad: 14
@ -91,12 +91,12 @@ tts_online-onnx:
lang: 'zh' lang: 'zh'
# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer, # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
am_block: 42 am_block: 72
am_pad: 12 am_pad: 12
# voc_pad and voc_block voc model to streaming voc infer, # voc_pad and voc_block voc model to streaming voc infer,
# when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
# when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
voc_block: 14 voc_block: 36
voc_pad: 14 voc_pad: 14
# voc_upsample should be same as n_shift on voc config. # voc_upsample should be same as n_shift on voc config.
voc_upsample: 300 voc_upsample: 300

@ -20,10 +20,9 @@ import paddle
from numpy import float32 from numpy import float32
from yacs.config import CfgNode from yacs.config import CfgNode
from .pretrained_models import pretrained_models
from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.asr.infer import ASRExecutor
from paddlespeech.cli.asr.infer import model_alias
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
from paddlespeech.cli.utils import download_and_decompress
from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.cli.utils import MODEL_HOME
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.speech import SpeechSegment from paddlespeech.s2t.frontend.speech import SpeechSegment
@ -40,45 +39,6 @@ from paddlespeech.server.utils.paddle_predictor import init_predictor
__all__ = ['ASREngine'] __all__ = ['ASREngine']
pretrained_models = {
"deepspeech2online_aishell-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
'md5':
'98b87b171b7240b7cae6e07d8d0bc9be',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/deepspeech2_online/checkpoints/avg_1',
'model':
'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
'params':
'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
"conformer_online_multicn-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
'md5':
'0ac93d390552336f2a906aec9e33c5fa',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/chunk_conformer/checkpoints/multi_cn',
'model':
'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
'params':
'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
}
# ASR server connection process class # ASR server connection process class
class PaddleASRConnectionHanddler: class PaddleASRConnectionHanddler:
@ -625,24 +585,7 @@ class PaddleASRConnectionHanddler:
class ASRServerExecutor(ASRExecutor): class ASRServerExecutor(ASRExecutor):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
pass self.pretrained_models = pretrained_models
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
Download and returns pretrained resources path of current task.
"""
support_models = list(pretrained_models.keys())
assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
'Use pretrained model stored in: {}'.format(decompressed_path))
return decompressed_path
def _init_from_path(self, def _init_from_path(self,
model_type: str='deepspeech2online_aishell', model_type: str='deepspeech2online_aishell',
@ -658,20 +601,20 @@ class ASRServerExecutor(ASRExecutor):
""" """
self.model_type = model_type self.model_type = model_type
self.sample_rate = sample_rate self.sample_rate = sample_rate
sample_rate_str = '16k' if sample_rate == 16000 else '8k'
tag = model_type + '-' + lang + '-' + sample_rate_str
if cfg_path is None or am_model is None or am_params is None: if cfg_path is None or am_model is None or am_params is None:
sample_rate_str = '16k' if sample_rate == 16000 else '8k'
tag = model_type + '-' + lang + '-' + sample_rate_str
logger.info(f"Load the pretrained model, tag = {tag}") logger.info(f"Load the pretrained model, tag = {tag}")
res_path = self._get_pretrained_path(tag) # wenetspeech_zh res_path = self._get_pretrained_path(tag) # wenetspeech_zh
self.res_path = res_path self.res_path = res_path
self.cfg_path = os.path.join(res_path, self.cfg_path = os.path.join(
pretrained_models[tag]['cfg_path']) res_path, self.pretrained_models[tag]['cfg_path'])
self.am_model = os.path.join(res_path, self.am_model = os.path.join(res_path,
pretrained_models[tag]['model']) self.pretrained_models[tag]['model'])
self.am_params = os.path.join(res_path, self.am_params = os.path.join(res_path,
pretrained_models[tag]['params']) self.pretrained_models[tag]['params'])
logger.info(res_path) logger.info(res_path)
else: else:
self.cfg_path = os.path.abspath(cfg_path) self.cfg_path = os.path.abspath(cfg_path)
@ -699,8 +642,8 @@ class ASRServerExecutor(ASRExecutor):
self.text_feature = TextFeaturizer( self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type, vocab=self.vocab) unit_type=self.config.unit_type, vocab=self.vocab)
lm_url = pretrained_models[tag]['lm_url'] lm_url = self.pretrained_models[tag]['lm_url']
lm_md5 = pretrained_models[tag]['lm_md5'] lm_md5 = self.pretrained_models[tag]['lm_md5']
logger.info(f"Start to load language model {lm_url}") logger.info(f"Start to load language model {lm_url}")
self.download_lm( self.download_lm(
lm_url, lm_url,
@ -773,7 +716,7 @@ class ASRServerExecutor(ASRExecutor):
model_name = model_type[:model_type.rindex( model_name = model_type[:model_type.rindex(
'_')] # model_type: {model_name}_{dataset} '_')] # model_type: {model_name}_{dataset}
logger.info(f"model name: {model_name}") logger.info(f"model name: {model_name}")
model_class = dynamic_import(model_name, model_alias) model_class = dynamic_import(model_name, self.model_alias)
model_conf = self.config model_conf = self.config
model = model_class.from_config(model_conf) model = model_class.from_config(model_conf)
self.model = model self.model = model

@ -0,0 +1,52 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pretrained_models = {
"deepspeech2online_aishell-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
'md5':
'98b87b171b7240b7cae6e07d8d0bc9be',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/deepspeech2_online/checkpoints/avg_1',
'model':
'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
'params':
'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
"conformer_online_multicn-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
'md5':
'0ac93d390552336f2a906aec9e33c5fa',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/chunk_conformer/checkpoints/multi_cn',
'model':
'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
'params':
'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
}

@ -19,6 +19,7 @@ from typing import Optional
import paddle import paddle
from yacs.config import CfgNode from yacs.config import CfgNode
from .pretrained_models import pretrained_models
from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.asr.infer import ASRExecutor
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
from paddlespeech.cli.utils import MODEL_HOME from paddlespeech.cli.utils import MODEL_HOME
@ -31,32 +32,11 @@ from paddlespeech.server.utils.paddle_predictor import run_model
__all__ = ['ASREngine'] __all__ = ['ASREngine']
pretrained_models = {
"deepspeech2offline_aishell-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
'md5':
'932c3593d62fe5c741b59b31318aa314',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/deepspeech2/checkpoints/avg_1',
'model':
'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
'params':
'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
}
class ASRServerExecutor(ASRExecutor): class ASRServerExecutor(ASRExecutor):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
pass self.pretrained_models = pretrained_models
def _init_from_path(self, def _init_from_path(self,
model_type: str='wenetspeech', model_type: str='wenetspeech',
@ -71,18 +51,18 @@ class ASRServerExecutor(ASRExecutor):
Init model and other resources from a specific path. Init model and other resources from a specific path.
""" """
sample_rate_str = '16k' if sample_rate == 16000 else '8k'
tag = model_type + '-' + lang + '-' + sample_rate_str
if cfg_path is None or am_model is None or am_params is None: if cfg_path is None or am_model is None or am_params is None:
sample_rate_str = '16k' if sample_rate == 16000 else '8k'
tag = model_type + '-' + lang + '-' + sample_rate_str
res_path = self._get_pretrained_path(tag) # wenetspeech_zh res_path = self._get_pretrained_path(tag) # wenetspeech_zh
self.res_path = res_path self.res_path = res_path
self.cfg_path = os.path.join(res_path, self.cfg_path = os.path.join(
pretrained_models[tag]['cfg_path']) res_path, self.pretrained_models[tag]['cfg_path'])
self.am_model = os.path.join(res_path, self.am_model = os.path.join(res_path,
pretrained_models[tag]['model']) self.pretrained_models[tag]['model'])
self.am_params = os.path.join(res_path, self.am_params = os.path.join(res_path,
pretrained_models[tag]['params']) self.pretrained_models[tag]['params'])
logger.info(res_path) logger.info(res_path)
logger.info(self.cfg_path) logger.info(self.cfg_path)
logger.info(self.am_model) logger.info(self.am_model)
@ -109,8 +89,8 @@ class ASRServerExecutor(ASRExecutor):
self.text_feature = TextFeaturizer( self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type, vocab=self.vocab) unit_type=self.config.unit_type, vocab=self.vocab)
lm_url = pretrained_models[tag]['lm_url'] lm_url = self.pretrained_models[tag]['lm_url']
lm_md5 = pretrained_models[tag]['lm_md5'] lm_md5 = self.pretrained_models[tag]['lm_md5']
self.download_lm( self.download_lm(
lm_url, lm_url,
os.path.dirname(self.config.decode.lang_model_path), lm_md5) os.path.dirname(self.config.decode.lang_model_path), lm_md5)

@ -0,0 +1,34 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pretrained_models = {
"deepspeech2offline_aishell-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
'md5':
'932c3593d62fe5c741b59b31318aa314',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/deepspeech2/checkpoints/avg_1',
'model':
'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
'params':
'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
}

@ -20,83 +20,20 @@ import numpy as np
import paddle import paddle
import yaml import yaml
from .pretrained_models import pretrained_models
from paddlespeech.cli.cls.infer import CLSExecutor from paddlespeech.cli.cls.infer import CLSExecutor
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
from paddlespeech.cli.utils import download_and_decompress
from paddlespeech.cli.utils import MODEL_HOME
from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils.paddle_predictor import init_predictor from paddlespeech.server.utils.paddle_predictor import init_predictor
from paddlespeech.server.utils.paddle_predictor import run_model from paddlespeech.server.utils.paddle_predictor import run_model
__all__ = ['CLSEngine'] __all__ = ['CLSEngine']
pretrained_models = {
"panns_cnn6-32k": {
'url':
'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
'md5':
'da087c31046d23281d8ec5188c1967da',
'cfg_path':
'panns.yaml',
'model_path':
'inference.pdmodel',
'params_path':
'inference.pdiparams',
'label_file':
'audioset_labels.txt',
},
"panns_cnn10-32k": {
'url':
'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
'md5':
'5460cc6eafbfaf0f261cc75b90284ae1',
'cfg_path':
'panns.yaml',
'model_path':
'inference.pdmodel',
'params_path':
'inference.pdiparams',
'label_file':
'audioset_labels.txt',
},
"panns_cnn14-32k": {
'url':
'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
'md5':
'ccc80b194821274da79466862b2ab00f',
'cfg_path':
'panns.yaml',
'model_path':
'inference.pdmodel',
'params_path':
'inference.pdiparams',
'label_file':
'audioset_labels.txt',
},
}
class CLSServerExecutor(CLSExecutor): class CLSServerExecutor(CLSExecutor):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
pass self.pretrained_models = pretrained_models
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
Download and returns pretrained resources path of current task.
"""
support_models = list(pretrained_models.keys())
assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
'Use pretrained model stored in: {}'.format(decompressed_path))
return decompressed_path
def _init_from_path( def _init_from_path(
self, self,
@ -113,14 +50,14 @@ class CLSServerExecutor(CLSExecutor):
if cfg_path is None or model_path is None or params_path is None or label_file is None: if cfg_path is None or model_path is None or params_path is None or label_file is None:
tag = model_type + '-' + '32k' tag = model_type + '-' + '32k'
self.res_path = self._get_pretrained_path(tag) self.res_path = self._get_pretrained_path(tag)
self.cfg_path = os.path.join(self.res_path, self.cfg_path = os.path.join(
pretrained_models[tag]['cfg_path']) self.res_path, self.pretrained_models[tag]['cfg_path'])
self.model_path = os.path.join(self.res_path, self.model_path = os.path.join(
pretrained_models[tag]['model_path']) self.res_path, self.pretrained_models[tag]['model_path'])
self.params_path = os.path.join( self.params_path = os.path.join(
self.res_path, pretrained_models[tag]['params_path']) self.res_path, self.pretrained_models[tag]['params_path'])
self.label_file = os.path.join(self.res_path, self.label_file = os.path.join(
pretrained_models[tag]['label_file']) self.res_path, self.pretrained_models[tag]['label_file'])
else: else:
self.cfg_path = os.path.abspath(cfg_path) self.cfg_path = os.path.abspath(cfg_path)
self.model_path = os.path.abspath(model_path) self.model_path = os.path.abspath(model_path)

@ -0,0 +1,58 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pretrained_models = {
"panns_cnn6-32k": {
'url':
'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
'md5':
'da087c31046d23281d8ec5188c1967da',
'cfg_path':
'panns.yaml',
'model_path':
'inference.pdmodel',
'params_path':
'inference.pdiparams',
'label_file':
'audioset_labels.txt',
},
"panns_cnn10-32k": {
'url':
'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
'md5':
'5460cc6eafbfaf0f261cc75b90284ae1',
'cfg_path':
'panns.yaml',
'model_path':
'inference.pdmodel',
'params_path':
'inference.pdiparams',
'label_file':
'audioset_labels.txt',
},
"panns_cnn14-32k": {
'url':
'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
'md5':
'ccc80b194821274da79466862b2ab00f',
'cfg_path':
'panns.yaml',
'model_path':
'inference.pdmodel',
'params_path':
'inference.pdiparams',
'label_file':
'audioset_labels.txt',
},
}

@ -0,0 +1,69 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# support online model
pretrained_models = {
# fastspeech2
"fastspeech2_csmsc_onnx-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
'md5':
'fd3ad38d83273ad51f0ea4f4abf3ab4e',
'ckpt': ['fastspeech2_csmsc.onnx'],
'phones_dict':
'phone_id_map.txt',
'sample_rate':
24000,
},
"fastspeech2_cnndecoder_csmsc_onnx-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
'md5':
'5f70e1a6bcd29d72d54e7931aa86f266',
'ckpt': [
'fastspeech2_csmsc_am_encoder_infer.onnx',
'fastspeech2_csmsc_am_decoder.onnx',
'fastspeech2_csmsc_am_postnet.onnx',
],
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
'sample_rate':
24000,
},
# mb_melgan
"mb_melgan_csmsc_onnx-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
'md5':
'5b83ec746e8414bc29032d954ffd07ec',
'ckpt':
'mb_melgan_csmsc.onnx',
'sample_rate':
24000,
},
# hifigan
"hifigan_csmsc_onnx-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
'md5':
'1a7dc0385875889e46952e50c0994a6b',
'ckpt':
'hifigan_csmsc.onnx',
'sample_rate':
24000,
},
}

@ -20,10 +20,9 @@ from typing import Optional
import numpy as np import numpy as np
import paddle import paddle
from .pretrained_models import pretrained_models
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
from paddlespeech.cli.tts.infer import TTSExecutor from paddlespeech.cli.tts.infer import TTSExecutor
from paddlespeech.cli.utils import download_and_decompress
from paddlespeech.cli.utils import MODEL_HOME
from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils.audio_process import float2pcm from paddlespeech.server.utils.audio_process import float2pcm
from paddlespeech.server.utils.onnx_infer import get_sess from paddlespeech.server.utils.onnx_infer import get_sess
@ -34,83 +33,6 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend
__all__ = ['TTSEngine'] __all__ = ['TTSEngine']
# support online model
pretrained_models = {
# fastspeech2
"fastspeech2_csmsc_onnx-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
'md5':
'fd3ad38d83273ad51f0ea4f4abf3ab4e',
'ckpt': ['fastspeech2_csmsc.onnx'],
'phones_dict':
'phone_id_map.txt',
'sample_rate':
24000,
},
"fastspeech2_cnndecoder_csmsc_onnx-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
'md5':
'5f70e1a6bcd29d72d54e7931aa86f266',
'ckpt': [
'fastspeech2_csmsc_am_encoder_infer.onnx',
'fastspeech2_csmsc_am_decoder.onnx',
'fastspeech2_csmsc_am_postnet.onnx',
],
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
'sample_rate':
24000,
},
# mb_melgan
"mb_melgan_csmsc_onnx-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
'md5':
'5b83ec746e8414bc29032d954ffd07ec',
'ckpt':
'mb_melgan_csmsc.onnx',
'sample_rate':
24000,
},
# hifigan
"hifigan_csmsc_onnx-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
'md5':
'1a7dc0385875889e46952e50c0994a6b',
'ckpt':
'hifigan_csmsc.onnx',
'sample_rate':
24000,
},
}
model_alias = {
# acoustic model
"fastspeech2":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
# voc
"mb_melgan":
"paddlespeech.t2s.models.melgan:MelGANGenerator",
"mb_melgan_inference":
"paddlespeech.t2s.models.melgan:MelGANInference",
"hifigan":
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
}
__all__ = ['TTSEngine']
class TTSServerExecutor(TTSExecutor): class TTSServerExecutor(TTSExecutor):
def __init__(self, am_block, am_pad, voc_block, voc_pad, voc_upsample): def __init__(self, am_block, am_pad, voc_block, voc_pad, voc_upsample):
@ -122,23 +44,6 @@ class TTSServerExecutor(TTSExecutor):
self.voc_upsample = voc_upsample self.voc_upsample = voc_upsample
self.pretrained_models = pretrained_models self.pretrained_models = pretrained_models
self.model_alias = model_alias
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
#Download and returns pretrained resources path of current task.
"""
support_models = list(pretrained_models.keys())
assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
'Use pretrained model stored in: {}'.format(decompressed_path))
return decompressed_path
def _init_from_path( def _init_from_path(
self, self,
@ -173,10 +78,10 @@ class TTSServerExecutor(TTSExecutor):
am_res_path = self._get_pretrained_path(am_tag) am_res_path = self._get_pretrained_path(am_tag)
self.am_res_path = am_res_path self.am_res_path = am_res_path
self.am_ckpt = os.path.join( self.am_ckpt = os.path.join(
am_res_path, pretrained_models[am_tag]['ckpt'][0]) am_res_path, self.pretrained_models[am_tag]['ckpt'][0])
# must have phones_dict in acoustic # must have phones_dict in acoustic
self.phones_dict = os.path.join( self.phones_dict = os.path.join(
am_res_path, pretrained_models[am_tag]['phones_dict']) am_res_path, self.pretrained_models[am_tag]['phones_dict'])
else: else:
self.am_ckpt = os.path.abspath(am_ckpt[0]) self.am_ckpt = os.path.abspath(am_ckpt[0])
@ -192,16 +97,16 @@ class TTSServerExecutor(TTSExecutor):
am_res_path = self._get_pretrained_path(am_tag) am_res_path = self._get_pretrained_path(am_tag)
self.am_res_path = am_res_path self.am_res_path = am_res_path
self.am_encoder_infer = os.path.join( self.am_encoder_infer = os.path.join(
am_res_path, pretrained_models[am_tag]['ckpt'][0]) am_res_path, self.pretrained_models[am_tag]['ckpt'][0])
self.am_decoder = os.path.join( self.am_decoder = os.path.join(
am_res_path, pretrained_models[am_tag]['ckpt'][1]) am_res_path, self.pretrained_models[am_tag]['ckpt'][1])
self.am_postnet = os.path.join( self.am_postnet = os.path.join(
am_res_path, pretrained_models[am_tag]['ckpt'][2]) am_res_path, self.pretrained_models[am_tag]['ckpt'][2])
# must have phones_dict in acoustic # must have phones_dict in acoustic
self.phones_dict = os.path.join( self.phones_dict = os.path.join(
am_res_path, pretrained_models[am_tag]['phones_dict']) am_res_path, self.pretrained_models[am_tag]['phones_dict'])
self.am_stat = os.path.join( self.am_stat = os.path.join(
am_res_path, pretrained_models[am_tag]['speech_stats']) am_res_path, self.pretrained_models[am_tag]['speech_stats'])
else: else:
self.am_encoder_infer = os.path.abspath(am_ckpt[0]) self.am_encoder_infer = os.path.abspath(am_ckpt[0])
@ -229,8 +134,8 @@ class TTSServerExecutor(TTSExecutor):
if voc_ckpt is None: if voc_ckpt is None:
voc_res_path = self._get_pretrained_path(voc_tag) voc_res_path = self._get_pretrained_path(voc_tag)
self.voc_res_path = voc_res_path self.voc_res_path = voc_res_path
self.voc_ckpt = os.path.join(voc_res_path, self.voc_ckpt = os.path.join(
pretrained_models[voc_tag]['ckpt']) voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
else: else:
self.voc_ckpt = os.path.abspath(voc_ckpt) self.voc_ckpt = os.path.abspath(voc_ckpt)
self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt)) self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt))
@ -283,7 +188,6 @@ class TTSServerExecutor(TTSExecutor):
""" """
Model inference and result stored in self.output. Model inference and result stored in self.output.
""" """
#import pdb;pdb.set_trace()
am_block = self.am_block am_block = self.am_block
am_pad = self.am_pad am_pad = self.am_pad
@ -453,10 +357,21 @@ class TTSEngine(BaseEngine):
self.config.am_block, self.config.am_pad, self.config.voc_block, self.config.am_block, self.config.am_pad, self.config.voc_block,
self.config.voc_pad, self.config.voc_upsample) self.config.voc_pad, self.config.voc_upsample)
if "cpu" in self.config.am_sess_conf.device or "cpu" in self.config.voc_sess_conf.device: try:
paddle.set_device("cpu") if self.config.am_sess_conf.device is not None:
else: self.device = self.config.am_sess_conf.device
paddle.set_device(self.config.am_sess_conf.device) elif self.config.voc_sess_conf.device is not None:
self.device = self.config.voc_sess_conf.device
else:
self.device = paddle.get_device()
paddle.set_device(self.device)
except BaseException as e:
logger.error(
"Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
)
logger.error("Initialize TTS server engine Failed on device: %s." %
(self.device))
return False
try: try:
self.executor._init_from_path( self.executor._init_from_path(
@ -480,16 +395,17 @@ class TTSEngine(BaseEngine):
(self.config.voc_sess_conf.device)) (self.config.voc_sess_conf.device))
return False return False
logger.info("Initialize TTS server engine successfully on device: %s." %
(self.config.voc_sess_conf.device))
# warm up # warm up
try: try:
self.warm_up() self.warm_up()
logger.info("Warm up successfully.")
except Exception as e: except Exception as e:
logger.error("Failed to warm up on tts engine.") logger.error("Failed to warm up on tts engine.")
return False return False
logger.info("Initialize TTS server engine successfully on device: %s." %
(self.config.voc_sess_conf.device))
return True return True
def warm_up(self): def warm_up(self):
@ -499,9 +415,7 @@ class TTSEngine(BaseEngine):
sentence = "您好,欢迎使用语音合成服务。" sentence = "您好,欢迎使用语音合成服务。"
if self.config.lang == 'en': if self.config.lang == 'en':
sentence = "Hello and welcome to the speech synthesis service." sentence = "Hello and welcome to the speech synthesis service."
logger.info( logger.info("Start to warm up.")
"*******************************warm up ********************************"
)
for i in range(3): for i in range(3):
for wav in self.executor.infer( for wav in self.executor.infer(
text=sentence, text=sentence,
@ -512,9 +426,6 @@ class TTSEngine(BaseEngine):
f"The first response time of the {i} warm up: {self.executor.first_response_time} s" f"The first response time of the {i} warm up: {self.executor.first_response_time} s"
) )
break break
logger.info(
"**********************************************************************"
)
def preprocess(self, text_bese64: str=None, text_bytes: bytes=None): def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
# Convert byte to text # Convert byte to text

@ -0,0 +1,73 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# support online model
pretrained_models = {
# fastspeech2
"fastspeech2_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
'md5':
'637d28a5e53aa60275612ba4393d5f22',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_76000.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
},
"fastspeech2_cnndecoder_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
'md5':
'6eb28e22ace73e0ebe7845f86478f89f',
'config':
'cnndecoder.yaml',
'ckpt':
'snapshot_iter_153000.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
},
# mb_melgan
"mb_melgan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
'md5':
'ee5f0604e20091f0d495b6ec4618b90d',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_1000000.pdz',
'speech_stats':
'feats_stats.npy',
},
# hifigan
"hifigan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
'md5':
'dd40a3d88dfcf64513fba2f0f961ada6',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_2500000.pdz',
'speech_stats':
'feats_stats.npy',
},
}

@ -22,10 +22,9 @@ import paddle
import yaml import yaml
from yacs.config import CfgNode from yacs.config import CfgNode
from .pretrained_models import pretrained_models
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
from paddlespeech.cli.tts.infer import TTSExecutor from paddlespeech.cli.tts.infer import TTSExecutor
from paddlespeech.cli.utils import download_and_decompress
from paddlespeech.cli.utils import MODEL_HOME
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils.audio_process import float2pcm from paddlespeech.server.utils.audio_process import float2pcm
@ -37,87 +36,6 @@ from paddlespeech.t2s.modules.normalizer import ZScore
__all__ = ['TTSEngine'] __all__ = ['TTSEngine']
# support online model
pretrained_models = {
# fastspeech2
"fastspeech2_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
'md5':
'637d28a5e53aa60275612ba4393d5f22',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_76000.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
},
"fastspeech2_cnndecoder_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
'md5':
'6eb28e22ace73e0ebe7845f86478f89f',
'config':
'cnndecoder.yaml',
'ckpt':
'snapshot_iter_153000.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
},
# mb_melgan
"mb_melgan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
'md5':
'ee5f0604e20091f0d495b6ec4618b90d',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_1000000.pdz',
'speech_stats':
'feats_stats.npy',
},
# hifigan
"hifigan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
'md5':
'dd40a3d88dfcf64513fba2f0f961ada6',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_2500000.pdz',
'speech_stats':
'feats_stats.npy',
},
}
model_alias = {
# acoustic model
"fastspeech2":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
# voc
"mb_melgan":
"paddlespeech.t2s.models.melgan:MelGANGenerator",
"mb_melgan_inference":
"paddlespeech.t2s.models.melgan:MelGANInference",
"hifigan":
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
}
__all__ = ['TTSEngine']
class TTSServerExecutor(TTSExecutor): class TTSServerExecutor(TTSExecutor):
def __init__(self, am_block, am_pad, voc_block, voc_pad): def __init__(self, am_block, am_pad, voc_block, voc_pad):
@ -126,6 +44,7 @@ class TTSServerExecutor(TTSExecutor):
self.am_pad = am_pad self.am_pad = am_pad
self.voc_block = voc_block self.voc_block = voc_block
self.voc_pad = voc_pad self.voc_pad = voc_pad
self.pretrained_models = pretrained_models
def get_model_info(self, def get_model_info(self,
field: str, field: str,
@ -146,7 +65,7 @@ class TTSServerExecutor(TTSExecutor):
[Tensor]: standard deviation [Tensor]: standard deviation
""" """
model_class = dynamic_import(model_name, model_alias) model_class = dynamic_import(model_name, self.model_alias)
if field == "am": if field == "am":
odim = self.am_config.n_mels odim = self.am_config.n_mels
@ -169,22 +88,6 @@ class TTSServerExecutor(TTSExecutor):
return model, model_mu, model_std return model, model_mu, model_std
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
Download and returns pretrained resources path of current task.
"""
support_models = list(pretrained_models.keys())
assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
'Use pretrained model stored in: {}'.format(decompressed_path))
return decompressed_path
def _init_from_path( def _init_from_path(
self, self,
am: str='fastspeech2_csmsc', am: str='fastspeech2_csmsc',
@ -210,15 +113,15 @@ class TTSServerExecutor(TTSExecutor):
if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
am_res_path = self._get_pretrained_path(am_tag) am_res_path = self._get_pretrained_path(am_tag)
self.am_res_path = am_res_path self.am_res_path = am_res_path
self.am_config = os.path.join(am_res_path, self.am_config = os.path.join(
pretrained_models[am_tag]['config']) am_res_path, self.pretrained_models[am_tag]['config'])
self.am_ckpt = os.path.join(am_res_path, self.am_ckpt = os.path.join(am_res_path,
pretrained_models[am_tag]['ckpt']) self.pretrained_models[am_tag]['ckpt'])
self.am_stat = os.path.join( self.am_stat = os.path.join(
am_res_path, pretrained_models[am_tag]['speech_stats']) am_res_path, self.pretrained_models[am_tag]['speech_stats'])
# must have phones_dict in acoustic # must have phones_dict in acoustic
self.phones_dict = os.path.join( self.phones_dict = os.path.join(
am_res_path, pretrained_models[am_tag]['phones_dict']) am_res_path, self.pretrained_models[am_tag]['phones_dict'])
print("self.phones_dict:", self.phones_dict) print("self.phones_dict:", self.phones_dict)
logger.info(am_res_path) logger.info(am_res_path)
logger.info(self.am_config) logger.info(self.am_config)
@ -239,12 +142,12 @@ class TTSServerExecutor(TTSExecutor):
if voc_ckpt is None or voc_config is None or voc_stat is None: if voc_ckpt is None or voc_config is None or voc_stat is None:
voc_res_path = self._get_pretrained_path(voc_tag) voc_res_path = self._get_pretrained_path(voc_tag)
self.voc_res_path = voc_res_path self.voc_res_path = voc_res_path
self.voc_config = os.path.join(voc_res_path, self.voc_config = os.path.join(
pretrained_models[voc_tag]['config']) voc_res_path, self.pretrained_models[voc_tag]['config'])
self.voc_ckpt = os.path.join(voc_res_path, self.voc_ckpt = os.path.join(
pretrained_models[voc_tag]['ckpt']) voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
self.voc_stat = os.path.join( self.voc_stat = os.path.join(
voc_res_path, pretrained_models[voc_tag]['speech_stats']) voc_res_path, self.pretrained_models[voc_tag]['speech_stats'])
logger.info(voc_res_path) logger.info(voc_res_path)
logger.info(self.voc_config) logger.info(self.voc_config)
logger.info(self.voc_ckpt) logger.info(self.voc_ckpt)
@ -286,7 +189,7 @@ class TTSServerExecutor(TTSExecutor):
self.am_ckpt, self.am_stat) self.am_ckpt, self.am_stat)
am_normalizer = ZScore(am_mu, am_std) am_normalizer = ZScore(am_mu, am_std)
am_inference_class = dynamic_import(self.am_name + '_inference', am_inference_class = dynamic_import(self.am_name + '_inference',
model_alias) self.model_alias)
self.am_inference = am_inference_class(am_normalizer, am) self.am_inference = am_inference_class(am_normalizer, am)
self.am_inference.eval() self.am_inference.eval()
print("acoustic model done!") print("acoustic model done!")
@ -297,7 +200,7 @@ class TTSServerExecutor(TTSExecutor):
self.voc_ckpt, self.voc_stat) self.voc_ckpt, self.voc_stat)
voc_normalizer = ZScore(voc_mu, voc_std) voc_normalizer = ZScore(voc_mu, voc_std)
voc_inference_class = dynamic_import(self.voc_name + '_inference', voc_inference_class = dynamic_import(self.voc_name + '_inference',
model_alias) self.model_alias)
self.voc_inference = voc_inference_class(voc_normalizer, voc) self.voc_inference = voc_inference_class(voc_normalizer, voc)
self.voc_inference.eval() self.voc_inference.eval()
print("voc done!") print("voc done!")
@ -477,7 +380,7 @@ class TTSEngine(BaseEngine):
), "Please set correct voc_block and voc_pad, they should be more than 0." ), "Please set correct voc_block and voc_pad, they should be more than 0."
try: try:
if self.config.device: if self.config.device is not None:
self.device = self.config.device self.device = self.config.device
else: else:
self.device = paddle.get_device() self.device = paddle.get_device()
@ -513,16 +416,16 @@ class TTSEngine(BaseEngine):
(self.device)) (self.device))
return False return False
logger.info("Initialize TTS server engine successfully on device: %s." %
(self.device))
# warm up # warm up
try: try:
self.warm_up() self.warm_up()
logger.info("Warm up successfully.")
except Exception as e: except Exception as e:
logger.error("Failed to warm up on tts engine.") logger.error("Failed to warm up on tts engine.")
return False return False
logger.info("Initialize TTS server engine successfully on device: %s." %
(self.device))
return True return True
def warm_up(self): def warm_up(self):
@ -532,9 +435,7 @@ class TTSEngine(BaseEngine):
sentence = "您好,欢迎使用语音合成服务。" sentence = "您好,欢迎使用语音合成服务。"
if self.config.lang == 'en': if self.config.lang == 'en':
sentence = "Hello and welcome to the speech synthesis service." sentence = "Hello and welcome to the speech synthesis service."
logger.info( logger.info("Start to warm up.")
"*******************************warm up ********************************"
)
for i in range(3): for i in range(3):
for wav in self.executor.infer( for wav in self.executor.infer(
text=sentence, text=sentence,
@ -545,9 +446,6 @@ class TTSEngine(BaseEngine):
f"The first response time of the {i} warm up: {self.executor.first_response_time} s" f"The first response time of the {i} warm up: {self.executor.first_response_time} s"
) )
break break
logger.info(
"**********************************************************************"
)
def preprocess(self, text_bese64: str=None, text_bytes: bytes=None): def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
# Convert byte to text # Convert byte to text

@ -0,0 +1,87 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Static model applied on paddle inference
pretrained_models = {
# speedyspeech
"speedyspeech_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
'md5':
'f10cbdedf47dc7a9668d2264494e1823',
'model':
'speedyspeech_csmsc.pdmodel',
'params':
'speedyspeech_csmsc.pdiparams',
'phones_dict':
'phone_id_map.txt',
'tones_dict':
'tone_id_map.txt',
'sample_rate':
24000,
},
# fastspeech2
"fastspeech2_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
'md5':
'9788cd9745e14c7a5d12d32670b2a5a7',
'model':
'fastspeech2_csmsc.pdmodel',
'params':
'fastspeech2_csmsc.pdiparams',
'phones_dict':
'phone_id_map.txt',
'sample_rate':
24000,
},
# pwgan
"pwgan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
'md5':
'e3504aed9c5a290be12d1347836d2742',
'model':
'pwgan_csmsc.pdmodel',
'params':
'pwgan_csmsc.pdiparams',
'sample_rate':
24000,
},
# mb_melgan
"mb_melgan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
'md5':
'ac6eee94ba483421d750433f4c3b8d36',
'model':
'mb_melgan_csmsc.pdmodel',
'params':
'mb_melgan_csmsc.pdiparams',
'sample_rate':
24000,
},
# hifigan
"hifigan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
'md5':
'7edd8c436b3a5546b3a7cb8cff9d5a0c',
'model':
'hifigan_csmsc.pdmodel',
'params':
'hifigan_csmsc.pdiparams',
'sample_rate':
24000,
},
}

@ -23,10 +23,9 @@ import paddle
import soundfile as sf import soundfile as sf
from scipy.io import wavfile from scipy.io import wavfile
from .pretrained_models import pretrained_models
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
from paddlespeech.cli.tts.infer import TTSExecutor from paddlespeech.cli.tts.infer import TTSExecutor
from paddlespeech.cli.utils import download_and_decompress
from paddlespeech.cli.utils import MODEL_HOME
from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils.audio_process import change_speed from paddlespeech.server.utils.audio_process import change_speed
from paddlespeech.server.utils.errors import ErrorCode from paddlespeech.server.utils.errors import ErrorCode
@ -38,101 +37,11 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend
__all__ = ['TTSEngine'] __all__ = ['TTSEngine']
# Static model applied on paddle inference
pretrained_models = {
# speedyspeech
"speedyspeech_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
'md5':
'f10cbdedf47dc7a9668d2264494e1823',
'model':
'speedyspeech_csmsc.pdmodel',
'params':
'speedyspeech_csmsc.pdiparams',
'phones_dict':
'phone_id_map.txt',
'tones_dict':
'tone_id_map.txt',
'sample_rate':
24000,
},
# fastspeech2
"fastspeech2_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
'md5':
'9788cd9745e14c7a5d12d32670b2a5a7',
'model':
'fastspeech2_csmsc.pdmodel',
'params':
'fastspeech2_csmsc.pdiparams',
'phones_dict':
'phone_id_map.txt',
'sample_rate':
24000,
},
# pwgan
"pwgan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
'md5':
'e3504aed9c5a290be12d1347836d2742',
'model':
'pwgan_csmsc.pdmodel',
'params':
'pwgan_csmsc.pdiparams',
'sample_rate':
24000,
},
# mb_melgan
"mb_melgan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
'md5':
'ac6eee94ba483421d750433f4c3b8d36',
'model':
'mb_melgan_csmsc.pdmodel',
'params':
'mb_melgan_csmsc.pdiparams',
'sample_rate':
24000,
},
# hifigan
"hifigan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
'md5':
'7edd8c436b3a5546b3a7cb8cff9d5a0c',
'model':
'hifigan_csmsc.pdmodel',
'params':
'hifigan_csmsc.pdiparams',
'sample_rate':
24000,
},
}
class TTSServerExecutor(TTSExecutor): class TTSServerExecutor(TTSExecutor):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
pass self.pretrained_models = pretrained_models
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
Download and returns pretrained resources path of current task.
"""
assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
tag)
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
'Use pretrained model stored in: {}'.format(decompressed_path))
return decompressed_path
def _init_from_path( def _init_from_path(
self, self,
@ -161,14 +70,14 @@ class TTSServerExecutor(TTSExecutor):
if am_model is None or am_params is None or phones_dict is None: if am_model is None or am_params is None or phones_dict is None:
am_res_path = self._get_pretrained_path(am_tag) am_res_path = self._get_pretrained_path(am_tag)
self.am_res_path = am_res_path self.am_res_path = am_res_path
self.am_model = os.path.join(am_res_path, self.am_model = os.path.join(
pretrained_models[am_tag]['model']) am_res_path, self.pretrained_models[am_tag]['model'])
self.am_params = os.path.join(am_res_path, self.am_params = os.path.join(
pretrained_models[am_tag]['params']) am_res_path, self.pretrained_models[am_tag]['params'])
# must have phones_dict in acoustic # must have phones_dict in acoustic
self.phones_dict = os.path.join( self.phones_dict = os.path.join(
am_res_path, pretrained_models[am_tag]['phones_dict']) am_res_path, self.pretrained_models[am_tag]['phones_dict'])
self.am_sample_rate = pretrained_models[am_tag]['sample_rate'] self.am_sample_rate = self.pretrained_models[am_tag]['sample_rate']
logger.info(am_res_path) logger.info(am_res_path)
logger.info(self.am_model) logger.info(self.am_model)
@ -183,17 +92,17 @@ class TTSServerExecutor(TTSExecutor):
# for speedyspeech # for speedyspeech
self.tones_dict = None self.tones_dict = None
if 'tones_dict' in pretrained_models[am_tag]: if 'tones_dict' in self.pretrained_models[am_tag]:
self.tones_dict = os.path.join( self.tones_dict = os.path.join(
am_res_path, pretrained_models[am_tag]['tones_dict']) am_res_path, self.pretrained_models[am_tag]['tones_dict'])
if tones_dict: if tones_dict:
self.tones_dict = tones_dict self.tones_dict = tones_dict
# for multi speaker fastspeech2 # for multi speaker fastspeech2
self.speaker_dict = None self.speaker_dict = None
if 'speaker_dict' in pretrained_models[am_tag]: if 'speaker_dict' in self.pretrained_models[am_tag]:
self.speaker_dict = os.path.join( self.speaker_dict = os.path.join(
am_res_path, pretrained_models[am_tag]['speaker_dict']) am_res_path, self.pretrained_models[am_tag]['speaker_dict'])
if speaker_dict: if speaker_dict:
self.speaker_dict = speaker_dict self.speaker_dict = speaker_dict
@ -202,11 +111,12 @@ class TTSServerExecutor(TTSExecutor):
if voc_model is None or voc_params is None: if voc_model is None or voc_params is None:
voc_res_path = self._get_pretrained_path(voc_tag) voc_res_path = self._get_pretrained_path(voc_tag)
self.voc_res_path = voc_res_path self.voc_res_path = voc_res_path
self.voc_model = os.path.join(voc_res_path, self.voc_model = os.path.join(
pretrained_models[voc_tag]['model']) voc_res_path, self.pretrained_models[voc_tag]['model'])
self.voc_params = os.path.join(voc_res_path, self.voc_params = os.path.join(
pretrained_models[voc_tag]['params']) voc_res_path, self.pretrained_models[voc_tag]['params'])
self.voc_sample_rate = pretrained_models[voc_tag]['sample_rate'] self.voc_sample_rate = self.pretrained_models[voc_tag][
'sample_rate']
logger.info(voc_res_path) logger.info(voc_res_path)
logger.info(self.voc_model) logger.info(self.voc_model)
logger.info(self.voc_params) logger.info(self.voc_params)
@ -352,8 +262,24 @@ class TTSEngine(BaseEngine):
def init(self, config: dict) -> bool: def init(self, config: dict) -> bool:
self.executor = TTSServerExecutor() self.executor = TTSServerExecutor()
self.config = config self.config = config
try:
if self.config.am_predictor_conf.device is not None:
self.device = self.config.am_predictor_conf.device
elif self.config.voc_predictor_conf.device is not None:
self.device = self.config.voc_predictor_conf.device
else:
self.device = paddle.get_device()
paddle.set_device(self.device)
except BaseException as e:
logger.error(
"Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
)
logger.error("Initialize TTS server engine Failed on device: %s." %
(self.device))
return False
self.executor._init_from_path( self.executor._init_from_path(
am=self.config.am, am=self.config.am,
am_model=self.config.am_model, am_model=self.config.am_model,
@ -370,9 +296,35 @@ class TTSEngine(BaseEngine):
am_predictor_conf=self.config.am_predictor_conf, am_predictor_conf=self.config.am_predictor_conf,
voc_predictor_conf=self.config.voc_predictor_conf, ) voc_predictor_conf=self.config.voc_predictor_conf, )
# warm up
try:
self.warm_up()
logger.info("Warm up successfully.")
except Exception as e:
logger.error("Failed to warm up on tts engine.")
return False
logger.info("Initialize TTS server engine successfully.") logger.info("Initialize TTS server engine successfully.")
return True return True
def warm_up(self):
"""warm up
"""
if self.config.lang == 'zh':
sentence = "您好,欢迎使用语音合成服务。"
if self.config.lang == 'en':
sentence = "Hello and welcome to the speech synthesis service."
logger.info("Start to warm up.")
for i in range(3):
st = time.time()
self.executor.infer(
text=sentence,
lang=self.config.lang,
am=self.config.am,
spk_id=0, )
logger.info(
f"The response time of the {i} warm up: {time.time() - st} s")
def postprocess(self, def postprocess(self,
wav, wav,
original_fs: int, original_fs: int,

@ -51,15 +51,15 @@ class TTSEngine(BaseEngine):
def init(self, config: dict) -> bool: def init(self, config: dict) -> bool:
self.executor = TTSServerExecutor() self.executor = TTSServerExecutor()
self.config = config
try: try:
self.config = config if self.config.device is not None:
if self.config.device:
self.device = self.config.device self.device = self.config.device
else: else:
self.device = paddle.get_device() self.device = paddle.get_device()
paddle.set_device(self.device) paddle.set_device(self.device)
except BaseException: except BaseException as e:
logger.error( logger.error(
"Set device failed, please check if device is already used and the parameter 'device' in the yaml file" "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
) )
@ -87,10 +87,36 @@ class TTSEngine(BaseEngine):
(self.device)) (self.device))
return False return False
# warm up
try:
self.warm_up()
logger.info("Warm up successfully.")
except Exception as e:
logger.error("Failed to warm up on tts engine.")
return False
logger.info("Initialize TTS server engine successfully on device: %s." % logger.info("Initialize TTS server engine successfully on device: %s." %
(self.device)) (self.device))
return True return True
def warm_up(self):
"""warm up
"""
if self.config.lang == 'zh':
sentence = "您好,欢迎使用语音合成服务。"
if self.config.lang == 'en':
sentence = "Hello and welcome to the speech synthesis service."
logger.info("Start to warm up.")
for i in range(3):
st = time.time()
self.executor.infer(
text=sentence,
lang=self.config.lang,
am=self.config.am,
spk_id=0, )
logger.info(
f"The response time of the {i} warm up: {time.time() - st} s")
def postprocess(self, def postprocess(self,
wav, wav,
original_fs: int, original_fs: int,

@ -128,7 +128,7 @@ def tts(request_body: TTSRequest):
return response return response
@router.post("/paddlespeech/streaming/tts") @router.post("/paddlespeech/tts/streaming")
async def stream_tts(request_body: TTSRequest): async def stream_tts(request_body: TTSRequest):
text = request_body.text text = request_body.text

@ -14,6 +14,7 @@
import argparse import argparse
from paddlespeech.server.utils.audio_handler import TTSHttpHandler from paddlespeech.server.utils.audio_handler import TTSHttpHandler
from paddlespeech.server.utils.util import compute_delay
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@ -43,5 +44,25 @@ if __name__ == "__main__":
print("tts http client start") print("tts http client start")
handler = TTSHttpHandler(args.server, args.port, args.play) handler = TTSHttpHandler(args.server, args.port, args.play)
handler.run(args.text, args.spk_id, args.speed, args.volume, first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
args.sample_rate, args.output) args.text, args.spk_id, args.speed, args.volume, args.sample_rate,
args.output)
delay_time_list = compute_delay(receive_time_list, chunk_duration_list)
print(f"sentence: {args.text}")
print(f"duration: {duration} s")
print(f"first response: {first_response} s")
print(f"final response: {final_response} s")
print(f"RTF: {final_response/duration}")
if args.output is not None:
if save_audio_success:
print(f"Audio successfully saved in {args.output}")
else:
print("Audio save failed.")
if delay_time_list != []:
print(
f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
)
else:
print("The sentence has no delay in streaming synthesis.")

@ -15,6 +15,7 @@ import argparse
import asyncio import asyncio
from paddlespeech.server.utils.audio_handler import TTSWsHandler from paddlespeech.server.utils.audio_handler import TTSWsHandler
from paddlespeech.server.utils.util import compute_delay
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@ -35,4 +36,24 @@ if __name__ == "__main__":
print("tts websocket client start") print("tts websocket client start")
handler = TTSWsHandler(args.server, args.port, args.play) handler = TTSWsHandler(args.server, args.port, args.play)
loop = asyncio.get_event_loop() loop = asyncio.get_event_loop()
loop.run_until_complete(handler.run(args.text, args.output)) first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
handler.run(args.text, args.output))
delay_time_list = compute_delay(receive_time_list, chunk_duration_list)
print(f"sentence: {args.text}")
print(f"duration: {duration} s")
print(f"first response: {first_response} s")
print(f"final response: {final_response} s")
print(f"RTF: {final_response/duration}")
if args.output is not None:
if save_audio_success:
print(f"Audio successfully saved in {args.output}")
else:
print("Audio save failed.")
if delay_time_list != []:
print(
f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
)
else:
print("The sentence has no delay in streaming synthesis.")

@ -259,7 +259,8 @@ class TTSWsHandler:
""" """
self.server = server self.server = server
self.port = port self.port = port
self.url = "ws://" + self.server + ":" + str(self.port) + "/ws/tts" self.url = "ws://" + self.server + ":" + str(
self.port) + "/paddlespeech/tts/streaming"
self.play = play self.play = play
if self.play: if self.play:
import pyaudio import pyaudio
@ -295,6 +296,8 @@ class TTSWsHandler:
output (str): save audio path output (str): save audio path
""" """
all_bytes = b'' all_bytes = b''
receive_time_list = []
chunk_duration_list = []
# 1. Send websocket handshake protocal # 1. Send websocket handshake protocal
async with websockets.connect(self.url) as ws: async with websockets.connect(self.url) as ws:
@ -309,14 +312,15 @@ class TTSWsHandler:
# 3. Process the received response # 3. Process the received response
message = await ws.recv() message = await ws.recv()
logger.info(f"句子:{text}") first_response = time.time() - st
logger.info(f"首包响应:{time.time() - st} s")
message = json.loads(message) message = json.loads(message)
status = message["status"] status = message["status"]
while (status == 1): while (status == 1):
receive_time_list.append(time.time())
audio = message["audio"] audio = message["audio"]
audio = base64.b64decode(audio) # bytes audio = base64.b64decode(audio) # bytes
chunk_duration_list.append(len(audio) / 2.0 / 24000)
all_bytes += audio all_bytes += audio
if self.play: if self.play:
self.mutex.acquire() self.mutex.acquire()
@ -334,15 +338,11 @@ class TTSWsHandler:
if status == 2: if status == 2:
final_response = time.time() - st final_response = time.time() - st
duration = len(all_bytes) / 2.0 / 24000 duration = len(all_bytes) / 2.0 / 24000
logger.info(f"尾包响应:{final_response} s")
logger.info(f"音频时长:{duration} s")
logger.info(f"RTF: {final_response / duration}")
if output is not None: if output is not None:
if save_audio(all_bytes, output): save_audio_success = save_audio(all_bytes, output)
logger.info(f"音频保存至:{output}") else:
else: save_audio_success = False
logger.error("save audio error")
else: else:
logger.error("infer error") logger.error("infer error")
@ -352,6 +352,8 @@ class TTSWsHandler:
self.stream.close() self.stream.close()
self.p.terminate() self.p.terminate()
return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list
class TTSHttpHandler: class TTSHttpHandler:
def __init__(self, server="127.0.0.1", port=8092, play: bool=False): def __init__(self, server="127.0.0.1", port=8092, play: bool=False):
@ -365,7 +367,7 @@ class TTSHttpHandler:
self.server = server self.server = server
self.port = port self.port = port
self.url = "http://" + str(self.server) + ":" + str( self.url = "http://" + str(self.server) + ":" + str(
self.port) + "/paddlespeech/streaming/tts" self.port) + "/paddlespeech/tts/streaming"
self.play = play self.play = play
if self.play: if self.play:
@ -423,13 +425,16 @@ class TTSHttpHandler:
all_bytes = b'' all_bytes = b''
first_flag = 1 first_flag = 1
receive_time_list = []
chunk_duration_list = []
# 2. Send request # 2. Send request
st = time.time() st = time.time()
html = requests.post(self.url, json.dumps(params), stream=True) html = requests.post(self.url, json.dumps(params), stream=True)
# 3. Process the received response # 3. Process the received response
for chunk in html.iter_content(chunk_size=1024): for chunk in html.iter_content(chunk_size=None):
receive_time_list.append(time.time())
audio = base64.b64decode(chunk) # bytes audio = base64.b64decode(chunk) # bytes
if first_flag: if first_flag:
first_response = time.time() - st first_response = time.time() - st
@ -443,21 +448,15 @@ class TTSHttpHandler:
self.t.start() self.t.start()
self.start_play = False self.start_play = False
all_bytes += audio all_bytes += audio
chunk_duration_list.append(len(audio) / 2.0 / 24000)
final_response = time.time() - st final_response = time.time() - st
duration = len(all_bytes) / 2.0 / 24000 duration = len(all_bytes) / 2.0 / 24000
logger.info(f"句子:{text}")
logger.info(f"首包响应:{first_response} s")
logger.info(f"尾包响应:{final_response} s")
logger.info(f"音频时长:{duration} s")
logger.info(f"RTF: {final_response / duration}")
if output is not None: if output is not None:
if save_audio(all_bytes, output): save_audio_success = save_audio(all_bytes, output)
logger.info(f"音频保存至:{output}") else:
else: save_audio_success = False
logger.error("save audio error")
if self.play: if self.play:
self.t.join() self.t.join()
@ -465,6 +464,8 @@ class TTSHttpHandler:
self.stream.close() self.stream.close()
self.p.terminate() self.p.terminate()
return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list
class VectorHttpHandler: class VectorHttpHandler:
def __init__(self, server_ip=None, port=None): def __init__(self, server_ip=None, port=None):

@ -75,3 +75,74 @@ def get_chunks(data, block_size, pad_size, step):
else: else:
print("Please set correct type to get chunks, am or voc") print("Please set correct type to get chunks, am or voc")
return chunks return chunks
def compute_delay(receive_time_list, chunk_duration_list):
"""compute delay
Args:
receive_time_list (list): Time to receive each packet
chunk_duration_list (list): The audio duration corresponding to each packet
Returns:
[list]: Delay time list
"""
assert (len(receive_time_list) == len(chunk_duration_list))
delay_time_list = []
play_time = receive_time_list[0] + chunk_duration_list[0]
for i in range(1, len(receive_time_list)):
receive_time = receive_time_list[i]
delay_time = receive_time - play_time
# 有延迟
if delay_time > 0:
play_time = play_time + delay_time + chunk_duration_list[i]
delay_time_list.append(delay_time)
# 没有延迟
else:
play_time = play_time + chunk_duration_list[i]
return delay_time_list
def count_engine(logfile: str="./nohup.out"):
"""For inference on the statistical engine side
Args:
logfile (str, optional): server log. Defaults to "./nohup.out".
"""
first_response_list = []
final_response_list = []
duration_list = []
with open(logfile, "r") as f:
for line in f.readlines():
if "- first response time:" in line:
first_response = float(line.splie(" ")[-2])
first_response_list.append(first_response)
elif "- final response time:" in line:
final_response = float(line.splie(" ")[-2])
final_response_list.append(final_response)
elif "- The durations of audio is:" in line:
duration = float(line.splie(" ")[-2])
duration_list.append(duration)
assert (len(first_response_list) == len(final_response_list) and
len(final_response_list) == len(duration_list))
avg_first_response = sum(first_response_list) / len(first_response_list)
avg_final_response = sum(final_response_list) / len(final_response_list)
avg_duration = sum(duration_list) / len(duration_list)
RTF = sum(final_response_list) / sum(duration_list)
print(
"************************* engine result ***************************************"
)
print(
f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
)
print(
f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
)
print(
f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
)
print(
f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
)

@ -24,7 +24,7 @@ from paddlespeech.server.engine.engine_pool import get_engine_pool
router = APIRouter() router = APIRouter()
@router.websocket('/ws/tts') @router.websocket('/paddlespeech/tts/streaming')
async def websocket_endpoint(websocket: WebSocket): async def websocket_endpoint(websocket: WebSocket):
await websocket.accept() await websocket.accept()

@ -19,7 +19,7 @@ def change_device(yamlfile: str, engine: str, device: str):
if device == 'cpu': if device == 'cpu':
set_device = 'cpu' set_device = 'cpu'
elif device == 'gpu': elif device == 'gpu':
set_device = 'gpu:0' set_device = 'gpu:3'
else: else:
print("Please set correct device: cpu or gpu.") print("Please set correct device: cpu or gpu.")

@ -1,4 +1,4 @@
# This is the parameter configuration file for PaddleSpeech Serving. # This is the parameter configuration file for PaddleSpeech Offline Serving.
################################################################################# #################################################################################
# SERVER SETTING # # SERVER SETTING #
@ -7,8 +7,8 @@ host: 127.0.0.1
port: 8090 port: 8090
# The task format in the engin_list is: <speech task>_<engine type> # The task format in the engin_list is: <speech task>_<engine type>
# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference'] # task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference']
protocol: 'http'
engine_list: ['asr_python', 'tts_python', 'cls_python'] engine_list: ['asr_python', 'tts_python', 'cls_python']

@ -39,6 +39,7 @@ ClientTest(){
((test_times+=1)) ((test_times+=1))
paddlespeech_client cls --server_ip $server_ip --port $port --input ./zh.wav paddlespeech_client cls --server_ip $server_ip --port $port --input ./zh.wav
((test_times+=1)) ((test_times+=1))
} }
GetTestResult() { GetTestResult() {
@ -58,6 +59,7 @@ rm -rf log/server.log.wf
rm -rf log/server.log rm -rf log/server.log
rm -rf log/test_result.log rm -rf log/test_result.log
cp ../../../../demos/speech_server/conf/application.yaml ./conf/
config_file=./conf/application.yaml config_file=./conf/application.yaml
server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}') server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}') port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
@ -191,5 +193,4 @@ echo "***************** Here are all the test results ********************"
cat ./log/test_result.log cat ./log/test_result.log
# Restoring conf is the same as demos/speech_server # Restoring conf is the same as demos/speech_server
rm -rf ./conf cp ../../../../demos/speech_server/conf/application.yaml ./conf/
cp ../../../demos/speech_server/conf/ ./ -rf

@ -39,9 +39,9 @@ tts_online:
# others # others
lang: 'zh' lang: 'zh'
device: 'cpu' # set 'gpu:id' or 'cpu' device: 'cpu' # set 'gpu:id' or 'cpu'
am_block: 42 am_block: 72
am_pad: 12 am_pad: 12
voc_block: 14 voc_block: 36
voc_pad: 14 voc_pad: 14
@ -80,9 +80,9 @@ tts_online-onnx:
# others # others
lang: 'zh' lang: 'zh'
am_block: 42 am_block: 72
am_pad: 12 am_pad: 12
voc_block: 14 voc_block: 36
voc_pad: 14 voc_pad: 14
voc_upsample: 300 voc_upsample: 300

@ -1,100 +0,0 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import base64
import json
import os
import time
import requests
from paddlespeech.server.utils.audio_process import pcm2wav
def save_audio(buffer, audio_path) -> bool:
if args.save_path.endswith("pcm"):
with open(args.save_path, "wb") as f:
f.write(buffer)
elif args.save_path.endswith("wav"):
with open("./tmp.pcm", "wb") as f:
f.write(buffer)
pcm2wav("./tmp.pcm", audio_path, channels=1, bits=16, sample_rate=24000)
os.system("rm ./tmp.pcm")
else:
print("Only supports saved audio format is pcm or wav")
return False
return True
def test(args):
params = {
"text": args.text,
"spk_id": args.spk_id,
"speed": args.speed,
"volume": args.volume,
"sample_rate": args.sample_rate,
"save_path": ''
}
buffer = b''
flag = 1
url = "http://" + str(args.server) + ":" + str(
args.port) + "/paddlespeech/streaming/tts"
st = time.time()
html = requests.post(url, json.dumps(params), stream=True)
for chunk in html.iter_content(chunk_size=1024):
chunk = base64.b64decode(chunk) # bytes
if flag:
first_response = time.time() - st
print(f"首包响应:{first_response} s")
flag = 0
buffer += chunk
final_response = time.time() - st
duration = len(buffer) / 2.0 / 24000
print(f"尾包响应:{final_response} s")
print(f"音频时长:{duration} s")
print(f"RTF: {final_response / duration}")
if args.save_path is not None:
if save_audio(buffer, args.save_path):
print("音频保存至:", args.save_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
'--text',
type=str,
default="您好,欢迎使用语音合成服务。",
help='A sentence to be synthesized')
parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')
parser.add_argument(
'--volume', type=float, default=1.0, help='Audio volume')
parser.add_argument(
'--sample_rate',
type=int,
default=0,
help='Sampling rate, the default is the same as the model')
parser.add_argument(
"--server", type=str, help="server ip", default="127.0.0.1")
parser.add_argument("--port", type=int, help="server port", default=8092)
parser.add_argument(
"--save_path", type=str, help="save audio path", default=None)
args = parser.parse_args()
test(args)

@ -10,7 +10,6 @@ bash test.sh tts_online $log_all_dir/log_tts_online_cpu
python change_yaml.py --change_type engine_type --target_key engine_list --target_value tts_online-onnx python change_yaml.py --change_type engine_type --target_key engine_list --target_value tts_online-onnx
bash test.sh tts_online-onnx $log_all_dir/log_tts_online-onnx_cpu bash test.sh tts_online-onnx $log_all_dir/log_tts_online-onnx_cpu
python change_yaml.py --change_type device --target_key device --target_value gpu:3 python change_yaml.py --change_type device --target_key device --target_value gpu:3
bash test.sh tts_online $log_all_dir/log_tts_online_gpu bash test.sh tts_online $log_all_dir/log_tts_online_gpu

@ -39,9 +39,9 @@ tts_online:
# others # others
lang: 'zh' lang: 'zh'
device: 'cpu' # set 'gpu:id' or 'cpu' device: 'cpu' # set 'gpu:id' or 'cpu'
am_block: 42 am_block: 72
am_pad: 12 am_pad: 12
voc_block: 14 voc_block: 36
voc_pad: 14 voc_pad: 14
@ -80,9 +80,9 @@ tts_online-onnx:
# others # others
lang: 'zh' lang: 'zh'
am_block: 42 am_block: 72
am_pad: 12 am_pad: 12
voc_block: 14 voc_block: 36
voc_pad: 14 voc_pad: 14
voc_upsample: 300 voc_upsample: 300

@ -1,126 +0,0 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import _thread as thread
import argparse
import base64
import json
import ssl
import time
import websocket
flag = 1
st = 0.0
all_bytes = b''
class WsParam(object):
# 初始化
def __init__(self, text, server="127.0.0.1", port=8090):
self.server = server
self.port = port
self.url = "ws://" + self.server + ":" + str(self.port) + "/ws/tts"
self.text = text
# 生成url
def create_url(self):
return self.url
def on_message(ws, message):
global flag
global st
global all_bytes
try:
message = json.loads(message)
audio = message["audio"]
audio = base64.b64decode(audio) # bytes
status = message["status"]
all_bytes += audio
if status == 0:
print("create successfully.")
elif status == 1:
if flag:
print(f"首包响应:{time.time() - st} s")
flag = 0
elif status == 2:
final_response = time.time() - st
duration = len(all_bytes) / 2.0 / 24000
print(f"尾包响应:{final_response} s")
print(f"音频时长:{duration} s")
print(f"RTF: {final_response / duration}")
with open("./out.pcm", "wb") as f:
f.write(all_bytes)
print("ws is closed")
ws.close()
else:
print("infer error")
except Exception as e:
print("receive msg,but parse exception:", e)
# 收到websocket错误的处理
def on_error(ws, error):
print("### error:", error)
# 收到websocket关闭的处理
def on_close(ws):
print("### closed ###")
# 收到websocket连接建立的处理
def on_open(ws):
def run(*args):
global st
text_base64 = str(
base64.b64encode((wsParam.text).encode('utf-8')), "UTF8")
d = {"text": text_base64}
d = json.dumps(d)
print("Start sending text data")
st = time.time()
ws.send(d)
thread.start_new_thread(run, ())
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--text",
type=str,
help="A sentence to be synthesized",
default="您好,欢迎使用语音合成服务。")
parser.add_argument(
"--server", type=str, help="server ip", default="127.0.0.1")
parser.add_argument("--port", type=int, help="server port", default=8092)
args = parser.parse_args()
print("***************************************")
print("Server ip: ", args.server)
print("Server port: ", args.port)
print("Sentence to be synthesized: ", args.text)
print("***************************************")
wsParam = WsParam(text=args.text, server=args.server, port=args.port)
websocket.enableTrace(False)
wsUrl = wsParam.create_url()
ws = websocket.WebSocketApp(
wsUrl, on_message=on_message, on_error=on_error, on_close=on_close)
ws.on_open = on_open
ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})

@ -12,117 +12,35 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse import argparse
import base64 import asyncio
import json
import os import os
import time
import requests from paddlespeech.server.utils.util import compute_delay
from paddlespeech.server.utils.audio_process import pcm2wav
from paddlespeech.t2s.exps.syn_utils import get_sentences from paddlespeech.t2s.exps.syn_utils import get_sentences
def save_audio(buffer, audio_path) -> bool:
if audio_path.endswith("pcm"):
with open(audio_path, "wb") as f:
f.write(buffer)
elif audio_path.endswith("wav"):
with open("./tmp.pcm", "wb") as f:
f.write(buffer)
pcm2wav("./tmp.pcm", audio_path, channels=1, bits=16, sample_rate=24000)
os.system("rm ./tmp.pcm")
else:
print("Only supports saved audio format is pcm or wav")
return False
return True
def test(args, text, utt_id): def test(args, text, utt_id):
params = { output = str(args.output_dir + "/" + utt_id + ".wav")
"text": text, if args.protocol == "http":
"spk_id": args.spk_id, print("tts http client start")
"speed": args.speed, from paddlespeech.server.utils.audio_handler import TTSHttpHandler
"volume": args.volume, handler = TTSHttpHandler(args.server_ip, args.port, args.play)
"sample_rate": args.sample_rate, first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
"save_path": '' text, args.spk_id, args.speed, args.volume, args.sample_rate,
} output)
buffer = b'' elif args.protocol == "websocket":
flag = 1 from paddlespeech.server.utils.audio_handler import TTSWsHandler
url = "http://" + str(args.server) + ":" + str( print("tts websocket client start")
args.port) + "/paddlespeech/streaming/tts" handler = TTSWsHandler(args.server_ip, args.port, args.play)
st = time.time() loop = asyncio.get_event_loop()
html = requests.post(url, json.dumps(params), stream=True) first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
for chunk in html.iter_content(chunk_size=1024): handler.run(text, output))
chunk = base64.b64decode(chunk) # bytes
if flag:
first_response = time.time() - st
print(f"首包响应:{first_response} s")
flag = 0
buffer += chunk
final_response = time.time() - st
duration = len(buffer) / 2.0 / 24000
print(f"sentence: {text}")
print(f"尾包响应:{final_response} s")
print(f"音频时长:{duration} s")
print(f"RTF: {final_response / duration}")
save_path = str(args.output_dir + "/" + utt_id + ".wav")
save_audio(buffer, save_path)
print("音频保存至:", save_path)
return first_response, final_response, duration
def count_engine(logfile: str="./nohup.out"):
"""For inference on the statistical engine side
Args:
logfile (str, optional): server log. Defaults to "./nohup.out".
"""
first_response_list = []
final_response_list = []
duration_list = []
with open(logfile, "r") as f: else:
for line in f.readlines(): print("Please set correct protocol, http or websocket")
if "- first response time:" in line:
first_response = float(line.splie(" ")[-2])
first_response_list.append(first_response)
elif "- final response time:" in line:
final_response = float(line.splie(" ")[-2])
final_response_list.append(final_response)
elif "- The durations of audio is:" in line:
duration = float(line.splie(" ")[-2])
duration_list.append(duration)
assert (len(first_response_list) == len(final_response_list) and return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list
len(final_response_list) == len(duration_list))
avg_first_response = sum(first_response_list) / len(first_response_list)
avg_final_response = sum(final_response_list) / len(final_response_list)
avg_duration = sum(duration_list) / len(duration_list)
RTF = sum(final_response_list) / sum(duration_list)
print(
"************************* engine result ***************************************"
)
print(
f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
)
print(
f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
)
print(
f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
)
print(
f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
)
if __name__ == "__main__": if __name__ == "__main__":
@ -142,10 +60,18 @@ if __name__ == "__main__":
default=0, default=0,
help='Sampling rate, the default is the same as the model') help='Sampling rate, the default is the same as the model')
parser.add_argument( parser.add_argument(
"--server", type=str, help="server ip", default="127.0.0.1") "--server_ip", type=str, help="server ip", default="127.0.0.1")
parser.add_argument("--port", type=int, help="server port", default=8092) parser.add_argument("--port", type=int, help="server port", default=8092)
parser.add_argument(
"--protocol",
type=str,
choices=['http', 'websocket'],
help="server protocol",
default="http")
parser.add_argument( parser.add_argument(
"--output_dir", type=str, default="./output", help="output dir") "--output_dir", type=str, default="./output", help="output dir")
parser.add_argument(
"--play", type=bool, help="whether to play audio", default=False)
args = parser.parse_args() args = parser.parse_args()
@ -155,13 +81,35 @@ if __name__ == "__main__":
first_response_list = [] first_response_list = []
final_response_list = [] final_response_list = []
duration_list = [] duration_list = []
all_delay_list = []
packet_count = 0.0
sentences = get_sentences(text_file=args.text, lang="zh") sentences = get_sentences(text_file=args.text, lang="zh")
for utt_id, sentence in sentences: for utt_id, sentence in sentences:
first_response, final_response, duration = test(args, sentence, utt_id) first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = test(
args, sentence, utt_id)
delay_time_list = compute_delay(receive_time_list, chunk_duration_list)
first_response_list.append(first_response) first_response_list.append(first_response)
final_response_list.append(final_response) final_response_list.append(final_response)
duration_list.append(duration) duration_list.append(duration)
packet_count += len(receive_time_list)
print(f"句子:{sentence}")
print(f"首包响应时间:{first_response} s")
print(f"尾包响应时间:{final_response} s")
print(f"音频时长:{duration} s")
print(f"该句RTF{final_response/duration}")
if delay_time_list != []:
for t in delay_time_list:
all_delay_list.append(t)
print(
f"该句流式合成的延迟情况:总包个数:{len(receive_time_list)},延迟包个数:{len(delay_time_list)}, 最小延迟时间:{min(delay_time_list)} s, 最大延迟时间:{max(delay_time_list)} s, 平均延迟时间:{sum(delay_time_list)/len(delay_time_list)} s, 延迟率:{len(delay_time_list)/len(receive_time_list)}"
)
else:
print("该句流式合成无延迟情况")
packet_count += len(receive_time_list)
assert (len(first_response_list) == len(final_response_list) and assert (len(first_response_list) == len(final_response_list) and
len(final_response_list) == len(duration_list)) len(final_response_list) == len(duration_list))
@ -170,19 +118,35 @@ if __name__ == "__main__":
avg_final_response = sum(final_response_list) / len(final_response_list) avg_final_response = sum(final_response_list) / len(final_response_list)
avg_duration = sum(duration_list) / len(duration_list) avg_duration = sum(duration_list) / len(duration_list)
RTF = sum(final_response_list) / sum(duration_list) RTF = sum(final_response_list) / sum(duration_list)
if all_delay_list != []:
delay_count = len(all_delay_list)
avg_delay = sum(all_delay_list) / len(all_delay_list)
delay_ratio = len(all_delay_list) / packet_count
min_delay = min(all_delay_list)
max_delay = max(all_delay_list)
else:
delay_count = 0.0
avg_delay = 0.0
delay_ratio = 0.0
min_delay = 0.0
max_delay = 0.0
print( print(
"************************* server/client result ***************************************" "************************* server/client result ***************************************"
) )
print( print(
f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}" f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}."
)
print(
f"test num: {len(duration_list)}, packet count: {packet_count}, delay count: {delay_count}, avg delay time: {avg_delay} s, delay ratio: {delay_ratio} "
) )
print( print(
f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s" f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
) )
print( print(
f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s" f"min first response: {min(first_response_list)} s, max first response: {max(first_response_list)} s."
) )
print( print(
f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s" f"min final response: {min(final_response_list)} s, max final response: {max(final_response_list)} s."
) )
print(f"min delay: {min_delay} s, max delay: {max_delay}")

Loading…
Cancel
Save