improve server code, test=doc

3 years ago · a6934228ce
parent 8cb2199181
commit a6934228ce
30 changed files with 812 additions and 706 deletions
--- a/demos/streaming_tts_server/conf/tts_online_application.yaml
+++ b/demos/streaming_tts_server/conf/tts_online_application.yaml
@ -43,12 +43,12 @@ tts_online:
    device: 'cpu' # set 'gpu:id' or 'cpu'
    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
    am_pad: 12
    # voc_pad and voc_block voc model to streaming voc infer,
    # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
    # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
@ -91,12 +91,12 @@ tts_online-onnx:
    lang: 'zh'
    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
    am_pad: 12
    # voc_pad and voc_block voc model to streaming voc infer,
    # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
    # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
    # voc_upsample should be same as n_shift on voc config.
    voc_upsample: 300
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@ -31,6 +31,7 @@ from ..util import stats_wrapper
 from paddlespeech.cli.log import logger
 from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler
 from paddlespeech.server.utils.audio_process import wav2pcm
 from paddlespeech.server.utils.util import compute_delay
 from paddlespeech.server.utils.util import wav2base64
 __all__ = [
@ -221,7 +222,7 @@ class TTSOnlineClientExecutor(BaseExecutor):
        play = args.play
        try:
-            res = self(
+            self(
                input=input_,
                server_ip=server_ip,
                port=port,
@ -257,17 +258,42 @@ class TTSOnlineClientExecutor(BaseExecutor):
            logger.info("tts http client start")
            from paddlespeech.server.utils.audio_handler import TTSHttpHandler
            handler = TTSHttpHandler(server_ip, port, play)
-            handler.run(input, spk_id, speed, volume, sample_rate, output)
+            first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
                input, spk_id, speed, volume, sample_rate, output)
            delay_time_list = compute_delay(receive_time_list,
                                            chunk_duration_list)
        elif protocol == "websocket":
            from paddlespeech.server.utils.audio_handler import TTSWsHandler
            logger.info("tts websocket client start")
            handler = TTSWsHandler(server_ip, port, play)
            loop = asyncio.get_event_loop()
-            loop.run_until_complete(handler.run(input, output))
+            first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
                handler.run(input, output))
            delay_time_list = compute_delay(receive_time_list,
                                            chunk_duration_list)
        else:
            logger.error("Please set correct protocol, http or websocket")
            return False
        logger.info(f"sentence: {input}")
        logger.info(f"duration: {duration} s")
        logger.info(f"first response: {first_response} s")
        logger.info(f"final response: {final_response} s")
        logger.info(f"RTF: {final_response/duration}")
        if output is not None:
            if save_audio_success:
                logger.info(f"Audio successfully saved in {output}")
            else:
                logger.error("Audio save failed.")
        if delay_time_list != []:
            logger.info(
                f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
            )
        else:
            logger.info("The sentence has no delay in streaming synthesis.")
@cli_client_register(
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@ -1,4 +1,4 @@
-# This is the parameter configuration file for PaddleSpeech Serving.
+# This is the parameter configuration file for PaddleSpeech Offline Serving..
 #################################################################################
 #                             SERVER SETTING                                    #
@ -7,9 +7,7 @@ host: 127.0.0.1
 port: 8090
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference']
 # protocol = ['websocket', 'http'] (only one can be selected). 
 # http only support offline engine type.
 protocol: 'http'
 engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']
@ -50,24 +48,6 @@ asr_inference:
        summary: True  # False -> do not show predictor config
 ################### speech task: asr; engine_type: online #######################
 asr_online:
    model_type: 'deepspeech2online_aishell'
    am_model: # the pdmodel file of am static model [optional]
    am_params:  # the pdiparams file of am static model [optional]
    lang: 'zh'
    sample_rate: 16000
    cfg_path: 
    decode_method: 
    force_yes: True
    am_predictor_conf:
        device:  # set 'gpu:id' or 'cpu'
        switch_ir_optim: True
        glog_info: False  # True -> print glog
        summary: True  # False -> do not show predictor config
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: python #######################
 tts_python: 
--- a/paddlespeech/server/conf/tts_online_application.yaml
+++ b/paddlespeech/server/conf/tts_online_application.yaml
@ -43,12 +43,12 @@ tts_online:
    device: 'cpu' # set 'gpu:id' or 'cpu'
    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
    am_pad: 12
    # voc_pad and voc_block voc model to streaming voc infer,
    # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
    # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
@ -91,12 +91,12 @@ tts_online-onnx:
    lang: 'zh'
    # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
    # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
    am_pad: 12
    # voc_pad and voc_block voc model to streaming voc infer,
    # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
    # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
    # voc_upsample should be same as n_shift on voc config.
    voc_upsample: 300
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@ -20,10 +20,9 @@ import paddle
 from numpy import float32
 from yacs.config import CfgNode
 from .pretrained_models import pretrained_models
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.asr.infer import model_alias
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.utils import download_and_decompress
 from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.speech import SpeechSegment
@ -40,45 +39,6 @@ from paddlespeech.server.utils.paddle_predictor import init_predictor
 __all__ = ['ASREngine']
 pretrained_models = {
    "deepspeech2online_aishell-zh-16k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
        'md5':
        '98b87b171b7240b7cae6e07d8d0bc9be',
        'cfg_path':
        'model.yaml',
        'ckpt_path':
        'exp/deepspeech2_online/checkpoints/avg_1',
        'model':
        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
        'params':
        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
        'lm_url':
        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
        'lm_md5':
        '29e02312deb2e59b3c8686c7966d4fe3'
    },
    "conformer_online_multicn-zh-16k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
        'md5':
        '0ac93d390552336f2a906aec9e33c5fa',
        'cfg_path':
        'model.yaml',
        'ckpt_path':
        'exp/chunk_conformer/checkpoints/multi_cn',
        'model':
        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
        'params':
        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
        'lm_url':
        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
        'lm_md5':
        '29e02312deb2e59b3c8686c7966d4fe3'
    },
 }
 # ASR server connection process class
 class PaddleASRConnectionHanddler:
@ -626,24 +586,7 @@ class PaddleASRConnectionHanddler:
 class ASRServerExecutor(ASRExecutor):
    def __init__(self):
        super().__init__()
-        pass
+        self.pretrained_models = pretrained_models
    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
        Download and returns pretrained resources path of current task.
        """
        support_models = list(pretrained_models.keys())
        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
            tag, '\n\t\t'.join(support_models))
        res_path = os.path.join(MODEL_HOME, tag)
        decompressed_path = download_and_decompress(pretrained_models[tag],
                                                    res_path)
        decompressed_path = os.path.abspath(decompressed_path)
        logger.info(
            'Use pretrained model stored in: {}'.format(decompressed_path))
        return decompressed_path
    def _init_from_path(self,
                        model_type: str='deepspeech2online_aishell',
@ -659,20 +602,20 @@ class ASRServerExecutor(ASRExecutor):
        """
        self.model_type = model_type
        self.sample_rate = sample_rate
        sample_rate_str = '16k' if sample_rate == 16000 else '8k'
        tag = model_type + '-' + lang + '-' + sample_rate_str
        if cfg_path is None or am_model is None or am_params is None:
            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
            tag = model_type + '-' + lang + '-' + sample_rate_str
            logger.info(f"Load the pretrained model, tag = {tag}")
            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
            self.res_path = res_path
-            self.cfg_path = os.path.join(res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                res_path, self.pretrained_models[tag]['cfg_path'])
            self.am_model = os.path.join(res_path,
-                                         pretrained_models[tag]['model'])
+                                         self.pretrained_models[tag]['model'])
            self.am_params = os.path.join(res_path,
-                                          pretrained_models[tag]['params'])
+                                          self.pretrained_models[tag]['params'])
            logger.info(res_path)
        else:
            self.cfg_path = os.path.abspath(cfg_path)
@ -700,8 +643,8 @@ class ASRServerExecutor(ASRExecutor):
                self.text_feature = TextFeaturizer(
                    unit_type=self.config.unit_type, vocab=self.vocab)
-                lm_url = pretrained_models[tag]['lm_url']
+                lm_url = self.pretrained_models[tag]['lm_url']
-                lm_md5 = pretrained_models[tag]['lm_md5']
+                lm_md5 = self.pretrained_models[tag]['lm_md5']
                logger.info(f"Start to load language model {lm_url}")
                self.download_lm(
                    lm_url,
@ -774,7 +717,7 @@ class ASRServerExecutor(ASRExecutor):
            model_name = model_type[:model_type.rindex(
                '_')]  # model_type: {model_name}_{dataset}
            logger.info(f"model name: {model_name}")
-            model_class = dynamic_import(model_name, model_alias)
+            model_class = dynamic_import(model_name, self.model_alias)
            model_conf = self.config
            model = model_class.from_config(model_conf)
            self.model = model
--- a/paddlespeech/server/engine/asr/online/pretrained_models.py
+++ b/paddlespeech/server/engine/asr/online/pretrained_models.py
@ -0,0 +1,52 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 pretrained_models = {
    "deepspeech2online_aishell-zh-16k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
        'md5':
        '98b87b171b7240b7cae6e07d8d0bc9be',
        'cfg_path':
        'model.yaml',
        'ckpt_path':
        'exp/deepspeech2_online/checkpoints/avg_1',
        'model':
        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
        'params':
        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
        'lm_url':
        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
        'lm_md5':
        '29e02312deb2e59b3c8686c7966d4fe3'
    },
    "conformer_online_multicn-zh-16k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
        'md5':
        '0ac93d390552336f2a906aec9e33c5fa',
        'cfg_path':
        'model.yaml',
        'ckpt_path':
        'exp/chunk_conformer/checkpoints/multi_cn',
        'model':
        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
        'params':
        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
        'lm_url':
        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
        'lm_md5':
        '29e02312deb2e59b3c8686c7966d4fe3'
    },
 }
--- a/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/paddleinference/asr_engine.py
@ -19,6 +19,7 @@ from typing import Optional
 import paddle
 from yacs.config import CfgNode
 from .pretrained_models import pretrained_models
 from paddlespeech.cli.asr.infer import ASRExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.utils import MODEL_HOME
@ -31,32 +32,11 @@ from paddlespeech.server.utils.paddle_predictor import run_model
 __all__ = ['ASREngine']
 pretrained_models = {
    "deepspeech2offline_aishell-zh-16k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
        'md5':
        '932c3593d62fe5c741b59b31318aa314',
        'cfg_path':
        'model.yaml',
        'ckpt_path':
        'exp/deepspeech2/checkpoints/avg_1',
        'model':
        'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
        'params':
        'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
        'lm_url':
        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
        'lm_md5':
        '29e02312deb2e59b3c8686c7966d4fe3'
    },
 }
 class ASRServerExecutor(ASRExecutor):
    def __init__(self):
        super().__init__()
-        pass
+        self.pretrained_models = pretrained_models
    def _init_from_path(self,
                        model_type: str='wenetspeech',
@ -71,18 +51,18 @@ class ASRServerExecutor(ASRExecutor):
        Init model and other resources from a specific path.
        """
        sample_rate_str = '16k' if sample_rate == 16000 else '8k'
        tag = model_type + '-' + lang + '-' + sample_rate_str
        if cfg_path is None or am_model is None or am_params is None:
            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
            tag = model_type + '-' + lang + '-' + sample_rate_str
            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
            self.res_path = res_path
-            self.cfg_path = os.path.join(res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                res_path, self.pretrained_models[tag]['cfg_path'])
            self.am_model = os.path.join(res_path,
-                                         pretrained_models[tag]['model'])
+                                         self.pretrained_models[tag]['model'])
            self.am_params = os.path.join(res_path,
-                                          pretrained_models[tag]['params'])
+                                          self.pretrained_models[tag]['params'])
            logger.info(res_path)
            logger.info(self.cfg_path)
            logger.info(self.am_model)
@ -109,8 +89,8 @@ class ASRServerExecutor(ASRExecutor):
                self.text_feature = TextFeaturizer(
                    unit_type=self.config.unit_type, vocab=self.vocab)
-                lm_url = pretrained_models[tag]['lm_url']
+                lm_url = self.pretrained_models[tag]['lm_url']
-                lm_md5 = pretrained_models[tag]['lm_md5']
+                lm_md5 = self.pretrained_models[tag]['lm_md5']
                self.download_lm(
                    lm_url,
                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)
--- a/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py
+++ b/paddlespeech/server/engine/asr/paddleinference/pretrained_models.py
@ -0,0 +1,34 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 pretrained_models = {
    "deepspeech2offline_aishell-zh-16k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
        'md5':
        '932c3593d62fe5c741b59b31318aa314',
        'cfg_path':
        'model.yaml',
        'ckpt_path':
        'exp/deepspeech2/checkpoints/avg_1',
        'model':
        'exp/deepspeech2/checkpoints/avg_1.jit.pdmodel',
        'params':
        'exp/deepspeech2/checkpoints/avg_1.jit.pdiparams',
        'lm_url':
        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
        'lm_md5':
        '29e02312deb2e59b3c8686c7966d4fe3'
    },
 }
--- a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
+++ b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
@ -20,83 +20,20 @@ import numpy as np
 import paddle
 import yaml
 from .pretrained_models import pretrained_models
 from paddlespeech.cli.cls.infer import CLSExecutor
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.utils import download_and_decompress
 from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.paddle_predictor import init_predictor
 from paddlespeech.server.utils.paddle_predictor import run_model
 __all__ = ['CLSEngine']
 pretrained_models = {
    "panns_cnn6-32k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
        'md5':
        'da087c31046d23281d8ec5188c1967da',
        'cfg_path':
        'panns.yaml',
        'model_path':
        'inference.pdmodel',
        'params_path':
        'inference.pdiparams',
        'label_file':
        'audioset_labels.txt',
    },
    "panns_cnn10-32k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
        'md5':
        '5460cc6eafbfaf0f261cc75b90284ae1',
        'cfg_path':
        'panns.yaml',
        'model_path':
        'inference.pdmodel',
        'params_path':
        'inference.pdiparams',
        'label_file':
        'audioset_labels.txt',
    },
    "panns_cnn14-32k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
        'md5':
        'ccc80b194821274da79466862b2ab00f',
        'cfg_path':
        'panns.yaml',
        'model_path':
        'inference.pdmodel',
        'params_path':
        'inference.pdiparams',
        'label_file':
        'audioset_labels.txt',
    },
 }
 class CLSServerExecutor(CLSExecutor):
    def __init__(self):
        super().__init__()
-        pass
+        self.pretrained_models = pretrained_models
    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
            Download and returns pretrained resources path of current task.
        """
        support_models = list(pretrained_models.keys())
        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
            tag, '\n\t\t'.join(support_models))
        res_path = os.path.join(MODEL_HOME, tag)
        decompressed_path = download_and_decompress(pretrained_models[tag],
                                                    res_path)
        decompressed_path = os.path.abspath(decompressed_path)
        logger.info(
            'Use pretrained model stored in: {}'.format(decompressed_path))
        return decompressed_path
    def _init_from_path(
            self,
@ -113,14 +50,14 @@ class CLSServerExecutor(CLSExecutor):
        if cfg_path is None or model_path is None or params_path is None or label_file is None:
            tag = model_type + '-' + '32k'
            self.res_path = self._get_pretrained_path(tag)
-            self.cfg_path = os.path.join(self.res_path,
+            self.cfg_path = os.path.join(
-                                         pretrained_models[tag]['cfg_path'])
+                self.res_path, self.pretrained_models[tag]['cfg_path'])
-            self.model_path = os.path.join(self.res_path,
+            self.model_path = os.path.join(
-                                           pretrained_models[tag]['model_path'])
+                self.res_path, self.pretrained_models[tag]['model_path'])
            self.params_path = os.path.join(
-                self.res_path, pretrained_models[tag]['params_path'])
+                self.res_path, self.pretrained_models[tag]['params_path'])
-            self.label_file = os.path.join(self.res_path,
+            self.label_file = os.path.join(
-                                           pretrained_models[tag]['label_file'])
+                self.res_path, self.pretrained_models[tag]['label_file'])
        else:
            self.cfg_path = os.path.abspath(cfg_path)
            self.model_path = os.path.abspath(model_path)
--- a/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py
+++ b/paddlespeech/server/engine/cls/paddleinference/pretrained_models.py
@ -0,0 +1,58 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 pretrained_models = {
    "panns_cnn6-32k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
        'md5':
        'da087c31046d23281d8ec5188c1967da',
        'cfg_path':
        'panns.yaml',
        'model_path':
        'inference.pdmodel',
        'params_path':
        'inference.pdiparams',
        'label_file':
        'audioset_labels.txt',
    },
    "panns_cnn10-32k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
        'md5':
        '5460cc6eafbfaf0f261cc75b90284ae1',
        'cfg_path':
        'panns.yaml',
        'model_path':
        'inference.pdmodel',
        'params_path':
        'inference.pdiparams',
        'label_file':
        'audioset_labels.txt',
    },
    "panns_cnn14-32k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
        'md5':
        'ccc80b194821274da79466862b2ab00f',
        'cfg_path':
        'panns.yaml',
        'model_path':
        'inference.pdmodel',
        'params_path':
        'inference.pdiparams',
        'label_file':
        'audioset_labels.txt',
    },
 }
--- a/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py
+++ b/paddlespeech/server/engine/tts/online/onnx/pretrained_models.py
@ -0,0 +1,69 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # support online model
 pretrained_models = {
    # fastspeech2
    "fastspeech2_csmsc_onnx-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
        'md5':
        'fd3ad38d83273ad51f0ea4f4abf3ab4e',
        'ckpt': ['fastspeech2_csmsc.onnx'],
        'phones_dict':
        'phone_id_map.txt',
        'sample_rate':
        24000,
    },
    "fastspeech2_cnndecoder_csmsc_onnx-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
        'md5':
        '5f70e1a6bcd29d72d54e7931aa86f266',
        'ckpt': [
            'fastspeech2_csmsc_am_encoder_infer.onnx',
            'fastspeech2_csmsc_am_decoder.onnx',
            'fastspeech2_csmsc_am_postnet.onnx',
        ],
        'speech_stats':
        'speech_stats.npy',
        'phones_dict':
        'phone_id_map.txt',
        'sample_rate':
        24000,
    },
    # mb_melgan
    "mb_melgan_csmsc_onnx-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
        'md5':
        '5b83ec746e8414bc29032d954ffd07ec',
        'ckpt':
        'mb_melgan_csmsc.onnx',
        'sample_rate':
        24000,
    },
    # hifigan
    "hifigan_csmsc_onnx-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
        'md5':
        '1a7dc0385875889e46952e50c0994a6b',
        'ckpt':
        'hifigan_csmsc.onnx',
        'sample_rate':
        24000,
    },
 }
--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@ -20,10 +20,9 @@ from typing import Optional
 import numpy as np
 import paddle
 from .pretrained_models import pretrained_models
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
 from paddlespeech.cli.utils import download_and_decompress
 from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.server.utils.onnx_infer import get_sess
@ -34,83 +33,6 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend
 __all__ = ['TTSEngine']
 # support online model
 pretrained_models = {
    # fastspeech2
    "fastspeech2_csmsc_onnx-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
        'md5':
        'fd3ad38d83273ad51f0ea4f4abf3ab4e',
        'ckpt': ['fastspeech2_csmsc.onnx'],
        'phones_dict':
        'phone_id_map.txt',
        'sample_rate':
        24000,
    },
    "fastspeech2_cnndecoder_csmsc_onnx-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
        'md5':
        '5f70e1a6bcd29d72d54e7931aa86f266',
        'ckpt': [
            'fastspeech2_csmsc_am_encoder_infer.onnx',
            'fastspeech2_csmsc_am_decoder.onnx',
            'fastspeech2_csmsc_am_postnet.onnx',
        ],
        'speech_stats':
        'speech_stats.npy',
        'phones_dict':
        'phone_id_map.txt',
        'sample_rate':
        24000,
    },
    # mb_melgan
    "mb_melgan_csmsc_onnx-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
        'md5':
        '5b83ec746e8414bc29032d954ffd07ec',
        'ckpt':
        'mb_melgan_csmsc.onnx',
        'sample_rate':
        24000,
    },
    # hifigan
    "hifigan_csmsc_onnx-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
        'md5':
        '1a7dc0385875889e46952e50c0994a6b',
        'ckpt':
        'hifigan_csmsc.onnx',
        'sample_rate':
        24000,
    },
 }
 model_alias = {
    # acoustic model
    "fastspeech2":
    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
    "fastspeech2_inference":
    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
    # voc
    "mb_melgan":
    "paddlespeech.t2s.models.melgan:MelGANGenerator",
    "mb_melgan_inference":
    "paddlespeech.t2s.models.melgan:MelGANInference",
    "hifigan":
    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
    "hifigan_inference":
    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
 }
 __all__ = ['TTSEngine']
 class TTSServerExecutor(TTSExecutor):
    def __init__(self, am_block, am_pad, voc_block, voc_pad, voc_upsample):
@ -122,23 +44,6 @@ class TTSServerExecutor(TTSExecutor):
        self.voc_upsample = voc_upsample
        self.pretrained_models = pretrained_models
        self.model_alias = model_alias
    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
        #Download and returns pretrained resources path of current task.
        """
        support_models = list(pretrained_models.keys())
        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
            tag, '\n\t\t'.join(support_models))
        res_path = os.path.join(MODEL_HOME, tag)
        decompressed_path = download_and_decompress(pretrained_models[tag],
                                                    res_path)
        decompressed_path = os.path.abspath(decompressed_path)
        logger.info(
            'Use pretrained model stored in: {}'.format(decompressed_path))
        return decompressed_path
    def _init_from_path(
            self,
@ -173,10 +78,10 @@ class TTSServerExecutor(TTSExecutor):
                am_res_path = self._get_pretrained_path(am_tag)
                self.am_res_path = am_res_path
                self.am_ckpt = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['ckpt'][0])
+                    am_res_path, self.pretrained_models[am_tag]['ckpt'][0])
                # must have phones_dict in acoustic
                self.phones_dict = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['phones_dict'])
+                    am_res_path, self.pretrained_models[am_tag]['phones_dict'])
            else:
                self.am_ckpt = os.path.abspath(am_ckpt[0])
@ -192,16 +97,16 @@ class TTSServerExecutor(TTSExecutor):
                am_res_path = self._get_pretrained_path(am_tag)
                self.am_res_path = am_res_path
                self.am_encoder_infer = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['ckpt'][0])
+                    am_res_path, self.pretrained_models[am_tag]['ckpt'][0])
                self.am_decoder = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['ckpt'][1])
+                    am_res_path, self.pretrained_models[am_tag]['ckpt'][1])
                self.am_postnet = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['ckpt'][2])
+                    am_res_path, self.pretrained_models[am_tag]['ckpt'][2])
                # must have phones_dict in acoustic
                self.phones_dict = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['phones_dict'])
+                    am_res_path, self.pretrained_models[am_tag]['phones_dict'])
                self.am_stat = os.path.join(
-                    am_res_path, pretrained_models[am_tag]['speech_stats'])
+                    am_res_path, self.pretrained_models[am_tag]['speech_stats'])
            else:
                self.am_encoder_infer = os.path.abspath(am_ckpt[0])
@ -229,8 +134,8 @@ class TTSServerExecutor(TTSExecutor):
        if voc_ckpt is None:
            voc_res_path = self._get_pretrained_path(voc_tag)
            self.voc_res_path = voc_res_path
-            self.voc_ckpt = os.path.join(voc_res_path,
+            self.voc_ckpt = os.path.join(
-                                         pretrained_models[voc_tag]['ckpt'])
+                voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
        else:
            self.voc_ckpt = os.path.abspath(voc_ckpt)
            self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt))
@ -283,7 +188,6 @@ class TTSServerExecutor(TTSExecutor):
        """
        Model inference and result stored in self.output.
        """
        #import pdb;pdb.set_trace()
        am_block = self.am_block
        am_pad = self.am_pad
@ -453,10 +357,21 @@ class TTSEngine(BaseEngine):
            self.config.am_block, self.config.am_pad, self.config.voc_block,
            self.config.voc_pad, self.config.voc_upsample)
-        if "cpu" in self.config.am_sess_conf.device or "cpu" in self.config.voc_sess_conf.device:
+        try:
-            paddle.set_device("cpu")
+            if self.config.am_sess_conf.device is not None:
-        else:
+                self.device = self.config.am_sess_conf.device
-            paddle.set_device(self.config.am_sess_conf.device)
+            elif self.config.voc_sess_conf.device is not None:
                self.device = self.config.voc_sess_conf.device
            else:
                self.device = paddle.get_device()
            paddle.set_device(self.device)
        except BaseException as e:
            logger.error(
                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
            )
            logger.error("Initialize TTS server engine Failed on device: %s." %
                         (self.device))
            return False
        try:
            self.executor._init_from_path(
@ -480,16 +395,17 @@ class TTSEngine(BaseEngine):
                         (self.config.voc_sess_conf.device))
            return False
        logger.info("Initialize TTS server engine successfully on device: %s." %
                    (self.config.voc_sess_conf.device))
        # warm up
        try:
            self.warm_up()
            logger.info("Warm up successfully.")
        except Exception as e:
            logger.error("Failed to warm up on tts engine.")
            return False
        logger.info("Initialize TTS server engine successfully on device: %s." %
                    (self.config.voc_sess_conf.device))
        return True
    def warm_up(self):
@ -499,9 +415,7 @@ class TTSEngine(BaseEngine):
            sentence = "您好，欢迎使用语音合成服务。"
        if self.config.lang == 'en':
            sentence = "Hello and welcome to the speech synthesis service."
-        logger.info(
+        logger.info("Start to warm up.")
            "*******************************warm up ********************************"
        )
        for i in range(3):
            for wav in self.executor.infer(
                    text=sentence,
@ -512,9 +426,6 @@ class TTSEngine(BaseEngine):
                    f"The first response time of the {i} warm up: {self.executor.first_response_time} s"
                )
                break
        logger.info(
            "**********************************************************************"
        )
    def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
        # Convert byte to text
--- a/paddlespeech/server/engine/tts/online/python/pretrained_models.py
+++ b/paddlespeech/server/engine/tts/online/python/pretrained_models.py
@ -0,0 +1,73 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # support online model
 pretrained_models = {
    # fastspeech2
    "fastspeech2_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
        'md5':
        '637d28a5e53aa60275612ba4393d5f22',
        'config':
        'default.yaml',
        'ckpt':
        'snapshot_iter_76000.pdz',
        'speech_stats':
        'speech_stats.npy',
        'phones_dict':
        'phone_id_map.txt',
    },
    "fastspeech2_cnndecoder_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
        'md5':
        '6eb28e22ace73e0ebe7845f86478f89f',
        'config':
        'cnndecoder.yaml',
        'ckpt':
        'snapshot_iter_153000.pdz',
        'speech_stats':
        'speech_stats.npy',
        'phones_dict':
        'phone_id_map.txt',
    },
    # mb_melgan
    "mb_melgan_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
        'md5':
        'ee5f0604e20091f0d495b6ec4618b90d',
        'config':
        'default.yaml',
        'ckpt':
        'snapshot_iter_1000000.pdz',
        'speech_stats':
        'feats_stats.npy',
    },
    # hifigan
    "hifigan_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
        'md5':
        'dd40a3d88dfcf64513fba2f0f961ada6',
        'config':
        'default.yaml',
        'ckpt':
        'snapshot_iter_2500000.pdz',
        'speech_stats':
        'feats_stats.npy',
    },
 }
--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@ -22,10 +22,9 @@ import paddle
 import yaml
 from yacs.config import CfgNode
 from .pretrained_models import pretrained_models
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
 from paddlespeech.cli.utils import download_and_decompress
 from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import float2pcm
@ -37,87 +36,6 @@ from paddlespeech.t2s.modules.normalizer import ZScore
 __all__ = ['TTSEngine']
 # support online model
 pretrained_models = {
    # fastspeech2
    "fastspeech2_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
        'md5':
        '637d28a5e53aa60275612ba4393d5f22',
        'config':
        'default.yaml',
        'ckpt':
        'snapshot_iter_76000.pdz',
        'speech_stats':
        'speech_stats.npy',
        'phones_dict':
        'phone_id_map.txt',
    },
    "fastspeech2_cnndecoder_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip',
        'md5':
        '6eb28e22ace73e0ebe7845f86478f89f',
        'config':
        'cnndecoder.yaml',
        'ckpt':
        'snapshot_iter_153000.pdz',
        'speech_stats':
        'speech_stats.npy',
        'phones_dict':
        'phone_id_map.txt',
    },
    # mb_melgan
    "mb_melgan_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
        'md5':
        'ee5f0604e20091f0d495b6ec4618b90d',
        'config':
        'default.yaml',
        'ckpt':
        'snapshot_iter_1000000.pdz',
        'speech_stats':
        'feats_stats.npy',
    },
    # hifigan
    "hifigan_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
        'md5':
        'dd40a3d88dfcf64513fba2f0f961ada6',
        'config':
        'default.yaml',
        'ckpt':
        'snapshot_iter_2500000.pdz',
        'speech_stats':
        'feats_stats.npy',
    },
 }
 model_alias = {
    # acoustic model
    "fastspeech2":
    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
    "fastspeech2_inference":
    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
    # voc
    "mb_melgan":
    "paddlespeech.t2s.models.melgan:MelGANGenerator",
    "mb_melgan_inference":
    "paddlespeech.t2s.models.melgan:MelGANInference",
    "hifigan":
    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
    "hifigan_inference":
    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
 }
 __all__ = ['TTSEngine']
 class TTSServerExecutor(TTSExecutor):
    def __init__(self, am_block, am_pad, voc_block, voc_pad):
@ -126,6 +44,7 @@ class TTSServerExecutor(TTSExecutor):
        self.am_pad = am_pad
        self.voc_block = voc_block
        self.voc_pad = voc_pad
        self.pretrained_models = pretrained_models
    def get_model_info(self,
                       field: str,
@ -146,7 +65,7 @@ class TTSServerExecutor(TTSExecutor):
            [Tensor]: standard deviation
        """
-        model_class = dynamic_import(model_name, model_alias)
+        model_class = dynamic_import(model_name, self.model_alias)
        if field == "am":
            odim = self.am_config.n_mels
@ -169,22 +88,6 @@ class TTSServerExecutor(TTSExecutor):
        return model, model_mu, model_std
    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
        Download and returns pretrained resources path of current task.
        """
        support_models = list(pretrained_models.keys())
        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
            tag, '\n\t\t'.join(support_models))
        res_path = os.path.join(MODEL_HOME, tag)
        decompressed_path = download_and_decompress(pretrained_models[tag],
                                                    res_path)
        decompressed_path = os.path.abspath(decompressed_path)
        logger.info(
            'Use pretrained model stored in: {}'.format(decompressed_path))
        return decompressed_path
    def _init_from_path(
            self,
            am: str='fastspeech2_csmsc',
@ -210,15 +113,15 @@ class TTSServerExecutor(TTSExecutor):
        if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
            am_res_path = self._get_pretrained_path(am_tag)
            self.am_res_path = am_res_path
-            self.am_config = os.path.join(am_res_path,
+            self.am_config = os.path.join(
-                                          pretrained_models[am_tag]['config'])
+                am_res_path, self.pretrained_models[am_tag]['config'])
            self.am_ckpt = os.path.join(am_res_path,
-                                        pretrained_models[am_tag]['ckpt'])
+                                        self.pretrained_models[am_tag]['ckpt'])
            self.am_stat = os.path.join(
-                am_res_path, pretrained_models[am_tag]['speech_stats'])
+                am_res_path, self.pretrained_models[am_tag]['speech_stats'])
            # must have phones_dict in acoustic
            self.phones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['phones_dict'])
+                am_res_path, self.pretrained_models[am_tag]['phones_dict'])
            print("self.phones_dict:", self.phones_dict)
            logger.info(am_res_path)
            logger.info(self.am_config)
@ -239,12 +142,12 @@ class TTSServerExecutor(TTSExecutor):
        if voc_ckpt is None or voc_config is None or voc_stat is None:
            voc_res_path = self._get_pretrained_path(voc_tag)
            self.voc_res_path = voc_res_path
-            self.voc_config = os.path.join(voc_res_path,
+            self.voc_config = os.path.join(
-                                           pretrained_models[voc_tag]['config'])
+                voc_res_path, self.pretrained_models[voc_tag]['config'])
-            self.voc_ckpt = os.path.join(voc_res_path,
+            self.voc_ckpt = os.path.join(
-                                         pretrained_models[voc_tag]['ckpt'])
+                voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
            self.voc_stat = os.path.join(
-                voc_res_path, pretrained_models[voc_tag]['speech_stats'])
+                voc_res_path, self.pretrained_models[voc_tag]['speech_stats'])
            logger.info(voc_res_path)
            logger.info(self.voc_config)
            logger.info(self.voc_ckpt)
@ -286,7 +189,7 @@ class TTSServerExecutor(TTSExecutor):
                                                    self.am_ckpt, self.am_stat)
            am_normalizer = ZScore(am_mu, am_std)
            am_inference_class = dynamic_import(self.am_name + '_inference',
-                                                model_alias)
+                                                self.model_alias)
            self.am_inference = am_inference_class(am_normalizer, am)
            self.am_inference.eval()
        print("acoustic model done!")
@ -297,7 +200,7 @@ class TTSServerExecutor(TTSExecutor):
                                                   self.voc_ckpt, self.voc_stat)
        voc_normalizer = ZScore(voc_mu, voc_std)
        voc_inference_class = dynamic_import(self.voc_name + '_inference',
-                                             model_alias)
+                                             self.model_alias)
        self.voc_inference = voc_inference_class(voc_normalizer, voc)
        self.voc_inference.eval()
        print("voc done!")
@ -477,7 +380,7 @@ class TTSEngine(BaseEngine):
        ), "Please set correct voc_block and voc_pad, they should be more than 0."
        try:
-            if self.config.device:
+            if self.config.device is not None:
                self.device = self.config.device
            else:
                self.device = paddle.get_device()
@ -513,16 +416,16 @@ class TTSEngine(BaseEngine):
                         (self.device))
            return False
        logger.info("Initialize TTS server engine successfully on device: %s." %
                    (self.device))
        # warm up
        try:
            self.warm_up()
            logger.info("Warm up successfully.")
        except Exception as e:
            logger.error("Failed to warm up on tts engine.")
            return False
        logger.info("Initialize TTS server engine successfully on device: %s." %
                    (self.device))
        return True
    def warm_up(self):
@ -532,9 +435,7 @@ class TTSEngine(BaseEngine):
            sentence = "您好，欢迎使用语音合成服务。"
        if self.config.lang == 'en':
            sentence = "Hello and welcome to the speech synthesis service."
-        logger.info(
+        logger.info("Start to warm up.")
            "*******************************warm up ********************************"
        )
        for i in range(3):
            for wav in self.executor.infer(
                    text=sentence,
@ -545,9 +446,6 @@ class TTSEngine(BaseEngine):
                    f"The first response time of the {i} warm up: {self.executor.first_response_time} s"
                )
                break
        logger.info(
            "**********************************************************************"
        )
    def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
        # Convert byte to text
--- a/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py
+++ b/paddlespeech/server/engine/tts/paddleinference/pretrained_models.py
@ -0,0 +1,87 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Static model applied on paddle inference
 pretrained_models = {
    # speedyspeech
    "speedyspeech_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
        'md5':
        'f10cbdedf47dc7a9668d2264494e1823',
        'model':
        'speedyspeech_csmsc.pdmodel',
        'params':
        'speedyspeech_csmsc.pdiparams',
        'phones_dict':
        'phone_id_map.txt',
        'tones_dict':
        'tone_id_map.txt',
        'sample_rate':
        24000,
    },
    # fastspeech2
    "fastspeech2_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
        'md5':
        '9788cd9745e14c7a5d12d32670b2a5a7',
        'model':
        'fastspeech2_csmsc.pdmodel',
        'params':
        'fastspeech2_csmsc.pdiparams',
        'phones_dict':
        'phone_id_map.txt',
        'sample_rate':
        24000,
    },
    # pwgan
    "pwgan_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
        'md5':
        'e3504aed9c5a290be12d1347836d2742',
        'model':
        'pwgan_csmsc.pdmodel',
        'params':
        'pwgan_csmsc.pdiparams',
        'sample_rate':
        24000,
    },
    # mb_melgan
    "mb_melgan_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
        'md5':
        'ac6eee94ba483421d750433f4c3b8d36',
        'model':
        'mb_melgan_csmsc.pdmodel',
        'params':
        'mb_melgan_csmsc.pdiparams',
        'sample_rate':
        24000,
    },
    # hifigan
    "hifigan_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
        'md5':
        '7edd8c436b3a5546b3a7cb8cff9d5a0c',
        'model':
        'hifigan_csmsc.pdmodel',
        'params':
        'hifigan_csmsc.pdiparams',
        'sample_rate':
        24000,
    },
 }
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@ -23,10 +23,9 @@ import paddle
 import soundfile as sf
 from scipy.io import wavfile
 from .pretrained_models import pretrained_models
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
 from paddlespeech.cli.utils import download_and_decompress
 from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import change_speed
 from paddlespeech.server.utils.errors import ErrorCode
@ -38,101 +37,11 @@ from paddlespeech.t2s.frontend.zh_frontend import Frontend
 __all__ = ['TTSEngine']
 # Static model applied on paddle inference
 pretrained_models = {
    # speedyspeech
    "speedyspeech_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip',
        'md5':
        'f10cbdedf47dc7a9668d2264494e1823',
        'model':
        'speedyspeech_csmsc.pdmodel',
        'params':
        'speedyspeech_csmsc.pdiparams',
        'phones_dict':
        'phone_id_map.txt',
        'tones_dict':
        'tone_id_map.txt',
        'sample_rate':
        24000,
    },
    # fastspeech2
    "fastspeech2_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip',
        'md5':
        '9788cd9745e14c7a5d12d32670b2a5a7',
        'model':
        'fastspeech2_csmsc.pdmodel',
        'params':
        'fastspeech2_csmsc.pdiparams',
        'phones_dict':
        'phone_id_map.txt',
        'sample_rate':
        24000,
    },
    # pwgan
    "pwgan_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip',
        'md5':
        'e3504aed9c5a290be12d1347836d2742',
        'model':
        'pwgan_csmsc.pdmodel',
        'params':
        'pwgan_csmsc.pdiparams',
        'sample_rate':
        24000,
    },
    # mb_melgan
    "mb_melgan_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip',
        'md5':
        'ac6eee94ba483421d750433f4c3b8d36',
        'model':
        'mb_melgan_csmsc.pdmodel',
        'params':
        'mb_melgan_csmsc.pdiparams',
        'sample_rate':
        24000,
    },
    # hifigan
    "hifigan_csmsc-zh": {
        'url':
        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip',
        'md5':
        '7edd8c436b3a5546b3a7cb8cff9d5a0c',
        'model':
        'hifigan_csmsc.pdmodel',
        'params':
        'hifigan_csmsc.pdiparams',
        'sample_rate':
        24000,
    },
 }
 class TTSServerExecutor(TTSExecutor):
    def __init__(self):
        super().__init__()
-        pass
+        self.pretrained_models = pretrained_models
    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        """
        Download and returns pretrained resources path of current task.
        """
        assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
            tag)
        res_path = os.path.join(MODEL_HOME, tag)
        decompressed_path = download_and_decompress(pretrained_models[tag],
                                                    res_path)
        decompressed_path = os.path.abspath(decompressed_path)
        logger.info(
            'Use pretrained model stored in: {}'.format(decompressed_path))
        return decompressed_path
    def _init_from_path(
            self,
@ -161,14 +70,14 @@ class TTSServerExecutor(TTSExecutor):
        if am_model is None or am_params is None or phones_dict is None:
            am_res_path = self._get_pretrained_path(am_tag)
            self.am_res_path = am_res_path
-            self.am_model = os.path.join(am_res_path,
+            self.am_model = os.path.join(
-                                         pretrained_models[am_tag]['model'])
+                am_res_path, self.pretrained_models[am_tag]['model'])
-            self.am_params = os.path.join(am_res_path,
+            self.am_params = os.path.join(
-                                          pretrained_models[am_tag]['params'])
+                am_res_path, self.pretrained_models[am_tag]['params'])
            # must have phones_dict in acoustic
            self.phones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['phones_dict'])
+                am_res_path, self.pretrained_models[am_tag]['phones_dict'])
-            self.am_sample_rate = pretrained_models[am_tag]['sample_rate']
+            self.am_sample_rate = self.pretrained_models[am_tag]['sample_rate']
            logger.info(am_res_path)
            logger.info(self.am_model)
@ -183,17 +92,17 @@ class TTSServerExecutor(TTSExecutor):
        # for speedyspeech
        self.tones_dict = None
-        if 'tones_dict' in pretrained_models[am_tag]:
+        if 'tones_dict' in self.pretrained_models[am_tag]:
            self.tones_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['tones_dict'])
+                am_res_path, self.pretrained_models[am_tag]['tones_dict'])
            if tones_dict:
                self.tones_dict = tones_dict
        # for multi speaker fastspeech2
        self.speaker_dict = None
-        if 'speaker_dict' in pretrained_models[am_tag]:
+        if 'speaker_dict' in self.pretrained_models[am_tag]:
            self.speaker_dict = os.path.join(
-                am_res_path, pretrained_models[am_tag]['speaker_dict'])
+                am_res_path, self.pretrained_models[am_tag]['speaker_dict'])
            if speaker_dict:
                self.speaker_dict = speaker_dict
@ -202,11 +111,12 @@ class TTSServerExecutor(TTSExecutor):
        if voc_model is None or voc_params is None:
            voc_res_path = self._get_pretrained_path(voc_tag)
            self.voc_res_path = voc_res_path
-            self.voc_model = os.path.join(voc_res_path,
+            self.voc_model = os.path.join(
-                                          pretrained_models[voc_tag]['model'])
+                voc_res_path, self.pretrained_models[voc_tag]['model'])
-            self.voc_params = os.path.join(voc_res_path,
+            self.voc_params = os.path.join(
-                                           pretrained_models[voc_tag]['params'])
+                voc_res_path, self.pretrained_models[voc_tag]['params'])
-            self.voc_sample_rate = pretrained_models[voc_tag]['sample_rate']
+            self.voc_sample_rate = self.pretrained_models[voc_tag][
                'sample_rate']
            logger.info(voc_res_path)
            logger.info(self.voc_model)
            logger.info(self.voc_params)
@ -352,8 +262,24 @@ class TTSEngine(BaseEngine):
    def init(self, config: dict) -> bool:
        self.executor = TTSServerExecutor()
        self.config = config
        try:
            if self.config.am_predictor_conf.device is not None:
                self.device = self.config.am_predictor_conf.device
            elif self.config.voc_predictor_conf.device is not None:
                self.device = self.config.voc_predictor_conf.device
            else:
                self.device = paddle.get_device()
            paddle.set_device(self.device)
        except BaseException as e:
            logger.error(
                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
            )
            logger.error("Initialize TTS server engine Failed on device: %s." %
                         (self.device))
            return False
        self.executor._init_from_path(
            am=self.config.am,
            am_model=self.config.am_model,
@ -370,9 +296,35 @@ class TTSEngine(BaseEngine):
            am_predictor_conf=self.config.am_predictor_conf,
            voc_predictor_conf=self.config.voc_predictor_conf, )
        # warm up
        try:
            self.warm_up()
            logger.info("Warm up successfully.")
        except Exception as e:
            logger.error("Failed to warm up on tts engine.")
            return False
        logger.info("Initialize TTS server engine successfully.")
        return True
    def warm_up(self):
        """warm up
        """
        if self.config.lang == 'zh':
            sentence = "您好，欢迎使用语音合成服务。"
        if self.config.lang == 'en':
            sentence = "Hello and welcome to the speech synthesis service."
        logger.info("Start to warm up.")
        for i in range(3):
            st = time.time()
            self.executor.infer(
                text=sentence,
                lang=self.config.lang,
                am=self.config.am,
                spk_id=0, )
            logger.info(
                f"The response time of the {i} warm up: {time.time() - st} s")
    def postprocess(self,
                    wav,
                    original_fs: int,
--- a/paddlespeech/server/engine/tts/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/python/tts_engine.py
@ -51,15 +51,15 @@ class TTSEngine(BaseEngine):
    def init(self, config: dict) -> bool:
        self.executor = TTSServerExecutor()
        self.config = config
        try:
-            self.config = config
+            if self.config.device is not None:
            if self.config.device:
                self.device = self.config.device
            else:
                self.device = paddle.get_device()
            paddle.set_device(self.device)
-        except BaseException:
+        except BaseException as e:
            logger.error(
                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
            )
@ -87,10 +87,36 @@ class TTSEngine(BaseEngine):
                         (self.device))
            return False
        # warm up
        try:
            self.warm_up()
            logger.info("Warm up successfully.")
        except Exception as e:
            logger.error("Failed to warm up on tts engine.")
            return False
        logger.info("Initialize TTS server engine successfully on device: %s." %
                    (self.device))
        return True
    def warm_up(self):
        """warm up
        """
        if self.config.lang == 'zh':
            sentence = "您好，欢迎使用语音合成服务。"
        if self.config.lang == 'en':
            sentence = "Hello and welcome to the speech synthesis service."
        logger.info("Start to warm up.")
        for i in range(3):
            st = time.time()
            self.executor.infer(
                text=sentence,
                lang=self.config.lang,
                am=self.config.am,
                spk_id=0, )
            logger.info(
                f"The response time of the {i} warm up: {time.time() - st} s")
    def postprocess(self,
                    wav,
                    original_fs: int,
--- a/paddlespeech/server/restful/tts_api.py
+++ b/paddlespeech/server/restful/tts_api.py
@ -128,7 +128,7 @@ def tts(request_body: TTSRequest):
    return response
-@router.post("/paddlespeech/streaming/tts")
+@router.post("/paddlespeech/tts/streaming")
 async def stream_tts(request_body: TTSRequest):
    text = request_body.text
--- a/paddlespeech/server/tests/tts/online/http_client.py
+++ b/paddlespeech/server/tests/tts/online/http_client.py
@ -14,6 +14,7 @@
 import argparse
 from paddlespeech.server.utils.audio_handler import TTSHttpHandler
 from paddlespeech.server.utils.util import compute_delay
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
@ -43,5 +44,25 @@ if __name__ == "__main__":
    print("tts http client start")
    handler = TTSHttpHandler(args.server, args.port, args.play)
-    handler.run(args.text, args.spk_id, args.speed, args.volume,
+    first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
-                args.sample_rate, args.output)
+        args.text, args.spk_id, args.speed, args.volume, args.sample_rate,
        args.output)
    delay_time_list = compute_delay(receive_time_list, chunk_duration_list)
    print(f"sentence: {args.text}")
    print(f"duration: {duration} s")
    print(f"first response: {first_response} s")
    print(f"final response: {final_response} s")
    print(f"RTF: {final_response/duration}")
    if args.output is not None:
        if save_audio_success:
            print(f"Audio successfully saved in {args.output}")
        else:
            print("Audio save failed.")
    if delay_time_list != []:
        print(
            f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
        )
    else:
        print("The sentence has no delay in streaming synthesis.")
--- a/paddlespeech/server/tests/tts/online/ws_client.py
+++ b/paddlespeech/server/tests/tts/online/ws_client.py
@ -15,6 +15,7 @@ import argparse
 import asyncio
 from paddlespeech.server.utils.audio_handler import TTSWsHandler
 from paddlespeech.server.utils.util import compute_delay
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
@ -35,4 +36,24 @@ if __name__ == "__main__":
    print("tts websocket client start")
    handler = TTSWsHandler(args.server, args.port, args.play)
    loop = asyncio.get_event_loop()
-    loop.run_until_complete(handler.run(args.text, args.output))
+    first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
        handler.run(args.text, args.output))
    delay_time_list = compute_delay(receive_time_list, chunk_duration_list)
    print(f"sentence: {args.text}")
    print(f"duration: {duration} s")
    print(f"first response: {first_response} s")
    print(f"final response: {final_response} s")
    print(f"RTF: {final_response/duration}")
    if args.output is not None:
        if save_audio_success:
            print(f"Audio successfully saved in {args.output}")
        else:
            print("Audio save failed.")
    if delay_time_list != []:
        print(
            f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
        )
    else:
        print("The sentence has no delay in streaming synthesis.")
--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@ -262,7 +262,8 @@ class TTSWsHandler:
        """
        self.server = server
        self.port = port
-        self.url = "ws://" + self.server + ":" + str(self.port) + "/ws/tts"
+        self.url = "ws://" + self.server + ":" + str(
            self.port) + "/paddlespeech/tts/streaming"
        self.play = play
        if self.play:
            import pyaudio
@ -298,6 +299,8 @@ class TTSWsHandler:
            output (str): save audio path
        """
        all_bytes = b''
        receive_time_list = []
        chunk_duration_list = []
        # 1. Send websocket handshake protocal
        async with websockets.connect(self.url) as ws:
@ -312,14 +315,15 @@ class TTSWsHandler:
            # 3. Process the received response 
            message = await ws.recv()
-            logger.info(f"句子：{text}")
+            first_response = time.time() - st
            logger.info(f"首包响应：{time.time() - st} s")
            message = json.loads(message)
            status = message["status"]
            while (status == 1):
                receive_time_list.append(time.time())
                audio = message["audio"]
                audio = base64.b64decode(audio)  # bytes
                chunk_duration_list.append(len(audio) / 2.0 / 24000)
                all_bytes += audio
                if self.play:
                    self.mutex.acquire()
@ -337,15 +341,11 @@ class TTSWsHandler:
            if status == 2:
                final_response = time.time() - st
                duration = len(all_bytes) / 2.0 / 24000
                logger.info(f"尾包响应：{final_response} s")
                logger.info(f"音频时长：{duration} s")
                logger.info(f"RTF: {final_response / duration}")
                if output is not None:
-                    if save_audio(all_bytes, output):
+                    save_audio_success = save_audio(all_bytes, output)
-                        logger.info(f"音频保存至：{output}")
+                else:
-                    else:
+                    save_audio_success = False
                        logger.error("save audio error")
            else:
                logger.error("infer error")
@ -355,6 +355,8 @@ class TTSWsHandler:
                self.stream.close()
                self.p.terminate()
        return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list
 class TTSHttpHandler:
    def __init__(self, server="127.0.0.1", port=8092, play: bool=False):
@ -368,7 +370,7 @@ class TTSHttpHandler:
        self.server = server
        self.port = port
        self.url = "http://" + str(self.server) + ":" + str(
-            self.port) + "/paddlespeech/streaming/tts"
+            self.port) + "/paddlespeech/tts/streaming"
        self.play = play
        if self.play:
@ -426,13 +428,16 @@ class TTSHttpHandler:
        all_bytes = b''
        first_flag = 1
        receive_time_list = []
        chunk_duration_list = []
        # 2. Send request
        st = time.time()
        html = requests.post(self.url, json.dumps(params), stream=True)
        # 3. Process the received response 
-        for chunk in html.iter_content(chunk_size=1024):
+        for chunk in html.iter_content(chunk_size=None):
            receive_time_list.append(time.time())
            audio = base64.b64decode(chunk)  # bytes
            if first_flag:
                first_response = time.time() - st
@ -446,21 +451,15 @@ class TTSHttpHandler:
                    self.t.start()
                    self.start_play = False
            all_bytes += audio
            chunk_duration_list.append(len(audio) / 2.0 / 24000)
        final_response = time.time() - st
        duration = len(all_bytes) / 2.0 / 24000
        logger.info(f"句子：{text}")
        logger.info(f"首包响应：{first_response} s")
        logger.info(f"尾包响应：{final_response} s")
        logger.info(f"音频时长：{duration} s")
        logger.info(f"RTF: {final_response / duration}")
        if output is not None:
-            if save_audio(all_bytes, output):
+            save_audio_success = save_audio(all_bytes, output)
-                logger.info(f"音频保存至：{output}")
+        else:
-            else:
+            save_audio_success = False
                logger.error("save audio error")
        if self.play:
            self.t.join()
@ -468,6 +467,8 @@ class TTSHttpHandler:
            self.stream.close()
            self.p.terminate()
        return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list
 class VectorHttpHandler:
    def __init__(self, server_ip=None, port=None):
--- a/paddlespeech/server/utils/util.py
+++ b/paddlespeech/server/utils/util.py
@ -75,3 +75,74 @@ def get_chunks(data, block_size, pad_size, step):
        else:
            print("Please set correct type to get chunks, am or voc")
    return chunks
 def compute_delay(receive_time_list, chunk_duration_list):
    """compute delay 
        Args:
            receive_time_list (list): Time to receive each packet
            chunk_duration_list (list): The audio duration corresponding to each packet
        Returns:
            [list]: Delay time list
        """
    assert (len(receive_time_list) == len(chunk_duration_list))
    delay_time_list = []
    play_time = receive_time_list[0] + chunk_duration_list[0]
    for i in range(1, len(receive_time_list)):
        receive_time = receive_time_list[i]
        delay_time = receive_time - play_time
        # 有延迟
        if delay_time > 0:
            play_time = play_time + delay_time + chunk_duration_list[i]
            delay_time_list.append(delay_time)
        # 没有延迟
        else:
            play_time = play_time + chunk_duration_list[i]
    return delay_time_list
 def count_engine(logfile: str="./nohup.out"):
    """For inference on the statistical engine side
    Args:
        logfile (str, optional): server log. Defaults to "./nohup.out".
    """
    first_response_list = []
    final_response_list = []
    duration_list = []
    with open(logfile, "r") as f:
        for line in f.readlines():
            if "- first response time:" in line:
                first_response = float(line.splie(" ")[-2])
                first_response_list.append(first_response)
            elif "- final response time:" in line:
                final_response = float(line.splie(" ")[-2])
                final_response_list.append(final_response)
            elif "- The durations of audio is:" in line:
                duration = float(line.splie(" ")[-2])
                duration_list.append(duration)
    assert (len(first_response_list) == len(final_response_list) and
            len(final_response_list) == len(duration_list))
    avg_first_response = sum(first_response_list) / len(first_response_list)
    avg_final_response = sum(final_response_list) / len(final_response_list)
    avg_duration = sum(duration_list) / len(duration_list)
    RTF = sum(final_response_list) / sum(duration_list)
    print(
        "************************* engine result ***************************************"
    )
    print(
        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
    )
    print(
        f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
    )
    print(
        f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
    )
    print(
        f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
    )
--- a/paddlespeech/server/ws/tts_socket.py
+++ b/paddlespeech/server/ws/tts_socket.py
@ -24,7 +24,7 @@ from paddlespeech.server.engine.engine_pool import get_engine_pool
 router = APIRouter()
-@router.websocket('/ws/tts')
+@router.websocket('/paddlespeech/tts/streaming')
 async def websocket_endpoint(websocket: WebSocket):
    await websocket.accept()
--- a/tests/unit/server/offline/change_yaml.py
+++ b/tests/unit/server/offline/change_yaml.py
@ -19,7 +19,7 @@ def change_device(yamlfile: str, engine: str, device: str):
    if device == 'cpu':
        set_device = 'cpu'
    elif device == 'gpu':
-        set_device = 'gpu:0'
+        set_device = 'gpu:3'
    else:
        print("Please set correct device: cpu or gpu.")
--- a/tests/unit/server/offline/conf/application.yaml
+++ b/tests/unit/server/offline/conf/application.yaml
@ -1,4 +1,4 @@
-# This is the parameter configuration file for PaddleSpeech Serving.
+# This is the parameter configuration file for PaddleSpeech Offline Serving.
 #################################################################################
 #                             SERVER SETTING                                    #
@ -7,8 +7,8 @@ host: 127.0.0.1
 port: 8090
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference']
-
+protocol: 'http'
 engine_list: ['asr_python', 'tts_python', 'cls_python']
--- a/tests/unit/server/offline/test_server_client.sh
+++ b/tests/unit/server/offline/test_server_client.sh
@ -21,6 +21,8 @@ StartService(){
 }
 ClientTest(){
    echo "aaaaaaaaaaaaaaaaaaaaaaaaaa $server_ip"
    echo "aaaaaaaaaaaaaaaaaaaaaaaaaa $port"
    # Client test
    # test asr client
    paddlespeech_client asr --server_ip $server_ip --port $port --input ./zh.wav 
@ -39,6 +41,7 @@ ClientTest(){
    ((test_times+=1))
    paddlespeech_client cls --server_ip $server_ip --port $port --input ./zh.wav 
    ((test_times+=1)) 
 }
 GetTestResult() {
@ -58,6 +61,7 @@ rm -rf log/server.log.wf
 rm -rf log/server.log
 rm -rf log/test_result.log
 cp ../../../../demos/speech_server/conf/application.yaml ./conf/
 config_file=./conf/application.yaml
 server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
 port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
@ -191,5 +195,4 @@ echo "***************** Here are all the test results ********************"
 cat ./log/test_result.log
 # Restoring conf is the same as demos/speech_server
-rm -rf ./conf
+cp ../../../../demos/speech_server/conf/application.yaml ./conf/
 cp ../../../demos/speech_server/conf/ ./ -rf
--- a/tests/unit/server/online/tts/check_server/conf/application.yaml
+++ b/tests/unit/server/online/tts/check_server/conf/application.yaml
@ -39,9 +39,9 @@ tts_online:
    # others
    lang: 'zh'
    device: 'cpu' # set 'gpu:id' or 'cpu'
-    am_block: 42
+    am_block: 72
    am_pad: 12
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
@ -80,9 +80,9 @@ tts_online-onnx:
    # others
    lang: 'zh'
-    am_block: 42
+    am_block: 72
    am_pad: 12
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
    voc_upsample: 300
--- a/tests/unit/server/online/tts/check_server/test_all.sh
+++ b/tests/unit/server/online/tts/check_server/test_all.sh
@ -10,7 +10,6 @@ bash test.sh tts_online $log_all_dir/log_tts_online_cpu
 python change_yaml.py --change_type engine_type --target_key engine_list --target_value tts_online-onnx
 bash test.sh tts_online-onnx $log_all_dir/log_tts_online-onnx_cpu
 python change_yaml.py --change_type device --target_key device --target_value gpu:3
 bash test.sh tts_online $log_all_dir/log_tts_online_gpu
--- a/tests/unit/server/online/tts/check_server/tts_online_application.yaml
+++ b/tests/unit/server/online/tts/check_server/tts_online_application.yaml
@ -39,9 +39,9 @@ tts_online:
    # others
    lang: 'zh'
    device: 'cpu' # set 'gpu:id' or 'cpu'
-    am_block: 42
+    am_block: 72
    am_pad: 12
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
@ -80,9 +80,9 @@ tts_online-onnx:
    # others
    lang: 'zh'
-    am_block: 42
+    am_block: 72
    am_pad: 12
-    voc_block: 14
+    voc_block: 36
    voc_pad: 14
    voc_upsample: 300
--- a/tests/unit/server/online/tts/test_server/test_http_client.py
+++ b/tests/unit/server/online/tts/test_server/test_http_client.py
@ -12,117 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import base64
+import asyncio
 import json
 import os
 import time
-import requests
+from paddlespeech.server.utils.util import compute_delay
 from paddlespeech.server.utils.audio_process import pcm2wav
 from paddlespeech.t2s.exps.syn_utils import get_sentences
 def save_audio(buffer, audio_path) -> bool:
    if audio_path.endswith("pcm"):
        with open(audio_path, "wb") as f:
            f.write(buffer)
    elif audio_path.endswith("wav"):
        with open("./tmp.pcm", "wb") as f:
            f.write(buffer)
        pcm2wav("./tmp.pcm", audio_path, channels=1, bits=16, sample_rate=24000)
        os.system("rm ./tmp.pcm")
    else:
        print("Only supports saved audio format is pcm or wav")
        return False
    return True
 def test(args, text, utt_id):
-    params = {
+    output = str(args.output_dir + "/" + utt_id + ".wav")
-        "text": text,
+    if args.protocol == "http":
-        "spk_id": args.spk_id,
+        print("tts http client start")
-        "speed": args.speed,
+        from paddlespeech.server.utils.audio_handler import TTSHttpHandler
-        "volume": args.volume,
+        handler = TTSHttpHandler(args.server_ip, args.port, args.play)
-        "sample_rate": args.sample_rate,
+        first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
-        "save_path": ''
+            text, args.spk_id, args.speed, args.volume, args.sample_rate,
-    }
+            output)
-
+
-    buffer = b''
+    elif args.protocol == "websocket":
-    flag = 1
+        from paddlespeech.server.utils.audio_handler import TTSWsHandler
-    url = "http://" + str(args.server) + ":" + str(
+        print("tts websocket client start")
-        args.port) + "/paddlespeech/streaming/tts"
+        handler = TTSWsHandler(args.server_ip, args.port, args.play)
-    st = time.time()
+        loop = asyncio.get_event_loop()
-    html = requests.post(url, json.dumps(params), stream=True)
+        first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
-    for chunk in html.iter_content(chunk_size=1024):
+            handler.run(text, output))
        chunk = base64.b64decode(chunk)  # bytes
        if flag:
            first_response = time.time() - st
            print(f"首包响应：{first_response} s")
            flag = 0
        buffer += chunk
    final_response = time.time() - st
    duration = len(buffer) / 2.0 / 24000
    print(f"sentence: {text}")
    print(f"尾包响应：{final_response} s")
    print(f"音频时长：{duration} s")
    print(f"RTF: {final_response / duration}")
    save_path = str(args.output_dir + "/" + utt_id + ".wav")
    save_audio(buffer, save_path)
    print("音频保存至：", save_path)
    return first_response, final_response, duration
 def count_engine(logfile: str="./nohup.out"):
    """For inference on the statistical engine side
    Args:
        logfile (str, optional): server log. Defaults to "./nohup.out".
    """
    first_response_list = []
    final_response_list = []
    duration_list = []
-    with open(logfile, "r") as f:
+    else:
-        for line in f.readlines():
+        print("Please set correct protocol, http or websocket")
            if "- first response time:" in line:
                first_response = float(line.splie(" ")[-2])
                first_response_list.append(first_response)
            elif "- final response time:" in line:
                final_response = float(line.splie(" ")[-2])
                final_response_list.append(final_response)
            elif "- The durations of audio is:" in line:
                duration = float(line.splie(" ")[-2])
                duration_list.append(duration)
-    assert (len(first_response_list) == len(final_response_list) and
+    return first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list
            len(final_response_list) == len(duration_list))
    avg_first_response = sum(first_response_list) / len(first_response_list)
    avg_final_response = sum(final_response_list) / len(final_response_list)
    avg_duration = sum(duration_list) / len(duration_list)
    RTF = sum(final_response_list) / sum(duration_list)
    print(
        "************************* engine result ***************************************"
    )
    print(
        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
    )
    print(
        f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
    )
    print(
        f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
    )
    print(
        f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
    )
 if __name__ == "__main__":
@ -142,10 +60,18 @@ if __name__ == "__main__":
        default=0,
        help='Sampling rate, the default is the same as the model')
    parser.add_argument(
-        "--server", type=str, help="server ip", default="127.0.0.1")
+        "--server_ip", type=str, help="server ip", default="127.0.0.1")
    parser.add_argument("--port", type=int, help="server port", default=8092)
    parser.add_argument(
        "--protocol",
        type=str,
        choices=['http', 'websocket'],
        help="server protocol",
        default="http")
    parser.add_argument(
        "--output_dir", type=str, default="./output", help="output dir")
    parser.add_argument(
        "--play", type=bool, help="whether to play audio", default=False)
    args = parser.parse_args()
@ -155,13 +81,35 @@ if __name__ == "__main__":
    first_response_list = []
    final_response_list = []
    duration_list = []
    all_delay_list = []
    packet_count = 0.0
    sentences = get_sentences(text_file=args.text, lang="zh")
    for utt_id, sentence in sentences:
-        first_response, final_response, duration = test(args, sentence, utt_id)
+        first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = test(
            args, sentence, utt_id)
        delay_time_list = compute_delay(receive_time_list, chunk_duration_list)
        first_response_list.append(first_response)
        final_response_list.append(final_response)
        duration_list.append(duration)
        packet_count += len(receive_time_list)
        print(f"句子：{sentence}")
        print(f"首包响应时间：{first_response} s")
        print(f"尾包响应时间：{final_response} s")
        print(f"音频时长：{duration} s")
        print(f"该句RTF：{final_response/duration}")
        if delay_time_list != []:
            for t in delay_time_list:
                all_delay_list.append(t)
            print(
                f"该句流式合成的延迟情况：总包个数：{len(receive_time_list)}，延迟包个数：{len(delay_time_list)}, 最小延迟时间：{min(delay_time_list)} s, 最大延迟时间：{max(delay_time_list)} s, 平均延迟时间：{sum(delay_time_list)/len(delay_time_list)} s, 延迟率：{len(delay_time_list)/len(receive_time_list)}"
            )
        else:
            print("该句流式合成无延迟情况")
        packet_count += len(receive_time_list)
    assert (len(first_response_list) == len(final_response_list) and
            len(final_response_list) == len(duration_list))
@ -170,19 +118,35 @@ if __name__ == "__main__":
    avg_final_response = sum(final_response_list) / len(final_response_list)
    avg_duration = sum(duration_list) / len(duration_list)
    RTF = sum(final_response_list) / sum(duration_list)
    if all_delay_list != []:
        delay_count = len(all_delay_list)
        avg_delay = sum(all_delay_list) / len(all_delay_list)
        delay_ratio = len(all_delay_list) / packet_count
        min_delay = min(all_delay_list)
        max_delay = max(all_delay_list)
    else:
        delay_count = 0.0
        avg_delay = 0.0
        delay_ratio = 0.0
        min_delay = 0.0
        max_delay = 0.0
    print(
        "************************* server/client result ***************************************"
    )
    print(
-        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
+        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}."
    )
    print(
        f"test num: {len(duration_list)}, packet count: {packet_count}, delay count: {delay_count}, avg delay time: {avg_delay} s, delay ratio: {delay_ratio} "
    )
    print(
        f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
    )
    print(
-        f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
+        f"min first response: {min(first_response_list)} s, max first response: {max(first_response_list)} s."
    )
    print(
-        f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
+        f"min final response: {min(final_response_list)} s, max final response: {max(final_response_list)} s."
    )
    print(f"min delay: {min_delay} s, max delay: {max_delay}")