Merge pull request #1733 from lym0302/tts_stream

[server] add onnx tts engine
3 years ago · 24f0a7d44b
parent 6b4f009dbf c9c805e83b
commit 24f0a7d44b
20 changed files with 1722 additions and 24 deletions
--- a/paddlespeech/server/conf/tts_online_application.yaml
+++ b/paddlespeech/server/conf/tts_online_application.yaml
@ -7,7 +7,7 @@ host: 127.0.0.1
 port: 8092

 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_online', 'tts_online']
+# task choices = ['tts_online', 'tts_online-onnx']
 # protocol = ['websocket', 'http'] (only one can be selected).
 protocol: 'http'
 engine_list: ['tts_online']
@ -20,8 +20,8 @@ engine_list: ['tts_online']
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: online #######################
 tts_online: 
-    # am (acoustic model) choices=['fastspeech2_csmsc']        
-    am: 'fastspeech2_csmsc'   
+    # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']        
+    am: 'fastspeech2_cnndecoder_csmsc'   
    am_config: 
    am_ckpt: 
    am_stat: 
@ -30,7 +30,7 @@ tts_online:
    speaker_dict: 
    spk_id: 0

-    # voc (vocoder) choices=['mb_melgan_csmsc']
+    # voc (vocoder) choices=['mb_melgan_csmsc, hifigan_csmsc']
    voc: 'mb_melgan_csmsc'
    voc_config: 
    voc_ckpt: 
@ -38,9 +38,51 @@ tts_online:

    # others
    lang: 'zh'
-    device:  # set 'gpu:id' or 'cpu'
+    device: 'cpu' # set 'gpu:id' or 'cpu'
    am_block: 42
    am_pad: 12
    voc_block: 14
    voc_pad: 14
    
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### TTS #########################################
+################### speech task: tts; engine_type: online-onnx #######################
+tts_online-onnx: 
+    # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']        
+    am: 'fastspeech2_cnndecoder_csmsc_onnx' 
+    # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model];
+    # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model];
+    am_ckpt:   # list
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+    am_sample_rate: 24000
+    am_sess_conf:
+        device: "cpu" # set 'gpu:id' or 'cpu'
+        use_trt: False
+        cpu_threads: 1
+
+    # voc (vocoder) choices=['mb_melgan_csmsc_onnx, hifigan_csmsc_onnx']
+    voc: 'mb_melgan_csmsc_onnx'
+    voc_ckpt: 
+    voc_sample_rate: 24000
+    voc_sess_conf:
+        device: "cpu" # set 'gpu:id' or 'cpu'
+        use_trt: False
+        cpu_threads: 1
+
+    # others
+    lang: 'zh'
+    am_block: 42
+    am_pad: 12
+    voc_block: 14
+    voc_pad: 14
+    voc_upsample: 300
+    
--- a/paddlespeech/server/engine/engine_factory.py
+++ b/paddlespeech/server/engine/engine_factory.py
@ -35,7 +35,10 @@ class EngineFactory(object):
            from paddlespeech.server.engine.tts.python.tts_engine import TTSEngine
            return TTSEngine()
        elif engine_name == 'tts' and engine_type == 'online':
-            from paddlespeech.server.engine.tts.online.tts_engine import TTSEngine
+            from paddlespeech.server.engine.tts.online.python.tts_engine import TTSEngine
+            return TTSEngine()
+        elif engine_name == 'tts' and engine_type == 'online-onnx':
+            from paddlespeech.server.engine.tts.online.onnx.tts_engine import TTSEngine
            return TTSEngine()
        elif engine_name == 'cls' and engine_type == 'inference':
            from paddlespeech.server.engine.cls.paddleinference.cls_engine import CLSEngine
--- a/paddlespeech/server/engine/tts/online/onnx/init.py
+++ b/paddlespeech/server/engine/tts/online/onnx/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@ -0,0 +1,578 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import math
+import os
+import time
+from typing import Optional
+
+import numpy as np
+import paddle
+
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.tts.infer import TTSExecutor
+from paddlespeech.cli.utils import download_and_decompress
+from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.server.engine.base_engine import BaseEngine
+from paddlespeech.server.utils.audio_process import float2pcm
+from paddlespeech.server.utils.onnx_infer import get_sess
+from paddlespeech.server.utils.util import denorm
+from paddlespeech.server.utils.util import get_chunks
+from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+
+__all__ = ['TTSEngine']
+
+# support online model
+pretrained_models = {
+    # fastspeech2
+    "fastspeech2_csmsc_onnx-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip',
+        'md5':
+        'fd3ad38d83273ad51f0ea4f4abf3ab4e',
+        'ckpt': ['fastspeech2_csmsc.onnx'],
+        'phones_dict':
+        'phone_id_map.txt',
+        'sample_rate':
+        24000,
+    },
+    "fastspeech2_cnndecoder_csmsc_onnx-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip',
+        'md5':
+        '5f70e1a6bcd29d72d54e7931aa86f266',
+        'ckpt': [
+            'fastspeech2_csmsc_am_encoder_infer.onnx',
+            'fastspeech2_csmsc_am_decoder.onnx',
+            'fastspeech2_csmsc_am_postnet.onnx',
+        ],
+        'speech_stats':
+        'speech_stats.npy',
+        'phones_dict':
+        'phone_id_map.txt',
+        'sample_rate':
+        24000,
+    },
+
+    # mb_melgan
+    "mb_melgan_csmsc_onnx-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip',
+        'md5':
+        '5b83ec746e8414bc29032d954ffd07ec',
+        'ckpt':
+        'mb_melgan_csmsc.onnx',
+        'sample_rate':
+        24000,
+    },
+
+    # hifigan
+    "hifigan_csmsc_onnx-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_onnx_0.2.0.zip',
+        'md5':
+        '1a7dc0385875889e46952e50c0994a6b',
+        'ckpt':
+        'hifigan_csmsc.onnx',
+        'sample_rate':
+        24000,
+    },
+}
+
+model_alias = {
+    # acoustic model
+    "fastspeech2":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
+    "fastspeech2_inference":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+
+    # voc
+    "mb_melgan":
+    "paddlespeech.t2s.models.melgan:MelGANGenerator",
+    "mb_melgan_inference":
+    "paddlespeech.t2s.models.melgan:MelGANInference",
+    "hifigan":
+    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
+    "hifigan_inference":
+    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
+}
+
+__all__ = ['TTSEngine']
+
+
+class TTSServerExecutor(TTSExecutor):
+    def __init__(self, am_block, am_pad, voc_block, voc_pad, voc_upsample):
+        super().__init__()
+        self.am_block = am_block
+        self.am_pad = am_pad
+        self.voc_block = voc_block
+        self.voc_pad = voc_pad
+        self.voc_upsample = voc_upsample
+
+        self.pretrained_models = pretrained_models
+        self.model_alias = model_alias
+
+    
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+        #Download and returns pretrained resources path of current task.
+        """
+        support_models = list(pretrained_models.keys())
+        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+            tag, '\n\t\t'.join(support_models))
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+        return decompressed_path
+    
+
+    def _init_from_path(
+            self,
+            am: str='fastspeech2_csmsc_onnx',
+            am_ckpt: Optional[list]=None,
+            am_stat: Optional[os.PathLike]=None,
+            phones_dict: Optional[os.PathLike]=None,
+            tones_dict: Optional[os.PathLike]=None,
+            speaker_dict: Optional[os.PathLike]=None,
+            am_sample_rate: int=24000,
+            am_sess_conf: dict=None,
+            voc: str='mb_melgan_csmsc_onnx',
+            voc_ckpt: Optional[os.PathLike]=None,
+            voc_sample_rate: int=24000,
+            voc_sess_conf: dict=None,
+            lang: str='zh', ):
+        """
+        Init model and other resources from a specific path.
+        """
+
+        if (hasattr(self, 'am_sess') or
+            (hasattr(self, 'am_encoder_infer_sess') and
+             hasattr(self, 'am_decoder_sess') and hasattr(
+                 self, 'am_postnet_sess'))) and hasattr(self, 'voc_inference'):
+            logger.info('Models had been initialized.')
+            return
+        # am
+        am_tag = am + '-' + lang
+        if am == "fastspeech2_csmsc_onnx":
+            # get model info
+            if am_ckpt is None or phones_dict is None:
+                am_res_path = self._get_pretrained_path(am_tag)
+                self.am_res_path = am_res_path
+                self.am_ckpt = os.path.join(
+                    am_res_path, pretrained_models[am_tag]['ckpt'][0])
+                # must have phones_dict in acoustic
+                self.phones_dict = os.path.join(
+                    am_res_path, pretrained_models[am_tag]['phones_dict'])
+
+            else:
+                self.am_ckpt = os.path.abspath(am_ckpt[0])
+                self.phones_dict = os.path.abspath(phones_dict)
+                self.am_res_path = os.path.dirname(
+                    os.path.abspath(self.am_ckpt))
+
+            # create am sess
+            self.am_sess = get_sess(self.am_ckpt, am_sess_conf)
+
+        elif am == "fastspeech2_cnndecoder_csmsc_onnx":
+            if am_ckpt is None or am_stat is None or phones_dict is None:
+                am_res_path = self._get_pretrained_path(am_tag)
+                self.am_res_path = am_res_path
+                self.am_encoder_infer = os.path.join(
+                    am_res_path, pretrained_models[am_tag]['ckpt'][0])
+                self.am_decoder = os.path.join(
+                    am_res_path, pretrained_models[am_tag]['ckpt'][1])
+                self.am_postnet = os.path.join(
+                    am_res_path, pretrained_models[am_tag]['ckpt'][2])
+                # must have phones_dict in acoustic
+                self.phones_dict = os.path.join(
+                    am_res_path, pretrained_models[am_tag]['phones_dict'])
+                self.am_stat = os.path.join(
+                    am_res_path, pretrained_models[am_tag]['speech_stats'])
+
+            else:
+                self.am_encoder_infer = os.path.abspath(am_ckpt[0])
+                self.am_decoder = os.path.abspath(am_ckpt[1])
+                self.am_postnet = os.path.abspath(am_ckpt[2])
+                self.phones_dict = os.path.abspath(phones_dict)
+                self.am_stat = os.path.abspath(am_stat)
+                self.am_res_path = os.path.dirname(
+                    os.path.abspath(self.am_ckpt))
+
+            # create am sess
+            self.am_encoder_infer_sess = get_sess(self.am_encoder_infer,
+                                                  am_sess_conf)
+            self.am_decoder_sess = get_sess(self.am_decoder, am_sess_conf)
+            self.am_postnet_sess = get_sess(self.am_postnet, am_sess_conf)
+
+            self.am_mu, self.am_std = np.load(self.am_stat)
+
+        logger.info(f"self.phones_dict: {self.phones_dict}")
+        logger.info(f"am model dir: {self.am_res_path}")
+        logger.info("Create am sess successfully.")
+
+        # voc model info
+        voc_tag = voc + '-' + lang
+        if voc_ckpt is None:
+            voc_res_path = self._get_pretrained_path(voc_tag)
+            self.voc_res_path = voc_res_path
+            self.voc_ckpt = os.path.join(voc_res_path,
+                                         pretrained_models[voc_tag]['ckpt'])
+        else:
+            self.voc_ckpt = os.path.abspath(voc_ckpt)
+            self.voc_res_path = os.path.dirname(os.path.abspath(self.voc_ckpt))
+        logger.info(self.voc_res_path)
+
+        # create voc sess
+        self.voc_sess = get_sess(self.voc_ckpt, voc_sess_conf)
+        logger.info("Create voc sess successfully.")
+
+        with open(self.phones_dict, "r") as f:
+            phn_id = [line.strip().split() for line in f.readlines()]
+        self.vocab_size = len(phn_id)
+        logger.info(f"vocab_size: {self.vocab_size}")
+
+        # frontend
+        self.tones_dict = None
+        if lang == 'zh':
+            self.frontend = Frontend(
+                phone_vocab_path=self.phones_dict,
+                tone_vocab_path=self.tones_dict)
+
+        elif lang == 'en':
+            self.frontend = English(phone_vocab_path=self.phones_dict)
+        logger.info("frontend done!")
+
+    def depadding(self, data, chunk_num, chunk_id, block, pad, upsample):
+        """ 
+        Streaming inference removes the result of pad inference
+        """
+        front_pad = min(chunk_id * block, pad)
+        # first chunk
+        if chunk_id == 0:
+            data = data[:block * upsample]
+        # last chunk
+        elif chunk_id == chunk_num - 1:
+            data = data[front_pad * upsample:]
+        # middle chunk
+        else:
+            data = data[front_pad * upsample:(front_pad + block) * upsample]
+
+        return data
+
+    @paddle.no_grad()
+    def infer(
+            self,
+            text: str,
+            lang: str='zh',
+            am: str='fastspeech2_csmsc_onnx',
+            spk_id: int=0, ):
+        """
+        Model inference and result stored in self.output.
+        """
+        #import pdb;pdb.set_trace()
+
+        am_block = self.am_block
+        am_pad = self.am_pad
+        am_upsample = 1
+        voc_block = self.voc_block
+        voc_pad = self.voc_pad
+        voc_upsample = self.voc_upsample
+        # first_flag 用于标记首包
+        first_flag = 1
+        get_tone_ids = False
+        merge_sentences = False
+
+        # front 
+        frontend_st = time.time()
+        if lang == 'zh':
+            input_ids = self.frontend.get_input_ids(
+                text,
+                merge_sentences=merge_sentences,
+                get_tone_ids=get_tone_ids)
+            phone_ids = input_ids["phone_ids"]
+            if get_tone_ids:
+                tone_ids = input_ids["tone_ids"]
+        elif lang == 'en':
+            input_ids = self.frontend.get_input_ids(
+                text, merge_sentences=merge_sentences)
+            phone_ids = input_ids["phone_ids"]
+        else:
+            logger.error("lang should in {'zh', 'en'}!")
+        frontend_et = time.time()
+        self.frontend_time = frontend_et - frontend_st
+
+        for i in range(len(phone_ids)):
+            part_phone_ids = phone_ids[i].numpy()
+            voc_chunk_id = 0
+
+            # fastspeech2_csmsc
+            if am == "fastspeech2_csmsc_onnx":
+                # am 
+                mel = self.am_sess.run(
+                    output_names=None, input_feed={'text': part_phone_ids})
+                mel = mel[0]
+                if first_flag == 1:
+                    first_am_et = time.time()
+                    self.first_am_infer = first_am_et - frontend_et
+
+                # voc streaming
+                mel_chunks = get_chunks(mel, voc_block, voc_pad, "voc")
+                voc_chunk_num = len(mel_chunks)
+                voc_st = time.time()
+                for i, mel_chunk in enumerate(mel_chunks):
+                    sub_wav = self.voc_sess.run(
+                        output_names=None, input_feed={'logmel': mel_chunk})
+                    sub_wav = self.depadding(sub_wav[0], voc_chunk_num, i,
+                                             voc_block, voc_pad, voc_upsample)
+                    if first_flag == 1:
+                        first_voc_et = time.time()
+                        self.first_voc_infer = first_voc_et - first_am_et
+                        self.first_response_time = first_voc_et - frontend_st
+                        first_flag = 0
+
+                    yield sub_wav
+
+            # fastspeech2_cnndecoder_csmsc 
+            elif am == "fastspeech2_cnndecoder_csmsc_onnx":
+                # am 
+                orig_hs = self.am_encoder_infer_sess.run(
+                    None, input_feed={'text': part_phone_ids})
+                orig_hs = orig_hs[0]
+
+                # streaming voc chunk info
+                mel_len = orig_hs.shape[1]
+                voc_chunk_num = math.ceil(mel_len / self.voc_block)
+                start = 0
+                end = min(self.voc_block + self.voc_pad, mel_len)
+
+                # streaming am
+                hss = get_chunks(orig_hs, self.am_block, self.am_pad, "am")
+                am_chunk_num = len(hss)
+                for i, hs in enumerate(hss):
+                    am_decoder_output = self.am_decoder_sess.run(
+                        None, input_feed={'xs': hs})
+                    am_postnet_output = self.am_postnet_sess.run(
+                        None,
+                        input_feed={
+                            'xs': np.transpose(am_decoder_output[0], (0, 2, 1))
+                        })
+                    am_output_data = am_decoder_output + np.transpose(
+                        am_postnet_output[0], (0, 2, 1))
+                    normalized_mel = am_output_data[0][0]
+
+                    sub_mel = denorm(normalized_mel, self.am_mu, self.am_std)
+                    sub_mel = self.depadding(sub_mel, am_chunk_num, i, am_block,
+                                             am_pad, am_upsample)
+
+                    if i == 0:
+                        mel_streaming = sub_mel
+                    else:
+                        mel_streaming = np.concatenate(
+                            (mel_streaming, sub_mel), axis=0)
+
+                    # streaming voc
+                    # 当流式AM推理的mel帧数大于流式voc推理的chunk size，开始进行流式voc 推理
+                    while (mel_streaming.shape[0] >= end and
+                           voc_chunk_id < voc_chunk_num):
+                        if first_flag == 1:
+                            first_am_et = time.time()
+                            self.first_am_infer = first_am_et - frontend_et
+                        voc_chunk = mel_streaming[start:end, :]
+
+                        sub_wav = self.voc_sess.run(
+                            output_names=None, input_feed={'logmel': voc_chunk})
+                        sub_wav = self.depadding(sub_wav[0], voc_chunk_num,
+                                                 voc_chunk_id, voc_block,
+                                                 voc_pad, voc_upsample)
+                        if first_flag == 1:
+                            first_voc_et = time.time()
+                            self.first_voc_infer = first_voc_et - first_am_et
+                            self.first_response_time = first_voc_et - frontend_st
+                            first_flag = 0
+
+                        yield sub_wav
+
+                        voc_chunk_id += 1
+                        start = max(0, voc_chunk_id * voc_block - voc_pad)
+                        end = min((voc_chunk_id + 1) * voc_block + voc_pad,
+                                  mel_len)
+
+            else:
+                logger.error(
+                    "Only support fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc on streaming tts."
+                )
+
+        self.final_response_time = time.time() - frontend_st
+
+
+class TTSEngine(BaseEngine):
+    """TTS server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self, name=None):
+        """Initialize TTS server engine
+        """
+        super().__init__()
+
+    def init(self, config: dict) -> bool:
+        self.config = config
+        assert (
+            self.config.am == "fastspeech2_csmsc_onnx" or
+            self.config.am == "fastspeech2_cnndecoder_csmsc_onnx"
+        ) and (
+            self.config.voc == "hifigan_csmsc_onnx" or
+            self.config.voc == "mb_melgan_csmsc_onnx"
+        ), 'Please check config, am support: fastspeech2, voc support: hifigan_csmsc-zh or mb_melgan_csmsc.'
+
+        assert (
+            self.config.voc_block > 0 and self.config.voc_pad > 0
+        ), "Please set correct voc_block and voc_pad, they should be more than 0."
+
+        assert (
+            self.config.voc_sample_rate == self.config.am_sample_rate
+        ), "The sample rate of AM and Vocoder model are different, please check model."
+
+        self.executor = TTSServerExecutor(
+            self.config.am_block, self.config.am_pad, self.config.voc_block,
+            self.config.voc_pad, self.config.voc_upsample)
+
+        if "cpu" in self.config.am_sess_conf.device or "cpu" in self.config.voc_sess_conf.device:
+            paddle.set_device("cpu")
+        else:
+            paddle.set_device(self.config.am_sess_conf.device)
+
+        try:
+            self.executor._init_from_path(
+                am=self.config.am,
+                am_ckpt=self.config.am_ckpt,
+                am_stat=self.config.am_stat,
+                phones_dict=self.config.phones_dict,
+                tones_dict=self.config.tones_dict,
+                speaker_dict=self.config.speaker_dict,
+                am_sample_rate=self.config.am_sample_rate,
+                am_sess_conf=self.config.am_sess_conf,
+                voc=self.config.voc,
+                voc_ckpt=self.config.voc_ckpt,
+                voc_sample_rate=self.config.voc_sample_rate,
+                voc_sess_conf=self.config.voc_sess_conf,
+                lang=self.config.lang)
+
+        except Exception as e:
+            logger.error("Failed to get model related files.")
+            logger.error("Initialize TTS server engine Failed on device: %s." %
+                         (self.config.voc_sess_conf.device))
+            return False
+
+        logger.info("Initialize TTS server engine successfully on device: %s." %
+                    (self.config.voc_sess_conf.device))
+
+        # warm up
+        try:
+            self.warm_up()
+        except Exception as e:
+            logger.error("Failed to warm up on tts engine.")
+            return False
+
+        return True
+
+    def warm_up(self):
+        """warm up
+        """
+        if self.config.lang == 'zh':
+            sentence = "您好，欢迎使用语音合成服务。"
+        if self.config.lang == 'en':
+            sentence = "Hello and welcome to the speech synthesis service."
+        logger.info(
+            "*******************************warm up ********************************"
+        )
+        for i in range(3):
+            for wav in self.executor.infer(
+                    text=sentence,
+                    lang=self.config.lang,
+                    am=self.config.am,
+                    spk_id=0, ):
+                logger.info(
+                    f"The first response time of the {i} warm up: {self.executor.first_response_time} s"
+                )
+                break
+        logger.info(
+            "**********************************************************************"
+        )
+
+    def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
+        # Convert byte to text
+        if text_bese64:
+            text_bytes = base64.b64decode(text_bese64)  # base64 to bytes
+        text = text_bytes.decode('utf-8')  # bytes to text
+
+        return text
+
+    def run(self,
+            sentence: str,
+            spk_id: int=0,
+            speed: float=1.0,
+            volume: float=1.0,
+            sample_rate: int=0,
+            save_path: str=None):
+        """ run include inference and postprocess.
+
+        Args:
+            sentence (str): text to be synthesized
+            spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
+            speed (float, optional): speed. Defaults to 1.0.
+            volume (float, optional): volume. Defaults to 1.0.
+            sample_rate (int, optional): target sample rate for synthesized audio, 
+            0 means the same as the model sampling rate. Defaults to 0.
+            save_path (str, optional): The save path of the synthesized audio. 
+            None means do not save audio. Defaults to None.
+
+        Returns:
+            wav_base64: The base64 format of the synthesized audio.
+        """
+        wav_list = []
+
+        for wav in self.executor.infer(
+                text=sentence,
+                lang=self.config.lang,
+                am=self.config.am,
+                spk_id=spk_id, ):
+
+            # wav type: <class 'numpy.ndarray'>  float32, convert to pcm (base64)
+            wav = float2pcm(wav)  # float32 to int16
+            wav_bytes = wav.tobytes()  # to bytes
+            wav_base64 = base64.b64encode(wav_bytes).decode('utf8')  # to base64
+            wav_list.append(wav)
+
+            yield wav_base64
+
+        wav_all = np.concatenate(wav_list, axis=0)
+        duration = len(wav_all) / self.config.voc_sample_rate
+        logger.info(f"sentence: {sentence}")
+        logger.info(f"The durations of audio is: {duration} s")
+        logger.info(
+            f"first response time: {self.executor.first_response_time} s")
+        logger.info(
+            f"final response time: {self.executor.final_response_time} s")
+        logger.info(f"RTF: {self.executor.final_response_time / duration}")
+        logger.info(
+            f"Other info: front time: {self.executor.frontend_time} s, first am infer time: {self.executor.first_am_infer} s, first voc infer time: {self.executor.first_voc_infer} s,"
+        )
--- a/paddlespeech/server/engine/tts/online/python/init.py
+++ b/paddlespeech/server/engine/tts/online/python/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@ -202,6 +202,7 @@ class TTSServerExecutor(TTSExecutor):
        """
        Init model and other resources from a specific path.
        """
+        #import pdb;pdb.set_trace()
        if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'):
            logger.info('Models had been initialized.')
            return
@ -302,23 +303,6 @@ class TTSServerExecutor(TTSExecutor):
        self.voc_inference.eval()
        print("voc done!")

-    def get_phone(self, sentence, lang, merge_sentences, get_tone_ids):
-        tone_ids = None
-        if lang == 'zh':
-            input_ids = self.frontend.get_input_ids(
-                sentence,
-                merge_sentences=merge_sentences,
-                get_tone_ids=get_tone_ids)
-            phone_ids = input_ids["phone_ids"]
-            if get_tone_ids:
-                tone_ids = input_ids["tone_ids"]
-        elif lang == 'en':
-            input_ids = self.frontend.get_input_ids(
-                sentence, merge_sentences=merge_sentences)
-            phone_ids = input_ids["phone_ids"]
-        else:
-            print("lang should in {'zh', 'en'}!")
-
    def depadding(self, data, chunk_num, chunk_id, block, pad, upsample):
        """ 
        Streaming inference removes the result of pad inference
--- a/paddlespeech/server/utils/onnx_infer.py
+++ b/paddlespeech/server/utils/onnx_infer.py
@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Optional
+
+import onnxruntime as ort
+
+
+def get_sess(model_path: Optional[os.PathLike]=None, sess_conf: dict=None):
+    sess_options = ort.SessionOptions()
+    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+
+    if "gpu" in sess_conf["device"]:
+        # fastspeech2/mb_melgan can't use trt now!
+        if sess_conf["use_trt"]:
+            providers = ['TensorrtExecutionProvider']
+        else:
+            providers = ['CUDAExecutionProvider']
+    elif sess_conf["device"] == "cpu":
+        providers = ['CPUExecutionProvider']
+    sess_options.intra_op_num_threads = sess_conf["cpu_threads"]
+    sess = ort.InferenceSession(
+        model_path, providers=providers, sess_options=sess_options)
+    return sess
--- a/paddlespeech/server/ws/tts_socket.py
+++ b/paddlespeech/server/ws/tts_socket.py
@ -51,7 +51,6 @@ async def websocket_endpoint(websocket: WebSocket):
                tts_results = next(wav_generator)
                resp = {"status": 1, "audio": tts_results}
                await websocket.send_json(resp)
-                logger.info("streaming audio...")
            except StopIteration as e:
                resp = {"status": 2, "audio": ''}
                await websocket.send_json(resp)
--- a/setup.py
+++ b/setup.py
@ -72,6 +72,9 @@ server = [
    "fastapi",
    "uvicorn",
    "pattern_singleton",
+    "websockets",
+    "websocket",
+    "websocket-client",
 ]

 requirements = {
--- a/tests/unit/server/offline/change_yaml.py
+++ b/tests/unit/server/offline/change_yaml.py
--- a/tests/unit/server/offline/conf/application.yaml
+++ b/tests/unit/server/offline/conf/application.yaml
--- a/tests/unit/server/offline/test_server_client.sh
+++ b/tests/unit/server/offline/test_server_client.sh
--- a/tests/unit/server/online/tts/check_server/change_yaml.py
+++ b/tests/unit/server/online/tts/check_server/change_yaml.py
@ -0,0 +1,99 @@
+#!/usr/bin/python
+import argparse
+import os
+
+import yaml
+
+def change_value(args):
+    yamlfile = args.config_file
+    change_type = args.change_type
+    engine_type = args.engine_type
+    target_key = args.target_key
+    target_value = args.target_value
+
+    tmp_yamlfile = yamlfile.split(".yaml")[0] + "_tmp.yaml"
+    os.system("cp %s %s" % (yamlfile, tmp_yamlfile))
+
+    with open(tmp_yamlfile) as f, open(yamlfile, "w+", encoding="utf-8") as fw:
+        y = yaml.safe_load(f)
+
+        if change_type == "model":
+            if engine_type == "tts_online-onnx":
+                target_value = target_value + "_onnx"
+            y[engine_type][target_key] = target_value
+        elif change_type == "protocol":
+            assert (target_key == "protocol" and (
+                target_value == "http" or target_value == "websocket"
+            )), "if change_type is protocol, target_key must be set protocol."
+            y[target_key] = target_value
+        elif change_type == "engine_type":
+            assert (
+                target_key == "engine_list"
+            ), "if change_type is engine_type, target_key must be set engine_list."
+            y[target_key] = [target_value]
+        elif change_type == "device":
+            assert (
+                target_key == "device"
+            ), "if change_type is device, target_key must be set device."
+            if y["engine_list"][0] == "tts_online":
+                y["tts_online"]["device"] = target_value
+            elif y["engine_list"][0] == "tts_online-onnx":
+                y["tts_online-onnx"]["am_sess_conf"]["device"] = target_value
+                y["tts_online-onnx"]["voc_sess_conf"]["device"] = target_value
+            else:
+                print(
+                    "Error engine_list, please set tts_online or tts_online-onnx"
+                )
+
+        else:
+            print("Error change_type, please set correct change_type.")
+
+        print(yaml.dump(y, default_flow_style=False, sort_keys=False))
+        yaml.dump(y, fw, allow_unicode=True)
+    os.system("rm %s" % (tmp_yamlfile))
+    print(f"Change key: {target_key} to value: {target_value} successfully.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--config_file',
+        type=str,
+        default='./conf/application.yaml',
+        help='server yaml file.')
+    parser.add_argument(
+        '--change_type',
+        type=str,
+        default="model",
+        choices=["model", "protocol", "engine_type", "device"],
+        help='change protocol', )
+    parser.add_argument(
+        '--engine_type',
+        type=str,
+        default="tts_online",
+        help='engine type',
+        choices=["tts_online", "tts_online-onnx"])
+    parser.add_argument(
+        '--target_key',
+        type=str,
+        default=None,
+        help='Change key',
+        required=True)
+    parser.add_argument(
+        '--target_value',
+        type=str,
+        default=None,
+        help='target value',
+        required=True)
+
+    args = parser.parse_args()
+
+    change_value(args)
+    """
+    if args.change_type == "model":
+        change_value(args.config_file, args.target_key, args.target_value, args.engine)
+    elif args.change_type == "protocol":
+        change_protocol(args.config_file, args.target_key, args.target_value)
+    else:
+        print("Please set correct change type, model or protocol")
+    """
--- a/tests/unit/server/online/tts/check_server/conf/application.yaml
+++ b/tests/unit/server/online/tts/check_server/conf/application.yaml
@ -0,0 +1,88 @@
+# This is the parameter configuration file for PaddleSpeech Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 127.0.0.1
+port: 8092
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['tts_online', 'tts_online-onnx']
+# protocol = ['websocket', 'http'] (only one can be selected).
+protocol: 'http'
+engine_list: ['tts_online']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### TTS #########################################
+################### speech task: tts; engine_type: online #######################
+tts_online: 
+    # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']        
+    am: 'fastspeech2_cnndecoder_csmsc'   
+    am_config: 
+    am_ckpt: 
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+
+    # voc (vocoder) choices=['mb_melgan_csmsc', 'hifigan_csmsc']
+    voc: 'mb_melgan_csmsc'
+    voc_config: 
+    voc_ckpt: 
+    voc_stat: 
+
+    # others
+    lang: 'zh'
+    device: 'cpu' # set 'gpu:id' or 'cpu'
+    am_block: 42
+    am_pad: 12
+    voc_block: 14
+    voc_pad: 14
+    
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### TTS #########################################
+################### speech task: tts; engine_type: online-onnx #######################
+tts_online-onnx: 
+    # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']        
+    am: 'fastspeech2_cnndecoder_csmsc_onnx' 
+    # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model];
+    # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model];
+    am_ckpt:   # list
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+    am_sample_rate: 24000
+    am_sess_conf:
+        device: "cpu" # set 'gpu:id' or 'cpu'
+        use_trt: False
+        cpu_threads: 1
+
+    # voc (vocoder) choices=['mb_melgan_csmsc_onnx', 'hifigan_csmsc_onnx']
+    voc: 'mb_melgan_csmsc_onnx'
+    voc_ckpt: 
+    voc_sample_rate: 24000
+    voc_sess_conf:
+        device: "cpu" # set 'gpu:id' or 'cpu'
+        use_trt: False
+        cpu_threads: 1
+
+    # others
+    lang: 'zh'
+    am_block: 42
+    am_pad: 12
+    voc_block: 14
+    voc_pad: 14
+    voc_upsample: 300
+    
--- a/tests/unit/server/online/tts/check_server/http_client.py
+++ b/tests/unit/server/online/tts/check_server/http_client.py
@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import base64
+import json
+import os
+import time
+
+import requests
+
+from paddlespeech.server.utils.audio_process import pcm2wav
+
+
+def save_audio(buffer, audio_path) -> bool:
+    if args.save_path.endswith("pcm"):
+        with open(args.save_path, "wb") as f:
+            f.write(buffer)
+    elif args.save_path.endswith("wav"):
+        with open("./tmp.pcm", "wb") as f:
+            f.write(buffer)
+        pcm2wav("./tmp.pcm", audio_path, channels=1, bits=16, sample_rate=24000)
+        os.system("rm ./tmp.pcm")
+    else:
+        print("Only supports saved audio format is pcm or wav")
+        return False
+
+    return True
+
+
+def test(args):
+    params = {
+        "text": args.text,
+        "spk_id": args.spk_id,
+        "speed": args.speed,
+        "volume": args.volume,
+        "sample_rate": args.sample_rate,
+        "save_path": ''
+    }
+
+    buffer = b''
+    flag = 1
+    url = "http://" + str(args.server) + ":" + str(
+        args.port) + "/paddlespeech/streaming/tts"
+    st = time.time()
+    html = requests.post(url, json.dumps(params), stream=True)
+    for chunk in html.iter_content(chunk_size=1024):
+        chunk = base64.b64decode(chunk)  # bytes
+        if flag:
+            first_response = time.time() - st
+            print(f"首包响应：{first_response} s")
+            flag = 0
+        buffer += chunk
+
+    final_response = time.time() - st
+    duration = len(buffer) / 2.0 / 24000
+
+    print(f"尾包响应：{final_response} s")
+    print(f"音频时长：{duration} s")
+    print(f"RTF: {final_response / duration}")
+
+    if args.save_path is not None:
+        if save_audio(buffer, args.save_path):
+            print("音频保存至：", args.save_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--text',
+        type=str,
+        default="您好，欢迎使用语音合成服务。",
+        help='A sentence to be synthesized')
+    parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
+    parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')
+    parser.add_argument(
+        '--volume', type=float, default=1.0, help='Audio volume')
+    parser.add_argument(
+        '--sample_rate',
+        type=int,
+        default=0,
+        help='Sampling rate, the default is the same as the model')
+    parser.add_argument(
+        "--server", type=str, help="server ip", default="127.0.0.1")
+    parser.add_argument("--port", type=int, help="server port", default=8092)
+    parser.add_argument(
+        "--save_path", type=str, help="save audio path", default=None)
+
+    args = parser.parse_args()
+    test(args)
--- a/tests/unit/server/online/tts/check_server/test.sh
+++ b/tests/unit/server/online/tts/check_server/test.sh
@ -0,0 +1,315 @@
+#!/bin/bash
+# bash test.sh
+
+StartService(){
+    # Start service 
+    paddlespeech_server start --config_file $config_file 1>>$log/server.log 2>>$log/server.log.wf &
+    echo $! > pid
+
+    start_num=$(cat $log/server.log.wf | grep "INFO:     Uvicorn running on http://" -c)
+    flag="normal"
+    while [[ $start_num -lt $target_start_num && $flag == "normal" ]]
+    do
+        start_num=$(cat $log/server.log.wf | grep "INFO:     Uvicorn running on http://" -c)
+        # start service failed
+        if [ $(cat $log/server.log.wf | grep -i "Failed to warm up on tts engine." -c) -gt $error_time ];then
+            echo "Service started failed."  | tee -a $log/test_result.log
+            error_time=$(cat $log/server.log.wf | grep -i "Failed to warm up on tts engine." -c)
+            flag="unnormal"
+
+        elif [ $(cat $log/server.log.wf | grep -i "AssertionError" -c) -gt $error_time ];then
+            echo "Service started failed."  | tee -a $log/test_result.log
+            error_time+=$(cat $log/server.log.wf | grep -i "AssertionError" -c)
+            flag="unnormal"
+        fi
+    done
+}
+
+ClientTest_http(){
+    for ((i=1; i<=3;i++))
+    do
+    python http_client.py --save_path ./out_http.wav 
+    ((http_test_times+=1))
+    done
+}
+
+ClientTest_ws(){
+    for ((i=1; i<=3;i++))
+    do
+    python ws_client.py
+    ((ws_test_times+=1))
+    done
+}
+
+GetTestResult_http() {
+    # Determine if the test was successful
+    http_response_success_time=$(cat $log/server.log | grep "200 OK" -c)
+    if (( $http_response_success_time == $http_test_times )) ; then
+        echo "Testing successfully. $info"  | tee -a $log/test_result.log
+    else
+        echo "Testing failed. $info" | tee -a $log/test_result.log
+    fi
+    http_test_times=$http_response_success_time
+}
+
+GetTestResult_ws() {
+    # Determine if the test was successful
+    ws_response_success_time=$(cat $log/server.log.wf | grep "Complete the transmission of audio streams" -c)
+    if (( $ws_response_success_time == $ws_test_times )) ; then
+        echo "Testing successfully. $info"  | tee -a $log/test_result.log
+    else
+        echo "Testing failed. $info" | tee -a $log/test_result.log
+    fi
+    ws_test_times=$ws_response_success_time
+}
+
+
+engine_type=$1
+log=$2
+mkdir -p $log
+rm -rf $log/server.log.wf 
+rm -rf $log/server.log
+rm -rf $log/test_result.log
+
+config_file=./conf/application.yaml
+server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
+port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
+
+echo "Sevice ip: $server_ip" | tee $log/test_result.log
+echo "Sevice port: $port" | tee -a $log/test_result.log
+
+# whether a process is listening on $port
+pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'`
+if [ "$pid" != "" ]; then
+    echo "The port: $port is occupied, please change another port"
+    exit
+fi
+
+
+
+target_start_num=0  # the number of start service
+test_times=0  # The number of client test
+error_time=0  # The number of error occurrences in the startup failure server.log.wf file
+
+# start server: engine: tts_online, protocol: http, am: fastspeech2_cnndecoder_csmsc, voc: mb_melgan_csmsc
+info="start server: engine: $engine_type, protocol: http, am: fastspeech2_cnndecoder_csmsc, voc: mb_melgan_csmsc."
+echo "$info"  | tee -a $log/test_result.log
+((target_start_num+=1))
+StartService
+
+if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
+    echo "Service started successfully."  | tee -a $log/test_result.log
+    ClientTest_http
+    echo "This round of testing is over."  | tee -a $log/test_result.log
+
+    GetTestResult_http
+else
+    echo "Service failed to start, no client test."
+    target_start_num=$start_num  
+
+fi
+
+kill -9 `cat pid`
+rm -rf pid
+sleep 2s
+echo "**************************************************************************************" | tee -a $log/test_result.log
+
+
+
+
+python change_yaml.py --engine_type $engine_type --target_key voc --target_value hifigan_csmsc    # change voc: mb_melgan_csmsc -> hifigan_csmsc
+# start server: engine: tts_online, protocol: http, am: fastspeech2_cnndecoder_csmsc, voc: hifigan_csmsc
+info="start server: engine: $engine_type, protocol: http, am: fastspeech2_cnndecoder_csmsc, voc: hifigan_csmsc."
+
+echo "$info"  | tee -a $log/test_result.log
+((target_start_num+=1))
+StartService
+
+if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
+    echo "Service started successfully."  | tee -a $log/test_result.log
+    ClientTest_http
+    echo "This round of testing is over."  | tee -a $log/test_result.log
+
+    GetTestResult_http
+else
+    echo "Service failed to start, no client test."
+    target_start_num=$start_num  
+
+fi
+
+kill -9 `cat pid`
+rm -rf pid
+sleep 2s
+echo "**************************************************************************************" | tee -a $log/test_result.log
+
+
+
+python change_yaml.py --engine_type $engine_type --target_key am --target_value fastspeech2_csmsc    # change am: fastspeech2_cnndecoder_csmsc -> fastspeech2_csmsc
+# start server: engine: tts_online, protocol: http, am: fastspeech2_csmsc, voc: hifigan_csmsc
+info="start server: engine: $engine_type, protocol: http, am: fastspeech2_csmsc, voc: hifigan_csmsc."
+
+echo "$info"  | tee -a $log/test_result.log
+((target_start_num+=1))
+StartService
+
+if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
+    echo "Service started successfully."  | tee -a $log/test_result.log
+    ClientTest_http
+    echo "This round of testing is over."  | tee -a $log/test_result.log
+
+    GetTestResult_http
+else
+    echo "Service failed to start, no client test."
+    target_start_num=$start_num  
+
+fi
+
+kill -9 `cat pid`
+rm -rf pid
+sleep 2s
+echo "**************************************************************************************" | tee -a $log/test_result.log
+
+
+python change_yaml.py --engine_type $engine_type  --target_key voc --target_value mb_melgan_csmsc    # change voc: hifigan_csmsc -> mb_melgan_csmsc
+# start server: engine: tts_online, protocol: http, am: fastspeech2_csmsc, voc: mb_melgan_csmsc
+info="start server: engine: $engine_type, protocol: http, am: fastspeech2_csmsc, voc: mb_melgan_csmsc."
+
+echo "$info"  | tee -a $log/test_result.log
+((target_start_num+=1))
+StartService
+
+if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
+    echo "Service started successfully."  | tee -a $log/test_result.log
+    ClientTest_http
+    echo "This round of testing is over."  | tee -a $log/test_result.log
+
+    GetTestResult_http
+else
+    echo "Service failed to start, no client test."
+    target_start_num=$start_num  
+    
+fi
+
+kill -9 `cat pid`
+rm -rf pid
+sleep 2s
+echo "**************************************************************************************" | tee -a $log/test_result.log
+
+
+echo "********************************************* websocket **********************************************************"
+
+python change_yaml.py --engine_type $engine_type --change_type protocol --target_key protocol --target_value websocket
+# start server: engine: tts_online, protocol: websocket, am: fastspeech2_csmsc, voc: mb_melgan_csmsc
+info="start server: engine: $engine_type, protocol: websocket, am: fastspeech2_csmsc, voc: mb_melgan_csmsc."
+
+echo "$info"  | tee -a $log/test_result.log
+((target_start_num+=1))
+StartService
+
+if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
+    echo "Service started successfully."  | tee -a $log/test_result.log
+    ClientTest_ws
+    echo "This round of testing is over."  | tee -a $log/test_result.log
+
+    GetTestResult_ws
+else
+    echo "Service failed to start, no client test."
+    target_start_num=$start_num  
+    
+fi
+
+kill -9 `cat pid`
+rm -rf pid
+sleep 2s
+echo "**************************************************************************************" | tee -a $log/test_result.log
+
+python change_yaml.py --engine_type $engine_type --target_key voc --target_value hifigan_csmsc    # change voc: mb_melgan_csmsc -> hifigan_csmsc
+# start server: engine: tts_online, protocol: websocket, am: fastspeech2_csmsc, voc: hifigan_csmsc
+info="start server: engine: $engine_type, protocol: websocket, am: fastspeech2_csmsc, voc: hifigan_csmsc."
+
+echo "$info"  | tee -a $log/test_result.log
+((target_start_num+=1))
+StartService
+
+if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
+    echo "Service started successfully."  | tee -a $log/test_result.log
+    ClientTest_ws
+    echo "This round of testing is over."  | tee -a $log/test_result.log
+
+    GetTestResult_ws
+else
+    echo "Service failed to start, no client test."
+    target_start_num=$start_num  
+
+fi
+
+kill -9 `cat pid`
+rm -rf pid
+sleep 2s
+echo "**************************************************************************************" | tee -a $log/test_result.log
+
+
+python change_yaml.py --engine_type $engine_type --target_key am --target_value fastspeech2_cnndecoder_csmsc    # change am: fastspeech2_csmsc -> fastspeech2_cnndecoder_csmsc
+# start server: engine: tts_online, protocol: websocket, am: fastspeech2_cnndecoder_csmsc, voc: hifigan_csmsc
+info="start server: engine: $engine_type, protocol: websocket, am: fastspeech2_cnndecoder_csmsc, voc: hifigan_csmsc."
+
+echo "$info"  | tee -a $log/test_result.log
+((target_start_num+=1))
+StartService
+
+if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
+    echo "Service started successfully."  | tee -a $log/test_result.log
+    ClientTest_ws
+    echo "This round of testing is over."  | tee -a $log/test_result.log
+
+    GetTestResult_ws
+else
+    echo "Service failed to start, no client test."
+    target_start_num=$start_num  
+
+fi
+
+kill -9 `cat pid`
+rm -rf pid
+sleep 2s
+echo "**************************************************************************************" | tee -a $log/test_result.log
+
+
+
+python change_yaml.py --engine_type $engine_type --target_key voc --target_value mb_melgan_csmsc    # change am: hifigan_csmsc -> mb_melgan_csmsc
+# start server: engine: tts_online, protocol: websocket, am: fastspeech2_cnndecoder_csmsc, voc: mb_melgan_csmsc
+info="start server: engine: $engine_type, protocol: websocket, am: fastspeech2_cnndecoder_csmsc, voc: mb_melgan_csmsc."
+
+echo "$info"  | tee -a $log/test_result.log
+((target_start_num+=1))
+StartService
+
+if [[ $start_num -eq $target_start_num && $flag == "normal" ]]; then
+    echo "Service started successfully."  | tee -a $log/test_result.log
+    ClientTest_ws
+    echo "This round of testing is over."  | tee -a $log/test_result.log
+
+    GetTestResult_ws
+else
+    echo "Service failed to start, no client test."
+    target_start_num=$start_num  
+
+fi
+
+kill -9 `cat pid`
+rm -rf pid
+sleep 2s
+echo "**************************************************************************************" | tee -a $log/test_result.log
+
+
+
+echo "All tests completed."  | tee -a $log/test_result.log
+
+
+# sohw all the test results
+echo "***************** Here are all the test results ********************"
+cat $log/test_result.log
+
+# Restoring conf is the same as demos/speech_server
+cp ./tts_online_application.yaml ./conf/application.yaml -rf
+sleep 2s
--- a/tests/unit/server/online/tts/check_server/test_all.sh
+++ b/tests/unit/server/online/tts/check_server/test_all.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+# bash test_all.sh
+
+log_all_dir=./log
+
+bash test.sh tts_online $log_all_dir/log_tts_online_cpu
+
+python change_yaml.py --change_type engine_type --target_key engine_list --target_value tts_online-onnx
+bash test.sh tts_online-onnx $log_all_dir/log_tts_online-onnx_cpu
+
+
+python change_yaml.py --change_type device --target_key device --target_value gpu:3
+bash test.sh tts_online $log_all_dir/log_tts_online_gpu
+
+python change_yaml.py --change_type engine_type --target_key engine_list --target_value tts_online-onnx
+python change_yaml.py --change_type device --target_key device --target_value gpu:3
+bash test.sh tts_online-onnx $log_all_dir/log_tts_online-onnx_gpu 
+
+echo "************************************** show all test results ****************************************"
+cat $log_all_dir/log_tts_online_cpu/test_result.log
+cat $log_all_dir/log_tts_online-onnx_cpu/test_result.log
+cat $log_all_dir/log_tts_online_gpu/test_result.log
+cat $log_all_dir/log_tts_online-onnx_gpu/test_result.log
--- a/tests/unit/server/online/tts/check_server/tts_online_application.yaml
+++ b/tests/unit/server/online/tts/check_server/tts_online_application.yaml
@ -0,0 +1,88 @@
+# This is the parameter configuration file for PaddleSpeech Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 127.0.0.1
+port: 8092
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['tts_online', 'tts_online-onnx']
+# protocol = ['websocket', 'http'] (only one can be selected).
+protocol: 'http'
+engine_list: ['tts_online']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### TTS #########################################
+################### speech task: tts; engine_type: online #######################
+tts_online: 
+    # am (acoustic model) choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc']        
+    am: 'fastspeech2_cnndecoder_csmsc'   
+    am_config: 
+    am_ckpt: 
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+
+    # voc (vocoder) choices=['mb_melgan_csmsc', 'hifigan_csmsc']
+    voc: 'mb_melgan_csmsc'
+    voc_config: 
+    voc_ckpt: 
+    voc_stat: 
+
+    # others
+    lang: 'zh'
+    device: 'cpu' # set 'gpu:id' or 'cpu'
+    am_block: 42
+    am_pad: 12
+    voc_block: 14
+    voc_pad: 14
+    
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### TTS #########################################
+################### speech task: tts; engine_type: online-onnx #######################
+tts_online-onnx: 
+    # am (acoustic model) choices=['fastspeech2_csmsc_onnx', 'fastspeech2_cnndecoder_csmsc_onnx']        
+    am: 'fastspeech2_cnndecoder_csmsc_onnx' 
+    # am_ckpt is a list, if am is fastspeech2_cnndecoder_csmsc_onnx, am_ckpt = [encoder model, decoder model, postnet model];
+    # if am is fastspeech2_csmsc_onnx, am_ckpt = [ckpt model];
+    am_ckpt:   # list
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+    spk_id: 0
+    am_sample_rate: 24000
+    am_sess_conf:
+        device: "cpu" # set 'gpu:id' or 'cpu'
+        use_trt: False
+        cpu_threads: 1
+
+    # voc (vocoder) choices=['mb_melgan_csmsc_onnx', 'hifigan_csmsc_onnx']
+    voc: 'mb_melgan_csmsc_onnx'
+    voc_ckpt: 
+    voc_sample_rate: 24000
+    voc_sess_conf:
+        device: "cpu" # set 'gpu:id' or 'cpu'
+        use_trt: False
+        cpu_threads: 1
+
+    # others
+    lang: 'zh'
+    am_block: 42
+    am_pad: 12
+    voc_block: 14
+    voc_pad: 14
+    voc_upsample: 300
+    
--- a/tests/unit/server/online/tts/check_server/ws_client.py
+++ b/tests/unit/server/online/tts/check_server/ws_client.py
@ -0,0 +1,126 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import _thread as thread
+import argparse
+import base64
+import json
+import ssl
+import time
+
+import websocket
+
+flag = 1
+st = 0.0
+all_bytes = b''
+
+
+class WsParam(object):
+    # 初始化
+    def __init__(self, text, server="127.0.0.1", port=8090):
+        self.server = server
+        self.port = port
+        self.url = "ws://" + self.server + ":" + str(self.port) + "/ws/tts"
+        self.text = text
+
+    # 生成url
+    def create_url(self):
+        return self.url
+
+
+def on_message(ws, message):
+    global flag
+    global st
+    global all_bytes
+
+    try:
+        message = json.loads(message)
+        audio = message["audio"]
+        audio = base64.b64decode(audio)  # bytes
+        status = message["status"]
+        all_bytes += audio
+
+        if status == 0:
+            print("create successfully.")
+        elif status == 1:
+            if flag:
+                print(f"首包响应：{time.time() - st} s")
+                flag = 0
+        elif status == 2:
+            final_response = time.time() - st
+            duration = len(all_bytes) / 2.0 / 24000
+            print(f"尾包响应：{final_response} s")
+            print(f"音频时长：{duration} s")
+            print(f"RTF: {final_response / duration}")
+            with open("./out.pcm", "wb") as f:
+                f.write(all_bytes)
+            print("ws is closed")
+            ws.close()
+        else:
+            print("infer error")
+
+    except Exception as e:
+        print("receive msg,but parse exception:", e)
+
+
+# 收到websocket错误的处理
+def on_error(ws, error):
+    print("### error:", error)
+
+
+# 收到websocket关闭的处理
+def on_close(ws):
+    print("### closed ###")
+
+
+# 收到websocket连接建立的处理
+def on_open(ws):
+    def run(*args):
+        global st
+        text_base64 = str(
+            base64.b64encode((wsParam.text).encode('utf-8')), "UTF8")
+        d = {"text": text_base64}
+        d = json.dumps(d)
+        print("Start sending text data")
+        st = time.time()
+        ws.send(d)
+
+    thread.start_new_thread(run, ())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="A sentence to be synthesized",
+        default="您好，欢迎使用语音合成服务。")
+    parser.add_argument(
+        "--server", type=str, help="server ip", default="127.0.0.1")
+    parser.add_argument("--port", type=int, help="server port", default=8092)
+    args = parser.parse_args()
+
+    print("***************************************")
+    print("Server ip: ", args.server)
+    print("Server port: ", args.port)
+    print("Sentence to be synthesized: ", args.text)
+    print("***************************************")
+
+    wsParam = WsParam(text=args.text, server=args.server, port=args.port)
+
+    websocket.enableTrace(False)
+    wsUrl = wsParam.create_url()
+    ws = websocket.WebSocketApp(
+        wsUrl, on_message=on_message, on_error=on_error, on_close=on_close)
+    ws.on_open = on_open
+    ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})
--- a/tests/unit/server/online/tts/test_server/test_http_client.py
+++ b/tests/unit/server/online/tts/test_server/test_http_client.py
@ -0,0 +1,188 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import base64
+import json
+import os
+import time
+
+import requests
+
+from paddlespeech.server.utils.audio_process import pcm2wav
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+
+
+def save_audio(buffer, audio_path) -> bool:
+    if audio_path.endswith("pcm"):
+        with open(audio_path, "wb") as f:
+            f.write(buffer)
+    elif audio_path.endswith("wav"):
+        with open("./tmp.pcm", "wb") as f:
+            f.write(buffer)
+        pcm2wav("./tmp.pcm", audio_path, channels=1, bits=16, sample_rate=24000)
+        os.system("rm ./tmp.pcm")
+    else:
+        print("Only supports saved audio format is pcm or wav")
+        return False
+
+    return True
+
+
+def test(args, text, utt_id):
+    params = {
+        "text": text,
+        "spk_id": args.spk_id,
+        "speed": args.speed,
+        "volume": args.volume,
+        "sample_rate": args.sample_rate,
+        "save_path": ''
+    }
+
+    buffer = b''
+    flag = 1
+    url = "http://" + str(args.server) + ":" + str(
+        args.port) + "/paddlespeech/streaming/tts"
+    st = time.time()
+    html = requests.post(url, json.dumps(params), stream=True)
+    for chunk in html.iter_content(chunk_size=1024):
+        chunk = base64.b64decode(chunk)  # bytes
+        if flag:
+            first_response = time.time() - st
+            print(f"首包响应：{first_response} s")
+            flag = 0
+        buffer += chunk
+
+    final_response = time.time() - st
+    duration = len(buffer) / 2.0 / 24000
+
+    print(f"sentence: {text}")
+    print(f"尾包响应：{final_response} s")
+    print(f"音频时长：{duration} s")
+    print(f"RTF: {final_response / duration}")
+
+    save_path = str(args.output_dir + "/" + utt_id + ".wav")
+    save_audio(buffer, save_path)
+    print("音频保存至：", save_path)
+
+    return first_response, final_response, duration
+
+
+def count_engine(logfile: str="./nohup.out"):
+    """For inference on the statistical engine side
+
+    Args:
+        logfile (str, optional): server log. Defaults to "./nohup.out".
+    """
+    first_response_list = []
+    final_response_list = []
+    duration_list = []
+
+    with open(logfile, "r") as f:
+        for line in f.readlines():
+            if "- first response time:" in line:
+                first_response = float(line.splie(" ")[-2])
+                first_response_list.append(first_response)
+            elif "- final response time:" in line:
+                final_response = float(line.splie(" ")[-2])
+                final_response_list.append(final_response)
+            elif "- The durations of audio is:" in line:
+                duration = float(line.splie(" ")[-2])
+                duration_list.append(duration)
+
+    assert (len(first_response_list) == len(final_response_list) and
+            len(final_response_list) == len(duration_list))
+
+    avg_first_response = sum(first_response_list) / len(first_response_list)
+    avg_final_response = sum(final_response_list) / len(final_response_list)
+    avg_duration = sum(duration_list) / len(duration_list)
+    RTF = sum(final_response_list) / sum(duration_list)
+
+    print(
+        "************************* engine result ***************************************"
+    )
+    print(
+        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
+    )
+    print(
+        f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
+    )
+    print(
+        f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
+    )
+    print(
+        f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--text",
+        type=str,
+        default="../../../../../../paddlespeech/t2s/exps/csmsc_test.txt",
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
+    parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')
+    parser.add_argument(
+        '--volume', type=float, default=1.0, help='Audio volume')
+    parser.add_argument(
+        '--sample_rate',
+        type=int,
+        default=0,
+        help='Sampling rate, the default is the same as the model')
+    parser.add_argument(
+        "--server", type=str, help="server ip", default="127.0.0.1")
+    parser.add_argument("--port", type=int, help="server port", default=8092)
+    parser.add_argument(
+        "--output_dir", type=str, default="./output", help="output dir")
+
+    args = parser.parse_args()
+
+    os.system("rm -rf %s" % (args.output_dir))
+    os.mkdir(args.output_dir)
+
+    first_response_list = []
+    final_response_list = []
+    duration_list = []
+
+    sentences = get_sentences(text_file=args.text, lang="zh")
+    for utt_id, sentence in sentences:
+        first_response, final_response, duration = test(args, sentence, utt_id)
+        first_response_list.append(first_response)
+        final_response_list.append(final_response)
+        duration_list.append(duration)
+
+    assert (len(first_response_list) == len(final_response_list) and
+            len(final_response_list) == len(duration_list))
+
+    avg_first_response = sum(first_response_list) / len(first_response_list)
+    avg_final_response = sum(final_response_list) / len(final_response_list)
+    avg_duration = sum(duration_list) / len(duration_list)
+    RTF = sum(final_response_list) / sum(duration_list)
+
+    print(
+        "************************* server/client result ***************************************"
+    )
+    print(
+        f"test num: {len(duration_list)}, avg first response: {avg_first_response} s, avg final response: {avg_final_response} s, avg duration: {avg_duration}, RTF: {RTF}"
+    )
+    print(
+        f"min duration: {min(duration_list)} s, max duration: {max(duration_list)} s"
+    )
+    print(
+        f"max first response: {max(first_response_list)} s, min first response: {min(first_response_list)} s"
+    )
+    print(
+        f"max final response: {max(final_response_list)} s, min final response: {min(final_response_list)} s"
+    )