add stream tts server, test=doc

3 years ago · 603e565ab1
parent 3c8f30c7a4
commit 603e565ab1
15 changed files with 997 additions and 3 deletions
--- a/paddlespeech/server/conf/tts_online_application.yaml
+++ b/paddlespeech/server/conf/tts_online_application.yaml
@ -0,0 +1,46 @@
 # This is the parameter configuration file for PaddleSpeech Serving.
 #################################################################################
 #                             SERVER SETTING                                    #
 #################################################################################
 host: 127.0.0.1
 port: 8092
 # The task format in the engin_list is: <speech task>_<engine type>
 # task choices = ['asr_online', 'tts_online']
 # protocol = ['websocket', 'http'] (only one can be selected).
 protocol: 'http'
 engine_list: ['tts_online']
 #################################################################################
 #                                ENGINE CONFIG                                  #
 #################################################################################
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: online #######################
 tts_online: 
    # am (acoustic model) choices=['fastspeech2_csmsc']        
    am: 'fastspeech2_csmsc'   
    am_config: 
    am_ckpt: 
    am_stat: 
    phones_dict: 
    tones_dict: 
    speaker_dict: 
    spk_id: 0
    # voc (vocoder) choices=['mb_melgan_csmsc']
    voc: 'mb_melgan_csmsc'
    voc_config: 
    voc_ckpt: 
    voc_stat: 
    # others
    lang: 'zh'
    device:  # set 'gpu:id' or 'cpu'
    am_block: 42
    am_pad: 12
    voc_block: 14
    voc_pad: 14
--- a/paddlespeech/server/engine/engine_factory.py
+++ b/paddlespeech/server/engine/engine_factory.py
@ -34,6 +34,9 @@ class EngineFactory(object):
        elif engine_name == 'tts' and engine_type == 'python':
            from paddlespeech.server.engine.tts.python.tts_engine import TTSEngine
            return TTSEngine()
        elif engine_name == 'tts' and engine_type == 'online':
            from paddlespeech.server.engine.tts.online.tts_engine import TTSEngine
            return TTSEngine()
        elif engine_name == 'cls' and engine_type == 'inference':
            from paddlespeech.server.engine.cls.paddleinference.cls_engine import CLSEngine
            return CLSEngine()
--- a/paddlespeech/server/engine/tts/online/init.py
+++ b/paddlespeech/server/engine/tts/online/init.py
@ -0,0 +1,13 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
--- a/paddlespeech/server/engine/tts/online/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/tts_engine.py
@ -0,0 +1,305 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import base64
 import io
 import time
 import librosa
 import numpy as np
 import paddle
 import soundfile as sf
 from scipy.io import wavfile
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
 from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import change_speed
 from paddlespeech.server.utils.errors import ErrorCode
 from paddlespeech.server.utils.exception import ServerBaseException
 from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.server.utils.config import get_config
 from paddlespeech.server.utils.util import denorm
 from paddlespeech.server.utils.util import get_chunks
 import math
 __all__ = ['TTSEngine']
 class TTSServerExecutor(TTSExecutor):
    def __init__(self):
        super().__init__()
        pass
    @paddle.no_grad()
    def infer(self,
              text: str,
              lang: str='zh',
              am: str='fastspeech2_csmsc',
              spk_id: int=0,
              am_block: int=42,
              am_pad: int=12,
              voc_block: int=14,
              voc_pad: int=14,):
        """
        Model inference and result stored in self.output.
        """
        am_name = am[:am.rindex('_')]
        am_dataset = am[am.rindex('_') + 1:]
        get_tone_ids = False
        merge_sentences = False
        frontend_st = time.time()
        if am_name == 'speedyspeech':
            get_tone_ids = True
        if lang == 'zh':
            input_ids = self.frontend.get_input_ids(
                text,
                merge_sentences=merge_sentences,
                get_tone_ids=get_tone_ids)
            phone_ids = input_ids["phone_ids"]
            if get_tone_ids:
                tone_ids = input_ids["tone_ids"]
        elif lang == 'en':
            input_ids = self.frontend.get_input_ids(
                text, merge_sentences=merge_sentences)
            phone_ids = input_ids["phone_ids"]
        else:
            print("lang should in {'zh', 'en'}!")
        self.frontend_time = time.time() - frontend_st
        for i in range(len(phone_ids)):
            am_st = time.time()
            part_phone_ids = phone_ids[i]
            # am
            if am_name == 'speedyspeech':
                part_tone_ids = tone_ids[i]
                mel = self.am_inference(part_phone_ids, part_tone_ids)
            # fastspeech2
            else:
                # multi speaker
                if am_dataset in {"aishell3", "vctk"}:
                    mel = self.am_inference(
                        part_phone_ids, spk_id=paddle.to_tensor(spk_id))
                else:
                    mel = self.am_inference(part_phone_ids)
            am_et = time.time()
            # voc streaming
            voc_upsample = self.voc_config.n_shift
            mel_chunks = get_chunks(mel, voc_block, voc_pad, "voc")
            chunk_num = len(mel_chunks)
            voc_st = time.time()
            for i, mel_chunk in enumerate(mel_chunks):
                sub_wav = self.voc_inference(mel_chunk)
                front_pad = min(i*voc_block, voc_pad)
                if i == 0:
                    sub_wav = sub_wav[: voc_block * voc_upsample]
                elif i == chunk_num - 1:
                    sub_wav = sub_wav[front_pad * voc_upsample : ]
                else:
                    sub_wav = sub_wav[front_pad * voc_upsample: (front_pad + voc_block) * voc_upsample]
                yield sub_wav
 class TTSEngine(BaseEngine):
    """TTS server engine
    Args:
        metaclass: Defaults to Singleton.
    """
    def __init__(self, name=None):
        """Initialize TTS server engine
        """
        super(TTSEngine, self).__init__()
    def init(self, config: dict) -> bool:
        self.executor = TTSServerExecutor()
        try:
            self.config = config
            if self.config.device:
                self.device = self.config.device
            else:
                self.device = paddle.get_device()
            paddle.set_device(self.device)
        except Exception as e:
            logger.error(
                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
            )
            logger.error("Initialize TTS server engine Failed on device: %s." %
                         (self.device))
            return False
        try:
            self.executor._init_from_path(
                am=self.config.am,
                am_config=self.config.am_config,
                am_ckpt=self.config.am_ckpt,
                am_stat=self.config.am_stat,
                phones_dict=self.config.phones_dict,
                tones_dict=self.config.tones_dict,
                speaker_dict=self.config.speaker_dict,
                voc=self.config.voc,
                voc_config=self.config.voc_config,
                voc_ckpt=self.config.voc_ckpt,
                voc_stat=self.config.voc_stat,
                lang=self.config.lang)
        except Exception as e:
            logger.error("Failed to get model related files.")
            logger.error("Initialize TTS server engine Failed on device: %s." %
                         (self.device))
            return False
        self.am_block = self.config.am_block
        self.am_pad = self.config.am_pad
        self.voc_block = self.config.voc_block
        self.voc_pad = self.config.voc_pad
        logger.info("Initialize TTS server engine successfully on device: %s." %
                    (self.device))
        return True
    def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
        # Convert byte to text
        if text_bese64:
            text_bytes = base64.b64decode(text_bese64)   # base64 to bytes
        text = text_bytes.decode('utf-8')    # bytes to text
        return text
    def postprocess(self,
                    wav,
                    original_fs: int,
                    target_fs: int=0,
                    volume: float=1.0,
                    speed: float=1.0,
                    audio_path: str=None):
        """Post-processing operations, including speech, volume, sample rate, save audio file
        Args:
            wav (numpy(float)): Synthesized audio sample points
            original_fs (int): original audio sample rate
            target_fs (int): target audio sample rate
            volume (float): target volume
            speed (float): target speed
        Raises:
            ServerBaseException: Throws an exception if the change speed unsuccessfully.
        Returns:
            target_fs: target sample rate for synthesized audio.
            wav_base64: The base64 format of the synthesized audio.
        """
        # transform sample_rate
        if target_fs == 0 or target_fs > original_fs:
            target_fs = original_fs
            wav_tar_fs = wav
            logger.info(
                "The sample rate of synthesized audio is the same as model, which is {}Hz".
                format(original_fs))
        else:
            wav_tar_fs = librosa.resample(
                np.squeeze(wav), original_fs, target_fs)
            logger.info(
                "The sample rate of model is {}Hz and the target sample rate is {}Hz. Converting the sample rate of the synthesized audio successfully.".
                format(original_fs, target_fs))
        # transform volume
        wav_vol = wav_tar_fs * volume
        logger.info("Transform the volume of the audio successfully.")
        # transform speed
        try:  # windows not support soxbindings
            wav_speed = change_speed(wav_vol, speed, target_fs)
            logger.info("Transform the speed of the audio successfully.")
        except ServerBaseException:
            raise ServerBaseException(
                ErrorCode.SERVER_INTERNAL_ERR,
                "Failed to transform speed. Can not install soxbindings on your system. \
                 You need to set speed value 1.0.")
        except BaseException:
            logger.error("Failed to transform speed.")
        # wav to base64
        buf = io.BytesIO()
        wavfile.write(buf, target_fs, wav_speed)
        base64_bytes = base64.b64encode(buf.read())
        wav_base64 = base64_bytes.decode('utf-8')
        logger.info("Audio to string successfully.")
        # save audio
        if audio_path is not None:
            if audio_path.endswith(".wav"):
                sf.write(audio_path, wav_speed, target_fs)
            elif audio_path.endswith(".pcm"):
                wav_norm = wav_speed * (32767 / max(0.001,
                                                    np.max(np.abs(wav_speed))))
                with open(audio_path, "wb") as f:
                    f.write(wav_norm.astype(np.int16))
            logger.info("Save audio to {} successfully.".format(audio_path))
        else:
            logger.info("There is no need to save audio.")
        return target_fs, wav_base64
    def run(self,
            sentence: str,
            spk_id: int=0,
            speed: float=1.0,
            volume: float=1.0,
            sample_rate: int=0,
            save_path: str=None):
        """ run include inference and postprocess.
        Args:
            sentence (str): text to be synthesized
            spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
            speed (float, optional): speed. Defaults to 1.0.
            volume (float, optional): volume. Defaults to 1.0.
            sample_rate (int, optional): target sample rate for synthesized audio, 
            0 means the same as the model sampling rate. Defaults to 0.
            save_path (str, optional): The save path of the synthesized audio. 
            None means do not save audio. Defaults to None.
        Raises:
            ServerBaseException: Throws an exception if tts inference unsuccessfully.
            ServerBaseException: Throws an exception if postprocess unsuccessfully.
        Returns:
            lang: model language 
            target_sample_rate: target sample rate for synthesized audio.
            wav_base64: The base64 format of the synthesized audio.
        """
        lang = self.config.lang
        wav_list = []
        for wav in self.executor.infer(text=sentence, lang=lang, am=self.config.am, spk_id=spk_id, am_block=self.am_block, am_pad=self.am_pad, voc_block=self.voc_block, voc_pad=self.voc_pad):
            # wav type: <class 'numpy.ndarray'>  float32, convert to pcm (base64)
            wav = float2pcm(wav) # float32 to int16
            wav_bytes = wav.tobytes()  # to bytes
            wav_base64 = base64.b64encode(wav_bytes).decode('utf8')  # to base64
            wav_list.append(wav)
            yield wav_base64
        wav_all = np.concatenate(wav_list, axis=0)
        logger.info("The durations of audio is: {} s".format(len(wav_all)/self.executor.am_config.fs))
--- a/paddlespeech/server/restful/tts_api.py
+++ b/paddlespeech/server/restful/tts_api.py
@ -15,6 +15,7 @@ import traceback
 from typing import Union
 from fastapi import APIRouter
 from fastapi.responses import StreamingResponse
 from paddlespeech.cli.log import logger
 from paddlespeech.server.engine.engine_pool import get_engine_pool
@ -125,3 +126,14 @@ def tts(request_body: TTSRequest):
        traceback.print_exc()
    return response
@router.post("/paddlespeech/streaming/tts")
 async def stream_tts(request_body: TTSRequest):
    text = request_body.text
    engine_pool = get_engine_pool()
    tts_engine = engine_pool['tts']
    logger.info("Get tts engine successfully.")
    return StreamingResponse(tts_engine.run(sentence=text))
--- a/paddlespeech/server/tests/tts/offline/http_client.py
+++ b/paddlespeech/server/tests/tts/offline/http_client.py
@ -33,7 +33,8 @@ def tts_client(args):
        text: A sentence to be synthesized
        outfile: Synthetic audio file
    """
-    url = 'http://127.0.0.1:8090/paddlespeech/tts'
+    url = "http://" + str(args.server) + ":" + str(
        args.port) + "/paddlespeech/tts"
    request = {
        "text": args.text,
        "spk_id": args.spk_id,
@ -72,7 +73,7 @@ if __name__ == "__main__":
    parser.add_argument(
        '--text',
        type=str,
-        default="你好，欢迎使用语音合成服务",
+        default="您好，欢迎使用语音合成服务。",
        help='A sentence to be synthesized')
    parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
    parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')
@ -88,6 +89,9 @@ if __name__ == "__main__":
        type=str,
        default="./out.wav",
        help='Synthesized audio file')
    parser.add_argument(
        "--server", type=str, help="server ip", default="127.0.0.1")
    parser.add_argument("--port", type=int, help="server port", default=8090)
    args = parser.parse_args()
    st = time.time()
--- a/paddlespeech/server/tests/tts/online/http_client.py
+++ b/paddlespeech/server/tests/tts/online/http_client.py
@ -0,0 +1,100 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import base64
 import json
 import os
 import time
 import requests
 from paddlespeech.server.utils.audio_process import pcm2wav
 def save_audio(buffer, audio_path) -> bool:
    if args.save_path.endswith("pcm"):
        with open(args.save_path, "wb") as f:
            f.write(buffer)
    elif args.save_path.endswith("wav"):
        with open("./tmp.pcm", "wb") as f:
            f.write(buffer)
        pcm2wav("./tmp.pcm", audio_path, channels=1, bits=16, sample_rate=24000)
        os.system("rm ./tmp.pcm")
    else:
        print("Only supports saved audio format is pcm or wav")
        return False
    return True
 def test(args):
    params = {
        "text": args.text,
        "spk_id": args.spk_id,
        "speed": args.speed,
        "volume": args.volume,
        "sample_rate": args.sample_rate,
        "save_path": ''
    }
    buffer = b''
    flag = 1
    url = "http://" + str(args.server) + ":" + str(
        args.port) + "/paddlespeech/streaming/tts"
    st = time.time()
    html = requests.post(url, json.dumps(params), stream=True)
    for chunk in html.iter_content(chunk_size=1024):
        chunk = base64.b64decode(chunk)  # bytes
        if flag:
            first_response = time.time() - st
            print(f"首包响应：{first_response} s")
            flag = 0
        buffer += chunk
    final_response = time.time() - st
    duration = len(buffer) / 2.0 / 24000
    print(f"尾包响应：{final_response} s")
    print(f"音频时长：{duration} s")
    print(f"RTF: {final_response / duration}")
    if args.save_path is not None:
        if save_audio(buffer, args.save_path):
            print("音频保存至：", args.save_path)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--text',
        type=str,
        default="您好，欢迎使用语音合成服务。",
        help='A sentence to be synthesized')
    parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
    parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')
    parser.add_argument(
        '--volume', type=float, default=1.0, help='Audio volume')
    parser.add_argument(
        '--sample_rate',
        type=int,
        default=0,
        help='Sampling rate, the default is the same as the model')
    parser.add_argument(
        "--server", type=str, help="server ip", default="127.0.0.1")
    parser.add_argument("--port", type=int, help="server port", default=8092)
    parser.add_argument(
        "--save_path", type=str, help="save audio path", default=None)
    args = parser.parse_args()
    test(args)
--- a/paddlespeech/server/tests/tts/online/http_client_playaudio.py
+++ b/paddlespeech/server/tests/tts/online/http_client_playaudio.py
@ -0,0 +1,112 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import base64
 import json
 import threading
 import time
 import pyaudio
 import requests
 mutex = threading.Lock()
 buffer = b''
 p = pyaudio.PyAudio()
 stream = p.open(
    format=p.get_format_from_width(2), channels=1, rate=24000, output=True)
 max_fail = 50
 def play_audio():
    global stream
    global buffer
    global max_fail
    while True:
        if not buffer:
            max_fail -= 1
            time.sleep(0.05)
            if max_fail < 0:
                break
        mutex.acquire()
        stream.write(buffer)
        buffer = b''
        mutex.release()
 def test(args):
    global mutex
    global buffer
    params = {
        "text": args.text,
        "spk_id": args.spk_id,
        "speed": args.speed,
        "volume": args.volume,
        "sample_rate": args.sample_rate,
        "save_path": ''
    }
    all_bytes = 0.0
    t = threading.Thread(target=play_audio)
    flag = 1
    url = "http://" + str(args.server) + ":" + str(
        args.port) + "/paddlespeech/streaming/tts"
    st = time.time()
    html = requests.post(url, json.dumps(params), stream=True)
    for chunk in html.iter_content(chunk_size=1024):
        mutex.acquire()
        chunk = base64.b64decode(chunk)  # bytes
        buffer += chunk
        mutex.release()
        if flag:
            first_response = time.time() - st
            print(f"首包响应：{first_response} s")
            flag = 0
            t.start()
        all_bytes += len(chunk)
    final_response = time.time() - st
    duration = all_bytes / 2 / 24000
    print(f"尾包响应：{final_response} s")
    print(f"音频时长：{duration} s")
    print(f"RTF: {final_response / duration}")
    t.join()
    stream.stop_stream()
    stream.close()
    p.terminate()
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--text',
        type=str,
        default="您好，欢迎使用语音合成服务。",
        help='A sentence to be synthesized')
    parser.add_argument('--spk_id', type=int, default=0, help='Speaker id')
    parser.add_argument('--speed', type=float, default=1.0, help='Audio speed')
    parser.add_argument(
        '--volume', type=float, default=1.0, help='Audio volume')
    parser.add_argument(
        '--sample_rate',
        type=int,
        default=0,
        help='Sampling rate, the default is the same as the model')
    parser.add_argument(
        "--server", type=str, help="server ip", default="127.0.0.1")
    parser.add_argument("--port", type=int, help="server port", default=8092)
    args = parser.parse_args()
    test(args)
--- a/paddlespeech/server/tests/tts/online/out.pcm
+++ b/paddlespeech/server/tests/tts/online/out.pcm
--- a/paddlespeech/server/tests/tts/online/ws_client.py
+++ b/paddlespeech/server/tests/tts/online/ws_client.py
@ -0,0 +1,126 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import _thread as thread
 import argparse
 import base64
 import json
 import ssl
 import time
 import websocket
 flag = 1
 st = 0.0
 all_bytes = b''
 class Ws_Param(object):
    # 初始化
    def __init__(self, text, server="127.0.0.1", port=8090):
        self.server = server
        self.port = port
        self.url = "ws://" + self.server + ":" + str(self.port) + "/ws/tts"
        self.text = text
    # 生成url
    def create_url(self):
        return self.url
 def on_message(ws, message):
    global flag
    global st
    global all_bytes
    try:
        message = json.loads(message)
        audio = message["audio"]
        audio = base64.b64decode(audio)  # bytes
        status = message["status"]
        all_bytes += audio
        if status == 0:
            print("create successfully.")
        elif status == 1:
            if flag:
                print(f"首包响应：{time.time() - st} s")
                flag = 0
        elif status == 2:
            final_response = time.time() - st
            duration = len(all_bytes) / 2.0 / 24000
            print(f"尾包响应：{final_response} s")
            print(f"音频时长：{duration} s")
            print(f"RTF: {final_response / duration}")
            with open("./out.pcm", "wb") as f:
                f.write(all_bytes)
            print("ws is closed")
            ws.close()
        else:
            print("infer error")
    except Exception as e:
        print("receive msg,but parse exception:", e)
 # 收到websocket错误的处理
 def on_error(ws, error):
    print("### error:", error)
 # 收到websocket关闭的处理
 def on_close(ws):
    print("### closed ###")
 # 收到websocket连接建立的处理
 def on_open(ws):
    def run(*args):
        global st
        text_base64 = str(
            base64.b64encode((wsParam.text).encode('utf-8')), "UTF8")
        d = {"text": text_base64}
        d = json.dumps(d)
        print("Start sending text data")
        st = time.time()
        ws.send(d)
    thread.start_new_thread(run, ())
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--text",
        type=str,
        help="A sentence to be synthesized",
        default="您好，欢迎使用语音合成服务。")
    parser.add_argument(
        "--server", type=str, help="server ip", default="127.0.0.1")
    parser.add_argument("--port", type=int, help="server port", default=8092)
    args = parser.parse_args()
    print("***************************************")
    print("Server ip: ", args.server)
    print("Server port: ", args.port)
    print("Sentence to be synthesized: ", args.text)
    print("***************************************")
    wsParam = Ws_Param(text=args.text, server=args.server, port=args.port)
    websocket.enableTrace(False)
    wsUrl = wsParam.create_url()
    ws = websocket.WebSocketApp(
        wsUrl, on_message=on_message, on_error=on_error, on_close=on_close)
    ws.on_open = on_open
    ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})
--- a/paddlespeech/server/tests/tts/online/ws_client_playaudio.py
+++ b/paddlespeech/server/tests/tts/online/ws_client_playaudio.py
@ -0,0 +1,160 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import _thread as thread
 import argparse
 import base64
 import json
 import ssl
 import threading
 import time
 import pyaudio
 import websocket
 mutex = threading.Lock()
 buffer = b''
 p = pyaudio.PyAudio()
 stream = p.open(
    format=p.get_format_from_width(2), channels=1, rate=24000, output=True)
 flag = 1
 st = 0.0
 all_bytes = 0.0
 class Ws_Param(object):
    # 初始化
    def __init__(self, text, server="127.0.0.1", port=8090):
        self.server = server
        self.port = port
        self.url = "ws://" + self.server + ":" + str(self.port) + "/ws/tts"
        self.text = text
    # 生成url
    def create_url(self):
        return self.url
 def play_audio():
    global stream
    global buffer
    while True:
        time.sleep(0.05)
        if not buffer:  # buffer 为空
            break
        mutex.acquire()
        stream.write(buffer)
        buffer = b''
        mutex.release()
 t = threading.Thread(target=play_audio)
 def on_message(ws, message):
    global flag
    global t
    global buffer
    global st
    global all_bytes
    try:
        message = json.loads(message)
        audio = message["audio"]
        audio = base64.b64decode(audio)  # bytes
        status = message["status"]
        all_bytes += len(audio)
        if status == 0:
            print("create successfully.")
        elif status == 1:
            mutex.acquire()
            buffer += audio
            mutex.release()
            if flag:
                print(f"首包响应：{time.time() - st} s")
                flag = 0
                print("Start playing audio")
                t.start()
        elif status == 2:
            final_response = time.time() - st
            duration = all_bytes / 2 / 24000
            print(f"尾包响应：{final_response} s")
            print(f"音频时长：{duration} s")
            print(f"RTF: {final_response / duration}")
            print("ws is closed")
            ws.close()
        else:
            print("infer error")
    except Exception as e:
        print("receive msg,but parse exception:", e)
 # 收到websocket错误的处理
 def on_error(ws, error):
    print("### error:", error)
 # 收到websocket关闭的处理
 def on_close(ws):
    print("### closed ###")
 # 收到websocket连接建立的处理
 def on_open(ws):
    def run(*args):
        global st
        text_base64 = str(
            base64.b64encode((wsParam.text).encode('utf-8')), "UTF8")
        d = {"text": text_base64}
        d = json.dumps(d)
        print("Start sending text data")
        st = time.time()
        ws.send(d)
    thread.start_new_thread(run, ())
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--text",
        type=str,
        help="A sentence to be synthesized",
        default="您好，欢迎使用语音合成服务。")
    parser.add_argument(
        "--server", type=str, help="server ip", default="127.0.0.1")
    parser.add_argument("--port", type=int, help="server port", default=8092)
    args = parser.parse_args()
    print("***************************************")
    print("Server ip: ", args.server)
    print("Server port: ", args.port)
    print("Sentence to be synthesized: ", args.text)
    print("***************************************")
    wsParam = Ws_Param(text=args.text, server=args.server, port=args.port)
    websocket.enableTrace(False)
    wsUrl = wsParam.create_url()
    ws = websocket.WebSocketApp(
        wsUrl, on_message=on_message, on_error=on_error, on_close=on_close)
    ws.on_open = on_open
    ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})
    t.join()
    print("End of playing audio")
    stream.stop_stream()
    stream.close()
    p.terminate()
--- a/paddlespeech/server/utils/audio_process.py
+++ b/paddlespeech/server/utils/audio_process.py
@ -103,3 +103,26 @@ def change_speed(sample_raw, speed_rate, sample_rate):
        sample_rate_in=sample_rate).squeeze(-1).astype(np.float32).copy()
    return sample_speed
 def float2pcm(sig, dtype='int16'):
    """Convert floating point signal with a range from -1 to 1 to PCM.
    Args:
        sig (array): Input array, must have floating point type.
        dtype (str, optional): Desired (integer) data type. Defaults to 'int16'.
    Returns:
        numpy.ndarray: Integer data, scaled and clipped to the range of the given
    """
    sig = np.asarray(sig)
    if sig.dtype.kind != 'f':
        raise TypeError("'sig' must be a float array")
    dtype = np.dtype(dtype)
    if dtype.kind not in 'iu':
        raise TypeError("'dtype' must be an integer type")
    i = np.iinfo(dtype)
    abs_max = 2**(i.bits - 1)
    offset = i.min + abs_max
    return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
--- a/paddlespeech/server/utils/util.py
+++ b/paddlespeech/server/utils/util.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the 
 import base64
 import math
 def wav2base64(wav_file: str):
@ -31,3 +32,29 @@ def self_check():
    """ self check resource
    """
    return True
 def denorm(data, mean, std):
    return data * std + mean
 def get_chunks(data, block_size, pad_size, step):
    if step == "am":
        data_len = data.shape[1]
    elif step == "voc":
        data_len = data.shape[0]
    else:
        print("Please set correct type to get chunks, am or voc")
    chunks = []
    n = math.ceil(data_len / block_size)
    for i in range(n):
        start = max(0, i * block_size - pad_size)
        end = min((i + 1) * block_size + pad_size, data_len)
        if step == "am":
            chunks.append(data[:, start:end, :])
        elif step == "voc":
            chunks.append(data[start:end, :])
        else:
            print("Please set correct type to get chunks, am or voc")
    return chunks
--- a/paddlespeech/server/ws/api.py
+++ b/paddlespeech/server/ws/api.py
@ -16,6 +16,7 @@ from typing import List
 from fastapi import APIRouter
 from paddlespeech.server.ws.asr_socket import router as asr_router
 from paddlespeech.server.ws.tts_socket import router as tts_router
 _router = APIRouter()
@ -31,7 +32,7 @@ def setup_router(api_list: List):
        if api_name == 'asr':
            _router.include_router(asr_router)
        elif api_name == 'tts':
-            pass
+            _router.include_router(tts_router)
        else:
            pass
--- a/paddlespeech/server/ws/tts_socket.py
+++ b/paddlespeech/server/ws/tts_socket.py
@ -0,0 +1,62 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
 from fastapi import APIRouter
 from fastapi import WebSocket
 from fastapi import WebSocketDisconnect
 from starlette.websockets import WebSocketState as WebSocketState
 from paddlespeech.cli.log import logger
 from paddlespeech.server.engine.engine_pool import get_engine_pool
 router = APIRouter()
@router.websocket('/ws/tts')
 async def websocket_endpoint(websocket: WebSocket):
    await websocket.accept()
    try:
        # careful here, changed the source code from starlette.websockets
        assert websocket.application_state == WebSocketState.CONNECTED
        message = await websocket.receive()
        websocket._raise_on_disconnect(message)
        # get engine
        engine_pool = get_engine_pool()
        tts_engine = engine_pool['tts']
        # 获取 message 并转文本
        message = json.loads(message["text"])
        text_bese64 = message["text"]
        sentence = tts_engine.preprocess(text_bese64=text_bese64)
        # run
        wav = tts_engine.run(sentence)
        while True:
            try:
                tts_results = next(wav)
                resp = {"status": 1, "audio": tts_results}
                await websocket.send_json(resp)
                logger.info("streaming audio...")
            except StopIteration as e:
                resp = {"status": 2, "audio": ''}
                await websocket.send_json(resp)
                logger.info("Complete the transmission of audio streams")
                break
    except WebSocketDisconnect:
        pass