Merge pull request #1627 from WilliamZhang06/ws-develop

[websocket] added online asr engine
3 years ago · 61941d14b0
parent 390a3d1b83 2ec8d608bf
commit 61941d14b0
15 changed files with 1043 additions and 3 deletions
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@ -208,6 +208,18 @@ class AudioSegment():
            io.BytesIO(bytes), dtype='float32')
        return cls(samples, sample_rate)

+    @classmethod
+    def from_pcm(cls, samples, sample_rate):
+        """Create audio segment from a byte string containing audio samples.
+        :param samples: Audio samples [num_samples x num_channels].
+        :type samples: numpy.ndarray
+        :param sample_rate: Audio sample rate.
+        :type sample_rate: int
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        return cls(samples, sample_rate)
+
    @classmethod
    def concatenate(cls, *segments):
        """Concatenate an arbitrary number of audio segments together.
--- a/paddlespeech/s2t/frontend/speech.py
+++ b/paddlespeech/s2t/frontend/speech.py
@ -107,6 +107,22 @@ class SpeechSegment(AudioSegment):
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)

+    @classmethod
+    def from_pcm(cls, samples, sample_rate, transcript, tokens=None, token_ids=None):
+        """Create speech segment from pcm on online mode 
+        Args:
+            samples (numpy.ndarray): Audio samples [num_samples x num_channels].
+            sample_rate (int): Audio sample rate.
+            transcript (str): Transcript text for the speech.
+            tokens (List[str], optional): text tokens. Defaults to None.
+            token_ids (List[int], optional): text token ids. Defaults to None.
+        Returns: 
+            SpeechSegment: Speech segment instance.
+        """
+        audio = AudioSegment.from_pcm(samples, sample_rate)
+        return cls(audio.samples, audio.sample_rate, transcript, tokens,
+                   token_ids)
+
    @classmethod
    def concatenate(cls, *segments):
        """Concatenate an arbitrary number of speech segments together, both
--- a/paddlespeech/server/bin/main.py
+++ b/paddlespeech/server/bin/main.py
@ -17,7 +17,8 @@ import uvicorn
 from fastapi import FastAPI

 from paddlespeech.server.engine.engine_pool import init_engine_pool
-from paddlespeech.server.restful.api import setup_router
+from paddlespeech.server.restful.api import setup_router as setup_http_router
+from paddlespeech.server.ws.api import setup_router as setup_ws_router
 from paddlespeech.server.utils.config import get_config

 app = FastAPI(
@ -35,7 +36,12 @@ def init(config):
    """
    # init api
    api_list = list(engine.split("_")[0] for engine in config.engine_list)
-    api_router = setup_router(api_list)
+    if config.protocol == "websocket":
+        api_router = setup_ws_router(api_list)
+    elif config.protocol == "http":
+        api_router = setup_http_router(api_list)
+    else:
+        raise Exception("unsupported protocol")
    app.include_router(api_router)

    if not init_engine_pool(config):
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@ -8,7 +8,9 @@ port: 8090

 # The task format in the engin_list is: <speech task>_<engine type>
 # task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
-
+# protocol = ['websocket', 'http'] (only one can be selected). 
+# http only support offline engine type.
+protocol: 'http'
 engine_list: ['asr_python', 'tts_python', 'cls_python']


@ -48,6 +50,24 @@ asr_inference:
        summary: True  # False -> do not show predictor config


+################### speech task: asr; engine_type: online #######################
+asr_online:
+    model_type: 'deepspeech2online_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: python #######################
 tts_python: 
--- a/paddlespeech/server/conf/ws_application.yaml
+++ b/paddlespeech/server/conf/ws_application.yaml
@ -0,0 +1,51 @@
+# This is the parameter configuration file for PaddleSpeech Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8091
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_online', 'tts_online']
+# protocol = ['websocket', 'http'] (only one can be selected).
+# websocket only support online engine type.
+protocol: 'websocket'
+engine_list: ['asr_online']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### ASR #########################################
+################### speech task: asr; engine_type: online #######################
+asr_online:
+    model_type: 'deepspeech2online_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    chunk_buffer_conf:
+        frame_duration_ms: 80
+        shift_ms: 40
+        sample_rate: 16000
+        sample_width: 2
+
+    vad_conf:
+        aggressiveness: 2
+        sample_rate: 16000
+        frame_duration_ms: 20
+        sample_width: 2
+        padding_ms: 200
+        padding_ratio: 0.9
--- a/paddlespeech/server/engine/asr/online/init.py
+++ b/paddlespeech/server/engine/asr/online/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@ -0,0 +1,355 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import os
+import time
+from typing import Optional
+import pickle
+import numpy as np
+from numpy import float32
+import soundfile
+
+import paddle
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.frontend.speech import SpeechSegment
+from paddlespeech.cli.asr.infer import ASRExecutor
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.server.engine.base_engine import BaseEngine
+from paddlespeech.server.utils.config import get_config
+from paddlespeech.server.utils.paddle_predictor import init_predictor
+from paddlespeech.server.utils.paddle_predictor import run_model
+
+__all__ = ['ASREngine']
+
+pretrained_models = {
+    "deepspeech2online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        'd5e076217cf60486519f72c217d21b9b',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2_online/checkpoints/avg_1',
+        'model':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
+        'params':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+}
+
+
+class ASRServerExecutor(ASRExecutor):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def _init_from_path(self,
+                        model_type: str='wenetspeech',
+                        am_model: Optional[os.PathLike]=None,
+                        am_params: Optional[os.PathLike]=None,
+                        lang: str='zh',
+                        sample_rate: int=16000,
+                        cfg_path: Optional[os.PathLike]=None,
+                        decode_method: str='attention_rescoring',
+                        am_predictor_conf: dict=None):
+        """
+        Init model and other resources from a specific path.
+        """
+
+        if cfg_path is None or am_model is None or am_params is None:
+            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+            tag = model_type + '-' + lang + '-' + sample_rate_str
+            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
+            self.res_path = res_path
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]['cfg_path'])
+
+            self.am_model = os.path.join(res_path,
+                                         pretrained_models[tag]['model'])
+            self.am_params = os.path.join(res_path,
+                                          pretrained_models[tag]['params'])
+            logger.info(res_path)
+            logger.info(self.cfg_path)
+            logger.info(self.am_model)
+            logger.info(self.am_params)
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.am_model = os.path.abspath(am_model)
+            self.am_params = os.path.abspath(am_params)
+            self.res_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(self.cfg_path)))
+
+        #Init body.
+        self.config = CfgNode(new_allowed=True)
+        self.config.merge_from_file(self.cfg_path)
+
+        with UpdateConfig(self.config):
+            if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+                from paddlespeech.s2t.io.collator import SpeechCollator
+                self.vocab = self.config.vocab_filepath
+                self.config.decode.lang_model_path = os.path.join(
+                    MODEL_HOME, 'language_model',
+                    self.config.decode.lang_model_path)
+                self.collate_fn_test = SpeechCollator.from_config(self.config)
+                self.text_feature = TextFeaturizer(
+                    unit_type=self.config.unit_type, vocab=self.vocab)
+
+                lm_url = pretrained_models[tag]['lm_url']
+                lm_md5 = pretrained_models[tag]['lm_md5']
+                self.download_lm(
+                    lm_url,
+                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)
+            elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
+                raise Exception("wrong type")
+            else:
+                raise Exception("wrong type")
+
+        # AM predictor
+        self.am_predictor_conf = am_predictor_conf
+        self.am_predictor = init_predictor(
+            model_file=self.am_model,
+            params_file=self.am_params,
+            predictor_conf=self.am_predictor_conf)
+
+        # decoder
+        self.decoder = CTCDecoder(
+            odim=self.config.output_dim,  # <blank> is in  vocab
+            enc_n_units=self.config.rnn_layer_size * 2,
+            blank_id=self.config.blank_id,
+            dropout_rate=0.0,
+            reduction=True,  # sum
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=self.config.get('ctc_grad_norm_type', None))
+        
+        # init decoder
+        cfg = self.config.decode
+        decode_batch_size = 1      # for online
+        self.decoder.init_decoder(
+            decode_batch_size, self.text_feature.vocab_list,
+            cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
+            cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
+            cfg.num_proc_bsearch)
+
+        # init state box
+        self.chunk_state_h_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+        self.chunk_state_c_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+
+    def reset_decoder_and_chunk(self):
+        """reset decoder and chunk state for an new audio
+        """
+        self.decoder.reset_decoder(batch_size=1)
+        # init state box, for new audio request
+        self.chunk_state_h_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+        self.chunk_state_c_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+
+    def decode_one_chunk(self, x_chunk, x_chunk_lens, model_type: str):
+        """decode one chunk
+
+        Args:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+            model_type (str): online model type
+
+        Returns:
+            [type]: [description]
+        """
+        if "deepspeech2online" in model_type :
+            input_names = self.am_predictor.get_input_names()
+            audio_handle = self.am_predictor.get_input_handle(input_names[0])
+            audio_len_handle = self.am_predictor.get_input_handle(input_names[1])
+            h_box_handle = self.am_predictor.get_input_handle(input_names[2])
+            c_box_handle = self.am_predictor.get_input_handle(input_names[3])
+
+            audio_handle.reshape(x_chunk.shape)
+            audio_handle.copy_from_cpu(x_chunk)
+
+            audio_len_handle.reshape(x_chunk_lens.shape)
+            audio_len_handle.copy_from_cpu(x_chunk_lens)
+
+            h_box_handle.reshape(self.chunk_state_h_box.shape)
+            h_box_handle.copy_from_cpu(self.chunk_state_h_box)
+
+            c_box_handle.reshape(self.chunk_state_c_box.shape)
+            c_box_handle.copy_from_cpu(self.chunk_state_c_box)
+
+            output_names = self.am_predictor.get_output_names()
+            output_handle = self.am_predictor.get_output_handle(output_names[0])
+            output_lens_handle = self.am_predictor.get_output_handle(output_names[1])
+            output_state_h_handle = self.am_predictor.get_output_handle(
+                output_names[2])
+            output_state_c_handle = self.am_predictor.get_output_handle(
+                output_names[3])
+
+            self.am_predictor.run()
+
+            output_chunk_probs = output_handle.copy_to_cpu()
+            output_chunk_lens = output_lens_handle.copy_to_cpu()
+            self.chunk_state_h_box = output_state_h_handle.copy_to_cpu()
+            self.chunk_state_c_box = output_state_c_handle.copy_to_cpu()
+
+            self.decoder.next(output_chunk_probs, output_chunk_lens)
+            trans_best, trans_beam = self.decoder.decode()
+
+            return trans_best[0]
+
+        elif "conformer" in model_type or "transformer" in model_type:
+            raise Exception("invalid model name")
+        else:
+            raise Exception("invalid model name")
+
+    def _pcm16to32(self, audio):
+        """pcm int16 to float32
+
+        Args:
+            audio(numpy.array): numpy.int16
+
+        Returns:
+            audio(numpy.array): numpy.float32
+        """
+        if audio.dtype == np.int16:
+            audio = audio.astype("float32")
+            bits = np.iinfo(np.int16).bits
+            audio = audio / (2**(bits - 1))
+        return audio
+
+    def extract_feat(self, samples, sample_rate):
+        """extract feat
+
+        Args:
+            samples (numpy.array): numpy.float32
+            sample_rate (int): sample rate
+
+        Returns:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+        """
+        # pcm16 -> pcm 32
+        samples = self._pcm16to32(samples)
+
+        # read audio
+        speech_segment = SpeechSegment.from_pcm(
+            samples, sample_rate, transcript=" ")
+        # audio augment
+        self.collate_fn_test.augmentation.transform_audio(speech_segment)
+
+        # extract speech feature
+        spectrum, transcript_part = self.collate_fn_test._speech_featurizer.featurize(
+            speech_segment, self.collate_fn_test.keep_transcription_text)
+        # CMVN spectrum
+        if self.collate_fn_test._normalizer:
+            spectrum = self.collate_fn_test._normalizer.apply(spectrum)
+
+        # spectrum augment
+        audio = self.collate_fn_test.augmentation.transform_feature(spectrum)
+
+        audio_len = audio.shape[0]
+        audio = paddle.to_tensor(audio, dtype='float32')
+        # audio_len = paddle.to_tensor(audio_len)
+        audio = paddle.unsqueeze(audio, axis=0)
+
+        x_chunk = audio.numpy()
+        x_chunk_lens = np.array([audio_len])
+
+        return x_chunk, x_chunk_lens
+
+
+class ASREngine(BaseEngine):
+    """ASR server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self):
+        super(ASREngine, self).__init__()
+
+    def init(self, config: dict) -> bool:
+        """init engine resource
+
+        Args:
+            config_file (str): config file
+
+        Returns:
+            bool: init failed or success
+        """
+        self.input = None
+        self.output = ""
+        self.executor = ASRServerExecutor()
+        self.config = config
+
+        self.executor._init_from_path(
+            model_type=self.config.model_type,
+            am_model=self.config.am_model,
+            am_params=self.config.am_params,
+            lang=self.config.lang,
+            sample_rate=self.config.sample_rate,
+            cfg_path=self.config.cfg_path,
+            decode_method=self.config.decode_method,
+            am_predictor_conf=self.config.am_predictor_conf)
+
+        logger.info("Initialize ASR server engine successfully.")
+        return True
+
+    def preprocess(self, samples, sample_rate):
+        """preprocess
+
+        Args:
+            samples (numpy.array): numpy.float32
+            sample_rate (int): sample rate
+
+        Returns:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+        """
+        x_chunk, x_chunk_lens = self.executor.extract_feat(samples, sample_rate)
+        return x_chunk, x_chunk_lens
+
+    def run(self, x_chunk, x_chunk_lens, decoder_chunk_size=1):
+        """run online engine
+
+        Args:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+            decoder_chunk_size(int)
+        """
+        self.output = self.executor.decode_one_chunk(x_chunk, x_chunk_lens, self.config.model_type)
+
+    def postprocess(self):
+        """postprocess
+        """
+        return self.output
+
+    def reset(self):
+        """reset engine decoder and inference state
+        """
+        self.executor.reset_decoder_and_chunk()
+        self.output = ""
--- a/paddlespeech/server/engine/engine_factory.py
+++ b/paddlespeech/server/engine/engine_factory.py
@ -25,6 +25,9 @@ class EngineFactory(object):
        elif engine_name == 'asr' and engine_type == 'python':
            from paddlespeech.server.engine.asr.python.asr_engine import ASREngine
            return ASREngine()
+        elif engine_name == 'asr' and engine_type == 'online':
+            from paddlespeech.server.engine.asr.online.asr_engine import ASREngine
+            return ASREngine()
        elif engine_name == 'tts' and engine_type == 'inference':
            from paddlespeech.server.engine.tts.paddleinference.tts_engine import TTSEngine
            return TTSEngine()
--- a/paddlespeech/server/tests/asr/online/microphone_client.py
+++ b/paddlespeech/server/tests/asr/online/microphone_client.py
@ -0,0 +1,161 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+record wave from the mic
+"""
+import asyncio
+import json
+import logging
+import threading
+import wave
+from signal import SIGINT
+from signal import SIGTERM
+
+import pyaudio
+import websockets
+
+
+class ASRAudioHandler(threading.Thread):
+    def __init__(self, url="127.0.0.1", port=8091):
+        threading.Thread.__init__(self)
+        self.url = url
+        self.port = port
+        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+        self.fileName = "./output.wav"
+        self.chunk = 5120
+        self.format = pyaudio.paInt16
+        self.channels = 1
+        self.rate = 16000
+        self._running = True
+        self._frames = []
+        self.data_backup = []
+
+    def startrecord(self):
+        """
+        start a new thread to record wave
+        """
+        threading._start_new_thread(self.recording, ())
+
+    def recording(self):
+        """
+        recording wave
+        """
+        self._running = True
+        self._frames = []
+        p = pyaudio.PyAudio()
+        stream = p.open(
+            format=self.format,
+            channels=self.channels,
+            rate=self.rate,
+            input=True,
+            frames_per_buffer=self.chunk)
+        while (self._running):
+            data = stream.read(self.chunk)
+            self._frames.append(data)
+            self.data_backup.append(data)
+
+        stream.stop_stream()
+        stream.close()
+        p.terminate()
+
+    def save(self):
+        """
+        save wave data
+        """
+        p = pyaudio.PyAudio()
+        wf = wave.open(self.fileName, 'wb')
+        wf.setnchannels(self.channels)
+        wf.setsampwidth(p.get_sample_size(self.format))
+        wf.setframerate(self.rate)
+        wf.writeframes(b''.join(self.data_backup))
+        wf.close()
+        p.terminate()
+
+    def stoprecord(self):
+        """
+        stop recording
+        """
+        self._running = False
+
+    async def run(self):
+        aa = input("是否开始录音？   (y/n)")
+        if aa.strip() == "y":
+            self.startrecord()
+            logging.info("*" * 10 + "开始录音，请输入语音")
+
+            async with websockets.connect(self.url) as ws:
+                # 发送开始指令
+                audio_info = json.dumps(
+                    {
+                        "name": "test.wav",
+                        "signal": "start",
+                        "nbest": 5
+                    },
+                    sort_keys=True,
+                    indent=4,
+                    separators=(',', ': '))
+                await ws.send(audio_info)
+                msg = await ws.recv()
+                logging.info("receive msg={}".format(msg))
+
+                # send bytes data
+                logging.info("结束录音请: Ctrl + c。继续请按回车。")
+                try:
+                    while True:
+                        while len(self._frames) > 0:
+                            await ws.send(self._frames.pop(0))
+                            msg = await ws.recv()
+                            logging.info("receive msg={}".format(msg))
+                except asyncio.CancelledError:
+                    # quit
+                    # send finished 
+                    audio_info = json.dumps(
+                        {
+                            "name": "test.wav",
+                            "signal": "end",
+                            "nbest": 5
+                        },
+                        sort_keys=True,
+                        indent=4,
+                        separators=(',', ': '))
+                    await ws.send(audio_info)
+                    msg = await ws.recv()
+                    logging.info("receive msg={}".format(msg))
+
+                    self.stoprecord()
+                    logging.info("*" * 10 + "录音结束")
+                    self.save()
+        elif aa.strip() == "n":
+            exit()
+        else:
+            print("无效输入!")
+            exit()
+
+
+if __name__ == "__main__":
+
+    logging.basicConfig(level=logging.INFO)
+    logging.info("asr websocket client start")
+
+    handler = ASRAudioHandler("127.0.0.1", 8091)
+    loop = asyncio.get_event_loop()
+    main_task = asyncio.ensure_future(handler.run())
+    for signal in [SIGINT, SIGTERM]:
+        loop.add_signal_handler(signal, main_task.cancel)
+    try:
+        loop.run_until_complete(main_task)
+    finally:
+        loop.close()
+
+    logging.info("asr websocket client finished")
--- a/paddlespeech/server/tests/asr/online/websocket_client.py
+++ b/paddlespeech/server/tests/asr/online/websocket_client.py
@ -0,0 +1,115 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+import argparse
+import asyncio
+import json
+import logging
+
+import numpy as np
+import soundfile
+import websockets
+
+
+class ASRAudioHandler:
+    def __init__(self, url="127.0.0.1", port=8090):
+        self.url = url
+        self.port = port
+        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+
+    def read_wave(self, wavfile_path: str):
+        samples, sample_rate = soundfile.read(wavfile_path, dtype='int16')
+        x_len = len(samples)
+        chunk_stride = 40 * 16  #40ms, sample_rate = 16kHz
+        chunk_size = 80 * 16  #80ms, sample_rate = 16kHz
+
+        if (x_len - chunk_size) % chunk_stride != 0:
+            padding_len_x = chunk_stride - (x_len - chunk_size) % chunk_stride
+        else:
+            padding_len_x = 0
+
+        padding = np.zeros((padding_len_x), dtype=samples.dtype)
+        padded_x = np.concatenate([samples, padding], axis=0)
+
+        num_chunk = (x_len + padding_len_x - chunk_size) / chunk_stride + 1
+        num_chunk = int(num_chunk)
+
+        for i in range(0, num_chunk):
+            start = i * chunk_stride
+            end = start + chunk_size
+            x_chunk = padded_x[start:end]
+            yield x_chunk
+
+    async def run(self, wavfile_path: str):
+        logging.info("send a message to the server")
+        # 读取音频
+        # self.read_wave()
+        # 发送 websocket 的 handshake 协议头
+        async with websockets.connect(self.url) as ws:
+            # server 端已经接收到 handshake 协议头
+            # 发送开始指令
+            audio_info = json.dumps(
+                {
+                    "name": "test.wav",
+                    "signal": "start",
+                    "nbest": 5
+                },
+                sort_keys=True,
+                indent=4,
+                separators=(',', ': '))
+            await ws.send(audio_info)
+            msg = await ws.recv()
+            logging.info("receive msg={}".format(msg))
+
+            # send chunk audio data to engine
+            for chunk_data in self.read_wave(wavfile_path):
+                await ws.send(chunk_data.tobytes())
+                msg = await ws.recv()
+                logging.info("receive msg={}".format(msg))
+
+            # finished 
+            audio_info = json.dumps(
+                {
+                    "name": "test.wav",
+                    "signal": "end",
+                    "nbest": 5
+                },
+                sort_keys=True,
+                indent=4,
+                separators=(',', ': '))
+            await ws.send(audio_info)
+            msg = await ws.recv()
+            logging.info("receive msg={}".format(msg))
+
+
+def main(args):
+    logging.basicConfig(level=logging.INFO)
+    logging.info("asr websocket client start")
+    handler = ASRAudioHandler("127.0.0.1", 8091)
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(handler.run(args.wavfile))
+    logging.info("asr websocket client finished")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--wavfile",
+        action="store",
+        help="wav file path ",
+        default="./16_audio.wav")
+    args = parser.parse_args()
+
+    main(args)
--- a/paddlespeech/server/utils/buffer.py
+++ b/paddlespeech/server/utils/buffer.py
@ -0,0 +1,59 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Frame(object):
+    """Represents a "frame" of audio data."""
+
+    def __init__(self, bytes, timestamp, duration):
+        self.bytes = bytes
+        self.timestamp = timestamp
+        self.duration = duration
+
+
+class ChunkBuffer(object):
+    def __init__(self,
+                 frame_duration_ms=80,
+                 shift_ms=40,
+                 sample_rate=16000,
+                 sample_width=2):
+        self.sample_rate = sample_rate
+        self.frame_duration_ms = frame_duration_ms
+        self.shift_ms = shift_ms
+        self.remained_audio = b''
+        self.sample_width = sample_width  # int16 = 2; float32 = 4
+
+    def frame_generator(self, audio):
+        """Generates audio frames from PCM audio data.
+        Takes the desired frame duration in milliseconds, the PCM data, and
+        the sample rate.
+        Yields Frames of the requested duration.
+        """
+        audio = self.remained_audio + audio
+        self.remained_audio = b''
+
+        n = int(self.sample_rate *
+                (self.frame_duration_ms / 1000.0) * self.sample_width)
+        shift_n = int(self.sample_rate *
+                      (self.shift_ms / 1000.0) * self.sample_width)
+        offset = 0
+        timestamp = 0.0
+        duration = (float(n) / self.sample_rate) / self.sample_width
+        shift_duration = (float(shift_n) / self.sample_rate) / self.sample_width
+        while offset + n <= len(audio):
+            yield Frame(audio[offset:offset + n], timestamp, duration)
+            timestamp += shift_duration
+            offset += shift_n
+
+        self.remained_audio += audio[offset:]
--- a/paddlespeech/server/utils/vad.py
+++ b/paddlespeech/server/utils/vad.py
@ -0,0 +1,78 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+import webrtcvad
+
+
+class VADAudio():
+    def __init__(self,
+                 aggressiveness=2,
+                 rate=16000,
+                 frame_duration_ms=20,
+                 sample_width=2,
+                 padding_ms=200,
+                 padding_ratio=0.9):
+        """Initializes VAD with given aggressivenes and sets up internal queues"""
+        self.vad = webrtcvad.Vad(aggressiveness)
+        self.rate = rate
+        self.sample_width = sample_width
+        self.frame_duration_ms = frame_duration_ms
+        self._frame_length = int(rate * (frame_duration_ms / 1000.0) *
+                                 self.sample_width)
+        self._buffer_queue = collections.deque()
+        self.ring_buffer = collections.deque(maxlen=padding_ms //
+                                             frame_duration_ms)
+        self._ratio = padding_ratio
+        self.triggered = False
+
+    def add_audio(self, audio):
+        """Adds new audio to internal queue"""
+        for x in audio:
+            self._buffer_queue.append(x)
+
+    def frame_generator(self):
+        """Generator that yields audio frames of frame_duration_ms"""
+        while len(self._buffer_queue) > self._frame_length:
+            frame = bytearray()
+            for _ in range(self._frame_length):
+                frame.append(self._buffer_queue.popleft())
+            yield bytes(frame)
+
+    def vad_collector(self):
+        """Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
+            Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
+            Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
+                      |---utterence---|        |---utterence---|
+        """
+        for frame in self.frame_generator():
+            is_speech = self.vad.is_speech(frame, self.rate)
+            if not self.triggered:
+                self.ring_buffer.append((frame, is_speech))
+                num_voiced = len(
+                    [f for f, speech in self.ring_buffer if speech])
+                if num_voiced > self._ratio * self.ring_buffer.maxlen:
+                    self.triggered = True
+                    for f, s in self.ring_buffer:
+                        yield f
+                    self.ring_buffer.clear()
+            else:
+                yield frame
+                self.ring_buffer.append((frame, is_speech))
+                num_unvoiced = len(
+                    [f for f, speech in self.ring_buffer if not speech])
+                if num_unvoiced > self._ratio * self.ring_buffer.maxlen:
+                    self.triggered = False
+                    yield None
+                    self.ring_buffer.clear()
--- a/paddlespeech/server/ws/init.py
+++ b/paddlespeech/server/ws/init.py
@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddlespeech/server/ws/api.py
+++ b/paddlespeech/server/ws/api.py
@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+from fastapi import APIRouter
+
+from paddlespeech.server.ws.asr_socket import router as asr_router
+
+_router = APIRouter()
+
+
+def setup_router(api_list: List):
+    """setup router for fastapi
+    Args:
+        api_list (List): [asr, tts]
+    Returns:
+        APIRouter
+    """
+    for api_name in api_list:
+        if api_name == 'asr':
+            _router.include_router(asr_router)
+        elif api_name == 'tts':
+            pass
+        else:
+            pass
+
+    return _router
--- a/paddlespeech/server/ws/asr_socket.py
+++ b/paddlespeech/server/ws/asr_socket.py
@ -0,0 +1,100 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+
+import numpy as np
+from fastapi import APIRouter
+from fastapi import WebSocket
+from fastapi import WebSocketDisconnect
+from starlette.websockets import WebSocketState as WebSocketState
+
+from paddlespeech.server.engine.engine_pool import get_engine_pool
+from paddlespeech.server.utils.buffer import ChunkBuffer
+from paddlespeech.server.utils.vad import VADAudio
+
+router = APIRouter()
+
+
+@router.websocket('/ws/asr')
+async def websocket_endpoint(websocket: WebSocket):
+
+    await websocket.accept()
+
+    engine_pool = get_engine_pool()
+    asr_engine = engine_pool['asr']
+    # init buffer
+    chunk_buffer_conf = asr_engine.config.chunk_buffer_conf
+    chunk_buffer = ChunkBuffer(
+        sample_rate=chunk_buffer_conf['sample_rate'],
+        sample_width=chunk_buffer_conf['sample_width'])
+    # init vad
+    vad_conf = asr_engine.config.vad_conf
+    vad = VADAudio(
+        aggressiveness=vad_conf['aggressiveness'],
+        rate=vad_conf['sample_rate'],
+        frame_duration_ms=vad_conf['frame_duration_ms'])
+
+    try:
+        while True:
+            # careful here, changed the source code from starlette.websockets
+            assert websocket.application_state == WebSocketState.CONNECTED
+            message = await websocket.receive()
+            websocket._raise_on_disconnect(message)
+            if "text" in message:
+                message = json.loads(message["text"])
+                if 'signal' not in message:
+                    resp = {"status": "ok", "message": "no valid json data"}
+                    await websocket.send_json(resp)
+
+                if message['signal'] == 'start':
+                    resp = {"status": "ok", "signal": "server_ready"}
+                    # do something at begining here
+                    await websocket.send_json(resp)
+                elif message['signal'] == 'end':
+                    engine_pool = get_engine_pool()
+                    asr_engine = engine_pool['asr']
+                    # reset single  engine for an new connection
+                    asr_engine.reset()
+                    resp = {"status": "ok", "signal": "finished"}
+                    await websocket.send_json(resp)
+                    break
+                else:
+                    resp = {"status": "ok", "message": "no valid json data"}
+                    await websocket.send_json(resp)
+            elif "bytes" in message:
+                message = message["bytes"]
+
+                # vad for input bytes audio
+                vad.add_audio(message)
+                message = b''.join(f for f in vad.vad_collector()
+                                   if f is not None)
+
+                engine_pool = get_engine_pool()
+                asr_engine = engine_pool['asr']
+                asr_results = ""
+                frames = chunk_buffer.frame_generator(message)
+                for frame in frames:
+                    samples = np.frombuffer(frame.bytes, dtype=np.int16)
+                    sample_rate = asr_engine.config.sample_rate
+                    x_chunk, x_chunk_lens = asr_engine.preprocess(samples,
+                                                                  sample_rate)
+                    asr_engine.run(x_chunk, x_chunk_lens)
+                    asr_results = asr_engine.postprocess()
+
+                asr_results = asr_engine.postprocess()
+                resp = {'asr_results': asr_results}
+
+                await websocket.send_json(resp)
+    except WebSocketDisconnect:
+        pass