From 7aa5a3df2bf5288d6d3a5fbd2a54810ede97129a Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Sat, 23 Apr 2022 00:16:12 +0800
Subject: [PATCH 1/5] fix the streaming asr server bug, server client, test=doc

---
 paddlespeech/server/utils/audio_handler.py | 86 ++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 paddlespeech/server/utils/audio_handler.py

diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py
new file mode 100644
index 00000000..3a864a78
--- /dev/null
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -0,0 +1,86 @@
+import numpy as np
+import logging
+import argparse
+import asyncio
+import codecs
+import json
+import logging
+import os
+
+import numpy as np
+import soundfile
+import websockets
+from paddlespeech.cli.log import logger
+class ASRAudioHandler:
+    def __init__(self, url="127.0.0.1", port=8090):
+        self.url = url
+        self.port = port
+        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+
+    def read_wave(self, wavfile_path: str):
+        samples, sample_rate = soundfile.read(wavfile_path, dtype='int16')
+        x_len = len(samples)
+
+        chunk_size = 85 * 16  #80ms, sample_rate = 16kHz
+        if x_len % chunk_size!= 0:
+            padding_len_x = chunk_size - x_len % chunk_size
+        else:
+            padding_len_x = 0
+
+        padding = np.zeros((padding_len_x), dtype=samples.dtype)
+        padded_x = np.concatenate([samples, padding], axis=0)
+
+        assert (x_len + padding_len_x) % chunk_size == 0
+        num_chunk = (x_len + padding_len_x) / chunk_size
+        num_chunk = int(num_chunk)
+        for i in range(0, num_chunk):
+            start = i * chunk_size
+            end = start + chunk_size
+            x_chunk = padded_x[start:end]
+            yield x_chunk
+
+    async def run(self, wavfile_path: str):
+        logging.info("send a message to the server")
+        # self.read_wave()
+        # send websocket handshake protocal
+        async with websockets.connect(self.url) as ws:
+            # server has already received handshake protocal
+            # client start to send the command
+            audio_info = json.dumps(
+                {
+                    "name": "test.wav",
+                    "signal": "start",
+                    "nbest": 5
+                },
+                sort_keys=True,
+                indent=4,
+                separators=(',', ': '))
+            await ws.send(audio_info)
+            msg = await ws.recv()
+            logger.info("receive msg={}".format(msg))
+
+            # send chunk audio data to engine
+            for chunk_data in self.read_wave(wavfile_path):
+                await ws.send(chunk_data.tobytes())
+                msg = await ws.recv()
+                msg = json.loads(msg)
+                logger.info("receive msg={}".format(msg))
+
+            # finished 
+            audio_info = json.dumps(
+                {
+                    "name": "test.wav",
+                    "signal": "end",
+                    "nbest": 5
+                },
+                sort_keys=True,
+                indent=4,
+                separators=(',', ': '))
+            await ws.send(audio_info)
+            msg = await ws.recv()
+            
+            # decode the bytes to str
+            msg = json.loads(msg)
+            logger.info("final receive msg={}".format(msg))
+            result = msg
+            return result
\ No newline at end of file

From 2f2cb7eaaf6dd906b2868d696484712165cda21c Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Sat, 23 Apr 2022 00:37:05 +0800
Subject: [PATCH 2/5] update the audio handler note, test=doc

---
 paddlespeech/server/utils/audio_handler.py | 63 ++++++++++++++++------
 1 file changed, 48 insertions(+), 15 deletions(-)

diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py
index 3a864a78..dce7d09d 100644
--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -1,28 +1,53 @@
-import numpy as np
-import logging
-import argparse
-import asyncio
-import codecs
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import json
 import logging
-import os
 
 import numpy as np
 import soundfile
 import websockets
+
 from paddlespeech.cli.log import logger
+
+
 class ASRAudioHandler:
     def __init__(self, url="127.0.0.1", port=8090):
+        """PaddleSpeech Online ASR Server Client  audio handler
+           Online asr server use the websocket protocal
+        Args:
+            url (str, optional): the server ip. Defaults to "127.0.0.1".
+            port (int, optional): the server port. Defaults to 8090.
+        """
         self.url = url
         self.port = port
         self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
 
     def read_wave(self, wavfile_path: str):
+        """read the audio file from specific wavfile path
+
+        Args:
+            wavfile_path (str): the audio wavfile, 
+                                 we assume that audio sample rate matches the model
+
+        Yields:
+            numpy.array: the samall package audio pcm data
+        """
         samples, sample_rate = soundfile.read(wavfile_path, dtype='int16')
         x_len = len(samples)
 
         chunk_size = 85 * 16  #80ms, sample_rate = 16kHz
-        if x_len % chunk_size!= 0:
+        if x_len % chunk_size != 0:
             padding_len_x = chunk_size - x_len % chunk_size
         else:
             padding_len_x = 0
@@ -40,11 +65,19 @@ class ASRAudioHandler:
             yield x_chunk
 
     async def run(self, wavfile_path: str):
+        """Send a audio file to online server
+
+        Args:
+            wavfile_path (str): audio path
+
+        Returns:
+            str: the final asr result
+        """
         logging.info("send a message to the server")
-        # self.read_wave()
-        # send websocket handshake protocal
+
+        # 1. send websocket handshake protocal
         async with websockets.connect(self.url) as ws:
-            # server has already received handshake protocal
+            # 2. server has already received handshake protocal
             # client start to send the command
             audio_info = json.dumps(
                 {
@@ -59,14 +92,14 @@ class ASRAudioHandler:
             msg = await ws.recv()
             logger.info("receive msg={}".format(msg))
 
-            # send chunk audio data to engine
+            # 3. send chunk audio data to engine
             for chunk_data in self.read_wave(wavfile_path):
                 await ws.send(chunk_data.tobytes())
                 msg = await ws.recv()
                 msg = json.loads(msg)
                 logger.info("receive msg={}".format(msg))
 
-            # finished 
+            # 4. we must send finished signal to the server
             audio_info = json.dumps(
                 {
                     "name": "test.wav",
@@ -78,9 +111,9 @@ class ASRAudioHandler:
                 separators=(',', ': '))
             await ws.send(audio_info)
             msg = await ws.recv()
-            
-            # decode the bytes to str
+
+            # 5. decode the bytes to str
             msg = json.loads(msg)
             logger.info("final receive msg={}".format(msg))
             result = msg
-            return result
\ No newline at end of file
+            return result

From 13a37b48927406108494539902e361f034eebc03 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Sat, 23 Apr 2022 01:02:43 +0800
Subject: [PATCH 3/5] update the online protocal note, test=doc

---
 paddlespeech/server/ws/asr_socket.py | 48 ++++++++++++++++------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py
index a865703d..10967f28 100644
--- a/paddlespeech/server/ws/asr_socket.py
+++ b/paddlespeech/server/ws/asr_socket.py
@@ -20,50 +20,52 @@ from starlette.websockets import WebSocketState as WebSocketState
 
 from paddlespeech.server.engine.asr.online.asr_engine import PaddleASRConnectionHanddler
 from paddlespeech.server.engine.engine_pool import get_engine_pool
-from paddlespeech.server.utils.buffer import ChunkBuffer
-from paddlespeech.server.utils.vad import VADAudio
 
 router = APIRouter()
 
 
 @router.websocket('/ws/asr')
 async def websocket_endpoint(websocket: WebSocket):
+    """PaddleSpeech Online ASR Server api
+
+    Args:
+        websocket (WebSocket): the websocket instance
+    """
+
+    #1. the interface wait to accept the websocket protocal header
+    #   and only we receive the header, it establish the connection with specific thread
     await websocket.accept()
 
+    #2. if we accept the websocket headers, we will get the online asr engine instance
     engine_pool = get_engine_pool()
     asr_engine = engine_pool['asr']
-    connection_handler = None
-    # init buffer
-    # each websocekt connection has its own chunk buffer
-    chunk_buffer_conf = asr_engine.config.chunk_buffer_conf
-    chunk_buffer = ChunkBuffer(
-        window_n=chunk_buffer_conf.window_n,
-        shift_n=chunk_buffer_conf.shift_n,
-        window_ms=chunk_buffer_conf.window_ms,
-        shift_ms=chunk_buffer_conf.shift_ms,
-        sample_rate=chunk_buffer_conf.sample_rate,
-        sample_width=chunk_buffer_conf.sample_width)
 
-    # init vad
-    vad_conf = asr_engine.config.get('vad_conf', None)
-    if vad_conf:
-        vad = VADAudio(
-            aggressiveness=vad_conf['aggressiveness'],
-            rate=vad_conf['sample_rate'],
-            frame_duration_ms=vad_conf['frame_duration_ms'])
+    #3. each websocket connection, we will create an PaddleASRConnectionHanddler to process such audio
+    #   and each connection has its own connection instance to process the request
+    #   and only if client send the start signal, we create the PaddleASRConnectionHanddler instance
+    connection_handler = None
 
     try:
+        #4. we do a loop to process the audio package by package according the protocal
+        #   and only if the client send finished signal, we will break the loop
         while True:
             # careful here, changed the source code from starlette.websockets
+            # 4.1 we wait for the client signal for the specific action
             assert websocket.application_state == WebSocketState.CONNECTED
             message = await websocket.receive()
             websocket._raise_on_disconnect(message)
+
+            #4.2 text for the action command and bytes for pcm data
             if "text" in message:
+                # we first parse the specific command
                 message = json.loads(message["text"])
                 if 'signal' not in message:
                     resp = {"status": "ok", "message": "no valid json data"}
                     await websocket.send_json(resp)
 
+                # start command, we create the PaddleASRConnectionHanddler instance to process the audio data
+                # end command, we process the all the last audio pcm and return the final result
+                #              and we break the loop
                 if message['signal'] == 'start':
                     resp = {"status": "ok", "signal": "server_ready"}
                     # do something at begining here
@@ -72,6 +74,7 @@ async def websocket_endpoint(websocket: WebSocket):
                     await websocket.send_json(resp)
                 elif message['signal'] == 'end':
                     # reset single  engine for an new connection
+                    # and we will destroy the connection
                     connection_handler.decode(is_finished=True)
                     connection_handler.rescoring()
                     asr_results = connection_handler.get_result()
@@ -88,12 +91,17 @@ async def websocket_endpoint(websocket: WebSocket):
                     resp = {"status": "ok", "message": "no valid json data"}
                     await websocket.send_json(resp)
             elif "bytes" in message:
+                # bytes for the pcm data
                 message = message["bytes"]
 
+                # we extract the remained audio pcm 
+                # and decode for the result in this package data
                 connection_handler.extract_feat(message)
                 connection_handler.decode(is_finished=False)
                 asr_results = connection_handler.get_result()
 
+                # return the current period result
+                # if the engine create the vad instance, this connection will have many period results 
                 resp = {'asr_results': asr_results}
                 await websocket.send_json(resp)
     except WebSocketDisconnect:

From 429965ca0efeb92142ca1071fbf29add41e63f95 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Sat, 23 Apr 2022 11:47:04 +0800
Subject: [PATCH 4/5] update the streaming asr server readme, test=doc

---
 demos/README.md                         | 1 +
 demos/README_cn.md                      | 1 +
 demos/streaming_asr_server/README.md    | 4 ++--
 demos/streaming_asr_server/README_cn.md | 4 ++--
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/demos/README.md b/demos/README.md
index 84f4de41..021e4750 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -11,6 +11,7 @@ The directory containes many speech applications in multi scenarios.
 * punctuation_restoration - restore punctuation from raw text
 * speech recogintion - recognize text of an audio file 
 * speech server - Server for Speech Task, e.g. ASR,TTS,CLS
+* streaming asr server - input the audio streaming to recognize the text
 * speech translation - end to end speech translation  
 * story talker - book reader based on OCR and TTS  
 * style_fs2 - multi style control for FastSpeech2 model  
diff --git a/demos/README_cn.md b/demos/README_cn.md
index 692b8468..47134212 100644
--- a/demos/README_cn.md
+++ b/demos/README_cn.md
@@ -11,6 +11,7 @@
 * 标点恢复 - 通常作为语音识别的文本后处理任务，为一段无标点的纯文本添加相应的标点符号。
 * 语音识别 - 识别一段音频中包含的语音文字。
 * 语音服务 - 离线语音服务，包括ASR、TTS、CLS等
+* 流式语音识别服务 - 流式输入语音数据流识别音频中的文字
 * 语音翻译 - 实时识别音频中的语言，并同时翻译成目标语言。
 * 会说话的故事书 - 基于 OCR 和语音合成的会说话的故事书。
 * 个性化语音合成 - 基于 FastSpeech2 模型的个性化语音合成。 
diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md
index 0eed8e56..6a2f21aa 100644
--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
@@ -40,8 +40,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
   paddlespeech_server start --help
   ```
   Arguments:
-  - `config_file`: yaml file of the app, defalut: ./conf/ws_conformer_application.yaml
-  - `log_file`: log file. Default: ./log/paddlespeech.log
+  - `config_file`: yaml file of the app, defalut: `./conf/application.yaml`
+  - `log_file`: log file. Default: `./log/paddlespeech.log`
 
   Output:
   ```bash
diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md
index bf122bb3..9224206b 100644
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -40,8 +40,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
   paddlespeech_server start --help
   ```
   参数:
-  - `config_file`: 服务的配置文件，默认： ./conf/ws_conformer_application.yaml
-  - `log_file`: log 文件. 默认：./log/paddlespeech.log
+  - `config_file`: 服务的配置文件，默认： `./conf/application.yaml`
+  - `log_file`: log 文件. 默认：`./log/paddlespeech.log`
 
   输出:
   ```bash

From 9760d70b327c7132ff0ad2b294b758e61a7cd061 Mon Sep 17 00:00:00 2001
From: xiongxinlei <xiongxinlei@baidu.com>
Date: Sat, 23 Apr 2022 13:25:17 +0800
Subject: [PATCH 5/5] update the streaming asr english note, test=doc

---
 demos/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/README.md b/demos/README.md
index 021e4750..8abd6724 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -11,7 +11,7 @@ The directory containes many speech applications in multi scenarios.
 * punctuation_restoration - restore punctuation from raw text
 * speech recogintion - recognize text of an audio file 
 * speech server - Server for Speech Task, e.g. ASR,TTS,CLS
-* streaming asr server - input the audio streaming to recognize the text
+* streaming asr server - receive audio stream from websocket, and recognize to transcript.
 * speech translation - end to end speech translation  
 * story talker - book reader based on OCR and TTS  
 * style_fs2 - multi style control for FastSpeech2 model