From 7be6b0e8cf46315ba09c7ca7d49247ab4862d9b6 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 7 May 2022 07:08:42 +0000 Subject: [PATCH 1/3] unify name style & frame with abs timestamp --- paddlespeech/server/bin/main.py | 77 --------------- paddlespeech/server/restful/api.py | 10 +- paddlespeech/server/utils/audio_handler.py | 10 +- paddlespeech/server/utils/audio_process.py | 2 +- paddlespeech/server/utils/buffer.py | 21 ++-- paddlespeech/server/ws/api.py | 4 +- paddlespeech/server/ws/asr_socket.py | 110 --------------------- paddlespeech/server/ws/tts_socket.py | 61 ------------ 8 files changed, 31 insertions(+), 264 deletions(-) delete mode 100644 paddlespeech/server/bin/main.py delete mode 100644 paddlespeech/server/ws/asr_socket.py delete mode 100644 paddlespeech/server/ws/tts_socket.py diff --git a/paddlespeech/server/bin/main.py b/paddlespeech/server/bin/main.py deleted file mode 100644 index 81824c85..00000000 --- a/paddlespeech/server/bin/main.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse - -import uvicorn -from fastapi import FastAPI - -from paddlespeech.server.engine.engine_pool import init_engine_pool -from paddlespeech.server.restful.api import setup_router as setup_http_router -from paddlespeech.server.utils.config import get_config -from paddlespeech.server.ws.api import setup_router as setup_ws_router - -app = FastAPI( - title="PaddleSpeech Serving API", description="Api", version="0.0.1") - - -def init(config): - """system initialization - - Args: - config (CfgNode): config object - - Returns: - bool: - """ - # init api - api_list = list(engine.split("_")[0] for engine in config.engine_list) - if config.protocol == "websocket": - api_router = setup_ws_router(api_list) - elif config.protocol == "http": - api_router = setup_http_router(api_list) - else: - raise Exception("unsupported protocol") - app.include_router(api_router) - - if not init_engine_pool(config): - return False - - return True - - -def main(args): - """main function""" - - config = get_config(args.config_file) - - if init(config): - uvicorn.run(app, host=config.host, port=config.port, debug=True) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--config_file", - action="store", - help="yaml file of the app", - default="./conf/application.yaml") - - parser.add_argument( - "--log_file", - action="store", - help="log file", - default="./log/paddlespeech.log") - args = parser.parse_args() - - main(args) diff --git a/paddlespeech/server/restful/api.py b/paddlespeech/server/restful/api.py index f1e4ffc8..63f865e8 100644 --- a/paddlespeech/server/restful/api.py +++ b/paddlespeech/server/restful/api.py @@ -29,19 +29,19 @@ def setup_router(api_list: List): """setup router for fastapi Args: - api_list (List): [asr, tts, cls] + api_list (List): [asr, tts, cls, text, vecotr] Returns: APIRouter """ for api_name in api_list: - if api_name == 'asr': + if api_name.lower() == 'asr': _router.include_router(asr_router) - elif api_name == 'tts': + elif api_name.lower() == 'tts': _router.include_router(tts_router) - elif api_name == 'cls': + elif api_name.lower() == 'cls': _router.include_router(cls_router) - elif api_name == 'text': + elif api_name.lower() == 'text': _router.include_router(text_router) elif api_name.lower() == 'vector': _router.include_router(vec_router) diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index 75f4a10b..2bce28e3 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -43,6 +43,7 @@ class TextHttpHandler: else: self.url = 'http://' + self.server_ip + ":" + str( self.port) + '/paddlespeech/text' + logger.info(f"endpoint: {self.url}") def run(self, text): """Call the text server to process the specific text @@ -107,8 +108,10 @@ class ASRWsAudioHandler: """ samples, sample_rate = soundfile.read(wavfile_path, dtype='int16') x_len = len(samples) + assert sample_rate == 16000 + + chunk_size = int(85 * sample_rate / 1000) # 85ms, sample_rate = 16kHz - chunk_size = 85 * 16 #80ms, sample_rate = 16kHz if x_len % chunk_size != 0: padding_len_x = chunk_size - x_len % chunk_size else: @@ -217,6 +220,7 @@ class ASRHttpHandler: else: self.url = 'http://' + self.server_ip + ":" + str( self.port) + '/paddlespeech/asr' + logger.info(f"endpoint: {self.url}") def run(self, input, audio_format, sample_rate, lang): """Call the http asr to process the audio @@ -275,6 +279,7 @@ class TTSWsHandler: self.start_play = True self.t = threading.Thread(target=self.play_audio) self.max_fail = 50 + logger.info(f"endpoint: {self.url}") def play_audio(self): while True: @@ -383,6 +388,7 @@ class TTSHttpHandler: self.start_play = True self.t = threading.Thread(target=self.play_audio) self.max_fail = 50 + logger.info(f"endpoint: {self.url}") def play_audio(self): while True: @@ -483,6 +489,7 @@ class VectorHttpHandler: else: self.url = 'http://' + self.server_ip + ":" + str( self.port) + '/paddlespeech/vector' + logger.info(f"endpoint: {self.url}") def run(self, input, audio_format, sample_rate, task="spk"): """Call the http asr to process the audio @@ -529,6 +536,7 @@ class VectorScoreHttpHandler: else: self.url = 'http://' + self.server_ip + ":" + str( self.port) + '/paddlespeech/vector/score' + logger.info(f"endpoint: {self.url}") def run(self, enroll_audio, test_audio, audio_format, sample_rate): """Call the http asr to process the audio diff --git a/paddlespeech/server/utils/audio_process.py b/paddlespeech/server/utils/audio_process.py index 6fb5bb83..bb02d664 100644 --- a/paddlespeech/server/utils/audio_process.py +++ b/paddlespeech/server/utils/audio_process.py @@ -107,7 +107,7 @@ def change_speed(sample_raw, speed_rate, sample_rate): def float2pcm(sig, dtype='int16'): - """Convert floating point signal with a range from -1 to 1 to PCM. + """Convert floating point signal with a range from -1 to 1 to PCM16. Args: sig (array): Input array, must have floating point type. diff --git a/paddlespeech/server/utils/buffer.py b/paddlespeech/server/utils/buffer.py index d4e6cd49..f56db752 100644 --- a/paddlespeech/server/utils/buffer.py +++ b/paddlespeech/server/utils/buffer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - class Frame(object): """Represents a "frame" of audio data.""" @@ -46,8 +45,7 @@ class ChunkBuffer(object): self.shift_ms = shift_ms self.sample_rate = sample_rate self.sample_width = sample_width # int16 = 2; float32 = 4 - self.remained_audio = b'' - + self.window_sec = float((self.window_n - 1) * self.shift_ms + self.window_ms) / 1000.0 self.shift_sec = float(self.shift_n * self.shift_ms / 1000.0) @@ -57,22 +55,31 @@ class ChunkBuffer(object): self.shift_bytes = int(self.shift_sec * self.sample_rate * self.sample_width) + self.remained_audio = b'' + # abs timestamp from `start` or latest `reset` + self.timestamp = 0.0 + + def reset(self): + """ + reset buffer state. + """ + self.timestamp = 0.0 + self.remained_audio = b'' + def frame_generator(self, audio): """Generates audio frames from PCM audio data. Takes the desired frame duration in milliseconds, the PCM data, and the sample rate. Yields Frames of the requested duration. """ - audio = self.remained_audio + audio self.remained_audio = b'' offset = 0 - timestamp = 0.0 while offset + self.window_bytes <= len(audio): - yield Frame(audio[offset:offset + self.window_bytes], timestamp, + yield Frame(audio[offset:offset + self.window_bytes], self.timestamp, self.window_sec) - timestamp += self.shift_sec + self.timestamp += self.shift_sec offset += self.shift_bytes self.remained_audio += audio[offset:] diff --git a/paddlespeech/server/ws/api.py b/paddlespeech/server/ws/api.py index 313fd16f..83d542a1 100644 --- a/paddlespeech/server/ws/api.py +++ b/paddlespeech/server/ws/api.py @@ -15,8 +15,8 @@ from typing import List from fastapi import APIRouter -from paddlespeech.server.ws.asr_socket import router as asr_router -from paddlespeech.server.ws.tts_socket import router as tts_router +from paddlespeech.server.ws.asr_api import router as asr_router +from paddlespeech.server.ws.tts_api import router as tts_router _router = APIRouter() diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py deleted file mode 100644 index 0f7dcddd..00000000 --- a/paddlespeech/server/ws/asr_socket.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import json - -from fastapi import APIRouter -from fastapi import WebSocket -from fastapi import WebSocketDisconnect -from starlette.websockets import WebSocketState as WebSocketState - -from paddlespeech.server.engine.asr.online.asr_engine import PaddleASRConnectionHanddler -from paddlespeech.server.engine.engine_pool import get_engine_pool - -router = APIRouter() - - -@router.websocket('/paddlespeech/asr/streaming') -async def websocket_endpoint(websocket: WebSocket): - """PaddleSpeech Online ASR Server api - - Args: - websocket (WebSocket): the websocket instance - """ - - #1. the interface wait to accept the websocket protocal header - # and only we receive the header, it establish the connection with specific thread - await websocket.accept() - - #2. if we accept the websocket headers, we will get the online asr engine instance - engine_pool = get_engine_pool() - asr_engine = engine_pool['asr'] - - #3. each websocket connection, we will create an PaddleASRConnectionHanddler to process such audio - # and each connection has its own connection instance to process the request - # and only if client send the start signal, we create the PaddleASRConnectionHanddler instance - connection_handler = None - - try: - #4. we do a loop to process the audio package by package according the protocal - # and only if the client send finished signal, we will break the loop - while True: - # careful here, changed the source code from starlette.websockets - # 4.1 we wait for the client signal for the specific action - assert websocket.application_state == WebSocketState.CONNECTED - message = await websocket.receive() - websocket._raise_on_disconnect(message) - - #4.2 text for the action command and bytes for pcm data - if "text" in message: - # we first parse the specific command - message = json.loads(message["text"]) - if 'signal' not in message: - resp = {"status": "ok", "message": "no valid json data"} - await websocket.send_json(resp) - - # start command, we create the PaddleASRConnectionHanddler instance to process the audio data - # end command, we process the all the last audio pcm and return the final result - # and we break the loop - if message['signal'] == 'start': - resp = {"status": "ok", "signal": "server_ready"} - # do something at begining here - # create the instance to process the audio - connection_handler = PaddleASRConnectionHanddler(asr_engine) - await websocket.send_json(resp) - elif message['signal'] == 'end': - # reset single engine for an new connection - # and we will destroy the connection - connection_handler.decode(is_finished=True) - connection_handler.rescoring() - asr_results = connection_handler.get_result() - word_time_stamp = connection_handler.get_word_time_stamp() - connection_handler.reset() - - resp = { - "status": "ok", - "signal": "finished", - 'result': asr_results, - 'times': word_time_stamp - } - await websocket.send_json(resp) - break - else: - resp = {"status": "ok", "message": "no valid json data"} - await websocket.send_json(resp) - elif "bytes" in message: - # bytes for the pcm data - message = message["bytes"] - - # we extract the remained audio pcm - # and decode for the result in this package data - connection_handler.extract_feat(message) - connection_handler.decode(is_finished=False) - asr_results = connection_handler.get_result() - - # return the current period result - # if the engine create the vad instance, this connection will have many period results - resp = {'result': asr_results} - await websocket.send_json(resp) - except WebSocketDisconnect: - pass diff --git a/paddlespeech/server/ws/tts_socket.py b/paddlespeech/server/ws/tts_socket.py deleted file mode 100644 index 482aeb79..00000000 --- a/paddlespeech/server/ws/tts_socket.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import json - -from fastapi import APIRouter -from fastapi import WebSocket -from fastapi import WebSocketDisconnect -from starlette.websockets import WebSocketState as WebSocketState - -from paddlespeech.cli.log import logger -from paddlespeech.server.engine.engine_pool import get_engine_pool - -router = APIRouter() - - -@router.websocket('/paddlespeech/tts/streaming') -async def websocket_endpoint(websocket: WebSocket): - await websocket.accept() - - try: - # careful here, changed the source code from starlette.websockets - assert websocket.application_state == WebSocketState.CONNECTED - message = await websocket.receive() - websocket._raise_on_disconnect(message) - - # get engine - engine_pool = get_engine_pool() - tts_engine = engine_pool['tts'] - - # 获取 message 并转文本 - message = json.loads(message["text"]) - text_bese64 = message["text"] - sentence = tts_engine.preprocess(text_bese64=text_bese64) - - # run - wav_generator = tts_engine.run(sentence) - - while True: - try: - tts_results = next(wav_generator) - resp = {"status": 1, "audio": tts_results} - await websocket.send_json(resp) - except StopIteration as e: - resp = {"status": 2, "audio": ''} - await websocket.send_json(resp) - logger.info("Complete the transmission of audio streams") - break - - except WebSocketDisconnect: - pass From 175c67b75ea34274e0f1162a7fc6e0e0139cde22 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 7 May 2022 07:11:22 +0000 Subject: [PATCH 2/3] asr socket to asr api --- paddlespeech/server/ws/asr_api.py | 110 ++++++++++++++++++++++++++++++ paddlespeech/server/ws/tts_api.py | 61 +++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 paddlespeech/server/ws/asr_api.py create mode 100644 paddlespeech/server/ws/tts_api.py diff --git a/paddlespeech/server/ws/asr_api.py b/paddlespeech/server/ws/asr_api.py new file mode 100644 index 00000000..0f7dcddd --- /dev/null +++ b/paddlespeech/server/ws/asr_api.py @@ -0,0 +1,110 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json + +from fastapi import APIRouter +from fastapi import WebSocket +from fastapi import WebSocketDisconnect +from starlette.websockets import WebSocketState as WebSocketState + +from paddlespeech.server.engine.asr.online.asr_engine import PaddleASRConnectionHanddler +from paddlespeech.server.engine.engine_pool import get_engine_pool + +router = APIRouter() + + +@router.websocket('/paddlespeech/asr/streaming') +async def websocket_endpoint(websocket: WebSocket): + """PaddleSpeech Online ASR Server api + + Args: + websocket (WebSocket): the websocket instance + """ + + #1. the interface wait to accept the websocket protocal header + # and only we receive the header, it establish the connection with specific thread + await websocket.accept() + + #2. if we accept the websocket headers, we will get the online asr engine instance + engine_pool = get_engine_pool() + asr_engine = engine_pool['asr'] + + #3. each websocket connection, we will create an PaddleASRConnectionHanddler to process such audio + # and each connection has its own connection instance to process the request + # and only if client send the start signal, we create the PaddleASRConnectionHanddler instance + connection_handler = None + + try: + #4. we do a loop to process the audio package by package according the protocal + # and only if the client send finished signal, we will break the loop + while True: + # careful here, changed the source code from starlette.websockets + # 4.1 we wait for the client signal for the specific action + assert websocket.application_state == WebSocketState.CONNECTED + message = await websocket.receive() + websocket._raise_on_disconnect(message) + + #4.2 text for the action command and bytes for pcm data + if "text" in message: + # we first parse the specific command + message = json.loads(message["text"]) + if 'signal' not in message: + resp = {"status": "ok", "message": "no valid json data"} + await websocket.send_json(resp) + + # start command, we create the PaddleASRConnectionHanddler instance to process the audio data + # end command, we process the all the last audio pcm and return the final result + # and we break the loop + if message['signal'] == 'start': + resp = {"status": "ok", "signal": "server_ready"} + # do something at begining here + # create the instance to process the audio + connection_handler = PaddleASRConnectionHanddler(asr_engine) + await websocket.send_json(resp) + elif message['signal'] == 'end': + # reset single engine for an new connection + # and we will destroy the connection + connection_handler.decode(is_finished=True) + connection_handler.rescoring() + asr_results = connection_handler.get_result() + word_time_stamp = connection_handler.get_word_time_stamp() + connection_handler.reset() + + resp = { + "status": "ok", + "signal": "finished", + 'result': asr_results, + 'times': word_time_stamp + } + await websocket.send_json(resp) + break + else: + resp = {"status": "ok", "message": "no valid json data"} + await websocket.send_json(resp) + elif "bytes" in message: + # bytes for the pcm data + message = message["bytes"] + + # we extract the remained audio pcm + # and decode for the result in this package data + connection_handler.extract_feat(message) + connection_handler.decode(is_finished=False) + asr_results = connection_handler.get_result() + + # return the current period result + # if the engine create the vad instance, this connection will have many period results + resp = {'result': asr_results} + await websocket.send_json(resp) + except WebSocketDisconnect: + pass diff --git a/paddlespeech/server/ws/tts_api.py b/paddlespeech/server/ws/tts_api.py new file mode 100644 index 00000000..699ee412 --- /dev/null +++ b/paddlespeech/server/ws/tts_api.py @@ -0,0 +1,61 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json + +from fastapi import APIRouter +from fastapi import WebSocket +from fastapi import WebSocketDisconnect +from starlette.websockets import WebSocketState as WebSocketState + +from paddlespeech.cli.log import logger +from paddlespeech.server.engine.engine_pool import get_engine_pool + +router = APIRouter() + + +@router.websocket('/ws/tts') +async def websocket_endpoint(websocket: WebSocket): + await websocket.accept() + + try: + # careful here, changed the source code from starlette.websockets + assert websocket.application_state == WebSocketState.CONNECTED + message = await websocket.receive() + websocket._raise_on_disconnect(message) + + # get engine + engine_pool = get_engine_pool() + tts_engine = engine_pool['tts'] + + # 获取 message 并转文本 + message = json.loads(message["text"]) + text_bese64 = message["text"] + sentence = tts_engine.preprocess(text_bese64=text_bese64) + + # run + wav_generator = tts_engine.run(sentence) + + while True: + try: + tts_results = next(wav_generator) + resp = {"status": 1, "audio": tts_results} + await websocket.send_json(resp) + except StopIteration as e: + resp = {"status": 2, "audio": ''} + await websocket.send_json(resp) + logger.info("Complete the transmission of audio streams") + break + + except WebSocketDisconnect: + pass From 12ae137c83f5b3d804dffd6c40ac9d5e128bd883 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 7 May 2022 07:12:57 +0000 Subject: [PATCH 3/3] update tts_api for ws --- paddlespeech/server/ws/tts_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/server/ws/tts_api.py b/paddlespeech/server/ws/tts_api.py index 699ee412..20a63d4c 100644 --- a/paddlespeech/server/ws/tts_api.py +++ b/paddlespeech/server/ws/tts_api.py @@ -24,7 +24,7 @@ from paddlespeech.server.engine.engine_pool import get_engine_pool router = APIRouter() -@router.websocket('/ws/tts') +@router.websocket('/paddlespeech/tts/streaming') async def websocket_endpoint(websocket: WebSocket): await websocket.accept() @@ -58,4 +58,4 @@ async def websocket_endpoint(websocket: WebSocket): break except WebSocketDisconnect: - pass + pass \ No newline at end of file