From 3535079434bf8c51d16474e275c2ef6e0f0f7691 Mon Sep 17 00:00:00 2001 From: xiongxinlei Date: Sun, 15 May 2022 12:21:50 +0800 Subject: [PATCH] update the acs engine doc, test=doc --- demos/audio_content_search/README.md | 69 ++++++++++++ demos/audio_content_search/README_cn.md | 68 ++++++++++++ .../conf/acs_application.yaml | 2 +- .../audio_content_search/{ => conf}/words.txt | 0 demos/audio_content_search/run.sh | 6 ++ .../ws_conformer_wenetspeech_application.yaml | 2 +- .../server/bin/paddlespeech_client.py | 85 +++++++++++++++ .../server/engine/acs/python/acs_engine.py | 46 +++++++- paddlespeech/server/restful/acs_api.py | 101 ++++++++++++++++++ paddlespeech/server/restful/response.py | 31 +++++- paddlespeech/server/utils/audio_handler.py | 4 +- paddlespeech/server/ws/asr_api.py | 62 +---------- 12 files changed, 406 insertions(+), 70 deletions(-) create mode 100644 demos/audio_content_search/README.md create mode 100644 demos/audio_content_search/README_cn.md rename demos/audio_content_search/{ => conf}/words.txt (100%) create mode 100755 demos/audio_content_search/run.sh create mode 100644 paddlespeech/server/restful/acs_api.py diff --git a/demos/audio_content_search/README.md b/demos/audio_content_search/README.md new file mode 100644 index 00000000..040d215f --- /dev/null +++ b/demos/audio_content_search/README.md @@ -0,0 +1,69 @@ +([简体中文](./README_cn.md)|English) +# ACS (Audio Content Search) + +## Introduction +ACS, or Audio Content Search, refers to the problem of getting the key word time stamp to from automatically transcribe spoken language (speech-to-text). + +This demo is an implementation to get the key word stamp from the text from a specific audio file. It can be done by a single command or a few lines in python using `PaddleSpeech`. + +## Usage +### 1. Installation +see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md). + +You can choose one way from meduim and hard to install paddlespeech. + +### 2. Prepare Input File +The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model. + +Here are sample files for this demo that can be downloaded: +```bash +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +``` + +### 3. Usage +- Command Line(Recommended) + ```bash + # Chinese + paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav + ``` + + Usage: + ```bash + paddlespeech asr --help + ``` + Arguments: + - `input`(required): Audio file to recognize. + - `server_ip`: the server ip. + - `port`: the server port. + - `lang`: the language type of the model. Default: `zh`. + - `sample_rate`: Sample rate of the model. Default: `16000`. + - `audio_format`: The audio format. + + Output: + ```bash + [2022-05-15 15:00:58,185] [ INFO] - acs http client start + [2022-05-15 15:00:58,185] [ INFO] - endpoint: http://127.0.0.1:8490/paddlespeech/asr/search + [2022-05-15 15:01:03,220] [ INFO] - acs http client finished + [2022-05-15 15:01:03,221] [ INFO] - ACS result: {'transcription': '我认为跑步最重要的就是给我带来了身体健康', 'acs': [{'w': '我', 'bg': 0, 'ed': 1.6800000000000002}, {'w': '我', 'bg': 2.1, 'ed': 4.28}, {'w': '康', 'bg': 3.2, 'ed': 4.92}]} + [2022-05-15 15:01:03,221] [ INFO] - Response time 5.036084 s. + ``` + +- Python API + ```python + from paddlespeech.server.bin.paddlespeech_client import ACSClientExecutor + + acs_executor = ACSClientExecutor() + res = acs_executor( + input='./zh.wav', + server_ip="127.0.0.1", + port=8490,) + print(res) + ``` + + Output: + ```bash + [2022-05-15 15:08:13,955] [ INFO] - acs http client start + [2022-05-15 15:08:13,956] [ INFO] - endpoint: http://127.0.0.1:8490/paddlespeech/asr/search + [2022-05-15 15:08:19,026] [ INFO] - acs http client finished + {'transcription': '我认为跑步最重要的就是给我带来了身体健康', 'acs': [{'w': '我', 'bg': 0, 'ed': 1.6800000000000002}, {'w': '我', 'bg': 2.1, 'ed': 4.28}, {'w': '康', 'bg': 3.2, 'ed': 4.92}]} + ``` diff --git a/demos/audio_content_search/README_cn.md b/demos/audio_content_search/README_cn.md new file mode 100644 index 00000000..6ccf328d --- /dev/null +++ b/demos/audio_content_search/README_cn.md @@ -0,0 +1,68 @@ +(简体中文|[English](./README.md)) + +# 语音内容搜索 +## 介绍 +语音内容搜索是一项用计算机程序获取转录语音内容关键词时间戳的技术。 + +这个 demo 是一个从给定音频文件获取其文本中关键词时间戳的实现,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。 + +## 使用方法 +### 1. 安装 +请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。 + +你可以从 medium,hard 三中方式中选择一种方式安装。 + +### 2. 准备输入 +这个 demo 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。 + +可以下载此 demo 的示例音频: +```bash +wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav +``` +### 3. 使用方法 +- 命令行 (推荐使用) + ```bash + # 中文 + paddlespeech_client acs --server_ip 127.0.0.1 --port 8090 --input ./zh.wav + ``` + + 使用方法: + ```bash + paddlespeech acs --help + ``` + 参数: + - `input`(必须输入):用于识别的音频文件。 + - `server_ip`: 服务的ip。 + - `port`:服务的端口。 + - `lang`:模型语言,默认值:`zh`。 + - `sample_rate`:音频采样率,默认值:`16000`。 + - `audio_format`: 音频的格式。 + + 输出: + ```bash + [2022-05-15 15:00:58,185] [ INFO] - acs http client start + [2022-05-15 15:00:58,185] [ INFO] - endpoint: http://127.0.0.1:8490/paddlespeech/asr/search + [2022-05-15 15:01:03,220] [ INFO] - acs http client finished + [2022-05-15 15:01:03,221] [ INFO] - ACS result: {'transcription': '我认为跑步最重要的就是给我带来了身体健康', 'acs': [{'w': '我', 'bg': 0, 'ed': 1.6800000000000002}, {'w': '我', 'bg': 2.1, 'ed': 4.28}, {'w': '康', 'bg': 3.2, 'ed': 4.92}]} + [2022-05-15 15:01:03,221] [ INFO] - Response time 5.036084 s. + ``` + +- Python API + ```python + from paddlespeech.server.bin.paddlespeech_client import ACSClientExecutor + + acs_executor = ACSClientExecutor() + res = acs_executor( + input='./zh.wav', + server_ip="127.0.0.1", + port=8490,) + print(res) + ``` + + 输出: + ```bash + [2022-05-15 15:08:13,955] [ INFO] - acs http client start + [2022-05-15 15:08:13,956] [ INFO] - endpoint: http://127.0.0.1:8490/paddlespeech/asr/search + [2022-05-15 15:08:19,026] [ INFO] - acs http client finished + {'transcription': '我认为跑步最重要的就是给我带来了身体健康', 'acs': [{'w': '我', 'bg': 0, 'ed': 1.6800000000000002}, {'w': '我', 'bg': 2.1, 'ed': 4.28}, {'w': '康', 'bg': 3.2, 'ed': 4.92}]} + ``` diff --git a/demos/audio_content_search/conf/acs_application.yaml b/demos/audio_content_search/conf/acs_application.yaml index 010661e3..b862931a 100644 --- a/demos/audio_content_search/conf/acs_application.yaml +++ b/demos/audio_content_search/conf/acs_application.yaml @@ -27,7 +27,7 @@ acs_python: asr_server_ip: 127.0.0.1 asr_server_port: 8390 lang: 'zh' - word_list: "words.txt" + word_list: "./conf/words.txt" sample_rate: 16000 device: 'cpu' # set 'gpu:id' or 'cpu' diff --git a/demos/audio_content_search/words.txt b/demos/audio_content_search/conf/words.txt similarity index 100% rename from demos/audio_content_search/words.txt rename to demos/audio_content_search/conf/words.txt diff --git a/demos/audio_content_search/run.sh b/demos/audio_content_search/run.sh new file mode 100755 index 00000000..ddebdfee --- /dev/null +++ b/demos/audio_content_search/run.sh @@ -0,0 +1,6 @@ +export CUDA_VISIBLE_DEVICE=0,1,2,3 +#nohup python3 streaming_asr_server.py --config_file conf/ws_conformer_application.yaml &> streaming_asr.log & + +# nohup python3 punc_server.py --config_file conf/punc_application.yaml > punc.log 2>&1 & +paddlespeech_server start --config_file conf/acs_application.yaml + diff --git a/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml b/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml index c23680bd..e9a89c19 100644 --- a/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml +++ b/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml @@ -4,7 +4,7 @@ # SERVER SETTING # ################################################################################# host: 0.0.0.0 -port: 8390 +port: 8090 # The task format in the engin_list is: _ # task choices = ['asr_online'] diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py index 3adf8015..745143f6 100644 --- a/paddlespeech/server/bin/paddlespeech_client.py +++ b/paddlespeech/server/bin/paddlespeech_client.py @@ -752,3 +752,88 @@ class VectorClientExecutor(BaseExecutor): logger.info(f"The vector score is: {res}") else: logger.error(f"Sorry, we have not support such task {task}") + + +@cli_client_register( + name='paddlespeech_client.acs', description='visit acs service') +class ACSClientExecutor(BaseExecutor): + def __init__(self): + super(ACSClientExecutor, self).__init__() + self.parser = argparse.ArgumentParser( + prog='paddlespeech_client.acs', add_help=True) + self.parser.add_argument( + '--server_ip', type=str, default='127.0.0.1', help='server ip') + self.parser.add_argument( + '--port', type=int, default=8090, help='server port') + self.parser.add_argument( + '--input', + type=str, + default=None, + help='Audio file to be recognized', + required=True) + self.parser.add_argument( + '--sample_rate', type=int, default=16000, help='audio sample rate') + self.parser.add_argument( + '--lang', type=str, default="zh_cn", help='language') + self.parser.add_argument( + '--audio_format', type=str, default="wav", help='audio format') + + def execute(self, argv: List[str]) -> bool: + args = self.parser.parse_args(argv) + input_ = args.input + server_ip = args.server_ip + port = args.port + sample_rate = args.sample_rate + lang = args.lang + audio_format = args.audio_format + + try: + time_start = time.time() + res = self( + input=input_, + server_ip=server_ip, + port=port, + sample_rate=sample_rate, + lang=lang, + audio_format=audio_format, ) + time_end = time.time() + logger.info(f"ACS result: {res}") + logger.info("Response time %f s." % (time_end - time_start)) + return True + except Exception as e: + logger.error("Failed to speech recognition.") + logger.error(e) + return False + + @stats_wrapper + def __call__( + self, + input: str, + server_ip: str="127.0.0.1", + port: int=8090, + sample_rate: int=16000, + lang: str="zh_cn", + audio_format: str="wav", ): + """Python API to call an executor. + + Args: + input (str): The input audio file path + server_ip (str, optional): The ASR server ip. Defaults to "127.0.0.1". + port (int, optional): The ASR server port. Defaults to 8090. + sample_rate (int, optional): The audio sample rate. Defaults to 16000. + lang (str, optional): The audio language type. Defaults to "zh_cn". + audio_format (str, optional): The audio format information. Defaults to "wav". + + Returns: + str: The ACS results + """ + # we use the acs server to get the key word time stamp in audio text content + logger.info("asr http client start") + from paddlespeech.server.utils.audio_handler import ASRHttpHandler + handler = ASRHttpHandler( + server_ip=server_ip, port=port, endpoint="/paddlespeech/asr/search") + res = handler.run(input, audio_format, sample_rate, lang) + res = res['result'] + logger.info("asr http client finished") + + return res diff --git a/paddlespeech/server/engine/acs/python/acs_engine.py b/paddlespeech/server/engine/acs/python/acs_engine.py index 42cdbb0a..30deeeb5 100644 --- a/paddlespeech/server/engine/acs/python/acs_engine.py +++ b/paddlespeech/server/engine/acs/python/acs_engine.py @@ -62,6 +62,7 @@ class ACSEngine(BaseEngine): self.read_search_words() + # init the asr url self.url = "ws://" + self.config.asr_server_ip + ":" + str( self.config.asr_server_port) + "/paddlespeech/asr/streaming" @@ -81,11 +82,19 @@ class ACSEngine(BaseEngine): return with open(word_list, 'r') as fp: - self.word_list = fp.readlines() + self.word_list = [line.strip() for line in fp.readlines()] logger.info(f"word list: {self.word_list}") def get_asr_content(self, audio_data): + """Get the streaming asr result + + Args: + audio_data (_type_): _description_ + + Returns: + _type_: _description_ + """ logger.info("send a message to the server") if self.url is None: logger.error("No asr server, please input valid ip and port") @@ -134,17 +143,46 @@ class ACSEngine(BaseEngine): return msg def get_macthed_word(self, msg): + """Get the matched info in msg + + Args: + msg (dict): the asr info, including the asr result and time stamp + + Returns: + acs_result, asr_result: the acs result and the asr result + """ asr_result = msg['result'] time_stamp = msg['times'] + acs_result = [] + # search for each word in self.word_list + offset = self.config.offset + max_ed = time_stamp[-1]['ed'] for w in self.word_list: + # search the w in asr_result and the index in asr_result for m in re.finditer(w, asr_result): - start = time_stamp[m.start(0)]['bg'] - end = time_stamp[m.end(0) - 1]['ed'] + start = max(time_stamp[m.start(0)]['bg'] - offset, 0) + + end = min(time_stamp[m.end(0) - 1]['ed'] + offset, max_ed) logger.info(f'start: {start}, end: {end}') + acs_result.append({'w': w, 'bg': start, 'ed': end}) + + return acs_result, asr_result def run(self, audio_data): + """process the audio data in acs engine + the engine does not store any data, so all the request use the self.run api + + Args: + audio_data (str): the audio data + + Returns: + acs_result, asr_result: the acs result and the asr result + """ logger.info("start to process the audio content search") msg = self.get_asr_content(io.BytesIO(audio_data)) - self.get_macthed_word(msg) + acs_result, asr_result = self.get_macthed_word(msg) + logger.info(f'the asr result {asr_result}') + logger.info(f'the acs result: {acs_result}') + return acs_result, asr_result diff --git a/paddlespeech/server/restful/acs_api.py b/paddlespeech/server/restful/acs_api.py new file mode 100644 index 00000000..61cb34d9 --- /dev/null +++ b/paddlespeech/server/restful/acs_api.py @@ -0,0 +1,101 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import base64 +from typing import Union + +from fastapi import APIRouter + +from paddlespeech.cli.log import logger +from paddlespeech.server.engine.engine_pool import get_engine_pool +from paddlespeech.server.restful.request import ASRRequest +from paddlespeech.server.restful.response import ACSResponse +from paddlespeech.server.restful.response import ErrorResponse +from paddlespeech.server.utils.errors import ErrorCode +from paddlespeech.server.utils.errors import failed_response +from paddlespeech.server.utils.exception import ServerBaseException + +router = APIRouter() + + +@router.get('/paddlespeech/asr/search/help') +def help(): + """help + + Returns: + json: the audio content search result + """ + response = { + "success": "True", + "code": 200, + "message": { + "global": "success" + }, + "result": { + "description": "acs server", + "input": "base64 string of wavfile", + "output": { + "asr_result": "你好", + "acs_result": [{ + 'w': '你', + 'bg': 0.0, + 'ed': 1.2 + }] + } + } + } + return response + + +@router.post( + "/paddlespeech/asr/search", + response_model=Union[ACSResponse, ErrorResponse]) +def acs(request_body: ASRRequest): + """acs api + + Args: + request_body (ASRRequest): the acs request, we reuse the http ASRRequest + + Returns: + json: the acs result + """ + try: + # 1. get the audio data via base64 decoding + audio_data = base64.b64decode(request_body.audio) + + # 2. get single engine from engine pool + engine_pool = get_engine_pool() + acs_engine = engine_pool['acs'] + + # 3. no data stored in acs_engine, so we need to create the another instance process the data + acs_result, asr_result = acs_engine.run(audio_data) + + response = { + "success": True, + "code": 200, + "message": { + "description": "success" + }, + "result": { + "transcription": asr_result, + "acs": acs_result + } + } + + except ServerBaseException as e: + response = failed_response(e.error_code, e.msg) + except BaseException as e: + response = failed_response(ErrorCode.SERVER_UNKOWN_ERR) + logger.error(e) + + return response diff --git a/paddlespeech/server/restful/response.py b/paddlespeech/server/restful/response.py index c91b3899..3d991de4 100644 --- a/paddlespeech/server/restful/response.py +++ b/paddlespeech/server/restful/response.py @@ -17,7 +17,7 @@ from pydantic import BaseModel __all__ = [ 'ASRResponse', 'TTSResponse', 'CLSResponse', 'TextResponse', - 'VectorResponse', 'VectorScoreResponse' + 'VectorResponse', 'VectorScoreResponse', 'ACSResponse' ] @@ -231,3 +231,32 @@ class ErrorResponse(BaseModel): success: bool code: int message: Message + + +#****************************************************************************************/ +#************************************ ACS response **************************************/ +#****************************************************************************************/ +class AcsResult(BaseModel): + transcription: str + acs: list + + +class ACSResponse(BaseModel): + """ + response example + { + "success": true, + "code": 0, + "message": { + "description": "success" + }, + "result": { + "transcription": "你好,飞桨" + "acs": [(你好, 0.0, 0.45)] + } + } + """ + success: bool + code: int + message: Message + result: AcsResult diff --git a/paddlespeech/server/utils/audio_handler.py b/paddlespeech/server/utils/audio_handler.py index 0fcdd08a..baa7b934 100644 --- a/paddlespeech/server/utils/audio_handler.py +++ b/paddlespeech/server/utils/audio_handler.py @@ -96,7 +96,7 @@ class ASRWsAudioHandler: self.punc_server = TextHttpHandler(punc_server_ip, punc_server_port) logger.info(f"endpoint: {self.url}") - def read_wave(self, wavfile_path): + def read_wave(self, wavfile_path: str): """read the audio file from specific wavfile path Args: @@ -129,7 +129,7 @@ class ASRWsAudioHandler: x_chunk = padded_x[start:end] yield x_chunk - async def run(self, wavfile_path): + async def run(self, wavfile_path: str): """Send a audio file to online server Args: diff --git a/paddlespeech/server/ws/asr_api.py b/paddlespeech/server/ws/asr_api.py index bf6e912c..0faa131a 100644 --- a/paddlespeech/server/ws/asr_api.py +++ b/paddlespeech/server/ws/asr_api.py @@ -12,24 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import json -import base64 -from typing import Union + from fastapi import APIRouter from fastapi import WebSocket -import soundfile -import io from fastapi import WebSocketDisconnect from starlette.websockets import WebSocketState as WebSocketState from paddlespeech.cli.log import logger from paddlespeech.server.engine.asr.online.asr_engine import PaddleASRConnectionHanddler from paddlespeech.server.engine.engine_pool import get_engine_pool -from paddlespeech.server.restful.response import ASRResponse -from paddlespeech.server.restful.response import ErrorResponse -from paddlespeech.server.restful.request import ASRRequest -from paddlespeech.server.utils.exception import ServerBaseException -from paddlespeech.server.utils.errors import failed_response -from paddlespeech.server.utils.errors import ErrorCode router = APIRouter() @@ -117,54 +108,3 @@ async def websocket_endpoint(websocket: WebSocket): await websocket.send_json(resp) except WebSocketDisconnect as e: logger.error(e) - - -# @router.post( -# "/paddlespeech/asr/search/", response_model=Union[ASRResponse, ErrorResponse]) -# def asr(request_body: ASRRequest): -# """asr api - -# Args: -# request_body (ASRRequest): [description] - -# Returns: -# json: [description] -# """ -# try: -# audio_data = base64.b64decode(request_body.audio) - -# # get single engine from engine pool -# engine_pool = get_engine_pool() -# asr_engine = engine_pool['asr'] - -# samples, sample_rate = soundfile.read(io.BytesIO(audio_data), dtype='int16') -# # print(samples.shape) -# # print(sample_rate) -# connection_handler = PaddleASRConnectionHanddler(asr_engine) -# connection_handler.extract_feat(samples) - -# connection_handler.decode(is_finished=True) -# asr_results = connection_handler.rescoring() -# asr_results = connection_handler.get_result() -# word_time_stamp = connection_handler.get_word_time_stamp() - -# response = { -# "success": True, -# "code": 200, -# "message": { -# "description": "success" -# }, -# "result": { -# "transcription": asr_results, -# "times": word_time_stamp -# } -# } - - -# except ServerBaseException as e: -# response = failed_response(e.error_code, e.msg) -# except BaseException as e: -# response = failed_response(ErrorCode.SERVER_UNKOWN_ERR) -# print(e) - -# return response \ No newline at end of file